Feature: PDF actions - merge, split & rotate (#6094)

This commit is contained in:
shamoon
2024-03-25 18:41:24 -07:00
committed by GitHub
parent d6d0071175
commit 4af8070450
31 changed files with 1847 additions and 150 deletions

View File

@@ -1,15 +1,27 @@
import hashlib
import itertools
import logging
import os
from typing import Optional
from celery import chord
from django.conf import settings
from django.db.models import Q
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents
from documents.tasks import consume_file
from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.bulk_edit")
def set_correspondent(doc_ids, correspondent):
if correspondent:
@@ -146,3 +158,137 @@ def set_permissions(doc_ids, set_permissions, owner=None, merge=False):
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def rotate(doc_ids: list[int], degrees: int):
logger.info(
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
)
qs = Document.objects.filter(id__in=doc_ids)
affected_docs = []
import pikepdf
rotate_tasks = []
for doc in qs:
if doc.mime_type != "application/pdf":
logger.warning(
f"Document {doc.id} is not a PDF, skipping rotation.",
)
continue
try:
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
for page in pdf.pages:
page.rotate(degrees, relative=True)
pdf.save()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save()
rotate_tasks.append(
update_document_archive_file.s(
document_id=doc.id,
),
)
logger.info(
f"Rotated document {doc.id} by {degrees} degrees",
)
affected_docs.append(doc.id)
except Exception as e:
logger.exception(f"Error rotating document {doc.id}: {e}")
if len(affected_docs) > 0:
bulk_update_task = bulk_update_documents.s(document_ids=affected_docs)
chord(header=rotate_tasks, body=bulk_update_task).delay()
return "OK"
def merge(doc_ids: list[int], metadata_document_id: Optional[int] = None):
logger.info(
f"Attempting to merge {len(doc_ids)} documents into a single document.",
)
qs = Document.objects.filter(id__in=doc_ids)
affected_docs = []
import pikepdf
merged_pdf = pikepdf.new()
version = merged_pdf.pdf_version
# use doc_ids to preserve order
for doc_id in doc_ids:
doc = qs.get(id=doc_id)
try:
with pikepdf.open(str(doc.source_path)) as pdf:
version = max(version, pdf.pdf_version)
merged_pdf.pages.extend(pdf.pages)
affected_docs.append(doc.id)
except Exception as e:
logger.exception(
f"Error merging document {doc.id}, it will not be included in the merge: {e}",
)
if len(affected_docs) == 0:
logger.warning("No documents were merged")
return "OK"
filepath = os.path.join(
settings.SCRATCH_DIR,
f"{'_'.join([str(doc_id) for doc_id in doc_ids])[:100]}_merged.pdf",
)
merged_pdf.remove_unreferenced_resources()
merged_pdf.save(filepath, min_version=version)
merged_pdf.close()
if metadata_document_id:
metadata_document = qs.get(id=metadata_document_id)
if metadata_document is not None:
overrides = DocumentMetadataOverrides.from_document(metadata_document)
overrides.title = metadata_document.title + " (merged)"
else:
overrides = DocumentMetadataOverrides()
logger.info("Adding merged document to the task queue.")
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
)
return "OK"
def split(doc_ids: list[int], pages: list[list[int]]):
logger.info(
f"Attempting to split document {doc_ids[0]} into {len(pages)} documents",
)
doc = Document.objects.get(id=doc_ids[0])
import pikepdf
try:
with pikepdf.open(doc.source_path) as pdf:
for idx, split_doc in enumerate(pages):
dst = pikepdf.new()
for page in split_doc:
dst.pages.append(pdf.pages[page - 1])
filepath = os.path.join(
settings.SCRATCH_DIR,
f"{doc.id}_{split_doc[0]}-{split_doc[-1]}.pdf",
)
dst.remove_unreferenced_resources()
dst.save(filepath)
dst.close()
overrides = DocumentMetadataOverrides().from_document(doc)
overrides.title = f"{doc.title} (split {idx + 1})"
logger.info(
f"Adding split document with pages {split_doc} to the task queue.",
)
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
)
except Exception as e:
logger.exception(f"Error splitting document {doc.id}: {e}")
return "OK"

View File

@@ -189,13 +189,21 @@ def refresh_metadata_cache(
cache.touch(doc_key, timeout)
def clear_metadata_cache(document_id: int) -> None:
doc_key = get_metadata_cache_key(document_id)
cache.delete(doc_key)
def get_thumbnail_modified_key(document_id: int) -> str:
"""
Builds the key to store a thumbnail's timestamp
"""
return f"doc_{document_id}_thumbnail_modified"
def clear_document_caches(document_id: int) -> None:
"""
Removes all cached items for the given document
"""
cache.delete_many(
[
get_suggestion_cache_key(document_id),
get_metadata_cache_key(document_id),
get_thumbnail_modified_key(document_id),
],
)

View File

@@ -5,6 +5,8 @@ from pathlib import Path
from typing import Optional
import magic
from guardian.shortcuts import get_groups_with_perms
from guardian.shortcuts import get_users_with_perms
@dataclasses.dataclass
@@ -88,6 +90,44 @@ class DocumentMetadataOverrides:
return self
@staticmethod
def from_document(doc) -> "DocumentMetadataOverrides":
"""
Fills in the overrides from a document object
"""
overrides = DocumentMetadataOverrides()
overrides.title = doc.title
overrides.correspondent_id = doc.correspondent.id if doc.correspondent else None
overrides.document_type_id = doc.document_type.id if doc.document_type else None
overrides.storage_path_id = doc.storage_path.id if doc.storage_path else None
overrides.owner_id = doc.owner.id if doc.owner else None
overrides.tag_ids = list(doc.tags.values_list("id", flat=True))
overrides.view_users = get_users_with_perms(
doc,
only_with_perms_in=["view_document"],
).values_list("id", flat=True)
overrides.change_users = get_users_with_perms(
doc,
only_with_perms_in=["change_document"],
).values_list("id", flat=True)
overrides.custom_field_ids = list(
doc.custom_fields.values_list("id", flat=True),
)
groups_with_perms = get_groups_with_perms(
doc,
attach_perms=True,
)
overrides.view_groups = [
group.id for group, perms in groups_with_perms if "view_document" in perms
]
overrides.change_groups = [
group.id for group, perms in groups_with_perms if "change_document" in perms
]
return overrides
class DocumentSource(IntEnum):
"""

View File

@@ -869,6 +869,9 @@ class BulkEditSerializer(DocumentListSerializer, SetPermissionsMixin):
"delete",
"redo_ocr",
"set_permissions",
"rotate",
"merge",
"split",
],
label="Method",
write_only=True,
@@ -906,6 +909,12 @@ class BulkEditSerializer(DocumentListSerializer, SetPermissionsMixin):
return bulk_edit.redo_ocr
elif method == "set_permissions":
return bulk_edit.set_permissions
elif method == "rotate":
return bulk_edit.rotate
elif method == "merge":
return bulk_edit.merge
elif method == "split":
return bulk_edit.split
else:
raise serializers.ValidationError("Unsupported method.")
@@ -984,6 +993,39 @@ class BulkEditSerializer(DocumentListSerializer, SetPermissionsMixin):
if "merge" not in parameters:
parameters["merge"] = False
def _validate_parameters_rotate(self, parameters):
try:
if (
"degrees" not in parameters
or not float(parameters["degrees"]).is_integer()
):
raise serializers.ValidationError("invalid rotation degrees")
except ValueError:
raise serializers.ValidationError("invalid rotation degrees")
def _validate_parameters_split(self, parameters):
if "pages" not in parameters:
raise serializers.ValidationError("pages not specified")
try:
pages = []
docs = parameters["pages"].split(",")
for doc in docs:
if "-" in doc:
pages.append(
[
x
for x in range(
int(doc.split("-")[0]),
int(doc.split("-")[1]) + 1,
)
],
)
else:
pages.append([int(doc)])
parameters["pages"] = pages
except ValueError:
raise serializers.ValidationError("invalid pages specified")
def validate(self, attrs):
method = attrs["method"]
parameters = attrs["parameters"]
@@ -1000,6 +1042,14 @@ class BulkEditSerializer(DocumentListSerializer, SetPermissionsMixin):
self._validate_storage_path(parameters)
elif method == bulk_edit.set_permissions:
self._validate_parameters_set_permissions(parameters)
elif method == bulk_edit.rotate:
self._validate_parameters_rotate(parameters)
elif method == bulk_edit.split:
if len(attrs["documents"]) > 1:
raise serializers.ValidationError(
"Split method only supports one document",
)
self._validate_parameters_split(parameters)
return attrs

View File

@@ -23,7 +23,7 @@ from filelock import FileLock
from guardian.shortcuts import remove_perm
from documents import matching
from documents.caching import clear_metadata_cache
from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier
from documents.consumer import parse_doc_title_w_placeholders
from documents.file_handling import create_source_path_directory
@@ -439,7 +439,8 @@ def update_filename_and_move_files(sender, instance: Document, **kwargs):
archive_filename=instance.archive_filename,
modified=timezone.now(),
)
clear_metadata_cache(instance.pk)
# Clear any caching for this document. Slightly overkill, but not terrible
clear_document_caches(instance.pk)
except (OSError, DatabaseError, CannotMoveFilesException) as e:
logger.warning(f"Exception during file handling: {e}")

View File

@@ -18,6 +18,7 @@ from whoosh.writing import AsyncWriter
from documents import index
from documents import sanity_checker
from documents.barcodes import BarcodePlugin
from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
@@ -213,6 +214,7 @@ def bulk_update_documents(document_ids):
ix = index.open_index()
for doc in documents:
clear_document_caches(doc.pk)
document_updated.send(
sender=None,
document=doc,
@@ -305,6 +307,8 @@ def update_document_archive_file(document_id):
with index.open_index_writer() as writer:
index.update_document(writer, document)
clear_document_caches(document.pk)
except Exception:
logger.exception(
f"Error while parsing document {document} (ID: {document_id})",

View File

@@ -781,3 +781,153 @@ class TestBulkEditAPI(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, status.HTTP_200_OK)
m.assert_called_once()
@mock.patch("documents.serialisers.bulk_edit.rotate")
def test_rotate(self, m):
m.return_value = "OK"
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id, self.doc3.id],
"method": "rotate",
"parameters": {"degrees": 90},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
m.assert_called_once()
args, kwargs = m.call_args
self.assertCountEqual(args[0], [self.doc2.id, self.doc3.id])
self.assertEqual(kwargs["degrees"], 90)
@mock.patch("documents.serialisers.bulk_edit.rotate")
def test_rotate_invalid_params(self, m):
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id, self.doc3.id],
"method": "rotate",
"parameters": {"degrees": "foo"},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id, self.doc3.id],
"method": "rotate",
"parameters": {"degrees": 90.5},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
m.assert_not_called()
@mock.patch("documents.serialisers.bulk_edit.merge")
def test_merge(self, m):
m.return_value = "OK"
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id, self.doc3.id],
"method": "merge",
"parameters": {"metadata_document_id": self.doc3.id},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
m.assert_called_once()
args, kwargs = m.call_args
self.assertCountEqual(args[0], [self.doc2.id, self.doc3.id])
self.assertEqual(kwargs["metadata_document_id"], self.doc3.id)
@mock.patch("documents.serialisers.bulk_edit.split")
def test_split(self, m):
m.return_value = "OK"
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id],
"method": "split",
"parameters": {"pages": "1,2-4,5-6,7"},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
m.assert_called_once()
args, kwargs = m.call_args
self.assertCountEqual(args[0], [self.doc2.id])
self.assertEqual(kwargs["pages"], [[1], [2, 3, 4], [5, 6], [7]])
def test_split_invalid_params(self):
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id],
"method": "split",
"parameters": {}, # pages not specified
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"pages not specified", response.content)
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id],
"method": "split",
"parameters": {"pages": "1:7"}, # wrong format
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"invalid pages specified", response.content)
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [
self.doc1.id,
self.doc2.id,
], # only one document supported
"method": "split",
"parameters": {"pages": "1-2,3-7"}, # wrong format
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"Split method only supports one document", response.content)

View File

@@ -1,3 +1,5 @@
import shutil
from pathlib import Path
from unittest import mock
from django.contrib.auth.models import Group
@@ -275,3 +277,262 @@ class TestBulkEdit(DirectoriesMixin, TestCase):
self.doc1,
)
self.assertEqual(groups_with_perms.count(), 2)
class TestPDFActions(DirectoriesMixin, TestCase):
def setUp(self):
super().setUp()
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1_archive,
)
sample2 = self.dirs.scratch_dir / "sample2.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000002.pdf",
sample2,
)
sample2_archive = self.dirs.archive_dir / "sample2_archive.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000002.pdf",
sample2_archive,
)
sample3 = self.dirs.scratch_dir / "sample3.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000003.pdf",
sample3,
)
self.doc1 = Document.objects.create(
checksum="A",
title="A",
filename=sample1,
mime_type="application/pdf",
)
self.doc1.archive_filename = sample1_archive
self.doc1.save()
self.doc2 = Document.objects.create(
checksum="B",
title="B",
filename=sample2,
mime_type="application/pdf",
)
self.doc2.archive_filename = sample2_archive
self.doc2.save()
self.doc3 = Document.objects.create(
checksum="C",
title="C",
filename=sample3,
mime_type="application/pdf",
)
img_doc = self.dirs.scratch_dir / "sample_image.jpg"
shutil.copy(
Path(__file__).parent / "samples" / "simple.jpg",
img_doc,
)
self.img_doc = Document.objects.create(
checksum="D",
title="D",
filename=img_doc,
mime_type="image/jpeg",
)
@mock.patch("documents.tasks.consume_file.delay")
def test_merge(self, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Merge action is called with 3 documents
THEN:
- Consume file should be called
"""
doc_ids = [self.doc1.id, self.doc2.id, self.doc3.id]
metadata_document_id = self.doc1.id
result = bulk_edit.merge(doc_ids)
expected_filename = (
f"{'_'.join([str(doc_id) for doc_id in doc_ids])[:100]}_merged.pdf"
)
mock_consume_file.assert_called()
consume_file_args, _ = mock_consume_file.call_args
self.assertEqual(
Path(consume_file_args[0].original_file).name,
expected_filename,
)
self.assertEqual(consume_file_args[1].title, None)
# With metadata_document_id overrides
result = bulk_edit.merge(doc_ids, metadata_document_id=metadata_document_id)
consume_file_args, _ = mock_consume_file.call_args
self.assertEqual(consume_file_args[1].title, "A (merged)")
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.consume_file.delay")
@mock.patch("pikepdf.open")
def test_merge_with_errors(self, mock_open_pdf, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Merge action is called with 2 documents
- Error occurs when opening both files
THEN:
- Consume file should not be called
"""
mock_open_pdf.side_effect = Exception("Error opening PDF")
doc_ids = [self.doc2.id, self.doc3.id]
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
bulk_edit.merge(doc_ids)
error_str = cm.output[0]
expected_str = (
"Error merging document 2, it will not be included in the merge"
)
self.assertIn(expected_str, error_str)
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.consume_file.delay")
def test_split(self, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Split action is called with 1 document and 2 pages
THEN:
- Consume file should be called twice
"""
doc_ids = [self.doc2.id]
pages = [[1, 2], [3]]
result = bulk_edit.split(doc_ids, pages)
self.assertEqual(mock_consume_file.call_count, 2)
consume_file_args, _ = mock_consume_file.call_args
self.assertEqual(consume_file_args[1].title, "B (split 2)")
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.consume_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_split_with_errors(self, mock_save_pdf, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Split action is called with 1 document and 2 page groups
- Error occurs when saving the files
THEN:
- Consume file should not be called
"""
mock_save_pdf.side_effect = Exception("Error saving PDF")
doc_ids = [self.doc2.id]
pages = [[1, 2], [3]]
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
bulk_edit.split(doc_ids, pages)
error_str = cm.output[0]
expected_str = "Error splitting document 2"
self.assertIn(expected_str, error_str)
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.s")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
"""
GIVEN:
- Existing documents
WHEN:
- Rotate action is called with 2 documents
THEN:
- Rotate action should be called twice
"""
doc_ids = [self.doc1.id, self.doc2.id]
result = bulk_edit.rotate(doc_ids, 90)
self.assertEqual(mock_update_document.call_count, 2)
mock_update_documents.assert_called_once()
mock_chord.assert_called_once()
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.bulk_update_documents.s")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("pikepdf.Pdf.save")
def test_rotate_with_error(
self,
mock_pdf_save,
mock_update_archive_file,
mock_update_documents,
):
"""
GIVEN:
- Existing documents
WHEN:
- Rotate action is called with 2 documents
- PikePDF raises an error
THEN:
- Rotate action should be called 0 times
"""
mock_pdf_save.side_effect = Exception("Error saving PDF")
doc_ids = [self.doc2.id, self.doc3.id]
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
bulk_edit.rotate(doc_ids, 90)
error_str = cm.output[0]
expected_str = "Error rotating document"
self.assertIn(expected_str, error_str)
mock_update_archive_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.s")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate_non_pdf(
self,
mock_chord,
mock_update_document,
mock_update_documents,
):
"""
GIVEN:
- Existing documents
WHEN:
- Rotate action is called with 2 documents, one of which is not a PDF
THEN:
- Rotate action should be performed 1 time, with the non-PDF document skipped
"""
with self.assertLogs("paperless.bulk_edit", level="INFO") as cm:
result = bulk_edit.rotate([self.doc2.id, self.img_doc.id], 90)
output_str = cm.output[1]
expected_str = "Document 4 is not a PDF, skipping rotation"
self.assertIn(expected_str, output_str)
self.assertEqual(mock_update_document.call_count, 1)
mock_update_documents.assert_called_once()
mock_chord.assert_called_once()
self.assertEqual(result, "OK")

View File

@@ -891,7 +891,8 @@ class BulkEditView(GenericAPIView, PassUserMixin):
document_objs = Document.objects.filter(pk__in=documents)
has_perms = (
all((doc.owner == user or doc.owner is None) for doc in document_objs)
if method == bulk_edit.set_permissions
if method
in [bulk_edit.set_permissions, bulk_edit.delete, bulk_edit.rotate]
else all(
has_perms_owner_aware(user, "change_document", doc)
for doc in document_objs