Bulk edit and actions should update version

This commit is contained in:
shamoon
2026-02-10 16:42:38 -08:00
parent 8014932419
commit daa4586eeb
2 changed files with 147 additions and 54 deletions

View File

@@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
import hashlib
import logging import logging
import tempfile import tempfile
from pathlib import Path from pathlib import Path
@@ -73,6 +72,46 @@ def restore_archive_serial_numbers(backup: dict[int, int | None]) -> None:
logger.info(f"Restored archive serial numbers for documents {list(backup.keys())}") logger.info(f"Restored archive serial numbers for documents {list(backup.keys())}")
def _get_root_ids_by_doc_id(doc_ids: list[int]) -> dict[int, int]:
"""
Resolve each provided document id to its root document id.
- If the id is already a root document: root id is itself.
- If the id is a version document: root id is its `root_document_id`.
"""
qs = Document.objects.filter(id__in=doc_ids).only("id", "root_document_id")
return {doc.id: doc.root_document_id or doc.id for doc in qs}
def _get_root_and_current_docs_by_root_id(
root_ids: set[int],
) -> tuple[dict[int, Document], dict[int, Document]]:
"""
Returns:
- root_docs: root_id -> root Document
- current_docs: root_id -> newest version Document (or root if none)
"""
root_docs = {
doc.id: doc
for doc in Document.objects.filter(id__in=root_ids).select_related(
"owner",
)
}
latest_versions_by_root_id: dict[int, Document] = {}
for version_doc in Document.objects.filter(root_document_id__in=root_ids).order_by(
"root_document_id",
"-id",
):
latest_versions_by_root_id.setdefault(version_doc.root_document_id, version_doc)
current_docs = {
root_id: latest_versions_by_root_id.get(root_id, root_docs.get(root_id))
for root_id in root_ids
if root_id in root_docs
}
return root_docs, current_docs
def set_correspondent( def set_correspondent(
doc_ids: list[int], doc_ids: list[int],
correspondent: Correspondent, correspondent: Correspondent,
@@ -379,43 +418,49 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
logger.info( logger.info(
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.", f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
) )
qs = Document.objects.filter(id__in=doc_ids) doc_to_root_id = _get_root_ids_by_doc_id(doc_ids)
root_ids = set(doc_to_root_id.values())
root_docs_by_id, current_docs_by_root_id = _get_root_and_current_docs_by_root_id(
root_ids,
)
import pikepdf import pikepdf
for doc in qs: for root_id in root_ids:
if doc.mime_type != "application/pdf": root_doc = root_docs_by_id[root_id]
source_doc = current_docs_by_root_id[root_id]
if source_doc.mime_type != "application/pdf":
logger.warning( logger.warning(
f"Document {doc.id} is not a PDF, skipping rotation.", f"Document {root_doc.id} is not a PDF, skipping rotation.",
) )
continue continue
try: try:
# Write rotated output to a temp file and create a new version via consume pipeline # Write rotated output to a temp file and create a new version via consume pipeline
filepath: Path = ( filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_rotated.pdf" / f"{root_doc.id}_rotated.pdf"
) )
with pikepdf.open(doc.source_path) as pdf: with pikepdf.open(source_doc.source_path) as pdf:
for page in pdf.pages: for page in pdf.pages:
page.rotate(degrees, relative=True) page.rotate(degrees, relative=True)
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
pdf.save(filepath) pdf.save(filepath)
# Preserve metadata/permissions via overrides; mark as new version # Preserve metadata/permissions via overrides; mark as new version
overrides = DocumentMetadataOverrides().from_document(doc) overrides = DocumentMetadataOverrides().from_document(root_doc)
consume_file.delay( consume_file.delay(
ConsumableDocument( ConsumableDocument(
source=DocumentSource.ConsumeFolder, source=DocumentSource.ConsumeFolder,
original_file=filepath, original_file=filepath,
root_document_id=doc.id, root_document_id=root_doc.id,
), ),
overrides, overrides,
) )
logger.info( logger.info(
f"Queued new rotated version for document {doc.id} by {degrees} degrees", f"Queued new rotated version for document {root_doc.id} by {degrees} degrees",
) )
except Exception as e: except Exception as e:
logger.exception(f"Error rotating document {doc.id}: {e}") logger.exception(f"Error rotating document {root_doc.id}: {e}")
return "OK" return "OK"
@@ -604,7 +649,14 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
logger.info( logger.info(
f"Attempting to delete pages {pages} from {len(doc_ids)} documents", f"Attempting to delete pages {pages} from {len(doc_ids)} documents",
) )
doc = Document.objects.get(id=doc_ids[0]) doc = Document.objects.select_related("root_document").get(id=doc_ids[0])
root_doc = doc if doc.root_document_id is None else doc.root_document
source_doc = (
Document.objects.filter(Q(id=root_doc.id) | Q(root_document=root_doc))
.order_by("-id")
.first()
or root_doc
)
pages = sorted(pages) # sort pages to avoid index issues pages = sorted(pages) # sort pages to avoid index issues
import pikepdf import pikepdf
@@ -612,9 +664,9 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
# Produce edited PDF to a temp file and create a new version # Produce edited PDF to a temp file and create a new version
filepath: Path = ( filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_pages_deleted.pdf" / f"{root_doc.id}_pages_deleted.pdf"
) )
with pikepdf.open(doc.source_path) as pdf: with pikepdf.open(source_doc.source_path) as pdf:
offset = 1 # pages are 1-indexed offset = 1 # pages are 1-indexed
for page_num in pages: for page_num in pages:
pdf.pages.remove(pdf.pages[page_num - offset]) pdf.pages.remove(pdf.pages[page_num - offset])
@@ -622,20 +674,20 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
pdf.save(filepath) pdf.save(filepath)
overrides = DocumentMetadataOverrides().from_document(doc) overrides = DocumentMetadataOverrides().from_document(root_doc)
consume_file.delay( consume_file.delay(
ConsumableDocument( ConsumableDocument(
source=DocumentSource.ConsumeFolder, source=DocumentSource.ConsumeFolder,
original_file=filepath, original_file=filepath,
root_document_id=doc.id, root_document_id=root_doc.id,
), ),
overrides, overrides,
) )
logger.info( logger.info(
f"Queued new version for document {doc.id} after deleting pages {pages}", f"Queued new version for document {root_doc.id} after deleting pages {pages}",
) )
except Exception as e: except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}") logger.exception(f"Error deleting pages from document {root_doc.id}: {e}")
return "OK" return "OK"
@@ -660,13 +712,20 @@ def edit_pdf(
logger.info( logger.info(
f"Editing PDF of document {doc_ids[0]} with {len(operations)} operations", f"Editing PDF of document {doc_ids[0]} with {len(operations)} operations",
) )
doc = Document.objects.get(id=doc_ids[0]) doc = Document.objects.select_related("root_document").get(id=doc_ids[0])
root_doc = doc if doc.root_document_id is None else doc.root_document
source_doc = (
Document.objects.filter(Q(id=root_doc.id) | Q(root_document=root_doc))
.order_by("-id")
.first()
or root_doc
)
import pikepdf import pikepdf
pdf_docs: list[pikepdf.Pdf] = [] pdf_docs: list[pikepdf.Pdf] = []
try: try:
with pikepdf.open(doc.source_path) as src: with pikepdf.open(source_doc.source_path) as src:
# prepare output documents # prepare output documents
max_idx = max(op.get("doc", 0) for op in operations) max_idx = max(op.get("doc", 0) for op in operations)
pdf_docs = [pikepdf.new() for _ in range(max_idx + 1)] pdf_docs = [pikepdf.new() for _ in range(max_idx + 1)]
@@ -690,11 +749,11 @@ def edit_pdf(
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
filepath: Path = ( filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_edited.pdf" / f"{root_doc.id}_edited.pdf"
) )
pdf.save(filepath) pdf.save(filepath)
overrides = ( overrides = (
DocumentMetadataOverrides().from_document(doc) DocumentMetadataOverrides().from_document(root_doc)
if include_metadata if include_metadata
else DocumentMetadataOverrides() else DocumentMetadataOverrides()
) )
@@ -704,14 +763,14 @@ def edit_pdf(
ConsumableDocument( ConsumableDocument(
source=DocumentSource.ConsumeFolder, source=DocumentSource.ConsumeFolder,
original_file=filepath, original_file=filepath,
root_document_id=doc.id, root_document_id=root_doc.id,
), ),
overrides, overrides,
) )
else: else:
consume_tasks = [] consume_tasks = []
overrides = ( overrides = (
DocumentMetadataOverrides().from_document(doc) DocumentMetadataOverrides().from_document(root_doc)
if include_metadata if include_metadata
else DocumentMetadataOverrides() else DocumentMetadataOverrides()
) )
@@ -720,11 +779,11 @@ def edit_pdf(
if not delete_original: if not delete_original:
overrides.skip_asn_if_exists = True overrides.skip_asn_if_exists = True
if delete_original and len(pdf_docs) == 1: if delete_original and len(pdf_docs) == 1:
overrides.asn = doc.archive_serial_number overrides.asn = root_doc.archive_serial_number
for idx, pdf in enumerate(pdf_docs, start=1): for idx, pdf in enumerate(pdf_docs, start=1):
filepath: Path = ( filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_edit_{idx}.pdf" / f"{root_doc.id}_edit_{idx}.pdf"
) )
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
pdf.save(filepath) pdf.save(filepath)
@@ -754,7 +813,7 @@ def edit_pdf(
group(consume_tasks).delay() group(consume_tasks).delay()
except Exception as e: except Exception as e:
logger.exception(f"Error editing document {doc.id}: {e}") logger.exception(f"Error editing document {root_doc.id}: {e}")
raise ValueError( raise ValueError(
f"An error occurred while editing the document: {e}", f"An error occurred while editing the document: {e}",
) from e ) from e
@@ -777,38 +836,53 @@ def remove_password(
import pikepdf import pikepdf
for doc_id in doc_ids: for doc_id in doc_ids:
doc = Document.objects.get(id=doc_id) doc = Document.objects.select_related("root_document").get(id=doc_id)
root_doc = doc if doc.root_document_id is None else doc.root_document
source_doc = (
Document.objects.filter(Q(id=root_doc.id) | Q(root_document=root_doc))
.order_by("-id")
.first()
or root_doc
)
try: try:
logger.info( logger.info(
f"Attempting password removal from document {doc_ids[0]}", f"Attempting password removal from document {doc_ids[0]}",
) )
with pikepdf.open(doc.source_path, password=password) as pdf: with pikepdf.open(source_doc.source_path, password=password) as pdf:
temp_path = doc.source_path.with_suffix(".tmp.pdf") filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{root_doc.id}_unprotected.pdf"
)
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
pdf.save(temp_path) pdf.save(filepath)
if update_document: if update_document:
# replace the original document with the unprotected one # Create a new version rather than modifying the root/original in place.
temp_path.replace(doc.source_path) overrides = (
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() DocumentMetadataOverrides().from_document(root_doc)
doc.page_count = len(pdf.pages) if include_metadata
doc.save() else DocumentMetadataOverrides()
update_document_content_maybe_archive_file.delay(document_id=doc.id) )
if user is not None:
overrides.owner_id = user.id
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
root_document_id=root_doc.id,
),
overrides,
)
else: else:
consume_tasks = [] consume_tasks = []
overrides = ( overrides = (
DocumentMetadataOverrides().from_document(doc) DocumentMetadataOverrides().from_document(root_doc)
if include_metadata if include_metadata
else DocumentMetadataOverrides() else DocumentMetadataOverrides()
) )
if user is not None: if user is not None:
overrides.owner_id = user.id overrides.owner_id = user.id
filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_unprotected.pdf"
)
temp_path.replace(filepath)
consume_tasks.append( consume_tasks.append(
consume_file.s( consume_file.s(
ConsumableDocument( ConsumableDocument(
@@ -820,12 +894,17 @@ def remove_password(
) )
if delete_original: if delete_original:
chord(header=consume_tasks, body=delete.si([doc.id])).delay() chord(
header=consume_tasks,
body=delete.si([doc.id]),
).delay()
else: else:
group(consume_tasks).delay() group(consume_tasks).delay()
except Exception as e: except Exception as e:
logger.exception(f"Error removing password from document {doc.id}: {e}") logger.exception(
f"Error removing password from document {root_doc.id}: {e}",
)
raise ValueError( raise ValueError(
f"An error occurred while removing the password: {e}", f"An error occurred while removing the password: {e}",
) from e ) from e

View File

@@ -1,4 +1,3 @@
import hashlib
import shutil import shutil
from datetime import date from datetime import date
from pathlib import Path from pathlib import Path
@@ -1241,10 +1240,20 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_consume_file.assert_not_called() mock_consume_file.assert_not_called()
@mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay") @mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay")
@mock.patch("documents.tasks.consume_file.delay")
@mock.patch("documents.bulk_edit.tempfile.mkdtemp")
@mock.patch("pikepdf.open") @mock.patch("pikepdf.open")
def test_remove_password_update_document(self, mock_open, mock_update_document): def test_remove_password_update_document(
self,
mock_open,
mock_mkdtemp,
mock_consume_delay,
mock_update_document,
):
doc = self.doc1 doc = self.doc1
original_checksum = doc.checksum temp_dir = self.dirs.scratch_dir / "remove-password-update"
temp_dir.mkdir(parents=True, exist_ok=True)
mock_mkdtemp.return_value = str(temp_dir)
fake_pdf = mock.MagicMock() fake_pdf = mock.MagicMock()
fake_pdf.pages = [mock.Mock(), mock.Mock(), mock.Mock()] fake_pdf.pages = [mock.Mock(), mock.Mock(), mock.Mock()]
@@ -1264,12 +1273,17 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertEqual(result, "OK") self.assertEqual(result, "OK")
mock_open.assert_called_once_with(doc.source_path, password="secret") mock_open.assert_called_once_with(doc.source_path, password="secret")
fake_pdf.remove_unreferenced_resources.assert_called_once() fake_pdf.remove_unreferenced_resources.assert_called_once()
doc.refresh_from_db() mock_update_document.assert_not_called()
self.assertNotEqual(doc.checksum, original_checksum) mock_consume_delay.assert_called_once()
expected_checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() consumable, overrides = mock_consume_delay.call_args[0]
self.assertEqual(doc.checksum, expected_checksum) expected_path = temp_dir / f"{doc.id}_unprotected.pdf"
self.assertEqual(doc.page_count, len(fake_pdf.pages)) self.assertTrue(expected_path.exists())
mock_update_document.assert_called_once_with(document_id=doc.id) self.assertEqual(
Path(consumable.original_file).resolve(),
expected_path.resolve(),
)
self.assertEqual(consumable.root_document_id, doc.id)
self.assertIsNotNone(overrides)
@mock.patch("documents.bulk_edit.chord") @mock.patch("documents.bulk_edit.chord")
@mock.patch("documents.bulk_edit.group") @mock.patch("documents.bulk_edit.group")