Bulk editing to update version instead of replace

This commit is contained in:
shamoon
2025-09-08 08:13:41 -07:00
parent 9e8b6071a3
commit 67f9375d77

View File

@@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
import hashlib
import itertools import itertools
import logging import logging
import tempfile import tempfile
@@ -283,10 +282,8 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.", f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
) )
qs = Document.objects.filter(id__in=doc_ids) qs = Document.objects.filter(id__in=doc_ids)
affected_docs: list[int] = []
import pikepdf import pikepdf
rotate_tasks = []
for doc in qs: for doc in qs:
if doc.mime_type != "application/pdf": if doc.mime_type != "application/pdf":
logger.warning( logger.warning(
@@ -294,28 +291,34 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
) )
continue continue
try: try:
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf: # Write rotated output to a temp file and create a new version via consume pipeline
filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_rotated.pdf"
)
with pikepdf.open(doc.source_path) as pdf:
for page in pdf.pages: for page in pdf.pages:
page.rotate(degrees, relative=True) page.rotate(degrees, relative=True)
pdf.save() pdf.remove_unreferenced_resources()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() pdf.save(filepath)
doc.save()
rotate_tasks.append( # Preserve metadata/permissions via overrides; mark as new version
update_document_content_maybe_archive_file.s( overrides = DocumentMetadataOverrides().from_document(doc)
document_id=doc.id,
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
head_version_id=doc.id,
), ),
overrides,
) )
logger.info( logger.info(
f"Rotated document {doc.id} by {degrees} degrees", f"Queued new rotated version for document {doc.id} by {degrees} degrees",
) )
affected_docs.append(doc.id)
except Exception as e: except Exception as e:
logger.exception(f"Error rotating document {doc.id}: {e}") logger.exception(f"Error rotating document {doc.id}: {e}")
if len(affected_docs) > 0:
bulk_update_task = bulk_update_documents.si(document_ids=affected_docs)
chord(header=rotate_tasks, body=bulk_update_task).delay()
return "OK" return "OK"
@@ -478,19 +481,31 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
import pikepdf import pikepdf
try: try:
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf: # Produce edited PDF to a temp file and create a new version
filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_pages_deleted.pdf"
)
with pikepdf.open(doc.source_path) as pdf:
offset = 1 # pages are 1-indexed offset = 1 # pages are 1-indexed
for page_num in pages: for page_num in pages:
pdf.pages.remove(pdf.pages[page_num - offset]) pdf.pages.remove(pdf.pages[page_num - offset])
offset += 1 # remove() changes the index of the pages offset += 1 # remove() changes the index of the pages
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
pdf.save() pdf.save(filepath)
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
if doc.page_count is not None: overrides = DocumentMetadataOverrides().from_document(doc)
doc.page_count = doc.page_count - len(pages) consume_file.delay(
doc.save() ConsumableDocument(
update_document_content_maybe_archive_file.delay(document_id=doc.id) source=DocumentSource.ConsumeFolder,
logger.info(f"Deleted pages {pages} from document {doc.id}") original_file=filepath,
head_version_id=doc.id,
),
overrides,
)
logger.info(
f"Queued new version for document {doc.id} after deleting pages {pages}",
)
except Exception as e: except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}") logger.exception(f"Error deleting pages from document {doc.id}: {e}")
@@ -542,17 +557,29 @@ def edit_pdf(
dst.pages[-1].rotate(op["rotate"], relative=True) dst.pages[-1].rotate(op["rotate"], relative=True)
if update_document: if update_document:
temp_path = doc.source_path.with_suffix(".tmp.pdf") # Create a new version from the edited PDF rather than replacing in-place
pdf = pdf_docs[0] pdf = pdf_docs[0]
pdf.remove_unreferenced_resources() pdf.remove_unreferenced_resources()
# save the edited PDF to a temporary file in case of errors filepath: Path = (
pdf.save(temp_path) Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
# replace the original document with the edited one / f"{doc.id}_edited.pdf"
temp_path.replace(doc.source_path) )
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() pdf.save(filepath)
doc.page_count = len(pdf.pages) overrides = (
doc.save() DocumentMetadataOverrides().from_document(doc)
update_document_content_maybe_archive_file.delay(document_id=doc.id) if include_metadata
else DocumentMetadataOverrides()
)
if user is not None:
overrides.owner_id = user.id
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
head_version_id=doc.id,
),
overrides,
)
else: else:
consume_tasks = [] consume_tasks = []
overrides = ( overrides = (