From 9c1561adfbeecab3fdb30b9a2988f63ccb3f1cbe Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:01:13 -0800 Subject: [PATCH] Change: change update content to handle archive disabled (#8315) --- src/documents/bulk_edit.py | 8 +- .../management/commands/document_archiver.py | 6 +- src/documents/tasks.py | 48 ++++++++---- src/documents/tests/test_bulk_edit.py | 10 +-- src/documents/tests/test_management.py | 10 +-- src/documents/tests/test_tasks.py | 74 +++++++++++++++++++ 6 files changed, 124 insertions(+), 32 deletions(-) diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py index 4d03769c8..83be5eea9 100644 --- a/src/documents/bulk_edit.py +++ b/src/documents/bulk_edit.py @@ -24,7 +24,7 @@ from documents.models import StoragePath from documents.permissions import set_permissions_for_object from documents.tasks import bulk_update_documents from documents.tasks import consume_file -from documents.tasks import update_document_archive_file +from documents.tasks import update_document_content_maybe_archive_file logger: logging.Logger = logging.getLogger("paperless.bulk_edit") @@ -191,7 +191,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]: def reprocess(doc_ids: list[int]) -> Literal["OK"]: for document_id in doc_ids: - update_document_archive_file.delay( + update_document_content_maybe_archive_file.delay( document_id=document_id, ) @@ -245,7 +245,7 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]: doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() doc.save() rotate_tasks.append( - update_document_archive_file.s( + update_document_content_maybe_archive_file.s( document_id=doc.id, ), ) @@ -423,7 +423,7 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]: if doc.page_count is not None: doc.page_count = doc.page_count - len(pages) doc.save() - update_document_archive_file.delay(document_id=doc.id) + update_document_content_maybe_archive_file.delay(document_id=doc.id) logger.info(f"Deleted pages {pages} from document {doc.id}") except Exception as e: logger.exception(f"Error deleting pages from document {doc.id}: {e}") diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index 12677dd79..1aa52117a 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin from documents.models import Document -from documents.tasks import update_document_archive_file +from documents.tasks import update_document_content_maybe_archive_file logger = logging.getLogger("paperless.management.archiver") @@ -77,13 +77,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): if self.process_count == 1: for doc_id in document_ids: - update_document_archive_file(doc_id) + update_document_content_maybe_archive_file(doc_id) else: # pragma: no cover with multiprocessing.Pool(self.process_count) as pool: list( tqdm.tqdm( pool.imap_unordered( - update_document_archive_file, + update_document_content_maybe_archive_file, document_ids, ), total=len(document_ids), diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 8f5ee51bc..e04cdb34e 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -206,9 +206,10 @@ def bulk_update_documents(document_ids): @shared_task -def update_document_archive_file(document_id): +def update_document_content_maybe_archive_file(document_id): """ - Re-creates the archive file of a document, including new OCR content and thumbnail + Re-creates OCR content and thumbnail for a document, and archive file if + it exists. """ document = Document.objects.get(id=document_id) @@ -234,8 +235,9 @@ def update_document_archive_file(document_id): document.get_public_filename(), ) - if parser.get_archive_path(): - with transaction.atomic(): + with transaction.atomic(): + oldDocument = Document.objects.get(pk=document.pk) + if parser.get_archive_path(): with open(parser.get_archive_path(), "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() # I'm going to save first so that in case the file move @@ -246,7 +248,6 @@ def update_document_archive_file(document_id): document, archive_filename=True, ) - oldDocument = Document.objects.get(pk=document.pk) Document.objects.filter(pk=document.pk).update( archive_checksum=checksum, content=parser.get_text(), @@ -268,24 +269,41 @@ def update_document_archive_file(document_id): ], }, additional_data={ - "reason": "Update document archive file", + "reason": "Update document content", + }, + action=LogEntry.Action.UPDATE, + ) + else: + Document.objects.filter(pk=document.pk).update( + content=parser.get_text(), + ) + + if settings.AUDIT_LOG_ENABLED: + LogEntry.objects.log_create( + instance=oldDocument, + changes={ + "content": [oldDocument.content, parser.get_text()], + }, + additional_data={ + "reason": "Update document content", }, action=LogEntry.Action.UPDATE, ) - with FileLock(settings.MEDIA_LOCK): + with FileLock(settings.MEDIA_LOCK): + if parser.get_archive_path(): create_source_path_directory(document.archive_path) shutil.move(parser.get_archive_path(), document.archive_path) - shutil.move(thumbnail, document.thumbnail_path) + shutil.move(thumbnail, document.thumbnail_path) - document.refresh_from_db() - logger.info( - f"Updating index for document {document_id} ({document.archive_checksum})", - ) - with index.open_index_writer() as writer: - index.update_document(writer, document) + document.refresh_from_db() + logger.info( + f"Updating index for document {document_id} ({document.archive_checksum})", + ) + with index.open_index_writer() as writer: + index.update_document(writer, document) - clear_document_caches(document.pk) + clear_document_caches(document.pk) except Exception: logger.exception( diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py index c6e846a77..bb5ebf04d 100644 --- a/src/documents/tests/test_bulk_edit.py +++ b/src/documents/tests/test_bulk_edit.py @@ -607,7 +607,7 @@ class TestPDFActions(DirectoriesMixin, TestCase): mock_consume_file.assert_not_called() @mock.patch("documents.tasks.bulk_update_documents.si") - @mock.patch("documents.tasks.update_document_archive_file.s") + @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s") @mock.patch("celery.chord.delay") def test_rotate(self, mock_chord, mock_update_document, mock_update_documents): """ @@ -626,7 +626,7 @@ class TestPDFActions(DirectoriesMixin, TestCase): self.assertEqual(result, "OK") @mock.patch("documents.tasks.bulk_update_documents.si") - @mock.patch("documents.tasks.update_document_archive_file.s") + @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s") @mock.patch("pikepdf.Pdf.save") def test_rotate_with_error( self, @@ -654,7 +654,7 @@ class TestPDFActions(DirectoriesMixin, TestCase): mock_update_archive_file.assert_not_called() @mock.patch("documents.tasks.bulk_update_documents.si") - @mock.patch("documents.tasks.update_document_archive_file.s") + @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s") @mock.patch("celery.chord.delay") def test_rotate_non_pdf( self, @@ -680,7 +680,7 @@ class TestPDFActions(DirectoriesMixin, TestCase): mock_chord.assert_called_once() self.assertEqual(result, "OK") - @mock.patch("documents.tasks.update_document_archive_file.delay") + @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay") @mock.patch("pikepdf.Pdf.save") def test_delete_pages(self, mock_pdf_save, mock_update_archive_file): """ @@ -705,7 +705,7 @@ class TestPDFActions(DirectoriesMixin, TestCase): self.doc2.refresh_from_db() self.assertEqual(self.doc2.page_count, expected_page_count) - @mock.patch("documents.tasks.update_document_archive_file.delay") + @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay") @mock.patch("pikepdf.Pdf.save") def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file): """ diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 76a0a2c74..5340035e7 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -13,7 +13,7 @@ from django.test import override_settings from documents.file_handling import generate_filename from documents.models import Document -from documents.tasks import update_document_archive_file +from documents.tasks import update_document_content_maybe_archive_file from documents.tests.utils import DirectoriesMixin from documents.tests.utils import FileSystemAssertsMixin @@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"), ) - update_document_archive_file(doc.pk) + update_document_content_maybe_archive_file(doc.pk) doc = Document.objects.get(id=doc.id) @@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): doc.save() shutil.copy(sample_file, doc.source_path) - update_document_archive_file(doc.pk) + update_document_content_maybe_archive_file(doc.pk) doc = Document.objects.get(id=doc.id) @@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase): os.path.join(self.dirs.originals_dir, "document_01.pdf"), ) - update_document_archive_file(doc2.pk) - update_document_archive_file(doc1.pk) + update_document_content_maybe_archive_file(doc2.pk) + update_document_content_maybe_archive_file(doc1.pk) doc1 = Document.objects.get(id=doc1.id) doc2 = Document.objects.get(id=doc2.id) diff --git a/src/documents/tests/test_tasks.py b/src/documents/tests/test_tasks.py index a1d21b532..0f9f8511f 100644 --- a/src/documents/tests/test_tasks.py +++ b/src/documents/tests/test_tasks.py @@ -1,5 +1,7 @@ import os +import shutil from datetime import timedelta +from pathlib import Path from unittest import mock from django.conf import settings @@ -184,3 +186,75 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase): tasks.empty_trash() self.assertEqual(Document.global_objects.count(), 0) + + +class TestUpdateContent(DirectoriesMixin, TestCase): + def test_update_content_maybe_archive_file(self): + """ + GIVEN: + - Existing document with archive file + WHEN: + - Update content task is called + THEN: + - Document is reprocessed, content and checksum are updated + """ + sample1 = self.dirs.scratch_dir / "sample.pdf" + shutil.copy( + Path(__file__).parent + / "samples" + / "documents" + / "originals" + / "0000001.pdf", + sample1, + ) + sample1_archive = self.dirs.archive_dir / "sample_archive.pdf" + shutil.copy( + Path(__file__).parent + / "samples" + / "documents" + / "originals" + / "0000001.pdf", + sample1_archive, + ) + doc = Document.objects.create( + title="test", + content="my document", + checksum="wow", + archive_checksum="wow", + filename=sample1, + mime_type="application/pdf", + archive_filename=sample1_archive, + ) + + tasks.update_document_content_maybe_archive_file(doc.pk) + self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test") + self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow") + + def test_update_content_maybe_archive_file_no_archive(self): + """ + GIVEN: + - Existing document without archive file + WHEN: + - Update content task is called + THEN: + - Document is reprocessed, content is updated + """ + sample1 = self.dirs.scratch_dir / "sample.pdf" + shutil.copy( + Path(__file__).parent + / "samples" + / "documents" + / "originals" + / "0000001.pdf", + sample1, + ) + doc = Document.objects.create( + title="test", + content="my document", + checksum="wow", + filename=sample1, + mime_type="application/pdf", + ) + + tasks.update_document_content_maybe_archive_file(doc.pk) + self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")