Change: change update content to handle archive disabled (#8315)

This commit is contained in:
shamoon 2024-11-20 12:01:13 -08:00 committed by GitHub
parent d7d3fed833
commit 9c1561adfb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 124 additions and 32 deletions

View File

@ -24,7 +24,7 @@ from documents.models import StoragePath
from documents.permissions import set_permissions_for_object from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents from documents.tasks import bulk_update_documents
from documents.tasks import consume_file from documents.tasks import consume_file
from documents.tasks import update_document_archive_file from documents.tasks import update_document_content_maybe_archive_file
logger: logging.Logger = logging.getLogger("paperless.bulk_edit") logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
@ -191,7 +191,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
def reprocess(doc_ids: list[int]) -> Literal["OK"]: def reprocess(doc_ids: list[int]) -> Literal["OK"]:
for document_id in doc_ids: for document_id in doc_ids:
update_document_archive_file.delay( update_document_content_maybe_archive_file.delay(
document_id=document_id, document_id=document_id,
) )
@ -245,7 +245,7 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save() doc.save()
rotate_tasks.append( rotate_tasks.append(
update_document_archive_file.s( update_document_content_maybe_archive_file.s(
document_id=doc.id, document_id=doc.id,
), ),
) )
@ -423,7 +423,7 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
if doc.page_count is not None: if doc.page_count is not None:
doc.page_count = doc.page_count - len(pages) doc.page_count = doc.page_count - len(pages)
doc.save() doc.save()
update_document_archive_file.delay(document_id=doc.id) update_document_content_maybe_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}") logger.info(f"Deleted pages {pages} from document {doc.id}")
except Exception as e: except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}") logger.exception(f"Error deleting pages from document {doc.id}: {e}")

View File

@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
from documents.tasks import update_document_archive_file from documents.tasks import update_document_content_maybe_archive_file
logger = logging.getLogger("paperless.management.archiver") logger = logging.getLogger("paperless.management.archiver")
@ -77,13 +77,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
if self.process_count == 1: if self.process_count == 1:
for doc_id in document_ids: for doc_id in document_ids:
update_document_archive_file(doc_id) update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool: with multiprocessing.Pool(self.process_count) as pool:
list( list(
tqdm.tqdm( tqdm.tqdm(
pool.imap_unordered( pool.imap_unordered(
update_document_archive_file, update_document_content_maybe_archive_file,
document_ids, document_ids,
), ),
total=len(document_ids), total=len(document_ids),

View File

@ -206,9 +206,10 @@ def bulk_update_documents(document_ids):
@shared_task @shared_task
def update_document_archive_file(document_id): def update_document_content_maybe_archive_file(document_id):
""" """
Re-creates the archive file of a document, including new OCR content and thumbnail Re-creates OCR content and thumbnail for a document, and archive file if
it exists.
""" """
document = Document.objects.get(id=document_id) document = Document.objects.get(id=document_id)
@ -234,8 +235,9 @@ def update_document_archive_file(document_id):
document.get_public_filename(), document.get_public_filename(),
) )
if parser.get_archive_path(): with transaction.atomic():
with transaction.atomic(): oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
with open(parser.get_archive_path(), "rb") as f: with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move # I'm going to save first so that in case the file move
@ -246,7 +248,6 @@ def update_document_archive_file(document_id):
document, document,
archive_filename=True, archive_filename=True,
) )
oldDocument = Document.objects.get(pk=document.pk)
Document.objects.filter(pk=document.pk).update( Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum, archive_checksum=checksum,
content=parser.get_text(), content=parser.get_text(),
@ -268,24 +269,41 @@ def update_document_archive_file(document_id):
], ],
}, },
additional_data={ additional_data={
"reason": "Update document archive file", "reason": "Update document content",
},
action=LogEntry.Action.UPDATE,
)
else:
Document.objects.filter(pk=document.pk).update(
content=parser.get_text(),
)
if settings.AUDIT_LOG_ENABLED:
LogEntry.objects.log_create(
instance=oldDocument,
changes={
"content": [oldDocument.content, parser.get_text()],
},
additional_data={
"reason": "Update document content",
}, },
action=LogEntry.Action.UPDATE, action=LogEntry.Action.UPDATE,
) )
with FileLock(settings.MEDIA_LOCK): with FileLock(settings.MEDIA_LOCK):
if parser.get_archive_path():
create_source_path_directory(document.archive_path) create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path) shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path) shutil.move(thumbnail, document.thumbnail_path)
document.refresh_from_db() document.refresh_from_db()
logger.info( logger.info(
f"Updating index for document {document_id} ({document.archive_checksum})", f"Updating index for document {document_id} ({document.archive_checksum})",
) )
with index.open_index_writer() as writer: with index.open_index_writer() as writer:
index.update_document(writer, document) index.update_document(writer, document)
clear_document_caches(document.pk) clear_document_caches(document.pk)
except Exception: except Exception:
logger.exception( logger.exception(

View File

@ -607,7 +607,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_consume_file.assert_not_called() mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si") @mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s") @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay") @mock.patch("celery.chord.delay")
def test_rotate(self, mock_chord, mock_update_document, mock_update_documents): def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
""" """
@ -626,7 +626,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertEqual(result, "OK") self.assertEqual(result, "OK")
@mock.patch("documents.tasks.bulk_update_documents.si") @mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s") @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("pikepdf.Pdf.save") @mock.patch("pikepdf.Pdf.save")
def test_rotate_with_error( def test_rotate_with_error(
self, self,
@ -654,7 +654,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_update_archive_file.assert_not_called() mock_update_archive_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si") @mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s") @mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay") @mock.patch("celery.chord.delay")
def test_rotate_non_pdf( def test_rotate_non_pdf(
self, self,
@ -680,7 +680,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_chord.assert_called_once() mock_chord.assert_called_once()
self.assertEqual(result, "OK") self.assertEqual(result, "OK")
@mock.patch("documents.tasks.update_document_archive_file.delay") @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save") @mock.patch("pikepdf.Pdf.save")
def test_delete_pages(self, mock_pdf_save, mock_update_archive_file): def test_delete_pages(self, mock_pdf_save, mock_update_archive_file):
""" """
@ -705,7 +705,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.doc2.refresh_from_db() self.doc2.refresh_from_db()
self.assertEqual(self.doc2.page_count, expected_page_count) self.assertEqual(self.doc2.page_count, expected_page_count)
@mock.patch("documents.tasks.update_document_archive_file.delay") @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save") @mock.patch("pikepdf.Pdf.save")
def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file): def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
""" """

View File

@ -13,7 +13,7 @@ from django.test import override_settings
from documents.file_handling import generate_filename from documents.file_handling import generate_filename
from documents.models import Document from documents.models import Document
from documents.tasks import update_document_archive_file from documents.tasks import update_document_content_maybe_archive_file
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import FileSystemAssertsMixin
@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"), os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
) )
update_document_archive_file(doc.pk) update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id) doc = Document.objects.get(id=doc.id)
@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
doc.save() doc.save()
shutil.copy(sample_file, doc.source_path) shutil.copy(sample_file, doc.source_path)
update_document_archive_file(doc.pk) update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id) doc = Document.objects.get(id=doc.id)
@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, "document_01.pdf"), os.path.join(self.dirs.originals_dir, "document_01.pdf"),
) )
update_document_archive_file(doc2.pk) update_document_content_maybe_archive_file(doc2.pk)
update_document_archive_file(doc1.pk) update_document_content_maybe_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id) doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id) doc2 = Document.objects.get(id=doc2.id)

View File

@ -1,5 +1,7 @@
import os import os
import shutil
from datetime import timedelta from datetime import timedelta
from pathlib import Path
from unittest import mock from unittest import mock
from django.conf import settings from django.conf import settings
@ -184,3 +186,75 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
tasks.empty_trash() tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0) self.assertEqual(Document.global_objects.count(), 0)
class TestUpdateContent(DirectoriesMixin, TestCase):
def test_update_content_maybe_archive_file(self):
"""
GIVEN:
- Existing document with archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content and checksum are updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1_archive,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
archive_checksum="wow",
filename=sample1,
mime_type="application/pdf",
archive_filename=sample1_archive,
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
def test_update_content_maybe_archive_file_no_archive(self):
"""
GIVEN:
- Existing document without archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content is updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
filename=sample1,
mime_type="application/pdf",
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")