Change: change update content to handle archive disabled (#8315)

This commit is contained in:
shamoon 2024-11-20 12:01:13 -08:00 committed by GitHub
parent d7d3fed833
commit 9c1561adfb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 124 additions and 32 deletions

View File

@ -24,7 +24,7 @@ from documents.models import StoragePath
from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents
from documents.tasks import consume_file
from documents.tasks import update_document_archive_file
from documents.tasks import update_document_content_maybe_archive_file
logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
@ -191,7 +191,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
def reprocess(doc_ids: list[int]) -> Literal["OK"]:
for document_id in doc_ids:
update_document_archive_file.delay(
update_document_content_maybe_archive_file.delay(
document_id=document_id,
)
@ -245,7 +245,7 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save()
rotate_tasks.append(
update_document_archive_file.s(
update_document_content_maybe_archive_file.s(
document_id=doc.id,
),
)
@ -423,7 +423,7 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
if doc.page_count is not None:
doc.page_count = doc.page_count - len(pages)
doc.save()
update_document_archive_file.delay(document_id=doc.id)
update_document_content_maybe_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}")
except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}")

View File

@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tasks import update_document_content_maybe_archive_file
logger = logging.getLogger("paperless.management.archiver")
@ -77,13 +77,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
if self.process_count == 1:
for doc_id in document_ids:
update_document_archive_file(doc_id)
update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
update_document_archive_file,
update_document_content_maybe_archive_file,
document_ids,
),
total=len(document_ids),

View File

@ -206,9 +206,10 @@ def bulk_update_documents(document_ids):
@shared_task
def update_document_archive_file(document_id):
def update_document_content_maybe_archive_file(document_id):
"""
Re-creates the archive file of a document, including new OCR content and thumbnail
Re-creates OCR content and thumbnail for a document, and archive file if
it exists.
"""
document = Document.objects.get(id=document_id)
@ -234,8 +235,9 @@ def update_document_archive_file(document_id):
document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic():
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
@ -246,7 +248,6 @@ def update_document_archive_file(document_id):
document,
archive_filename=True,
)
oldDocument = Document.objects.get(pk=document.pk)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
@ -268,24 +269,41 @@ def update_document_archive_file(document_id):
],
},
additional_data={
"reason": "Update document archive file",
"reason": "Update document content",
},
action=LogEntry.Action.UPDATE,
)
else:
Document.objects.filter(pk=document.pk).update(
content=parser.get_text(),
)
if settings.AUDIT_LOG_ENABLED:
LogEntry.objects.log_create(
instance=oldDocument,
changes={
"content": [oldDocument.content, parser.get_text()],
},
additional_data={
"reason": "Update document content",
},
action=LogEntry.Action.UPDATE,
)
with FileLock(settings.MEDIA_LOCK):
with FileLock(settings.MEDIA_LOCK):
if parser.get_archive_path():
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
shutil.move(thumbnail, document.thumbnail_path)
document.refresh_from_db()
logger.info(
f"Updating index for document {document_id} ({document.archive_checksum})",
)
with index.open_index_writer() as writer:
index.update_document(writer, document)
document.refresh_from_db()
logger.info(
f"Updating index for document {document_id} ({document.archive_checksum})",
)
with index.open_index_writer() as writer:
index.update_document(writer, document)
clear_document_caches(document.pk)
clear_document_caches(document.pk)
except Exception:
logger.exception(

View File

@ -607,7 +607,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
"""
@ -626,7 +626,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("pikepdf.Pdf.save")
def test_rotate_with_error(
self,
@ -654,7 +654,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_update_archive_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate_non_pdf(
self,
@ -680,7 +680,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_chord.assert_called_once()
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages(self, mock_pdf_save, mock_update_archive_file):
"""
@ -705,7 +705,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.doc2.refresh_from_db()
self.assertEqual(self.doc2.page_count, expected_page_count)
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
"""

View File

@ -13,7 +13,7 @@ from django.test import override_settings
from documents.file_handling import generate_filename
from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tasks import update_document_content_maybe_archive_file
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
update_document_archive_file(doc.pk)
update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
doc.save()
shutil.copy(sample_file, doc.source_path)
update_document_archive_file(doc.pk)
update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, "document_01.pdf"),
)
update_document_archive_file(doc2.pk)
update_document_archive_file(doc1.pk)
update_document_content_maybe_archive_file(doc2.pk)
update_document_content_maybe_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)

View File

@ -1,5 +1,7 @@
import os
import shutil
from datetime import timedelta
from pathlib import Path
from unittest import mock
from django.conf import settings
@ -184,3 +186,75 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0)
class TestUpdateContent(DirectoriesMixin, TestCase):
def test_update_content_maybe_archive_file(self):
"""
GIVEN:
- Existing document with archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content and checksum are updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1_archive,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
archive_checksum="wow",
filename=sample1,
mime_type="application/pdf",
archive_filename=sample1_archive,
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
def test_update_content_maybe_archive_file_no_archive(self):
"""
GIVEN:
- Existing document without archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content is updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
filename=sample1,
mime_type="application/pdf",
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")