diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py index 4fa258b8a..5f5064396 100644 --- a/src/documents/migrations/1012_fix_archive_files.py +++ b/src/documents/migrations/1012_fix_archive_files.py @@ -160,7 +160,12 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False): ############################################################################### -def create_archive_version(doc, retry_count=4): +def parse_wrapper(parser, path, mime_type, file_name): + # this is here so that I can mock this out for testing. + parser.parse(path, mime_type, file_name) + + +def create_archive_version(doc, retry_count=3): from documents.parsers import get_parser_class_for_mime_type, \ DocumentParser, \ ParseError @@ -172,8 +177,8 @@ def create_archive_version(doc, retry_count=4): for try_num in range(retry_count): parser: DocumentParser = parser_class(None, None) try: - parser.parse(source_path(doc), doc.mime_type, - os.path.basename(doc.filename)) + parse_wrapper(parser, source_path(doc), doc.mime_type, + os.path.basename(doc.filename)) doc.content = parser.get_text() if parser.get_archive_path() and os.path.isfile( @@ -225,25 +230,28 @@ def move_old_to_new_locations(apps, schema_editor): for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) - if not os.path.isfile(old_path): - raise ValueError( - f"Archived document ID:{doc.id} does not exist at: " - f"{old_path}") - if old_path in old_archive_path_to_id: affected_document_ids.add(doc.id) affected_document_ids.add(old_archive_path_to_id[old_path]) else: old_archive_path_to_id[old_path] = doc.id - # check that we can regenerate these archive versions + # check that archive files of all unaffected documents are in place + for doc in Document.objects.filter(archive_checksum__isnull=False): + old_path = archive_path_old(doc) + if doc.id not in affected_document_ids and not os.path.isfile(old_path): + raise ValueError( + f"Archived document ID:{doc.id} does not exist at: " + f"{old_path}") + + # check that we can regenerate affected archive versions for doc_id in affected_document_ids: from documents.parsers import get_parser_class_for_mime_type doc = Document.objects.get(id=doc_id) parser_class = get_parser_class_for_mime_type(doc.mime_type) if not parser_class: - raise Exception( + raise ValueError( f"Document ID:{doc.id} has an invalid archived document, " f"but no parsers are available. Cannot migrate.") @@ -253,6 +261,9 @@ def move_old_to_new_locations(apps, schema_editor): old_path = archive_path_old(doc) # remove affected archive versions if os.path.isfile(old_path): + logger.debug( + f"Removing {old_path}" + ) os.unlink(old_path) else: # Set archive path for unaffected files @@ -267,8 +278,6 @@ def move_old_to_new_locations(apps, schema_editor): create_archive_version(doc) - - def move_new_to_old_locations(apps, schema_editor): Document = apps.get_model("documents", "Document") diff --git a/src/documents/tests/test_migration_archive_files.py b/src/documents/tests/test_migration_archive_files.py index 637aaeee0..b3f2e916c 100644 --- a/src/documents/tests/test_migration_archive_files.py +++ b/src/documents/tests/test_migration_archive_files.py @@ -2,10 +2,12 @@ import hashlib import os import shutil from pathlib import Path +from unittest import mock from django.conf import settings from django.test import override_settings +from documents.parsers import ParseError from documents.tests.utils import DirectoriesMixin, TestMigrations @@ -169,6 +171,11 @@ class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles): self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf") +def fake_parse_wrapper(parser, path, mime_type, file_name): + parser.archive_path = None + parser.text = "the text" + + @override_settings(PAPERLESS_FILENAME_FORMAT="") class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations): @@ -185,6 +192,73 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations): self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration) + def test_parser_missing(self): + Document = self.apps.get_model("documents", "Document") + + doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf) + doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf) + + self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration) + + @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper") + def test_parser_error(self, m): + m.side_effect = ParseError() + Document = self.apps.get_model("documents", "Document") + + doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf) + doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf) + + self.assertIsNotNone(doc1.archive_checksum) + self.assertIsNotNone(doc2.archive_checksum) + + with self.assertLogs() as capture: + self.performMigration() + + self.assertEqual(m.call_count, 6) + + self.assertEqual( + len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))), + 4) + + self.assertEqual( + len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))), + 2) + + Document = self.apps.get_model("documents", "Document") + + doc1 = Document.objects.get(id=doc1.id) + doc2 = Document.objects.get(id=doc2.id) + + self.assertIsNone(doc1.archive_checksum) + self.assertIsNone(doc2.archive_checksum) + self.assertIsNone(doc1.archive_filename) + self.assertIsNone(doc2.archive_filename) + + @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper") + def test_parser_no_archive(self, m): + m.side_effect = fake_parse_wrapper + + Document = self.apps.get_model("documents", "Document") + + doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf) + doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf) + + with self.assertLogs() as capture: + self.performMigration() + + self.assertEqual( + len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))), + 2) + + Document = self.apps.get_model("documents", "Document") + + doc1 = Document.objects.get(id=doc1.id) + doc2 = Document.objects.get(id=doc2.id) + + self.assertIsNone(doc1.archive_checksum) + self.assertIsNone(doc2.archive_checksum) + self.assertIsNone(doc1.archive_filename) + self.assertIsNone(doc2.archive_filename) @override_settings(PAPERLESS_FILENAME_FORMAT="") class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):