more testing of the migration

This commit is contained in:
jonaswinkler 2021-02-10 16:58:55 +01:00
parent 6c8f010f7a
commit 04519ee623
2 changed files with 95 additions and 12 deletions

View File

@ -160,7 +160,12 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
############################################################################### ###############################################################################
def create_archive_version(doc, retry_count=4): def parse_wrapper(parser, path, mime_type, file_name):
# this is here so that I can mock this out for testing.
parser.parse(path, mime_type, file_name)
def create_archive_version(doc, retry_count=3):
from documents.parsers import get_parser_class_for_mime_type, \ from documents.parsers import get_parser_class_for_mime_type, \
DocumentParser, \ DocumentParser, \
ParseError ParseError
@ -172,7 +177,7 @@ def create_archive_version(doc, retry_count=4):
for try_num in range(retry_count): for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None) parser: DocumentParser = parser_class(None, None)
try: try:
parser.parse(source_path(doc), doc.mime_type, parse_wrapper(parser, source_path(doc), doc.mime_type,
os.path.basename(doc.filename)) os.path.basename(doc.filename))
doc.content = parser.get_text() doc.content = parser.get_text()
@ -225,25 +230,28 @@ def move_old_to_new_locations(apps, schema_editor):
for doc in Document.objects.filter(archive_checksum__isnull=False): for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc) old_path = archive_path_old(doc)
if not os.path.isfile(old_path):
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: "
f"{old_path}")
if old_path in old_archive_path_to_id: if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id) affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path]) affected_document_ids.add(old_archive_path_to_id[old_path])
else: else:
old_archive_path_to_id[old_path] = doc.id old_archive_path_to_id[old_path] = doc.id
# check that we can regenerate these archive versions # check that archive files of all unaffected documents are in place
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if doc.id not in affected_document_ids and not os.path.isfile(old_path):
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: "
f"{old_path}")
# check that we can regenerate affected archive versions
for doc_id in affected_document_ids: for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id) doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type) parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class: if not parser_class:
raise Exception( raise ValueError(
f"Document ID:{doc.id} has an invalid archived document, " f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.") f"but no parsers are available. Cannot migrate.")
@ -253,6 +261,9 @@ def move_old_to_new_locations(apps, schema_editor):
old_path = archive_path_old(doc) old_path = archive_path_old(doc)
# remove affected archive versions # remove affected archive versions
if os.path.isfile(old_path): if os.path.isfile(old_path):
logger.debug(
f"Removing {old_path}"
)
os.unlink(old_path) os.unlink(old_path)
else: else:
# Set archive path for unaffected files # Set archive path for unaffected files
@ -267,8 +278,6 @@ def move_old_to_new_locations(apps, schema_editor):
create_archive_version(doc) create_archive_version(doc)
def move_new_to_old_locations(apps, schema_editor): def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document") Document = apps.get_model("documents", "Document")

View File

@ -2,10 +2,12 @@ import hashlib
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from unittest import mock
from django.conf import settings from django.conf import settings
from django.test import override_settings from django.test import override_settings
from documents.parsers import ParseError
from documents.tests.utils import DirectoriesMixin, TestMigrations from documents.tests.utils import DirectoriesMixin, TestMigrations
@ -169,6 +171,11 @@ class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf") self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
def fake_parse_wrapper(parser, path, mime_type, file_name):
parser.archive_path = None
parser.text = "the text"
@override_settings(PAPERLESS_FILENAME_FORMAT="") @override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations): class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
@ -185,6 +192,73 @@ class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration) self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
def test_parser_missing(self):
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
def test_parser_error(self, m):
m.side_effect = ParseError()
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
self.assertIsNotNone(doc1.archive_checksum)
self.assertIsNotNone(doc2.archive_checksum)
with self.assertLogs() as capture:
self.performMigration()
self.assertEqual(m.call_count, 6)
self.assertEqual(
len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
4)
self.assertEqual(
len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
2)
Document = self.apps.get_model("documents", "Document")
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)
self.assertIsNone(doc1.archive_checksum)
self.assertIsNone(doc2.archive_checksum)
self.assertIsNone(doc1.archive_filename)
self.assertIsNone(doc2.archive_filename)
@mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
def test_parser_no_archive(self, m):
m.side_effect = fake_parse_wrapper
Document = self.apps.get_model("documents", "Document")
doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
with self.assertLogs() as capture:
self.performMigration()
self.assertEqual(
len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
2)
Document = self.apps.get_model("documents", "Document")
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)
self.assertIsNone(doc1.archive_checksum)
self.assertIsNone(doc2.archive_checksum)
self.assertIsNone(doc1.archive_filename)
self.assertIsNone(doc2.archive_filename)
@override_settings(PAPERLESS_FILENAME_FORMAT="") @override_settings(PAPERLESS_FILENAME_FORMAT="")
class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations): class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):