From 24767f62c7a3a60610a6250a35c8ae363c47f92a Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sun, 29 Nov 2020 12:31:26 +0100 Subject: [PATCH] added checksums for archived documents. --- src/documents/consumer.py | 8 ++++++- .../management/commands/document_archiver.py | 5 +++- src/documents/migrations/1005_checksums.py | 23 ++++++++++++++++++ src/documents/models.py | 12 +++++++--- src/documents/sanity_checker.py | 24 +++++++++++-------- 5 files changed, 57 insertions(+), 15 deletions(-) create mode 100644 src/documents/migrations/1005_checksums.py diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f38c726ac..fba89d46c 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -6,6 +6,7 @@ import os import magic from django.conf import settings from django.db import transaction +from django.db.models import Q from django.utils import timezone from .classifier import DocumentClassifier, IncompatibleClassifierVersionError @@ -42,7 +43,7 @@ class Consumer(LoggingMixin): def pre_check_duplicate(self): with open(self.path, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() - if Document.objects.filter(checksum=checksum).exists(): + if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501 if settings.CONSUMER_DELETE_DUPLICATES: os.unlink(self.path) raise ConsumerError( @@ -184,6 +185,11 @@ class Consumer(LoggingMixin): self._write(document.storage_type, archive_path, document.archive_path) + with open(archive_path, 'rb') as f: + document.archive_checksum = hashlib.md5( + f.read()).hexdigest() + document.save() + # Delete the file only if it was successfully consumed self.log("debug", "Deleting file {}".format(self.path)) os.unlink(self.path) diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index 88777be9a..9ade7c0e4 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -1,3 +1,4 @@ +import hashlib import multiprocessing import ocrmypdf @@ -27,6 +28,8 @@ def handle_document(document): parser.parse(document.source_path, mime_type) if parser.get_archive_path(): shutil.copy(parser.get_archive_path(), document.archive_path) + with document.archive_file as f: + document.archive_checksum = hashlib.md5(f.read()).hexdigest() else: logging.getLogger(__name__).warning( f"Parser {parser} did not produce an archived document " @@ -35,7 +38,7 @@ def handle_document(document): if parser.get_text(): document.content = parser.get_text() - document.save() + document.save() parser.cleanup() diff --git a/src/documents/migrations/1005_checksums.py b/src/documents/migrations/1005_checksums.py new file mode 100644 index 000000000..401de2e1d --- /dev/null +++ b/src/documents/migrations/1005_checksums.py @@ -0,0 +1,23 @@ +# Generated by Django 3.1.3 on 2020-11-29 00:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1004_sanity_check_schedule'), + ] + + operations = [ + migrations.AddField( + model_name='document', + name='archive_checksum', + field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True), + ), + migrations.AlterField( + model_name='document', + name='checksum', + field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 358749fae..bac6de806 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -157,9 +157,15 @@ class Document(models.Model): max_length=32, editable=False, unique=True, - help_text="The checksum of the original document (before it was " - "encrypted). We use this to prevent duplicate document " - "imports." + help_text="The checksum of the original document." + ) + + archive_checksum = models.CharField( + max_length=32, + editable=False, + blank=True, + null=True, + help_text="The checksum of the archived document." ) created = models.DateTimeField( diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index 706a2cd03..d4a809c11 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -67,30 +67,34 @@ def check_sanity(): f"Original of document {doc.pk} does not exist.")) else: present_files.remove(os.path.normpath(doc.source_path)) - checksum = None try: with doc.source_file as f: checksum = hashlib.md5(f.read()).hexdigest() except OSError as e: messages.append(SanityError( f"Cannot read original file of document {doc.pk}: {e}")) - - if checksum and not checksum == doc.checksum: - messages.append(SanityError( - f"Checksum mismatch of document {doc.pk}. " - f"Stored: {doc.checksum}, actual: {checksum}." - )) + else: + if not checksum == doc.checksum: + messages.append(SanityError( + f"Checksum mismatch of document {doc.pk}. " + f"Stored: {doc.checksum}, actual: {checksum}." + )) if os.path.isfile(doc.archive_path): + present_files.remove(os.path.normpath(doc.archive_path)) try: with doc.archive_file as f: - f.read() + checksum = hashlib.md5(f.read()).hexdigest() except OSError as e: messages.append(SanityError( f"Cannot read archive file of document {doc.pk}: {e}" )) - - present_files.remove(os.path.normpath(doc.archive_path)) + else: + if not checksum == doc.archive_checksum: + messages.append(SanityError( + f"Checksum mismatch of document {doc.pk}. " + f"Stored: {doc.checksum}, actual: {checksum}." + )) if not doc.content: messages.append(SanityWarning(