added checksums for archived documents.

This commit is contained in:
jonaswinkler 2020-11-29 12:31:26 +01:00
parent fdaf419a7e
commit 24767f62c7
5 changed files with 57 additions and 15 deletions

View File

@ -6,6 +6,7 @@ import os
import magic import magic
from django.conf import settings from django.conf import settings
from django.db import transaction from django.db import transaction
from django.db.models import Q
from django.utils import timezone from django.utils import timezone
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
@ -42,7 +43,7 @@ class Consumer(LoggingMixin):
def pre_check_duplicate(self): def pre_check_duplicate(self):
with open(self.path, "rb") as f: with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(checksum=checksum).exists(): if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
if settings.CONSUMER_DELETE_DUPLICATES: if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path) os.unlink(self.path)
raise ConsumerError( raise ConsumerError(
@ -184,6 +185,11 @@ class Consumer(LoggingMixin):
self._write(document.storage_type, self._write(document.storage_type,
archive_path, document.archive_path) archive_path, document.archive_path)
with open(archive_path, 'rb') as f:
document.archive_checksum = hashlib.md5(
f.read()).hexdigest()
document.save()
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path)) self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path) os.unlink(self.path)

View File

@ -1,3 +1,4 @@
import hashlib
import multiprocessing import multiprocessing
import ocrmypdf import ocrmypdf
@ -27,6 +28,8 @@ def handle_document(document):
parser.parse(document.source_path, mime_type) parser.parse(document.source_path, mime_type)
if parser.get_archive_path(): if parser.get_archive_path():
shutil.copy(parser.get_archive_path(), document.archive_path) shutil.copy(parser.get_archive_path(), document.archive_path)
with document.archive_file as f:
document.archive_checksum = hashlib.md5(f.read()).hexdigest()
else: else:
logging.getLogger(__name__).warning( logging.getLogger(__name__).warning(
f"Parser {parser} did not produce an archived document " f"Parser {parser} did not produce an archived document "
@ -35,7 +38,7 @@ def handle_document(document):
if parser.get_text(): if parser.get_text():
document.content = parser.get_text() document.content = parser.get_text()
document.save() document.save()
parser.cleanup() parser.cleanup()

View File

@ -0,0 +1,23 @@
# Generated by Django 3.1.3 on 2020-11-29 00:48
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1004_sanity_check_schedule'),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_checksum',
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
),
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
),
]

View File

@ -157,9 +157,15 @@ class Document(models.Model):
max_length=32, max_length=32,
editable=False, editable=False,
unique=True, unique=True,
help_text="The checksum of the original document (before it was " help_text="The checksum of the original document."
"encrypted). We use this to prevent duplicate document " )
"imports."
archive_checksum = models.CharField(
max_length=32,
editable=False,
blank=True,
null=True,
help_text="The checksum of the archived document."
) )
created = models.DateTimeField( created = models.DateTimeField(

View File

@ -67,30 +67,34 @@ def check_sanity():
f"Original of document {doc.pk} does not exist.")) f"Original of document {doc.pk} does not exist."))
else: else:
present_files.remove(os.path.normpath(doc.source_path)) present_files.remove(os.path.normpath(doc.source_path))
checksum = None
try: try:
with doc.source_file as f: with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e: except OSError as e:
messages.append(SanityError( messages.append(SanityError(
f"Cannot read original file of document {doc.pk}: {e}")) f"Cannot read original file of document {doc.pk}: {e}"))
else:
if checksum and not checksum == doc.checksum: if not checksum == doc.checksum:
messages.append(SanityError( messages.append(SanityError(
f"Checksum mismatch of document {doc.pk}. " f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}." f"Stored: {doc.checksum}, actual: {checksum}."
)) ))
if os.path.isfile(doc.archive_path): if os.path.isfile(doc.archive_path):
present_files.remove(os.path.normpath(doc.archive_path))
try: try:
with doc.archive_file as f: with doc.archive_file as f:
f.read() checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e: except OSError as e:
messages.append(SanityError( messages.append(SanityError(
f"Cannot read archive file of document {doc.pk}: {e}" f"Cannot read archive file of document {doc.pk}: {e}"
)) ))
else:
present_files.remove(os.path.normpath(doc.archive_path)) if not checksum == doc.archive_checksum:
messages.append(SanityError(
f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
if not doc.content: if not doc.content:
messages.append(SanityWarning( messages.append(SanityWarning(