added checksums for archived documents.

This commit is contained in:
jonaswinkler
2020-11-29 12:31:26 +01:00
parent fdaf419a7e
commit 24767f62c7
5 changed files with 57 additions and 15 deletions

View File

@@ -6,6 +6,7 @@ import os
import magic
from django.conf import settings
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
@@ -42,7 +43,7 @@ class Consumer(LoggingMixin):
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(checksum=checksum).exists():
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
raise ConsumerError(
@@ -184,6 +185,11 @@ class Consumer(LoggingMixin):
self._write(document.storage_type,
archive_path, document.archive_path)
with open(archive_path, 'rb') as f:
document.archive_checksum = hashlib.md5(
f.read()).hexdigest()
document.save()
# Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)