mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
added checksums for archived documents.
This commit is contained in:
parent
fdaf419a7e
commit
24767f62c7
@ -6,6 +6,7 @@ import os
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
@ -42,7 +43,7 @@ class Consumer(LoggingMixin):
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
@ -184,6 +185,11 @@ class Consumer(LoggingMixin):
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
|
@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import multiprocessing
|
||||
|
||||
import ocrmypdf
|
||||
@ -27,6 +28,8 @@ def handle_document(document):
|
||||
parser.parse(document.source_path, mime_type)
|
||||
if parser.get_archive_path():
|
||||
shutil.copy(parser.get_archive_path(), document.archive_path)
|
||||
with document.archive_file as f:
|
||||
document.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
else:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Parser {parser} did not produce an archived document "
|
||||
|
23
src/documents/migrations/1005_checksums.py
Normal file
23
src/documents/migrations/1005_checksums.py
Normal file
@ -0,0 +1,23 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-29 00:48
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1004_sanity_check_schedule'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_checksum',
|
||||
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
|
||||
),
|
||||
]
|
@ -157,9 +157,15 @@ class Document(models.Model):
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text="The checksum of the original document (before it was "
|
||||
"encrypted). We use this to prevent duplicate document "
|
||||
"imports."
|
||||
help_text="The checksum of the original document."
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="The checksum of the archived document."
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
|
@ -67,30 +67,34 @@ def check_sanity():
|
||||
f"Original of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
checksum = None
|
||||
try:
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||
|
||||
if checksum and not checksum == doc.checksum:
|
||||
else:
|
||||
if not checksum == doc.checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
if os.path.isfile(doc.archive_path):
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
try:
|
||||
with doc.archive_file as f:
|
||||
f.read()
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read archive file of document {doc.pk}: {e}"
|
||||
))
|
||||
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
else:
|
||||
if not checksum == doc.archive_checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
if not doc.content:
|
||||
messages.append(SanityWarning(
|
||||
|
Loading…
x
Reference in New Issue
Block a user