mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
added checksums for archived documents.
This commit is contained in:
parent
fdaf419a7e
commit
24767f62c7
@ -6,6 +6,7 @@ import os
|
|||||||
import magic
|
import magic
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
from django.db.models import Q
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||||
@ -42,7 +43,7 @@ class Consumer(LoggingMixin):
|
|||||||
def pre_check_duplicate(self):
|
def pre_check_duplicate(self):
|
||||||
with open(self.path, "rb") as f:
|
with open(self.path, "rb") as f:
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
if Document.objects.filter(checksum=checksum).exists():
|
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||||
os.unlink(self.path)
|
os.unlink(self.path)
|
||||||
raise ConsumerError(
|
raise ConsumerError(
|
||||||
@ -184,6 +185,11 @@ class Consumer(LoggingMixin):
|
|||||||
self._write(document.storage_type,
|
self._write(document.storage_type,
|
||||||
archive_path, document.archive_path)
|
archive_path, document.archive_path)
|
||||||
|
|
||||||
|
with open(archive_path, 'rb') as f:
|
||||||
|
document.archive_checksum = hashlib.md5(
|
||||||
|
f.read()).hexdigest()
|
||||||
|
document.save()
|
||||||
|
|
||||||
# Delete the file only if it was successfully consumed
|
# Delete the file only if it was successfully consumed
|
||||||
self.log("debug", "Deleting file {}".format(self.path))
|
self.log("debug", "Deleting file {}".format(self.path))
|
||||||
os.unlink(self.path)
|
os.unlink(self.path)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import hashlib
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
@ -27,6 +28,8 @@ def handle_document(document):
|
|||||||
parser.parse(document.source_path, mime_type)
|
parser.parse(document.source_path, mime_type)
|
||||||
if parser.get_archive_path():
|
if parser.get_archive_path():
|
||||||
shutil.copy(parser.get_archive_path(), document.archive_path)
|
shutil.copy(parser.get_archive_path(), document.archive_path)
|
||||||
|
with document.archive_file as f:
|
||||||
|
document.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
else:
|
else:
|
||||||
logging.getLogger(__name__).warning(
|
logging.getLogger(__name__).warning(
|
||||||
f"Parser {parser} did not produce an archived document "
|
f"Parser {parser} did not produce an archived document "
|
||||||
@ -35,7 +38,7 @@ def handle_document(document):
|
|||||||
|
|
||||||
if parser.get_text():
|
if parser.get_text():
|
||||||
document.content = parser.get_text()
|
document.content = parser.get_text()
|
||||||
document.save()
|
document.save()
|
||||||
|
|
||||||
parser.cleanup()
|
parser.cleanup()
|
||||||
|
|
||||||
|
23
src/documents/migrations/1005_checksums.py
Normal file
23
src/documents/migrations/1005_checksums.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# Generated by Django 3.1.3 on 2020-11-29 00:48
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '1004_sanity_check_schedule'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='document',
|
||||||
|
name='archive_checksum',
|
||||||
|
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='document',
|
||||||
|
name='checksum',
|
||||||
|
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
|
||||||
|
),
|
||||||
|
]
|
@ -157,9 +157,15 @@ class Document(models.Model):
|
|||||||
max_length=32,
|
max_length=32,
|
||||||
editable=False,
|
editable=False,
|
||||||
unique=True,
|
unique=True,
|
||||||
help_text="The checksum of the original document (before it was "
|
help_text="The checksum of the original document."
|
||||||
"encrypted). We use this to prevent duplicate document "
|
)
|
||||||
"imports."
|
|
||||||
|
archive_checksum = models.CharField(
|
||||||
|
max_length=32,
|
||||||
|
editable=False,
|
||||||
|
blank=True,
|
||||||
|
null=True,
|
||||||
|
help_text="The checksum of the archived document."
|
||||||
)
|
)
|
||||||
|
|
||||||
created = models.DateTimeField(
|
created = models.DateTimeField(
|
||||||
|
@ -67,30 +67,34 @@ def check_sanity():
|
|||||||
f"Original of document {doc.pk} does not exist."))
|
f"Original of document {doc.pk} does not exist."))
|
||||||
else:
|
else:
|
||||||
present_files.remove(os.path.normpath(doc.source_path))
|
present_files.remove(os.path.normpath(doc.source_path))
|
||||||
checksum = None
|
|
||||||
try:
|
try:
|
||||||
with doc.source_file as f:
|
with doc.source_file as f:
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
messages.append(SanityError(
|
messages.append(SanityError(
|
||||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||||
|
else:
|
||||||
if checksum and not checksum == doc.checksum:
|
if not checksum == doc.checksum:
|
||||||
messages.append(SanityError(
|
messages.append(SanityError(
|
||||||
f"Checksum mismatch of document {doc.pk}. "
|
f"Checksum mismatch of document {doc.pk}. "
|
||||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||||
))
|
))
|
||||||
|
|
||||||
if os.path.isfile(doc.archive_path):
|
if os.path.isfile(doc.archive_path):
|
||||||
|
present_files.remove(os.path.normpath(doc.archive_path))
|
||||||
try:
|
try:
|
||||||
with doc.archive_file as f:
|
with doc.archive_file as f:
|
||||||
f.read()
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
messages.append(SanityError(
|
messages.append(SanityError(
|
||||||
f"Cannot read archive file of document {doc.pk}: {e}"
|
f"Cannot read archive file of document {doc.pk}: {e}"
|
||||||
))
|
))
|
||||||
|
else:
|
||||||
present_files.remove(os.path.normpath(doc.archive_path))
|
if not checksum == doc.archive_checksum:
|
||||||
|
messages.append(SanityError(
|
||||||
|
f"Checksum mismatch of document {doc.pk}. "
|
||||||
|
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||||
|
))
|
||||||
|
|
||||||
if not doc.content:
|
if not doc.content:
|
||||||
messages.append(SanityWarning(
|
messages.append(SanityWarning(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user