From 6aca09d4856b458225a0df1bb18a70fc24509664 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 15:06:27 +0100 Subject: [PATCH 1/4] additional note about the automatic matching algorithm --- docs/advanced_usage.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/advanced_usage.rst b/docs/advanced_usage.rst index 653bee1c6..fca3ff4df 100644 --- a/docs/advanced_usage.rst +++ b/docs/advanced_usage.rst @@ -147,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this tag is set to *Auto*, this neural network will examine your documents and automatically learn when to assign this tag. -There are a couple caveats you need to keep in mind when using this feature: +Paperless tries to hide much of the involved complexity with this approach. +However, there are a couple caveats you need to keep in mind when using this +feature: * Changes to your documents are not immediately reflected by the matching algorithm. The neural network needs to be *trained* on your documents after @@ -167,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature: has the correspondent "Very obscure web shop I bought something five years ago", it will probably not assign this correspondent automatically if you buy something from them again. The more documents, the better. +* Paperless also needs a reasonable amount of negative examples to decide when + not to assign a certain tag, correspondent or type. This will usually be the + case as you start filling up paperless with documents. Example: If all your + documents are either from "Webshop" and "Bank", paperless will assign one of + these correspondents to ANY new document, if both are set to automatic matching. Hooking into the consumption process #################################### From 751c2ac54bfb69612c26acb2cd6ae66053971e7e Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 16:04:58 +0100 Subject: [PATCH 2/4] added a simple sanity checker. --- .../migrations/1004_sanity_check_schedule.py | 26 +++++ src/documents/sanity_checker.py | 94 +++++++++++++++++++ src/documents/tasks.py | 12 ++- 3 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 src/documents/migrations/1004_sanity_check_schedule.py create mode 100644 src/documents/sanity_checker.py diff --git a/src/documents/migrations/1004_sanity_check_schedule.py b/src/documents/migrations/1004_sanity_check_schedule.py new file mode 100644 index 000000000..b6346d479 --- /dev/null +++ b/src/documents/migrations/1004_sanity_check_schedule.py @@ -0,0 +1,26 @@ +# Generated by Django 3.1.3 on 2020-11-25 14:53 + +from django.db import migrations +from django.db.migrations import RunPython +from django_q.models import Schedule +from django_q.tasks import schedule + + +def add_schedules(apps, schema_editor): + schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY) + + +def remove_schedules(apps, schema_editor): + Schedule.objects.filter(func='documents.tasks.sanity_check').delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1003_mime_types'), + ('django_q', '0013_task_attempt_count'), + ] + + operations = [ + RunPython(add_schedules, remove_schedules) + ] diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py new file mode 100644 index 000000000..18bb3781c --- /dev/null +++ b/src/documents/sanity_checker.py @@ -0,0 +1,94 @@ +import hashlib +import os + +from django.conf import settings + +from documents.models import Document + + +class SanityMessage: + message = None + + +class SanityWarning(SanityMessage): + def __init__(self, message): + self.message = message + + def __str__(self): + return f"Warning: {self.message}" + + +class SanityError(SanityMessage): + def __init__(self, message): + self.message = message + + def __str__(self): + return f"ERROR: {self.message}" + + +class SanityFailedError(Exception): + + def __init__(self, messages): + self.messages = messages + + def __str__(self): + message_string = "\n".join([str(m) for m in self.messages]) + return ( + f"The following issuse were found by the sanity checker:\n" + f"{message_string}\n\n===============\n\n") + + +def check_sanity(): + messages = [] + + present_files = [] + for root, subdirs, files in os.walk(settings.MEDIA_ROOT): + for f in files: + present_files.append(os.path.normpath(os.path.join(root, f))) + + for doc in Document.objects.all(): + # Check thumbnail + if not os.path.isfile(doc.thumbnail_path): + messages.append(SanityError( + f"Thumbnail of document {doc.pk} does not exist.")) + else: + present_files.remove(os.path.normpath(doc.thumbnail_path)) + try: + with doc.thumbnail_file as f: + f.read() + except OSError as e: + messages.append(SanityError( + f"Cannot read thumbnail file of document {doc.pk}: {e}" + )) + + # Check document + if not os.path.isfile(doc.source_path): + messages.append(SanityError( + f"Original of document {doc.pk} does not exist.")) + else: + present_files.remove(os.path.normpath(doc.source_path)) + checksum = None + try: + with doc.source_file as f: + checksum = hashlib.md5(f.read()).hexdigest() + except OSError as e: + messages.append(SanityError( + f"Cannot read original file of document {doc.pk}: {e}")) + + if checksum and not checksum == doc.checksum: + messages.append(SanityError( + f"Checksum mismatch of document {doc.pk}. " + f"Stored: {doc.checksum}, actual: {checksum}." + )) + + if not doc.content: + messages.append(SanityWarning( + f"Document {doc.pk} has no content." + )) + + for extra_file in present_files: + messages.append(SanityWarning( + f"Orphaned file in media dir: {extra_file}" + )) + + return messages diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 40ed8f25e..3c9baad08 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -3,11 +3,12 @@ import logging from django.conf import settings from whoosh.writing import AsyncWriter -from documents import index +from documents import index, sanity_checker from documents.classifier import DocumentClassifier, \ IncompatibleClassifierVersionError from documents.consumer import Consumer, ConsumerError from documents.models import Document +from documents.sanity_checker import SanityFailedError def index_optimize(): @@ -74,3 +75,12 @@ def consume_file(path, else: raise ConsumerError("Unknown error: Returned document was null, but " "no error message was given.") + + +def sanity_check(): + messages = sanity_checker.check_sanity() + + if len(messages) > 0: + raise SanityFailedError(messages) + else: + return "No issues detected." From d92214d41204c5a545f0926f8b4123019434611d Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 16:05:52 +0100 Subject: [PATCH 3/4] codestyle --- src/paperless/auth.py | 3 ++- src/paperless_tesseract/checks.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/paperless/auth.py b/src/paperless/auth.py index faf3104bc..ece5d0eba 100644 --- a/src/paperless/auth.py +++ b/src/paperless/auth.py @@ -8,7 +8,8 @@ class AutoLoginMiddleware(MiddlewareMixin): def process_request(self, request): try: - request.user = User.objects.get(username=settings.AUTO_LOGIN_USERNAME) + request.user = User.objects.get( + username=settings.AUTO_LOGIN_USERNAME) except User.DoesNotExist: pass diff --git a/src/paperless_tesseract/checks.py b/src/paperless_tesseract/checks.py index 21f229e65..8a06d7b00 100644 --- a/src/paperless_tesseract/checks.py +++ b/src/paperless_tesseract/checks.py @@ -5,7 +5,8 @@ from django.core.checks import Error, register def get_tesseract_langs(): - with subprocess.Popen(['tesseract', '--list-langs'], stdout=subprocess.PIPE) as p: + with subprocess.Popen(['tesseract', '--list-langs'], + stdout=subprocess.PIPE) as p: stdout, stderr = p.communicate() return stdout.decode().strip().split("\n")[1:] @@ -15,7 +16,7 @@ def get_tesseract_langs(): def check_default_language_available(app_configs, **kwargs): langs = get_tesseract_langs() - if not settings.OCR_LANGUAGE in langs: + if settings.OCR_LANGUAGE not in langs: return [Error( f"The default ocr language {settings.OCR_LANGUAGE} is " f"not installed. Paperless cannot OCR your documents " From 1987dccf48dbabdae6203aa796ad8886ad6d4420 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 16:30:53 +0100 Subject: [PATCH 4/4] changelog --- docs/changelog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index 7a1b1c374..c494cecb9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -10,6 +10,8 @@ next * Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``. You have to specify your username. +* Added a simple sanity checker that checks your documents for missing or orphaned files, + files with wrong checksums, inaccessible files, and documents with empty content. paperless-ng 0.9.2