Merge branch 'dev' into feature-ocrmypdf

2025-12-14 01:21:14 -06:00 · 2020-11-25 16:58:20 +01:00
parent 56ce267f89 1987dccf48
commit b269af7572
7 changed files with 146 additions and 5 deletions
--- a/docs/advanced_usage.rst
+++ b/docs/advanced_usage.rst
@@ -147,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
 tag is set to *Auto*, this neural network will examine your documents and
 automatically learn when to assign this tag.

-There are a couple caveats you need to keep in mind when using this feature:
+Paperless tries to hide much of the involved complexity with this approach.
+However, there are a couple caveats you need to keep in mind when using this
+feature:

 * Changes to your documents are not immediately reflected by the matching
  algorithm. The neural network needs to be *trained* on your documents after
@@ -167,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
  has the correspondent "Very obscure web shop I bought something five years
  ago", it will probably not assign this correspondent automatically if you buy
  something from them again. The more documents, the better.
+* Paperless also needs a reasonable amount of negative examples to decide when
+  not to assign a certain tag, correspondent or type. This will usually be the
+  case as you start filling up paperless with documents. Example: If all your
+  documents are either from "Webshop" and "Bank", paperless will assign one of
+  these correspondents to ANY new document, if both are set to automatic matching.

 Hooking into the consumption process
 ####################################
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -10,6 +10,8 @@ next

 *   Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
    You have to specify your username.
+*   Added a simple sanity checker that checks your documents for missing or orphaned files,
+    files with wrong checksums, inaccessible files, and documents with empty content.


 paperless-ng 0.9.2
--- a/src/documents/migrations/1004_sanity_check_schedule.py
+++ b/src/documents/migrations/1004_sanity_check_schedule.py
@@ -0,0 +1,26 @@
+# Generated by Django 3.1.3 on 2020-11-25 14:53
+
+from django.db import migrations
+from django.db.migrations import RunPython
+from django_q.models import Schedule
+from django_q.tasks import schedule
+
+
+def add_schedules(apps, schema_editor):
+    schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
+
+
+def remove_schedules(apps, schema_editor):
+    Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1003_mime_types'),
+        ('django_q', '0013_task_attempt_count'),
+    ]
+
+    operations = [
+        RunPython(add_schedules, remove_schedules)
+    ]
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -0,0 +1,94 @@
+import hashlib
+import os
+
+from django.conf import settings
+
+from documents.models import Document
+
+
+class SanityMessage:
+    message = None
+
+
+class SanityWarning(SanityMessage):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return f"Warning: {self.message}"
+
+
+class SanityError(SanityMessage):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return f"ERROR: {self.message}"
+
+
+class SanityFailedError(Exception):
+
+    def __init__(self, messages):
+        self.messages = messages
+
+    def __str__(self):
+        message_string = "\n".join([str(m) for m in self.messages])
+        return (
+            f"The following issuse were found by the sanity checker:\n"
+            f"{message_string}\n\n===============\n\n")
+
+
+def check_sanity():
+    messages = []
+
+    present_files = []
+    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
+        for f in files:
+            present_files.append(os.path.normpath(os.path.join(root, f)))
+
+    for doc in Document.objects.all():
+        # Check thumbnail
+        if not os.path.isfile(doc.thumbnail_path):
+            messages.append(SanityError(
+                f"Thumbnail of document {doc.pk} does not exist."))
+        else:
+            present_files.remove(os.path.normpath(doc.thumbnail_path))
+            try:
+                with doc.thumbnail_file as f:
+                    f.read()
+            except OSError as e:
+                messages.append(SanityError(
+                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
+                ))
+
+        # Check document
+        if not os.path.isfile(doc.source_path):
+            messages.append(SanityError(
+                f"Original of document {doc.pk} does not exist."))
+        else:
+            present_files.remove(os.path.normpath(doc.source_path))
+            checksum = None
+            try:
+                with doc.source_file as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+            except OSError as e:
+                messages.append(SanityError(
+                    f"Cannot read original file of document {doc.pk}: {e}"))
+
+            if checksum and not checksum == doc.checksum:
+                messages.append(SanityError(
+                    f"Checksum mismatch of document {doc.pk}. "
+                    f"Stored: {doc.checksum}, actual: {checksum}."
+                ))
+
+        if not doc.content:
+            messages.append(SanityWarning(
+                f"Document {doc.pk} has no content."
+            ))
+
+    for extra_file in present_files:
+        messages.append(SanityWarning(
+            f"Orphaned file in media dir: {extra_file}"
+        ))
+
+    return messages
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -3,11 +3,12 @@ import logging
 from django.conf import settings
 from whoosh.writing import AsyncWriter

-from documents import index
+from documents import index, sanity_checker
 from documents.classifier import DocumentClassifier, \
    IncompatibleClassifierVersionError
 from documents.consumer import Consumer, ConsumerError
 from documents.models import Document
+from documents.sanity_checker import SanityFailedError


 def index_optimize():
@@ -74,3 +75,12 @@ def consume_file(path,
    else:
        raise ConsumerError("Unknown error: Returned document was null, but "
                            "no error message was given.")
+
+
+def sanity_check():
+    messages = sanity_checker.check_sanity()
+
+    if len(messages) > 0:
+        raise SanityFailedError(messages)
+    else:
+        return "No issues detected."
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@@ -8,7 +8,8 @@ class AutoLoginMiddleware(MiddlewareMixin):

    def process_request(self, request):
        try:
-            request.user = User.objects.get(username=settings.AUTO_LOGIN_USERNAME)
+            request.user = User.objects.get(
+                username=settings.AUTO_LOGIN_USERNAME)
        except User.DoesNotExist:
            pass

--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -5,7 +5,8 @@ from django.core.checks import Error, register


 def get_tesseract_langs():
-    with subprocess.Popen(['tesseract', '--list-langs'], stdout=subprocess.PIPE) as p:
+    with subprocess.Popen(['tesseract', '--list-langs'],
+                          stdout=subprocess.PIPE) as p:
        stdout, stderr = p.communicate()

    return stdout.decode().strip().split("\n")[1:]
@@ -15,7 +16,7 @@ def get_tesseract_langs():
 def check_default_language_available(app_configs, **kwargs):
    langs = get_tesseract_langs()

-    if not settings.OCR_LANGUAGE in langs:
+    if settings.OCR_LANGUAGE not in langs:
        return [Error(
            f"The default ocr language {settings.OCR_LANGUAGE} is "
            f"not installed. Paperless cannot OCR your documents "