From 6aca09d4856b458225a0df1bb18a70fc24509664 Mon Sep 17 00:00:00 2001
From: Jonas Winkler <jonas.winkler@jpwinkler.de>
Date: Wed, 25 Nov 2020 15:06:27 +0100
Subject: [PATCH 1/4] additional note about the automatic matching algorithm

---
 docs/advanced_usage.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/advanced_usage.rst b/docs/advanced_usage.rst
index 653bee1c6..fca3ff4df 100644
--- a/docs/advanced_usage.rst
+++ b/docs/advanced_usage.rst
@@ -147,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
 tag is set to *Auto*, this neural network will examine your documents and
 automatically learn when to assign this tag.
 
-There are a couple caveats you need to keep in mind when using this feature:
+Paperless tries to hide much of the involved complexity with this approach.
+However, there are a couple caveats you need to keep in mind when using this
+feature:
 
 * Changes to your documents are not immediately reflected by the matching
   algorithm. The neural network needs to be *trained* on your documents after
@@ -167,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
   has the correspondent "Very obscure web shop I bought something five years
   ago", it will probably not assign this correspondent automatically if you buy
   something from them again. The more documents, the better.
+* Paperless also needs a reasonable amount of negative examples to decide when
+  not to assign a certain tag, correspondent or type. This will usually be the
+  case as you start filling up paperless with documents. Example: If all your
+  documents are either from "Webshop" and "Bank", paperless will assign one of
+  these correspondents to ANY new document, if both are set to automatic matching.
 
 Hooking into the consumption process
 ####################################

From 751c2ac54bfb69612c26acb2cd6ae66053971e7e Mon Sep 17 00:00:00 2001
From: Jonas Winkler <jonas.winkler@jpwinkler.de>
Date: Wed, 25 Nov 2020 16:04:58 +0100
Subject: [PATCH 2/4] added a simple sanity checker.

---
 .../migrations/1004_sanity_check_schedule.py  | 26 +++++
 src/documents/sanity_checker.py               | 94 +++++++++++++++++++
 src/documents/tasks.py                        | 12 ++-
 3 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 src/documents/migrations/1004_sanity_check_schedule.py
 create mode 100644 src/documents/sanity_checker.py

diff --git a/src/documents/migrations/1004_sanity_check_schedule.py b/src/documents/migrations/1004_sanity_check_schedule.py
new file mode 100644
index 000000000..b6346d479
--- /dev/null
+++ b/src/documents/migrations/1004_sanity_check_schedule.py
@@ -0,0 +1,26 @@
+# Generated by Django 3.1.3 on 2020-11-25 14:53
+
+from django.db import migrations
+from django.db.migrations import RunPython
+from django_q.models import Schedule
+from django_q.tasks import schedule
+
+
+def add_schedules(apps, schema_editor):
+    schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
+
+
+def remove_schedules(apps, schema_editor):
+    Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1003_mime_types'),
+        ('django_q', '0013_task_attempt_count'),
+    ]
+
+    operations = [
+        RunPython(add_schedules, remove_schedules)
+    ]
diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py
new file mode 100644
index 000000000..18bb3781c
--- /dev/null
+++ b/src/documents/sanity_checker.py
@@ -0,0 +1,94 @@
+import hashlib
+import os
+
+from django.conf import settings
+
+from documents.models import Document
+
+
+class SanityMessage:
+    message = None
+
+
+class SanityWarning(SanityMessage):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return f"Warning: {self.message}"
+
+
+class SanityError(SanityMessage):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return f"ERROR: {self.message}"
+
+
+class SanityFailedError(Exception):
+
+    def __init__(self, messages):
+        self.messages = messages
+
+    def __str__(self):
+        message_string = "\n".join([str(m) for m in self.messages])
+        return (
+            f"The following issuse were found by the sanity checker:\n"
+            f"{message_string}\n\n===============\n\n")
+
+
+def check_sanity():
+    messages = []
+
+    present_files = []
+    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
+        for f in files:
+            present_files.append(os.path.normpath(os.path.join(root, f)))
+
+    for doc in Document.objects.all():
+        # Check thumbnail
+        if not os.path.isfile(doc.thumbnail_path):
+            messages.append(SanityError(
+                f"Thumbnail of document {doc.pk} does not exist."))
+        else:
+            present_files.remove(os.path.normpath(doc.thumbnail_path))
+            try:
+                with doc.thumbnail_file as f:
+                    f.read()
+            except OSError as e:
+                messages.append(SanityError(
+                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
+                ))
+
+        # Check document
+        if not os.path.isfile(doc.source_path):
+            messages.append(SanityError(
+                f"Original of document {doc.pk} does not exist."))
+        else:
+            present_files.remove(os.path.normpath(doc.source_path))
+            checksum = None
+            try:
+                with doc.source_file as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+            except OSError as e:
+                messages.append(SanityError(
+                    f"Cannot read original file of document {doc.pk}: {e}"))
+
+            if checksum and not checksum == doc.checksum:
+                messages.append(SanityError(
+                    f"Checksum mismatch of document {doc.pk}. "
+                    f"Stored: {doc.checksum}, actual: {checksum}."
+                ))
+
+        if not doc.content:
+            messages.append(SanityWarning(
+                f"Document {doc.pk} has no content."
+            ))
+
+    for extra_file in present_files:
+        messages.append(SanityWarning(
+            f"Orphaned file in media dir: {extra_file}"
+        ))
+
+    return messages
diff --git a/src/documents/tasks.py b/src/documents/tasks.py
index 40ed8f25e..3c9baad08 100644
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -3,11 +3,12 @@ import logging
 from django.conf import settings
 from whoosh.writing import AsyncWriter
 
-from documents import index
+from documents import index, sanity_checker
 from documents.classifier import DocumentClassifier, \
     IncompatibleClassifierVersionError
 from documents.consumer import Consumer, ConsumerError
 from documents.models import Document
+from documents.sanity_checker import SanityFailedError
 
 
 def index_optimize():
@@ -74,3 +75,12 @@ def consume_file(path,
     else:
         raise ConsumerError("Unknown error: Returned document was null, but "
                             "no error message was given.")
+
+
+def sanity_check():
+    messages = sanity_checker.check_sanity()
+
+    if len(messages) > 0:
+        raise SanityFailedError(messages)
+    else:
+        return "No issues detected."

From d92214d41204c5a545f0926f8b4123019434611d Mon Sep 17 00:00:00 2001
From: Jonas Winkler <jonas.winkler@jpwinkler.de>
Date: Wed, 25 Nov 2020 16:05:52 +0100
Subject: [PATCH 3/4] codestyle

---
 src/paperless/auth.py             | 3 ++-
 src/paperless_tesseract/checks.py | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/paperless/auth.py b/src/paperless/auth.py
index faf3104bc..ece5d0eba 100644
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@@ -8,7 +8,8 @@ class AutoLoginMiddleware(MiddlewareMixin):
 
     def process_request(self, request):
         try:
-            request.user = User.objects.get(username=settings.AUTO_LOGIN_USERNAME)
+            request.user = User.objects.get(
+                username=settings.AUTO_LOGIN_USERNAME)
         except User.DoesNotExist:
             pass
 
diff --git a/src/paperless_tesseract/checks.py b/src/paperless_tesseract/checks.py
index 21f229e65..8a06d7b00 100644
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -5,7 +5,8 @@ from django.core.checks import Error, register
 
 
 def get_tesseract_langs():
-    with subprocess.Popen(['tesseract', '--list-langs'], stdout=subprocess.PIPE) as p:
+    with subprocess.Popen(['tesseract', '--list-langs'],
+                          stdout=subprocess.PIPE) as p:
         stdout, stderr = p.communicate()
 
     return stdout.decode().strip().split("\n")[1:]
@@ -15,7 +16,7 @@ def get_tesseract_langs():
 def check_default_language_available(app_configs, **kwargs):
     langs = get_tesseract_langs()
 
-    if not settings.OCR_LANGUAGE in langs:
+    if settings.OCR_LANGUAGE not in langs:
         return [Error(
             f"The default ocr language {settings.OCR_LANGUAGE} is "
             f"not installed. Paperless cannot OCR your documents "

From 1987dccf48dbabdae6203aa796ad8886ad6d4420 Mon Sep 17 00:00:00 2001
From: Jonas Winkler <jonas.winkler@jpwinkler.de>
Date: Wed, 25 Nov 2020 16:30:53 +0100
Subject: [PATCH 4/4] changelog

---
 docs/changelog.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 7a1b1c374..c494cecb9 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -10,6 +10,8 @@ next
 
 *   Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
     You have to specify your username.
+*   Added a simple sanity checker that checks your documents for missing or orphaned files,
+    files with wrong checksums, inaccessible files, and documents with empty content.
 
 
 paperless-ng 0.9.2