Merge branch 'dev' into feature-ocrmypdf

2025-06-28 15:54:41 -05:00 · 2020-11-25 16:58:20 +01:00 · 2020-11-25 16:58:20 +01:00 · b269af7572
commit b269af7572
parent 56ce267f89 1987dccf48
7 changed files with 146 additions and 5 deletions
--- a/docs/advanced_usage.rst
+++ b/docs/advanced_usage.rst
@ -147,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
 tag is set to *Auto*, this neural network will examine your documents and
 automatically learn when to assign this tag.
-There are a couple caveats you need to keep in mind when using this feature:
+Paperless tries to hide much of the involved complexity with this approach.
 However, there are a couple caveats you need to keep in mind when using this
 feature:
 * Changes to your documents are not immediately reflected by the matching
  algorithm. The neural network needs to be *trained* on your documents after
@ -167,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
  has the correspondent "Very obscure web shop I bought something five years
  ago", it will probably not assign this correspondent automatically if you buy
  something from them again. The more documents, the better.
 * Paperless also needs a reasonable amount of negative examples to decide when
  not to assign a certain tag, correspondent or type. This will usually be the
  case as you start filling up paperless with documents. Example: If all your
  documents are either from "Webshop" and "Bank", paperless will assign one of
  these correspondents to ANY new document, if both are set to automatic matching.
 Hooking into the consumption process
 ####################################
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -10,6 +10,8 @@ next
 *   Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
    You have to specify your username.
 *   Added a simple sanity checker that checks your documents for missing or orphaned files,
    files with wrong checksums, inaccessible files, and documents with empty content.
 paperless-ng 0.9.2
--- a/src/documents/migrations/1004_sanity_check_schedule.py
+++ b/src/documents/migrations/1004_sanity_check_schedule.py
@ -0,0 +1,26 @@
 # Generated by Django 3.1.3 on 2020-11-25 14:53
 from django.db import migrations
 from django.db.migrations import RunPython
 from django_q.models import Schedule
 from django_q.tasks import schedule
 def add_schedules(apps, schema_editor):
    schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
 def remove_schedules(apps, schema_editor):
    Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
 class Migration(migrations.Migration):
    dependencies = [
        ('documents', '1003_mime_types'),
        ('django_q', '0013_task_attempt_count'),
    ]
    operations = [
        RunPython(add_schedules, remove_schedules)
    ]
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@ -0,0 +1,94 @@
 import hashlib
 import os
 from django.conf import settings
 from documents.models import Document
 class SanityMessage:
    message = None
 class SanityWarning(SanityMessage):
    def __init__(self, message):
        self.message = message
    def __str__(self):
        return f"Warning: {self.message}"
 class SanityError(SanityMessage):
    def __init__(self, message):
        self.message = message
    def __str__(self):
        return f"ERROR: {self.message}"
 class SanityFailedError(Exception):
    def __init__(self, messages):
        self.messages = messages
    def __str__(self):
        message_string = "\n".join([str(m) for m in self.messages])
        return (
            f"The following issuse were found by the sanity checker:\n"
            f"{message_string}\n\n===============\n\n")
 def check_sanity():
    messages = []
    present_files = []
    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
        for f in files:
            present_files.append(os.path.normpath(os.path.join(root, f)))
    for doc in Document.objects.all():
        # Check thumbnail
        if not os.path.isfile(doc.thumbnail_path):
            messages.append(SanityError(
                f"Thumbnail of document {doc.pk} does not exist."))
        else:
            present_files.remove(os.path.normpath(doc.thumbnail_path))
            try:
                with doc.thumbnail_file as f:
                    f.read()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
                ))
        # Check document
        if not os.path.isfile(doc.source_path):
            messages.append(SanityError(
                f"Original of document {doc.pk} does not exist."))
        else:
            present_files.remove(os.path.normpath(doc.source_path))
            checksum = None
            try:
                with doc.source_file as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read original file of document {doc.pk}: {e}"))
            if checksum and not checksum == doc.checksum:
                messages.append(SanityError(
                    f"Checksum mismatch of document {doc.pk}. "
                    f"Stored: {doc.checksum}, actual: {checksum}."
                ))
        if not doc.content:
            messages.append(SanityWarning(
                f"Document {doc.pk} has no content."
            ))
    for extra_file in present_files:
        messages.append(SanityWarning(
            f"Orphaned file in media dir: {extra_file}"
        ))
    return messages
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -3,11 +3,12 @@ import logging
 from django.conf import settings
 from whoosh.writing import AsyncWriter
-from documents import index
+from documents import index, sanity_checker
 from documents.classifier import DocumentClassifier, \
    IncompatibleClassifierVersionError
 from documents.consumer import Consumer, ConsumerError
 from documents.models import Document
 from documents.sanity_checker import SanityFailedError
 def index_optimize():
@ -74,3 +75,12 @@ def consume_file(path,
    else:
        raise ConsumerError("Unknown error: Returned document was null, but "
                            "no error message was given.")
 def sanity_check():
    messages = sanity_checker.check_sanity()
    if len(messages) > 0:
        raise SanityFailedError(messages)
    else:
        return "No issues detected."
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@ -8,7 +8,8 @@ class AutoLoginMiddleware(MiddlewareMixin):
    def process_request(self, request):
        try:
-            request.user = User.objects.get(username=settings.AUTO_LOGIN_USERNAME)
+            request.user = User.objects.get(
                username=settings.AUTO_LOGIN_USERNAME)
        except User.DoesNotExist:
            pass
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@ -5,7 +5,8 @@ from django.core.checks import Error, register
 def get_tesseract_langs():
-    with subprocess.Popen(['tesseract', '--list-langs'], stdout=subprocess.PIPE) as p:
+    with subprocess.Popen(['tesseract', '--list-langs'],
                          stdout=subprocess.PIPE) as p:
        stdout, stderr = p.communicate()
    return stdout.decode().strip().split("\n")[1:]
@ -15,7 +16,7 @@ def get_tesseract_langs():
 def check_default_language_available(app_configs, **kwargs):
    langs = get_tesseract_langs()
-    if not settings.OCR_LANGUAGE in langs:
+    if settings.OCR_LANGUAGE not in langs:
        return [Error(
            f"The default ocr language {settings.OCR_LANGUAGE} is "
            f"not installed. Paperless cannot OCR your documents "