Merge branch 'dev' into feature-ocrmypdf

This commit is contained in:
Jonas Winkler 2020-11-25 16:58:20 +01:00
commit b269af7572
7 changed files with 146 additions and 5 deletions

View File

@ -147,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
tag is set to *Auto*, this neural network will examine your documents and tag is set to *Auto*, this neural network will examine your documents and
automatically learn when to assign this tag. automatically learn when to assign this tag.
There are a couple caveats you need to keep in mind when using this feature: Paperless tries to hide much of the involved complexity with this approach.
However, there are a couple caveats you need to keep in mind when using this
feature:
* Changes to your documents are not immediately reflected by the matching * Changes to your documents are not immediately reflected by the matching
algorithm. The neural network needs to be *trained* on your documents after algorithm. The neural network needs to be *trained* on your documents after
@ -167,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
has the correspondent "Very obscure web shop I bought something five years has the correspondent "Very obscure web shop I bought something five years
ago", it will probably not assign this correspondent automatically if you buy ago", it will probably not assign this correspondent automatically if you buy
something from them again. The more documents, the better. something from them again. The more documents, the better.
* Paperless also needs a reasonable amount of negative examples to decide when
not to assign a certain tag, correspondent or type. This will usually be the
case as you start filling up paperless with documents. Example: If all your
documents are either from "Webshop" and "Bank", paperless will assign one of
these correspondents to ANY new document, if both are set to automatic matching.
Hooking into the consumption process Hooking into the consumption process
#################################### ####################################

View File

@ -10,6 +10,8 @@ next
* Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``. * Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
You have to specify your username. You have to specify your username.
* Added a simple sanity checker that checks your documents for missing or orphaned files,
files with wrong checksums, inaccessible files, and documents with empty content.
paperless-ng 0.9.2 paperless-ng 0.9.2

View File

@ -0,0 +1,26 @@
# Generated by Django 3.1.3 on 2020-11-25 14:53
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
class Migration(migrations.Migration):
dependencies = [
('documents', '1003_mime_types'),
('django_q', '0013_task_attempt_count'),
]
operations = [
RunPython(add_schedules, remove_schedules)
]

View File

@ -0,0 +1,94 @@
import hashlib
import os
from django.conf import settings
from documents.models import Document
class SanityMessage:
message = None
class SanityWarning(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"Warning: {self.message}"
class SanityError(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"ERROR: {self.message}"
class SanityFailedError(Exception):
def __init__(self, messages):
self.messages = messages
def __str__(self):
message_string = "\n".join([str(m) for m in self.messages])
return (
f"The following issuse were found by the sanity checker:\n"
f"{message_string}\n\n===============\n\n")
def check_sanity():
messages = []
present_files = []
for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
for f in files:
present_files.append(os.path.normpath(os.path.join(root, f)))
for doc in Document.objects.all():
# Check thumbnail
if not os.path.isfile(doc.thumbnail_path):
messages.append(SanityError(
f"Thumbnail of document {doc.pk} does not exist."))
else:
present_files.remove(os.path.normpath(doc.thumbnail_path))
try:
with doc.thumbnail_file as f:
f.read()
except OSError as e:
messages.append(SanityError(
f"Cannot read thumbnail file of document {doc.pk}: {e}"
))
# Check document
if not os.path.isfile(doc.source_path):
messages.append(SanityError(
f"Original of document {doc.pk} does not exist."))
else:
present_files.remove(os.path.normpath(doc.source_path))
checksum = None
try:
with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.append(SanityError(
f"Cannot read original file of document {doc.pk}: {e}"))
if checksum and not checksum == doc.checksum:
messages.append(SanityError(
f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
if not doc.content:
messages.append(SanityWarning(
f"Document {doc.pk} has no content."
))
for extra_file in present_files:
messages.append(SanityWarning(
f"Orphaned file in media dir: {extra_file}"
))
return messages

View File

@ -3,11 +3,12 @@ import logging
from django.conf import settings from django.conf import settings
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
from documents import index from documents import index, sanity_checker
from documents.classifier import DocumentClassifier, \ from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError IncompatibleClassifierVersionError
from documents.consumer import Consumer, ConsumerError from documents.consumer import Consumer, ConsumerError
from documents.models import Document from documents.models import Document
from documents.sanity_checker import SanityFailedError
def index_optimize(): def index_optimize():
@ -74,3 +75,12 @@ def consume_file(path,
else: else:
raise ConsumerError("Unknown error: Returned document was null, but " raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.") "no error message was given.")
def sanity_check():
messages = sanity_checker.check_sanity()
if len(messages) > 0:
raise SanityFailedError(messages)
else:
return "No issues detected."

View File

@ -8,7 +8,8 @@ class AutoLoginMiddleware(MiddlewareMixin):
def process_request(self, request): def process_request(self, request):
try: try:
request.user = User.objects.get(username=settings.AUTO_LOGIN_USERNAME) request.user = User.objects.get(
username=settings.AUTO_LOGIN_USERNAME)
except User.DoesNotExist: except User.DoesNotExist:
pass pass

View File

@ -5,7 +5,8 @@ from django.core.checks import Error, register
def get_tesseract_langs(): def get_tesseract_langs():
with subprocess.Popen(['tesseract', '--list-langs'], stdout=subprocess.PIPE) as p: with subprocess.Popen(['tesseract', '--list-langs'],
stdout=subprocess.PIPE) as p:
stdout, stderr = p.communicate() stdout, stderr = p.communicate()
return stdout.decode().strip().split("\n")[1:] return stdout.decode().strip().split("\n")[1:]
@ -15,7 +16,7 @@ def get_tesseract_langs():
def check_default_language_available(app_configs, **kwargs): def check_default_language_available(app_configs, **kwargs):
langs = get_tesseract_langs() langs = get_tesseract_langs()
if not settings.OCR_LANGUAGE in langs: if settings.OCR_LANGUAGE not in langs:
return [Error( return [Error(
f"The default ocr language {settings.OCR_LANGUAGE} is " f"The default ocr language {settings.OCR_LANGUAGE} is "
f"not installed. Paperless cannot OCR your documents " f"not installed. Paperless cannot OCR your documents "