reworked most of the tesseract parser, better logging

This commit is contained in:
Jonas Winkler
2020-11-02 15:40:44 +01:00
parent 397503d4c4
commit def3a85858
8 changed files with 90 additions and 123 deletions

View File

@@ -89,11 +89,13 @@ class Consumer:
if self._is_duplicate(doc):
self.log(
"info",
"warning",
"Skipping {} as it appears to be a duplicate".format(doc)
)
return False
self.log("info", "Consuming {}".format(doc))
parser_class = self._get_parser_class(doc)
if not parser_class:
self.log(
@@ -102,7 +104,6 @@ class Consumer:
self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
document_consumption_started.send(
sender=self.__class__,
@@ -110,23 +111,23 @@ class Consumer:
logging_group=self.logging_group
)
parsed_document = parser_class(doc)
document_parser = parser_class(doc, self.logging_group)
try:
thumbnail = parsed_document.get_optimised_thumbnail()
date = parsed_document.get_date()
thumbnail = document_parser.get_optimised_thumbnail()
date = document_parser.get_date()
document = self._store(
parsed_document.get_text(),
document_parser.get_text(),
doc,
thumbnail,
date
)
except ParseError as e:
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup()
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
document_parser.cleanup()
return False
else:
parsed_document.cleanup()
document_parser.cleanup()
self._cleanup_doc(doc)
self.log(
@@ -140,9 +141,10 @@ class Consumer:
self.classifier.reload()
classifier = self.classifier
except FileNotFoundError:
logging.getLogger(__name__).warning("Cannot classify documents, "
"classifier model file was not "
"found.")
self.log("warning", "Cannot classify documents, classifier "
"model file was not found. Consider "
"running python manage.py "
"document_create_classifier.")
document_consumption_finished.send(
sender=self.__class__,
@@ -211,7 +213,7 @@ class Consumer:
document.save()
self.log("info", "Completed")
self.log("debug", "Completed")
return document

View File

@@ -2,15 +2,7 @@ import logging
class PaperlessLogger(logging.StreamHandler):
"""
A logger smart enough to know to log some kinds of messages to the database
for later retrieval in a pretty interface.
"""
def emit(self, record):
logging.StreamHandler.emit(self, record)
# We have to do the import here or Django will barf when it tries to
# load this because the apps aren't loaded at that point
from .models import Log

View File

@@ -3,7 +3,6 @@
import logging
import os
import re
import uuid
from collections import OrderedDict
import dateutil.parser

View File

@@ -39,11 +39,11 @@ class DocumentParser:
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path):
def __init__(self, path, logging_group):
self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.logger = logging.getLogger(__name__)
self.logging_group = None
self.logging_group = logging_group
def get_thumbnail(self):
"""

View File

@@ -56,6 +56,7 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
'Assigning correspondent "{}" to "{}" '.format(selected, document),
logging_group
)
# TODO: during consumption, this saves even though no updates have been made
document.correspondent = selected
document.save(update_fields=("correspondent",))