mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
reworked most of the tesseract parser, better logging
This commit is contained in:
@@ -89,11 +89,13 @@ class Consumer:
|
||||
|
||||
if self._is_duplicate(doc):
|
||||
self.log(
|
||||
"info",
|
||||
"warning",
|
||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
||||
)
|
||||
return False
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
|
||||
parser_class = self._get_parser_class(doc)
|
||||
if not parser_class:
|
||||
self.log(
|
||||
@@ -102,7 +104,6 @@ class Consumer:
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
@@ -110,23 +111,23 @@ class Consumer:
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
parsed_document = parser_class(doc)
|
||||
document_parser = parser_class(doc, self.logging_group)
|
||||
|
||||
try:
|
||||
thumbnail = parsed_document.get_optimised_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
date = document_parser.get_date()
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
document_parser.get_text(),
|
||||
doc,
|
||||
thumbnail,
|
||||
date
|
||||
)
|
||||
except ParseError as e:
|
||||
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
parsed_document.cleanup()
|
||||
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
document_parser.cleanup()
|
||||
return False
|
||||
else:
|
||||
parsed_document.cleanup()
|
||||
document_parser.cleanup()
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self.log(
|
||||
@@ -140,9 +141,10 @@ class Consumer:
|
||||
self.classifier.reload()
|
||||
classifier = self.classifier
|
||||
except FileNotFoundError:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents, "
|
||||
"classifier model file was not "
|
||||
"found.")
|
||||
self.log("warning", "Cannot classify documents, classifier "
|
||||
"model file was not found. Consider "
|
||||
"running python manage.py "
|
||||
"document_create_classifier.")
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
@@ -211,7 +213,7 @@ class Consumer:
|
||||
|
||||
document.save()
|
||||
|
||||
self.log("info", "Completed")
|
||||
self.log("debug", "Completed")
|
||||
|
||||
return document
|
||||
|
||||
|
@@ -2,15 +2,7 @@ import logging
|
||||
|
||||
|
||||
class PaperlessLogger(logging.StreamHandler):
|
||||
"""
|
||||
A logger smart enough to know to log some kinds of messages to the database
|
||||
for later retrieval in a pretty interface.
|
||||
"""
|
||||
|
||||
def emit(self, record):
|
||||
|
||||
logging.StreamHandler.emit(self, record)
|
||||
|
||||
# We have to do the import here or Django will barf when it tries to
|
||||
# load this because the apps aren't loaded at that point
|
||||
from .models import Log
|
||||
|
@@ -3,7 +3,6 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from collections import OrderedDict
|
||||
|
||||
import dateutil.parser
|
||||
|
@@ -39,11 +39,11 @@ class DocumentParser:
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, path):
|
||||
def __init__(self, path, logging_group):
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
self.logging_group = logging_group
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
|
@@ -56,6 +56,7 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
|
||||
'Assigning correspondent "{}" to "{}" '.format(selected, document),
|
||||
logging_group
|
||||
)
|
||||
# TODO: during consumption, this saves even though no updates have been made
|
||||
|
||||
document.correspondent = selected
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
Reference in New Issue
Block a user