From d15405ef56534a529b499be430c97a1ea7f9c898 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Mon, 2 Nov 2020 15:40:44 +0100 Subject: [PATCH] reworked most of the tesseract parser, better logging --- src/documents/consumer.py | 28 +++--- src/documents/loggers.py | 8 -- src/documents/models.py | 1 - src/documents/parsers.py | 4 +- src/documents/signals/handlers.py | 1 + src/paperless/settings.py | 8 +- src/paperless_tesseract/parsers.py | 147 +++++++++++++---------------- src/paperless_text/parsers.py | 16 +--- 8 files changed, 90 insertions(+), 123 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 1229af680..41eefc948 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -89,11 +89,13 @@ class Consumer: if self._is_duplicate(doc): self.log( - "info", + "warning", "Skipping {} as it appears to be a duplicate".format(doc) ) return False + self.log("info", "Consuming {}".format(doc)) + parser_class = self._get_parser_class(doc) if not parser_class: self.log( @@ -102,7 +104,6 @@ class Consumer: self.logging_group = uuid.uuid4() - self.log("info", "Consuming {}".format(doc)) document_consumption_started.send( sender=self.__class__, @@ -110,23 +111,23 @@ class Consumer: logging_group=self.logging_group ) - parsed_document = parser_class(doc) + document_parser = parser_class(doc, self.logging_group) try: - thumbnail = parsed_document.get_optimised_thumbnail() - date = parsed_document.get_date() + thumbnail = document_parser.get_optimised_thumbnail() + date = document_parser.get_date() document = self._store( - parsed_document.get_text(), + document_parser.get_text(), doc, thumbnail, date ) except ParseError as e: - self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) - parsed_document.cleanup() + self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e)) + document_parser.cleanup() return False else: - parsed_document.cleanup() + document_parser.cleanup() self._cleanup_doc(doc) self.log( @@ -140,9 +141,10 @@ class Consumer: self.classifier.reload() classifier = self.classifier except FileNotFoundError: - logging.getLogger(__name__).warning("Cannot classify documents, " - "classifier model file was not " - "found.") + self.log("warning", "Cannot classify documents, classifier " + "model file was not found. Consider " + "running python manage.py " + "document_create_classifier.") document_consumption_finished.send( sender=self.__class__, @@ -211,7 +213,7 @@ class Consumer: document.save() - self.log("info", "Completed") + self.log("debug", "Completed") return document diff --git a/src/documents/loggers.py b/src/documents/loggers.py index a35841299..4657c9b6c 100644 --- a/src/documents/loggers.py +++ b/src/documents/loggers.py @@ -2,15 +2,7 @@ import logging class PaperlessLogger(logging.StreamHandler): - """ - A logger smart enough to know to log some kinds of messages to the database - for later retrieval in a pretty interface. - """ - def emit(self, record): - - logging.StreamHandler.emit(self, record) - # We have to do the import here or Django will barf when it tries to # load this because the apps aren't loaded at that point from .models import Log diff --git a/src/documents/models.py b/src/documents/models.py index 1881ac249..436f5163a 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -3,7 +3,6 @@ import logging import os import re -import uuid from collections import OrderedDict import dateutil.parser diff --git a/src/documents/parsers.py b/src/documents/parsers.py index fc327a07b..c5357b419 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -39,11 +39,11 @@ class DocumentParser: `paperless_tesseract.parsers` for inspiration. """ - def __init__(self, path): + def __init__(self, path, logging_group): self.document_path = path self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) self.logger = logging.getLogger(__name__) - self.logging_group = None + self.logging_group = logging_group def get_thumbnail(self): """ diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 5adf9df68..0a96d6b06 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -56,6 +56,7 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None 'Assigning correspondent "{}" to "{}" '.format(selected, document), logging_group ) + # TODO: during consumption, this saves even though no updates have been made document.correspondent = selected document.save(update_fields=("correspondent",)) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 9aa2b98af..5cf2b4b66 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -239,14 +239,14 @@ LOGGING = { "version": 1, "disable_existing_loggers": False, "handlers": { - "consumer": { + "dblogger": { "class": "documents.loggers.PaperlessLogger", } }, "loggers": { "documents": { - "handlers": ["consumer"], - "level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"), + "handlers": ["dblogger"], + "level": "DEBUG" }, }, } @@ -260,7 +260,7 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4)) # OCR all documents? -OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS") +OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", False) # GNUPG needs a home directory for some reason diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index fd6de1ba4..afd64de65 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -8,9 +8,7 @@ import langdetect import pyocr from django.conf import settings from PIL import Image -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError -from pyocr.tesseract import TesseractError +from pyocr import PyocrException import pdftotext from documents.parsers import DocumentParser, ParseError @@ -28,8 +26,8 @@ class RasterisedDocumentParser(DocumentParser): image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) """ - def __init__(self, path): - super().__init__(path) + def __init__(self, path, logging_group): + super().__init__(path, logging_group) self._text = None def get_thumbnail(self): @@ -53,11 +51,7 @@ class RasterisedDocumentParser(DocumentParser): except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript - self.log( - "warning", - "Thumbnail generation with ImageMagick failed, " - "falling back to Ghostscript." - ) + self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!') gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [settings.GS_BINARY, "-q", @@ -100,9 +94,33 @@ class RasterisedDocumentParser(DocumentParser): images = self._get_greyscale() + if not images: + raise ParseError("Empty document, nothing to do.") + try: - self._text = self._get_ocr(images) + + sample_page_index = int(len(images) / 2) + self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images))) + sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] + guessed_language = self._guess_language(sample_page_text) + + if not guessed_language or guessed_language not in ISO639: + self.log("warning", "Language detection failed.") + ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + elif ISO639[guessed_language] == settings.OCR_LANGUAGE: + self.log("info", "Detected language: {} (default language)".format(guessed_language)) + ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): + self.log("warning","Detected language {} is not available on this system.".format(guessed_language)) + ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + else: + self.log("info","Detected language: {}".format(guessed_language)) + ocr_pages = self._ocr(images, ISO639[guessed_language]) + + self.log("info", "OCR completed.") + self._text = strip_excess_whitespace(" ".join(ocr_pages)) return self._text + except OCRError as e: raise ParseError(e) @@ -111,6 +129,8 @@ class RasterisedDocumentParser(DocumentParser): Greyscale images are easier for Tesseract to OCR """ + self.log("info", "Converting document {} into greyscale images...".format(self.document_path)) + # Convert PDF to multiple PNMs pnm = os.path.join(self.tempdir, "convert-%04d.pnm") run_convert( @@ -127,91 +147,43 @@ class RasterisedDocumentParser(DocumentParser): if f.endswith(".pnm"): pnms.append(os.path.join(self.tempdir, f)) + self.log("info", "Running unpaper on {} pages...".format(len(pnms))) + # Run unpaper in parallel on converted images with Pool(processes=settings.OCR_THREADS) as pool: - pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms)) - - # Return list of converted images, processed with unpaper - pnms = [] - for f in os.listdir(self.tempdir): - if f.endswith(".unpaper.pnm"): - pnms.append(os.path.join(self.tempdir, f)) + pnms = pool.map(run_unpaper, pnms) return sorted(filter(lambda __: os.path.isfile(__), pnms)) def _guess_language(self, text): try: guess = langdetect.detect(text) - self.log("debug", "Language detected: {}".format(guess)) return guess except Exception as e: - self.log("warning", "Language detection error: {}".format(e)) - - def _get_ocr(self, imgs): - """ - Attempts to do the best job possible OCR'ing the document based on - simple language detection trial & error. - """ - - if not imgs: - raise OCRError("Empty document, nothing to do.") - - self.log("info", "OCRing the document") - - # Since the division gets rounded down by int, this calculation works - # for every edge-case, i.e. 1 - middle = int(len(imgs) / 2) - raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE) - guessed_language = self._guess_language(raw_text) - - if not guessed_language or guessed_language not in ISO639: - self.log("warning", "Language detection failed!") - - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - - if ISO639[guessed_language] == settings.OCR_LANGUAGE: - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text - - try: - return self._ocr(imgs, ISO639[guessed_language]) - except pyocr.pyocr.tesseract.TesseractError: - self.log( - "warning", - "OCR for {} failed, but we're going to stick with what " - "we've got since FORGIVING_OCR is enabled.".format( - guessed_language - ) - ) - raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) - return raw_text + return None def _ocr(self, imgs, lang): - """ - Performs a single OCR attempt. - """ - - if not imgs: - return "" - - self.log("info", "Parsing for {}".format(lang)) - + self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) with Pool(processes=settings.OCR_THREADS) as pool: r = pool.map(image_to_string, itertools.product(imgs, [lang])) - r = " ".join(r) + return r - # Strip out excess white space to allow matching to go smoother - return strip_excess_whitespace(r) - - def _assemble_ocr_sections(self, imgs, middle, text): + def _complete_ocr_default_language(self, images, sample_page_index, sample_page): """ Given a `middle` value and the text that middle page represents, we OCR the remainder of the document and return the whole thing. """ - text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text - text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) - return text + # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text + # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) + images_copy = list(images) + del images_copy[sample_page_index] + if images_copy: + self.log('info', 'Continuing ocr with default language.') + ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) + ocr_pages.insert(sample_page_index, sample_page) + return ocr_pages + else: + return [sample_page] def run_convert(*args): @@ -225,13 +197,16 @@ def run_convert(*args): raise ParseError("Convert failed at {}".format(args)) -def run_unpaper(args): - unpaper, pnm = args - command_args = (unpaper, "--overwrite", "--quiet", pnm, - pnm.replace(".pnm", ".unpaper.pnm")) +def run_unpaper(pnm): + pnm_out = pnm.replace(".pnm", ".unpaper.pnm") + + command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, + pnm_out) if not subprocess.Popen(command_args).wait() == 0: raise ParseError("Unpaper failed at {}".format(command_args)) + return pnm_out + def strip_excess_whitespace(text): collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) @@ -245,14 +220,18 @@ def strip_excess_whitespace(text): def image_to_string(args): img, lang = args ocr = pyocr.get_available_tools()[0] - with Image.open(os.path.join(settings.SCRATCH_DIR, img)) as f: + with Image.open(img) as f: if ocr.can_detect_orientation(): try: orientation = ocr.detect_orientation(f, lang=lang) f = f.rotate(orientation["angle"], expand=1) - except (TesseractError, OtherTesseractError, AttributeError): + except Exception: + # Rotation not possible, ignore pass - return ocr.image_to_string(f, lang=lang) + try: + return ocr.image_to_string(f, lang=lang) + except PyocrException as e: + raise OCRError(e) def get_text_from_pdf(pdf_file): diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 3ccb78404..0db1e230b 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -11,14 +11,8 @@ class TextDocumentParser(DocumentParser): This parser directly parses a text document (.txt, .md, or .csv) """ - CONVERT = settings.CONVERT_BINARY - THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None - UNPAPER = settings.UNPAPER_BINARY - DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE - OCR_ALWAYS = settings.OCR_ALWAYS - - def __init__(self, path): - super().__init__(path) + def __init__(self, path, logging_group): + super().__init__(path, logging_group) self._text = None def get_thumbnail(self): @@ -44,7 +38,7 @@ class TextDocumentParser(DocumentParser): r = str(round(psize[0] / 10)) rounded = ",".join([r, r]) run_command( - self.CONVERT, + settings.CONVERT_BINARY, "-size ", picsize, ' xc:none -draw ', '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501 @@ -59,7 +53,7 @@ class TextDocumentParser(DocumentParser): def create_txlayer(): run_command( - self.CONVERT, + settings.CONVERT_BINARY, "-background none", "-fill", text_color, @@ -73,7 +67,7 @@ class TextDocumentParser(DocumentParser): create_txlayer() create_bg() run_command( - self.CONVERT, + settings.CONVERT_BINARY, temp_bg, temp_txlayer, "-background None -layers merge ",