mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	reworked most of the tesseract parser, better logging
This commit is contained in:
		| @@ -89,11 +89,13 @@ class Consumer: | ||||
|  | ||||
|         if self._is_duplicate(doc): | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "warning", | ||||
|                 "Skipping {} as it appears to be a duplicate".format(doc) | ||||
|             ) | ||||
|             return False | ||||
|  | ||||
|         self.log("info", "Consuming {}".format(doc)) | ||||
|  | ||||
|         parser_class = self._get_parser_class(doc) | ||||
|         if not parser_class: | ||||
|             self.log( | ||||
| @@ -102,7 +104,6 @@ class Consumer: | ||||
|  | ||||
|         self.logging_group = uuid.uuid4() | ||||
|  | ||||
|         self.log("info", "Consuming {}".format(doc)) | ||||
|  | ||||
|         document_consumption_started.send( | ||||
|             sender=self.__class__, | ||||
| @@ -110,23 +111,23 @@ class Consumer: | ||||
|             logging_group=self.logging_group | ||||
|         ) | ||||
|  | ||||
|         parsed_document = parser_class(doc) | ||||
|         document_parser = parser_class(doc, self.logging_group) | ||||
|  | ||||
|         try: | ||||
|             thumbnail = parsed_document.get_optimised_thumbnail() | ||||
|             date = parsed_document.get_date() | ||||
|             thumbnail = document_parser.get_optimised_thumbnail() | ||||
|             date = document_parser.get_date() | ||||
|             document = self._store( | ||||
|                 parsed_document.get_text(), | ||||
|                 document_parser.get_text(), | ||||
|                 doc, | ||||
|                 thumbnail, | ||||
|                 date | ||||
|             ) | ||||
|         except ParseError as e: | ||||
|             self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|             parsed_document.cleanup() | ||||
|             self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|             document_parser.cleanup() | ||||
|             return False | ||||
|         else: | ||||
|             parsed_document.cleanup() | ||||
|             document_parser.cleanup() | ||||
|             self._cleanup_doc(doc) | ||||
|  | ||||
|             self.log( | ||||
| @@ -140,9 +141,10 @@ class Consumer: | ||||
|                 self.classifier.reload() | ||||
|                 classifier = self.classifier | ||||
|             except FileNotFoundError: | ||||
|                 logging.getLogger(__name__).warning("Cannot classify documents, " | ||||
|                                                   "classifier model file was not " | ||||
|                                                   "found.") | ||||
|                 self.log("warning", "Cannot classify documents, classifier " | ||||
|                                     "model file was not found. Consider " | ||||
|                                     "running python manage.py " | ||||
|                                     "document_create_classifier.") | ||||
|  | ||||
|             document_consumption_finished.send( | ||||
|                 sender=self.__class__, | ||||
| @@ -211,7 +213,7 @@ class Consumer: | ||||
|  | ||||
|         document.save() | ||||
|  | ||||
|         self.log("info", "Completed") | ||||
|         self.log("debug", "Completed") | ||||
|  | ||||
|         return document | ||||
|  | ||||
|   | ||||
| @@ -2,15 +2,7 @@ import logging | ||||
|  | ||||
|  | ||||
| class PaperlessLogger(logging.StreamHandler): | ||||
|     """ | ||||
|     A logger smart enough to know to log some kinds of messages to the database | ||||
|     for later retrieval in a pretty interface. | ||||
|     """ | ||||
|  | ||||
|     def emit(self, record): | ||||
|  | ||||
|         logging.StreamHandler.emit(self, record) | ||||
|  | ||||
|         # We have to do the import here or Django will barf when it tries to | ||||
|         # load this because the apps aren't loaded at that point | ||||
|         from .models import Log | ||||
|   | ||||
| @@ -3,7 +3,6 @@ | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import uuid | ||||
| from collections import OrderedDict | ||||
|  | ||||
| import dateutil.parser | ||||
|   | ||||
| @@ -39,11 +39,11 @@ class DocumentParser: | ||||
|     `paperless_tesseract.parsers` for inspiration. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path): | ||||
|     def __init__(self, path, logging_group): | ||||
|         self.document_path = path | ||||
|         self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|         self.logging_group = None | ||||
|         self.logging_group = logging_group | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|   | ||||
| @@ -56,6 +56,7 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None | ||||
|         'Assigning correspondent "{}" to "{}" '.format(selected, document), | ||||
|         logging_group | ||||
|     ) | ||||
|     # TODO: during consumption, this saves even though no updates have been made | ||||
|  | ||||
|     document.correspondent = selected | ||||
|     document.save(update_fields=("correspondent",)) | ||||
|   | ||||
| @@ -239,14 +239,14 @@ LOGGING = { | ||||
|     "version": 1, | ||||
|     "disable_existing_loggers": False, | ||||
|     "handlers": { | ||||
|         "consumer": { | ||||
|         "dblogger": { | ||||
|             "class": "documents.loggers.PaperlessLogger", | ||||
|         } | ||||
|     }, | ||||
|     "loggers": { | ||||
|         "documents": { | ||||
|             "handlers": ["consumer"], | ||||
|             "level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"), | ||||
|             "handlers": ["dblogger"], | ||||
|             "level": "DEBUG" | ||||
|         }, | ||||
|     }, | ||||
| } | ||||
| @@ -260,7 +260,7 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||
| OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4)) | ||||
|  | ||||
| # OCR all documents? | ||||
| OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS") | ||||
| OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", False) | ||||
|  | ||||
|  | ||||
| # GNUPG needs a home directory for some reason | ||||
|   | ||||
| @@ -8,9 +8,7 @@ import langdetect | ||||
| import pyocr | ||||
| from django.conf import settings | ||||
| from PIL import Image | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
| from pyocr.tesseract import TesseractError | ||||
| from pyocr import PyocrException | ||||
|  | ||||
| import pdftotext | ||||
| from documents.parsers import DocumentParser, ParseError | ||||
| @@ -28,8 +26,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         super().__init__(path) | ||||
|     def __init__(self, path, logging_group): | ||||
|         super().__init__(path, logging_group) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
| @@ -53,11 +51,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         except ParseError: | ||||
|             # if convert fails, fall back to extracting | ||||
|             # the first PDF page as a PNG using Ghostscript | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "Thumbnail generation with ImageMagick failed, " | ||||
|                 "falling back to Ghostscript." | ||||
|             ) | ||||
|             self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!') | ||||
|             gs_out_path = os.path.join(self.tempdir, "gs_out.png") | ||||
|             cmd = [settings.GS_BINARY, | ||||
|                    "-q", | ||||
| @@ -100,9 +94,33 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         images = self._get_greyscale() | ||||
|  | ||||
|         if not images: | ||||
|             raise ParseError("Empty document, nothing to do.") | ||||
|  | ||||
|         try: | ||||
|             self._text = self._get_ocr(images) | ||||
|  | ||||
|             sample_page_index = int(len(images) / 2) | ||||
|             self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images))) | ||||
|             sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] | ||||
|             guessed_language = self._guess_language(sample_page_text) | ||||
|  | ||||
|             if not guessed_language or guessed_language not in ISO639: | ||||
|                 self.log("warning", "Language detection failed.") | ||||
|                 ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) | ||||
|             elif ISO639[guessed_language] == settings.OCR_LANGUAGE: | ||||
|                 self.log("info", "Detected language: {} (default language)".format(guessed_language)) | ||||
|                 ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) | ||||
|             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): | ||||
|                 self.log("warning","Detected language {} is not available on this system.".format(guessed_language)) | ||||
|                 ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) | ||||
|             else: | ||||
|                 self.log("info","Detected language: {}".format(guessed_language)) | ||||
|                 ocr_pages = self._ocr(images, ISO639[guessed_language]) | ||||
|  | ||||
|             self.log("info", "OCR completed.") | ||||
|             self._text = strip_excess_whitespace(" ".join(ocr_pages)) | ||||
|             return self._text | ||||
|  | ||||
|         except OCRError as e: | ||||
|             raise ParseError(e) | ||||
|  | ||||
| @@ -111,6 +129,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         """ | ||||
|  | ||||
|         self.log("info", "Converting document {} into greyscale images...".format(self.document_path)) | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         pnm = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||
|         run_convert( | ||||
| @@ -127,91 +147,43 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|  | ||||
|         self.log("info", "Running unpaper on {} pages...".format(len(pnms))) | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with Pool(processes=settings.OCR_THREADS) as pool: | ||||
|             pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms)) | ||||
|  | ||||
|         # Return list of converted images, processed with unpaper | ||||
|         pnms = [] | ||||
|         for f in os.listdir(self.tempdir): | ||||
|             if f.endswith(".unpaper.pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|             pnms = pool.map(run_unpaper, pnms) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
|  | ||||
|     def _guess_language(self, text): | ||||
|         try: | ||||
|             guess = langdetect.detect(text) | ||||
|             self.log("debug", "Language detected: {}".format(guess)) | ||||
|             return guess | ||||
|         except Exception as e: | ||||
|             self.log("warning", "Language detection error: {}".format(e)) | ||||
|  | ||||
|     def _get_ocr(self, imgs): | ||||
|         """ | ||||
|         Attempts to do the best job possible OCR'ing the document based on | ||||
|         simple language detection trial & error. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             raise OCRError("Empty document, nothing to do.") | ||||
|  | ||||
|         self.log("info", "OCRing the document") | ||||
|  | ||||
|         # Since the division gets rounded down by int, this calculation works | ||||
|         # for every edge-case, i.e. 1 | ||||
|         middle = int(len(imgs) / 2) | ||||
|         raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE) | ||||
|         guessed_language = self._guess_language(raw_text) | ||||
|  | ||||
|         if not guessed_language or guessed_language not in ISO639: | ||||
|             self.log("warning", "Language detection failed!") | ||||
|  | ||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|             return raw_text | ||||
|  | ||||
|         if ISO639[guessed_language] == settings.OCR_LANGUAGE: | ||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|             return raw_text | ||||
|  | ||||
|         try: | ||||
|             return self._ocr(imgs, ISO639[guessed_language]) | ||||
|         except pyocr.pyocr.tesseract.TesseractError: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "OCR for {} failed, but we're going to stick with what " | ||||
|                 "we've got since FORGIVING_OCR is enabled.".format( | ||||
|                     guessed_language | ||||
|                 ) | ||||
|             ) | ||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|             return raw_text | ||||
|             return None | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|         """ | ||||
|         Performs a single OCR attempt. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             return "" | ||||
|  | ||||
|         self.log("info", "Parsing for {}".format(lang)) | ||||
|  | ||||
|         self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) | ||||
|         with Pool(processes=settings.OCR_THREADS) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             r = " ".join(r) | ||||
|             return r | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return strip_excess_whitespace(r) | ||||
|  | ||||
|     def _assemble_ocr_sections(self, imgs, middle, text): | ||||
|     def _complete_ocr_default_language(self, images, sample_page_index, sample_page): | ||||
|         """ | ||||
|         Given a `middle` value and the text that middle page represents, we OCR | ||||
|         the remainder of the document and return the whole thing. | ||||
|         """ | ||||
|         text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text | ||||
|         text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) | ||||
|         return text | ||||
|         # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text | ||||
|         # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE) | ||||
|         images_copy = list(images) | ||||
|         del images_copy[sample_page_index] | ||||
|         if images_copy: | ||||
|             self.log('info', 'Continuing ocr with default language.') | ||||
|             ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) | ||||
|             ocr_pages.insert(sample_page_index, sample_page) | ||||
|             return ocr_pages | ||||
|         else: | ||||
|             return [sample_page] | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
| @@ -225,13 +197,16 @@ def run_convert(*args): | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
|  | ||||
|  | ||||
| def run_unpaper(args): | ||||
|     unpaper, pnm = args | ||||
|     command_args = (unpaper, "--overwrite", "--quiet", pnm, | ||||
|                     pnm.replace(".pnm", ".unpaper.pnm")) | ||||
| def run_unpaper(pnm): | ||||
|     pnm_out = pnm.replace(".pnm", ".unpaper.pnm") | ||||
|  | ||||
|     command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, | ||||
|                     pnm_out) | ||||
|     if not subprocess.Popen(command_args).wait() == 0: | ||||
|         raise ParseError("Unpaper failed at {}".format(command_args)) | ||||
|  | ||||
|     return pnm_out | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
| @@ -245,14 +220,18 @@ def strip_excess_whitespace(text): | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|     with Image.open(os.path.join(settings.SCRATCH_DIR, img)) as f: | ||||
|     with Image.open(img) as f: | ||||
|         if ocr.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except (TesseractError, OtherTesseractError, AttributeError): | ||||
|             except Exception: | ||||
|                 # Rotation not possible, ignore | ||||
|                 pass | ||||
|         try: | ||||
|             return ocr.image_to_string(f, lang=lang) | ||||
|         except PyocrException as e: | ||||
|             raise OCRError(e) | ||||
|  | ||||
|  | ||||
| def get_text_from_pdf(pdf_file): | ||||
|   | ||||
| @@ -11,14 +11,8 @@ class TextDocumentParser(DocumentParser): | ||||
|     This parser directly parses a text document (.txt, .md, or .csv) | ||||
|     """ | ||||
|  | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     OCR_ALWAYS = settings.OCR_ALWAYS | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         super().__init__(path) | ||||
|     def __init__(self, path, logging_group): | ||||
|         super().__init__(path, logging_group) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
| @@ -44,7 +38,7 @@ class TextDocumentParser(DocumentParser): | ||||
|             r = str(round(psize[0] / 10)) | ||||
|             rounded = ",".join([r, r]) | ||||
|             run_command( | ||||
|                 self.CONVERT, | ||||
|                 settings.CONVERT_BINARY, | ||||
|                 "-size ", picsize, | ||||
|                 ' xc:none -draw ', | ||||
|                 '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ',  # NOQA: E501 | ||||
| @@ -59,7 +53,7 @@ class TextDocumentParser(DocumentParser): | ||||
|  | ||||
|         def create_txlayer(): | ||||
|             run_command( | ||||
|                 self.CONVERT, | ||||
|                 settings.CONVERT_BINARY, | ||||
|                 "-background none", | ||||
|                 "-fill", | ||||
|                 text_color, | ||||
| @@ -73,7 +67,7 @@ class TextDocumentParser(DocumentParser): | ||||
|         create_txlayer() | ||||
|         create_bg() | ||||
|         run_command( | ||||
|             self.CONVERT, | ||||
|             settings.CONVERT_BINARY, | ||||
|             temp_bg, | ||||
|             temp_txlayer, | ||||
|             "-background None -layers merge ", | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler