mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	feat: refactor for pluggable consumers
I've broken out the OCR-specific code from the consumers and dumped it
all into its own app, `paperless_tesseract`.  This new app should serve
as a sample of how to create one's own consumer for different file
types.
Documentation for how to do this isn't ready yet, but for the impatient:
* Create a new app
    * containing a `parsers.py` for your parser modelled after
      `paperless_tesseract.parsers.RasterisedDocumentParser`
    * containing a `signals.py` with a handler moddelled after
      `paperless_tesseract.signals.ConsumerDeclaration`
    * connect the signal handler to
      `documents.signals.document_consumer_declaration` in
      `your_app.apps`
* Install the app into Paperless by declaring
  `PAPERLESS_INSTALLED_APPS=your_app`.  Additional apps should be
  separated with commas.
* Restart the consumer
			
			
This commit is contained in:
		| @@ -1,35 +1,21 @@ | ||||
| import datetime | ||||
| import hashlib | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import uuid | ||||
| import shutil | ||||
| import hashlib | ||||
| import logging | ||||
| import datetime | ||||
| import tempfile | ||||
| import itertools | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
|  | ||||
| import pyocr | ||||
| import langdetect | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from paperless.db import GnuPG | ||||
| from pyocr.tesseract import TesseractError | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from .models import Tag, Document, FileInfo | ||||
| from .models import Document, FileInfo, Tag | ||||
| from .parsers import ParseError | ||||
| from .signals import ( | ||||
|     document_consumption_started, | ||||
|     document_consumption_finished | ||||
|     document_consumer_declaration, | ||||
|     document_consumption_finished, | ||||
|     document_consumption_started | ||||
| ) | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class ConsumerError(Exception): | ||||
| @@ -47,13 +33,7 @@ class Consumer(object): | ||||
|     """ | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 | ||||
|  | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     def __init__(self): | ||||
|  | ||||
| @@ -78,6 +58,16 @@ class Consumer(object): | ||||
|             raise ConsumerError( | ||||
|                 "Consumption directory {} does not exist".format(self.CONSUME)) | ||||
|  | ||||
|         self.parsers = [] | ||||
|         for response in document_consumer_declaration.send(self): | ||||
|             self.parsers.append(response[1]) | ||||
|  | ||||
|         if not self.parsers: | ||||
|             raise ConsumerError( | ||||
|                 "No parsers could be found, not even the default.  " | ||||
|                 "This is a problem." | ||||
|             ) | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
| @@ -109,6 +99,13 @@ class Consumer(object): | ||||
|                 self._ignore.append(doc) | ||||
|                 continue | ||||
|  | ||||
|             parser_class = self._get_parser_class(doc) | ||||
|             if not parser_class: | ||||
|                 self.log( | ||||
|                     "info", "No parsers could be found for {}".format(doc)) | ||||
|                 self._ignore.append(doc) | ||||
|                 continue | ||||
|  | ||||
|             self.logging_group = uuid.uuid4() | ||||
|  | ||||
|             self.log("info", "Consuming {}".format(doc)) | ||||
| @@ -119,25 +116,26 @@ class Consumer(object): | ||||
|                 logging_group=self.logging_group | ||||
|             ) | ||||
|  | ||||
|             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||
|             imgs = self._get_greyscale(tempdir, doc) | ||||
|             thumbnail = self._get_thumbnail(tempdir, doc) | ||||
|             parsed_document = parser_class(doc) | ||||
|             thumbnail = parsed_document.get_thumbnail() | ||||
|  | ||||
|             try: | ||||
|  | ||||
|                 document = self._store(self._get_ocr(imgs), doc, thumbnail) | ||||
|  | ||||
|             except OCRError as e: | ||||
|                 document = self._store( | ||||
|                     parsed_document.get_text(), | ||||
|                     doc, | ||||
|                     thumbnail | ||||
|                 ) | ||||
|             except ParseError as e: | ||||
|  | ||||
|                 self._ignore.append(doc) | ||||
|                 self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) | ||||
|                 self._cleanup_tempdir(tempdir) | ||||
|                 self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||
|                 parsed_document.cleanup() | ||||
|  | ||||
|                 continue | ||||
|  | ||||
|             else: | ||||
|  | ||||
|                 self._cleanup_tempdir(tempdir) | ||||
|                 parsed_document.cleanup() | ||||
|                 self._cleanup_doc(doc) | ||||
|  | ||||
|                 self.log( | ||||
| @@ -151,142 +149,20 @@ class Consumer(object): | ||||
|                     logging_group=self.logging_group | ||||
|                 ) | ||||
|  | ||||
|     def _get_greyscale(self, tempdir, doc): | ||||
|     def _get_parser_class(self, doc): | ||||
|         """ | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         Determine the appropriate parser class based on the file | ||||
|         """ | ||||
|  | ||||
|         self.log("info", "Generating greyscale image from {}".format(doc)) | ||||
|         options = [] | ||||
|         for parser in self.parsers: | ||||
|             result = parser(doc) | ||||
|             if result: | ||||
|                 options.append(result) | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         pnm = os.path.join(tempdir, "convert-%04d.pnm") | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-density", str(self.DENSITY), | ||||
|             "-depth", "8", | ||||
|             "-type", "grayscale", | ||||
|             doc, pnm, | ||||
|         ) | ||||
|  | ||||
|         # Get a list of converted images | ||||
|         pnms = [] | ||||
|         for f in os.listdir(tempdir): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(tempdir, f)) | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) | ||||
|  | ||||
|         # Return list of converted images, processed with unpaper | ||||
|         pnms = [] | ||||
|         for f in os.listdir(tempdir): | ||||
|             if f.endswith(".unpaper.pnm"): | ||||
|                 pnms.append(os.path.join(tempdir, f)) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
|  | ||||
|     def _get_thumbnail(self, tempdir, doc): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
|  | ||||
|         self.log("info", "Generating the thumbnail") | ||||
|  | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-scale", "500x5000", | ||||
|             "-alpha", "remove", | ||||
|             doc, os.path.join(tempdir, "convert-%04d.png") | ||||
|         ) | ||||
|  | ||||
|         return os.path.join(tempdir, "convert-0000.png") | ||||
|  | ||||
|     def _guess_language(self, text): | ||||
|         try: | ||||
|             guess = langdetect.detect(text) | ||||
|             self.log("debug", "Language detected: {}".format(guess)) | ||||
|             return guess | ||||
|         except Exception as e: | ||||
|             self.log("warning", "Language detection error: {}".format(e)) | ||||
|  | ||||
|     def _get_ocr(self, imgs): | ||||
|         """ | ||||
|         Attempts to do the best job possible OCR'ing the document based on | ||||
|         simple language detection trial & error. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             raise OCRError("No images found") | ||||
|  | ||||
|         self.log("info", "OCRing the document") | ||||
|  | ||||
|         # Since the division gets rounded down by int, this calculation works | ||||
|         # for every edge-case, i.e. 1 | ||||
|         middle = int(len(imgs) / 2) | ||||
|         raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) | ||||
|  | ||||
|         guessed_language = self._guess_language(raw_text) | ||||
|  | ||||
|         if not guessed_language or guessed_language not in ISO639: | ||||
|             self.log("warning", "Language detection failed!") | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "As FORGIVING_OCR is enabled, we're going to make the " | ||||
|                     "best with what we have." | ||||
|                 ) | ||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|                 return raw_text | ||||
|             raise OCRError("Language detection failed") | ||||
|  | ||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|             return raw_text | ||||
|  | ||||
|         try: | ||||
|             return self._ocr(imgs, ISO639[guessed_language]) | ||||
|         except pyocr.pyocr.tesseract.TesseractError: | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "OCR for {} failed, but we're going to stick with what " | ||||
|                     "we've got since FORGIVING_OCR is enabled.".format( | ||||
|                         guessed_language | ||||
|                     ) | ||||
|                 ) | ||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|                 return raw_text | ||||
|             raise OCRError( | ||||
|                 "The guessed language is not available in this instance of " | ||||
|                 "Tesseract." | ||||
|             ) | ||||
|  | ||||
|     def _assemble_ocr_sections(self, imgs, middle, text): | ||||
|         """ | ||||
|         Given a `middle` value and the text that middle page represents, we OCR | ||||
|         the remainder of the document and return the whole thing. | ||||
|         """ | ||||
|         text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text | ||||
|         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||
|         return text | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|         """ | ||||
|         Performs a single OCR attempt. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             return "" | ||||
|  | ||||
|         self.log("info", "Parsing for {}".format(lang)) | ||||
|  | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             r = " ".join(r) | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return strip_excess_whitespace(r) | ||||
|         # Return the parser with the highest weight. | ||||
|         return sorted( | ||||
|             options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail): | ||||
|  | ||||
| @@ -332,10 +208,6 @@ class Consumer(object): | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def _cleanup_tempdir(self, d): | ||||
|         self.log("debug", "Deleting directory {}".format(d)) | ||||
|         shutil.rmtree(d) | ||||
|  | ||||
|     def _cleanup_doc(self, doc): | ||||
|         self.log("debug", "Deleting document {}".format(doc)) | ||||
|         os.unlink(doc) | ||||
| @@ -361,41 +233,3 @@ class Consumer(object): | ||||
|         with open(doc, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         return Document.objects.filter(checksum=checksum).exists() | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     return no_trailing_whitespace | ||||
|  | ||||
|  | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|     with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: | ||||
|         if ocr.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except (TesseractError, OtherTesseractError): | ||||
|                 pass | ||||
|         return ocr.image_to_string(f, lang=lang) | ||||
|  | ||||
|  | ||||
| def run_unpaper(args): | ||||
|     unpaper, pnm = args | ||||
|     subprocess.Popen( | ||||
|         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|  | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
|     if settings.CONVERT_TMPDIR: | ||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||
|  | ||||
|     subprocess.Popen(args, env=environment).wait() | ||||
|   | ||||
							
								
								
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| import logging | ||||
| import shutil | ||||
| import tempfile | ||||
|  | ||||
| from django.conf import settings | ||||
|  | ||||
|  | ||||
| class ParseError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class DocumentParser(object): | ||||
|     """ | ||||
|     Subclass this to make your own parser.  Have a look at | ||||
|     `paperless_tesseract.parsers` for inspiration. | ||||
|     """ | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         self.document_path = path | ||||
|         self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||
|         self.logger = logging.getLogger(__name__) | ||||
|         self.logging_group = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|         Returns the path to a file we can use as a thumbnail for this document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_text(self): | ||||
|         """ | ||||
|         Returns the text from the document and only the text. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
|         }) | ||||
|  | ||||
|     def cleanup(self): | ||||
|         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||
|         shutil.rmtree(self.tempdir) | ||||
| @@ -2,3 +2,4 @@ from django.dispatch import Signal | ||||
|  | ||||
| document_consumption_started = Signal(providing_args=["filename"]) | ||||
| document_consumption_finished = Signal(providing_args=["document"]) | ||||
| document_consumer_declaration = Signal(providing_args=[]) | ||||
|   | ||||
| @@ -1,6 +1,5 @@ | ||||
| import logging | ||||
| import os | ||||
|  | ||||
| from subprocess import Popen | ||||
|  | ||||
| from django.conf import settings | ||||
|   | ||||
| @@ -1,13 +1,6 @@ | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
|  | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from ..models import FileInfo | ||||
| from ..consumer import image_to_string, strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class TestAttributes(TestCase): | ||||
| @@ -310,69 +303,3 @@ class TestFieldPermutations(TestCase): | ||||
|                             template.format(**spec), **spec) | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| class TestOCR(TestCase): | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) | ||||
|     @mock.patch("documents.consumer.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string(["no-text.png", "en"]) | ||||
|   | ||||
| @@ -61,6 +61,7 @@ INSTALLED_APPS = [ | ||||
|     "django_extensions", | ||||
|  | ||||
|     "documents.apps.DocumentsConfig", | ||||
|     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||
|  | ||||
|     "flat_responsive", | ||||
|     "django.contrib.admin", | ||||
| @@ -70,6 +71,9 @@ INSTALLED_APPS = [ | ||||
|  | ||||
| ] | ||||
|  | ||||
| if os.getenv("PAPERLESS_INSTALLED_APPS"): | ||||
|     INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",") | ||||
|  | ||||
| MIDDLEWARE_CLASSES = [ | ||||
|     'django.middleware.security.SecurityMiddleware', | ||||
|     'django.contrib.sessions.middleware.SessionMiddleware', | ||||
|   | ||||
							
								
								
									
										0
									
								
								src/paperless_tesseract/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_tesseract/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_tesseract/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_tesseract/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| from django.apps import AppConfig | ||||
|  | ||||
|  | ||||
| class PaperlessTesseractConfig(AppConfig): | ||||
|  | ||||
|     name = "paperless_tesseract" | ||||
|  | ||||
|     def ready(self): | ||||
|  | ||||
|         from documents.signals import document_consumer_declaration | ||||
|  | ||||
|         from .signals import ConsumerDeclaration | ||||
|  | ||||
|         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||
|  | ||||
|         AppConfig.ready(self) | ||||
							
								
								
									
										214
									
								
								src/paperless_tesseract/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										214
									
								
								src/paperless_tesseract/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,214 @@ | ||||
| import itertools | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
|  | ||||
| import langdetect | ||||
| import pyocr | ||||
| from django.conf import settings | ||||
| from documents.parsers import DocumentParser, ParseError | ||||
| from PIL import Image | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
| from pyocr.tesseract import TesseractError | ||||
|  | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class RasterisedDocumentParser(DocumentParser): | ||||
|     """ | ||||
|     This parser uses Tesseract to try and get some text out of a rasterised | ||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||
|     """ | ||||
|  | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
|  | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-scale", "500x5000", | ||||
|             "-alpha", "remove", | ||||
|             self.document_path, os.path.join(self.tempdir, "convert-%04d.png") | ||||
|         ) | ||||
|  | ||||
|         return os.path.join(self.tempdir, "convert-0000.png") | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|         images = self._get_greyscale() | ||||
|  | ||||
|         try: | ||||
|  | ||||
|             return self._get_ocr(images) | ||||
|         except OCRError as e: | ||||
|             raise ParseError(e) | ||||
|  | ||||
|     def _get_greyscale(self): | ||||
|         """ | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         """ | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         pnm = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-density", str(self.DENSITY), | ||||
|             "-depth", "8", | ||||
|             "-type", "grayscale", | ||||
|             self.document_path, pnm, | ||||
|         ) | ||||
|  | ||||
|         # Get a list of converted images | ||||
|         pnms = [] | ||||
|         for f in os.listdir(self.tempdir): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) | ||||
|  | ||||
|         # Return list of converted images, processed with unpaper | ||||
|         pnms = [] | ||||
|         for f in os.listdir(self.tempdir): | ||||
|             if f.endswith(".unpaper.pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
|  | ||||
|     def _guess_language(self, text): | ||||
|         try: | ||||
|             guess = langdetect.detect(text) | ||||
|             self.log("debug", "Language detected: {}".format(guess)) | ||||
|             return guess | ||||
|         except Exception as e: | ||||
|             self.log("warning", "Language detection error: {}".format(e)) | ||||
|  | ||||
|     def _get_ocr(self, imgs): | ||||
|         """ | ||||
|         Attempts to do the best job possible OCR'ing the document based on | ||||
|         simple language detection trial & error. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             raise OCRError("No images found") | ||||
|  | ||||
|         self.log("info", "OCRing the document") | ||||
|  | ||||
|         # Since the division gets rounded down by int, this calculation works | ||||
|         # for every edge-case, i.e. 1 | ||||
|         middle = int(len(imgs) / 2) | ||||
|         raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) | ||||
|  | ||||
|         guessed_language = self._guess_language(raw_text) | ||||
|  | ||||
|         if not guessed_language or guessed_language not in ISO639: | ||||
|             self.log("warning", "Language detection failed!") | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "As FORGIVING_OCR is enabled, we're going to make the " | ||||
|                     "best with what we have." | ||||
|                 ) | ||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|                 return raw_text | ||||
|             raise OCRError("Language detection failed") | ||||
|  | ||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|             return raw_text | ||||
|  | ||||
|         try: | ||||
|             return self._ocr(imgs, ISO639[guessed_language]) | ||||
|         except pyocr.pyocr.tesseract.TesseractError: | ||||
|             if settings.FORGIVING_OCR: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     "OCR for {} failed, but we're going to stick with what " | ||||
|                     "we've got since FORGIVING_OCR is enabled.".format( | ||||
|                         guessed_language | ||||
|                     ) | ||||
|                 ) | ||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||
|                 return raw_text | ||||
|             raise OCRError( | ||||
|                 "The guessed language is not available in this instance of " | ||||
|                 "Tesseract." | ||||
|             ) | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|         """ | ||||
|         Performs a single OCR attempt. | ||||
|         """ | ||||
|  | ||||
|         if not imgs: | ||||
|             return "" | ||||
|  | ||||
|         self.log("info", "Parsing for {}".format(lang)) | ||||
|  | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             r = " ".join(r) | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return strip_excess_whitespace(r) | ||||
|  | ||||
|     def _assemble_ocr_sections(self, imgs, middle, text): | ||||
|         """ | ||||
|         Given a `middle` value and the text that middle page represents, we OCR | ||||
|         the remainder of the document and return the whole thing. | ||||
|         """ | ||||
|         text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text | ||||
|         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||
|         return text | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|  | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
|     if settings.CONVERT_TMPDIR: | ||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||
|  | ||||
|     subprocess.Popen(args, env=environment).wait() | ||||
|  | ||||
|  | ||||
| def run_unpaper(args): | ||||
|     unpaper, pnm = args | ||||
|     subprocess.Popen( | ||||
|         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     return no_trailing_whitespace | ||||
|  | ||||
|  | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|     with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f: | ||||
|         if ocr.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except (TesseractError, OtherTesseractError): | ||||
|                 pass | ||||
|         return ocr.image_to_string(f, lang=lang) | ||||
							
								
								
									
										23
									
								
								src/paperless_tesseract/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_tesseract/signals.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| import re | ||||
|  | ||||
| from .parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class ConsumerDeclaration(object): | ||||
|  | ||||
|     MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def handle(cls, sender, **kwargs): | ||||
|         return cls.test | ||||
|  | ||||
|     @classmethod | ||||
|     def test(cls, doc): | ||||
|  | ||||
|         if cls.MATCHING_FILES.match(doc): | ||||
|             return { | ||||
|                 "parser": RasterisedDocumentParser, | ||||
|                 "weight": 0 | ||||
|             } | ||||
|  | ||||
|         return None | ||||
							
								
								
									
										0
									
								
								src/paperless_tesseract/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_tesseract/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							| Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										80
									
								
								src/paperless_tesseract/tests/test_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								src/paperless_tesseract/tests/test_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
|  | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from ..parsers import image_to_string, strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| class TestOCR(TestCase): | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string(["no-text.png", "en"]) | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn