mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	feat: refactor for pluggable consumers
I've broken out the OCR-specific code from the consumers and dumped it
all into its own app, `paperless_tesseract`.  This new app should serve
as a sample of how to create one's own consumer for different file
types.
Documentation for how to do this isn't ready yet, but for the impatient:
* Create a new app
    * containing a `parsers.py` for your parser modelled after
      `paperless_tesseract.parsers.RasterisedDocumentParser`
    * containing a `signals.py` with a handler moddelled after
      `paperless_tesseract.signals.ConsumerDeclaration`
    * connect the signal handler to
      `documents.signals.document_consumer_declaration` in
      `your_app.apps`
* Install the app into Paperless by declaring
  `PAPERLESS_INSTALLED_APPS=your_app`.  Additional apps should be
  separated with commas.
* Restart the consumer
			
			
This commit is contained in:
		| @@ -1,35 +1,21 @@ | |||||||
|  | import datetime | ||||||
|  | import hashlib | ||||||
|  | import logging | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import uuid | import uuid | ||||||
| import shutil |  | ||||||
| import hashlib |  | ||||||
| import logging |  | ||||||
| import datetime |  | ||||||
| import tempfile |  | ||||||
| import itertools |  | ||||||
| import subprocess |  | ||||||
| from multiprocessing.pool import Pool |  | ||||||
|  |  | ||||||
| import pyocr |  | ||||||
| import langdetect |  | ||||||
| from PIL import Image |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
| from pyocr.tesseract import TesseractError |  | ||||||
| from pyocr.libtesseract.tesseract_raw import \ |  | ||||||
|     TesseractError as OtherTesseractError |  | ||||||
|  |  | ||||||
| from .models import Tag, Document, FileInfo | from .models import Document, FileInfo, Tag | ||||||
|  | from .parsers import ParseError | ||||||
| from .signals import ( | from .signals import ( | ||||||
|     document_consumption_started, |     document_consumer_declaration, | ||||||
|     document_consumption_finished |     document_consumption_finished, | ||||||
|  |     document_consumption_started | ||||||
| ) | ) | ||||||
| from .languages import ISO639 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class OCRError(Exception): |  | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ConsumerError(Exception): | class ConsumerError(Exception): | ||||||
| @@ -47,13 +33,7 @@ class Consumer(object): | |||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     SCRATCH = settings.SCRATCH_DIR |     SCRATCH = settings.SCRATCH_DIR | ||||||
|     CONVERT = settings.CONVERT_BINARY |  | ||||||
|     UNPAPER = settings.UNPAPER_BINARY |  | ||||||
|     CONSUME = settings.CONSUMPTION_DIR |     CONSUME = settings.CONSUMPTION_DIR | ||||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None |  | ||||||
|     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 |  | ||||||
|  |  | ||||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE |  | ||||||
|  |  | ||||||
|     def __init__(self): |     def __init__(self): | ||||||
|  |  | ||||||
| @@ -78,6 +58,16 @@ class Consumer(object): | |||||||
|             raise ConsumerError( |             raise ConsumerError( | ||||||
|                 "Consumption directory {} does not exist".format(self.CONSUME)) |                 "Consumption directory {} does not exist".format(self.CONSUME)) | ||||||
|  |  | ||||||
|  |         self.parsers = [] | ||||||
|  |         for response in document_consumer_declaration.send(self): | ||||||
|  |             self.parsers.append(response[1]) | ||||||
|  |  | ||||||
|  |         if not self.parsers: | ||||||
|  |             raise ConsumerError( | ||||||
|  |                 "No parsers could be found, not even the default.  " | ||||||
|  |                 "This is a problem." | ||||||
|  |             ) | ||||||
|  |  | ||||||
|     def log(self, level, message): |     def log(self, level, message): | ||||||
|         getattr(self.logger, level)(message, extra={ |         getattr(self.logger, level)(message, extra={ | ||||||
|             "group": self.logging_group |             "group": self.logging_group | ||||||
| @@ -109,6 +99,13 @@ class Consumer(object): | |||||||
|                 self._ignore.append(doc) |                 self._ignore.append(doc) | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|  |             parser_class = self._get_parser_class(doc) | ||||||
|  |             if not parser_class: | ||||||
|  |                 self.log( | ||||||
|  |                     "info", "No parsers could be found for {}".format(doc)) | ||||||
|  |                 self._ignore.append(doc) | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|             self.logging_group = uuid.uuid4() |             self.logging_group = uuid.uuid4() | ||||||
|  |  | ||||||
|             self.log("info", "Consuming {}".format(doc)) |             self.log("info", "Consuming {}".format(doc)) | ||||||
| @@ -119,25 +116,26 @@ class Consumer(object): | |||||||
|                 logging_group=self.logging_group |                 logging_group=self.logging_group | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) |             parsed_document = parser_class(doc) | ||||||
|             imgs = self._get_greyscale(tempdir, doc) |             thumbnail = parsed_document.get_thumbnail() | ||||||
|             thumbnail = self._get_thumbnail(tempdir, doc) |  | ||||||
|  |  | ||||||
|             try: |             try: | ||||||
|  |                 document = self._store( | ||||||
|                 document = self._store(self._get_ocr(imgs), doc, thumbnail) |                     parsed_document.get_text(), | ||||||
|  |                     doc, | ||||||
|             except OCRError as e: |                     thumbnail | ||||||
|  |                 ) | ||||||
|  |             except ParseError as e: | ||||||
|  |  | ||||||
|                 self._ignore.append(doc) |                 self._ignore.append(doc) | ||||||
|                 self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) |                 self.log("error", "PARSE FAILURE for {}: {}".format(doc, e)) | ||||||
|                 self._cleanup_tempdir(tempdir) |                 parsed_document.cleanup() | ||||||
|  |  | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             else: |             else: | ||||||
|  |  | ||||||
|                 self._cleanup_tempdir(tempdir) |                 parsed_document.cleanup() | ||||||
|                 self._cleanup_doc(doc) |                 self._cleanup_doc(doc) | ||||||
|  |  | ||||||
|                 self.log( |                 self.log( | ||||||
| @@ -151,142 +149,20 @@ class Consumer(object): | |||||||
|                     logging_group=self.logging_group |                     logging_group=self.logging_group | ||||||
|                 ) |                 ) | ||||||
|  |  | ||||||
|     def _get_greyscale(self, tempdir, doc): |     def _get_parser_class(self, doc): | ||||||
|         """ |         """ | ||||||
|         Greyscale images are easier for Tesseract to OCR |         Determine the appropriate parser class based on the file | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|         self.log("info", "Generating greyscale image from {}".format(doc)) |         options = [] | ||||||
|  |         for parser in self.parsers: | ||||||
|  |             result = parser(doc) | ||||||
|  |             if result: | ||||||
|  |                 options.append(result) | ||||||
|  |  | ||||||
|         # Convert PDF to multiple PNMs |         # Return the parser with the highest weight. | ||||||
|         pnm = os.path.join(tempdir, "convert-%04d.pnm") |         return sorted( | ||||||
|         run_convert( |             options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||||
|             self.CONVERT, |  | ||||||
|             "-density", str(self.DENSITY), |  | ||||||
|             "-depth", "8", |  | ||||||
|             "-type", "grayscale", |  | ||||||
|             doc, pnm, |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         # Get a list of converted images |  | ||||||
|         pnms = [] |  | ||||||
|         for f in os.listdir(tempdir): |  | ||||||
|             if f.endswith(".pnm"): |  | ||||||
|                 pnms.append(os.path.join(tempdir, f)) |  | ||||||
|  |  | ||||||
|         # Run unpaper in parallel on converted images |  | ||||||
|         with Pool(processes=self.THREADS) as pool: |  | ||||||
|             pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) |  | ||||||
|  |  | ||||||
|         # Return list of converted images, processed with unpaper |  | ||||||
|         pnms = [] |  | ||||||
|         for f in os.listdir(tempdir): |  | ||||||
|             if f.endswith(".unpaper.pnm"): |  | ||||||
|                 pnms.append(os.path.join(tempdir, f)) |  | ||||||
|  |  | ||||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) |  | ||||||
|  |  | ||||||
|     def _get_thumbnail(self, tempdir, doc): |  | ||||||
|         """ |  | ||||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         self.log("info", "Generating the thumbnail") |  | ||||||
|  |  | ||||||
|         run_convert( |  | ||||||
|             self.CONVERT, |  | ||||||
|             "-scale", "500x5000", |  | ||||||
|             "-alpha", "remove", |  | ||||||
|             doc, os.path.join(tempdir, "convert-%04d.png") |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|         return os.path.join(tempdir, "convert-0000.png") |  | ||||||
|  |  | ||||||
|     def _guess_language(self, text): |  | ||||||
|         try: |  | ||||||
|             guess = langdetect.detect(text) |  | ||||||
|             self.log("debug", "Language detected: {}".format(guess)) |  | ||||||
|             return guess |  | ||||||
|         except Exception as e: |  | ||||||
|             self.log("warning", "Language detection error: {}".format(e)) |  | ||||||
|  |  | ||||||
|     def _get_ocr(self, imgs): |  | ||||||
|         """ |  | ||||||
|         Attempts to do the best job possible OCR'ing the document based on |  | ||||||
|         simple language detection trial & error. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if not imgs: |  | ||||||
|             raise OCRError("No images found") |  | ||||||
|  |  | ||||||
|         self.log("info", "OCRing the document") |  | ||||||
|  |  | ||||||
|         # Since the division gets rounded down by int, this calculation works |  | ||||||
|         # for every edge-case, i.e. 1 |  | ||||||
|         middle = int(len(imgs) / 2) |  | ||||||
|         raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) |  | ||||||
|  |  | ||||||
|         guessed_language = self._guess_language(raw_text) |  | ||||||
|  |  | ||||||
|         if not guessed_language or guessed_language not in ISO639: |  | ||||||
|             self.log("warning", "Language detection failed!") |  | ||||||
|             if settings.FORGIVING_OCR: |  | ||||||
|                 self.log( |  | ||||||
|                     "warning", |  | ||||||
|                     "As FORGIVING_OCR is enabled, we're going to make the " |  | ||||||
|                     "best with what we have." |  | ||||||
|                 ) |  | ||||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) |  | ||||||
|                 return raw_text |  | ||||||
|             raise OCRError("Language detection failed") |  | ||||||
|  |  | ||||||
|         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: |  | ||||||
|             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) |  | ||||||
|             return raw_text |  | ||||||
|  |  | ||||||
|         try: |  | ||||||
|             return self._ocr(imgs, ISO639[guessed_language]) |  | ||||||
|         except pyocr.pyocr.tesseract.TesseractError: |  | ||||||
|             if settings.FORGIVING_OCR: |  | ||||||
|                 self.log( |  | ||||||
|                     "warning", |  | ||||||
|                     "OCR for {} failed, but we're going to stick with what " |  | ||||||
|                     "we've got since FORGIVING_OCR is enabled.".format( |  | ||||||
|                         guessed_language |  | ||||||
|                     ) |  | ||||||
|                 ) |  | ||||||
|                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) |  | ||||||
|                 return raw_text |  | ||||||
|             raise OCRError( |  | ||||||
|                 "The guessed language is not available in this instance of " |  | ||||||
|                 "Tesseract." |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     def _assemble_ocr_sections(self, imgs, middle, text): |  | ||||||
|         """ |  | ||||||
|         Given a `middle` value and the text that middle page represents, we OCR |  | ||||||
|         the remainder of the document and return the whole thing. |  | ||||||
|         """ |  | ||||||
|         text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text |  | ||||||
|         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) |  | ||||||
|         return text |  | ||||||
|  |  | ||||||
|     def _ocr(self, imgs, lang): |  | ||||||
|         """ |  | ||||||
|         Performs a single OCR attempt. |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         if not imgs: |  | ||||||
|             return "" |  | ||||||
|  |  | ||||||
|         self.log("info", "Parsing for {}".format(lang)) |  | ||||||
|  |  | ||||||
|         with Pool(processes=self.THREADS) as pool: |  | ||||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) |  | ||||||
|             r = " ".join(r) |  | ||||||
|  |  | ||||||
|         # Strip out excess white space to allow matching to go smoother |  | ||||||
|         return strip_excess_whitespace(r) |  | ||||||
|  |  | ||||||
|     def _store(self, text, doc, thumbnail): |     def _store(self, text, doc, thumbnail): | ||||||
|  |  | ||||||
| @@ -332,10 +208,6 @@ class Consumer(object): | |||||||
|  |  | ||||||
|         return document |         return document | ||||||
|  |  | ||||||
|     def _cleanup_tempdir(self, d): |  | ||||||
|         self.log("debug", "Deleting directory {}".format(d)) |  | ||||||
|         shutil.rmtree(d) |  | ||||||
|  |  | ||||||
|     def _cleanup_doc(self, doc): |     def _cleanup_doc(self, doc): | ||||||
|         self.log("debug", "Deleting document {}".format(doc)) |         self.log("debug", "Deleting document {}".format(doc)) | ||||||
|         os.unlink(doc) |         os.unlink(doc) | ||||||
| @@ -361,41 +233,3 @@ class Consumer(object): | |||||||
|         with open(doc, "rb") as f: |         with open(doc, "rb") as f: | ||||||
|             checksum = hashlib.md5(f.read()).hexdigest() |             checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|         return Document.objects.filter(checksum=checksum).exists() |         return Document.objects.filter(checksum=checksum).exists() | ||||||
|  |  | ||||||
|  |  | ||||||
| def strip_excess_whitespace(text): |  | ||||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) |  | ||||||
|     no_leading_whitespace = re.sub( |  | ||||||
|         "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) |  | ||||||
|     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) |  | ||||||
|     return no_trailing_whitespace |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def image_to_string(args): |  | ||||||
|     img, lang = args |  | ||||||
|     ocr = pyocr.get_available_tools()[0] |  | ||||||
|     with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: |  | ||||||
|         if ocr.can_detect_orientation(): |  | ||||||
|             try: |  | ||||||
|                 orientation = ocr.detect_orientation(f, lang=lang) |  | ||||||
|                 f = f.rotate(orientation["angle"], expand=1) |  | ||||||
|             except (TesseractError, OtherTesseractError): |  | ||||||
|                 pass |  | ||||||
|         return ocr.image_to_string(f, lang=lang) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def run_unpaper(args): |  | ||||||
|     unpaper, pnm = args |  | ||||||
|     subprocess.Popen( |  | ||||||
|         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def run_convert(*args): |  | ||||||
|  |  | ||||||
|     environment = os.environ.copy() |  | ||||||
|     if settings.CONVERT_MEMORY_LIMIT: |  | ||||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT |  | ||||||
|     if settings.CONVERT_TMPDIR: |  | ||||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR |  | ||||||
|  |  | ||||||
|     subprocess.Popen(args, env=environment).wait() |  | ||||||
|   | |||||||
							
								
								
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								src/documents/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | |||||||
|  | import logging | ||||||
|  | import shutil | ||||||
|  | import tempfile | ||||||
|  |  | ||||||
|  | from django.conf import settings | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ParseError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DocumentParser(object): | ||||||
|  |     """ | ||||||
|  |     Subclass this to make your own parser.  Have a look at | ||||||
|  |     `paperless_tesseract.parsers` for inspiration. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     SCRATCH = settings.SCRATCH_DIR | ||||||
|  |  | ||||||
|  |     def __init__(self, path): | ||||||
|  |         self.document_path = path | ||||||
|  |         self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) | ||||||
|  |         self.logger = logging.getLogger(__name__) | ||||||
|  |         self.logging_group = None | ||||||
|  |  | ||||||
|  |     def get_thumbnail(self): | ||||||
|  |         """ | ||||||
|  |         Returns the path to a file we can use as a thumbnail for this document. | ||||||
|  |         """ | ||||||
|  |         raise NotImplementedError() | ||||||
|  |  | ||||||
|  |     def get_text(self): | ||||||
|  |         """ | ||||||
|  |         Returns the text from the document and only the text. | ||||||
|  |         """ | ||||||
|  |         raise NotImplementedError() | ||||||
|  |  | ||||||
|  |     def log(self, level, message): | ||||||
|  |         getattr(self.logger, level)(message, extra={ | ||||||
|  |             "group": self.logging_group | ||||||
|  |         }) | ||||||
|  |  | ||||||
|  |     def cleanup(self): | ||||||
|  |         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||||
|  |         shutil.rmtree(self.tempdir) | ||||||
| @@ -2,3 +2,4 @@ from django.dispatch import Signal | |||||||
|  |  | ||||||
| document_consumption_started = Signal(providing_args=["filename"]) | document_consumption_started = Signal(providing_args=["filename"]) | ||||||
| document_consumption_finished = Signal(providing_args=["document"]) | document_consumption_finished = Signal(providing_args=["document"]) | ||||||
|  | document_consumer_declaration = Signal(providing_args=[]) | ||||||
|   | |||||||
| @@ -1,6 +1,5 @@ | |||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from subprocess import Popen | from subprocess import Popen | ||||||
|  |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
|   | |||||||
| @@ -1,13 +1,6 @@ | |||||||
| import os |  | ||||||
| from unittest import mock, skipIf |  | ||||||
|  |  | ||||||
| import pyocr |  | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from pyocr.libtesseract.tesseract_raw import \ |  | ||||||
|     TesseractError as OtherTesseractError |  | ||||||
|  |  | ||||||
| from ..models import FileInfo | from ..models import FileInfo | ||||||
| from ..consumer import image_to_string, strip_excess_whitespace |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestAttributes(TestCase): | class TestAttributes(TestCase): | ||||||
| @@ -310,69 +303,3 @@ class TestFieldPermutations(TestCase): | |||||||
|                             template.format(**spec), **spec) |                             template.format(**spec), **spec) | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakeTesseract(object): |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def can_detect_orientation(): |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def detect_orientation(file_handle, lang): |  | ||||||
|         raise OtherTesseractError("arbitrary status", "message") |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def image_to_string(file_handle, lang): |  | ||||||
|         return "This is test text" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakePyOcr(object): |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_tools(): |  | ||||||
|         return [FakeTesseract] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestOCR(TestCase): |  | ||||||
|  |  | ||||||
|     text_cases = [ |  | ||||||
|         ("simple     string", "simple string"), |  | ||||||
|         ( |  | ||||||
|             "simple    newline\n   testing string", |  | ||||||
|             "simple newline\ntesting string" |  | ||||||
|         ), |  | ||||||
|         ( |  | ||||||
|             "utf-8   строка с пробелами в конце  ", |  | ||||||
|             "utf-8 строка с пробелами в конце" |  | ||||||
|         ) |  | ||||||
|     ] |  | ||||||
|  |  | ||||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") |  | ||||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) |  | ||||||
|  |  | ||||||
|     def test_strip_excess_whitespace(self): |  | ||||||
|         for source, result in self.text_cases: |  | ||||||
|             actual_result = strip_excess_whitespace(source) |  | ||||||
|             self.assertEqual( |  | ||||||
|                 result, |  | ||||||
|                 actual_result, |  | ||||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( |  | ||||||
|                     source, |  | ||||||
|                     result, |  | ||||||
|                     actual_result |  | ||||||
|                 ) |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") |  | ||||||
|     @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) |  | ||||||
|     @mock.patch("documents.consumer.pyocr", FakePyOcr) |  | ||||||
|     def test_image_to_string_with_text_free_page(self): |  | ||||||
|         """ |  | ||||||
|         This test is sort of silly, since it's really just reproducing an odd |  | ||||||
|         exception thrown by pyocr when it encounters a page with no text. |  | ||||||
|         Actually running this test against an installation of Tesseract results |  | ||||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I |  | ||||||
|         don't care to dig.  Regardless, if you run the consumer normally, |  | ||||||
|         text-free pages are now handled correctly so long as we work around |  | ||||||
|         this weird exception. |  | ||||||
|         """ |  | ||||||
|         image_to_string(["no-text.png", "en"]) |  | ||||||
|   | |||||||
| @@ -61,6 +61,7 @@ INSTALLED_APPS = [ | |||||||
|     "django_extensions", |     "django_extensions", | ||||||
|  |  | ||||||
|     "documents.apps.DocumentsConfig", |     "documents.apps.DocumentsConfig", | ||||||
|  |     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||||
|  |  | ||||||
|     "flat_responsive", |     "flat_responsive", | ||||||
|     "django.contrib.admin", |     "django.contrib.admin", | ||||||
| @@ -70,6 +71,9 @@ INSTALLED_APPS = [ | |||||||
|  |  | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | if os.getenv("PAPERLESS_INSTALLED_APPS"): | ||||||
|  |     INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",") | ||||||
|  |  | ||||||
| MIDDLEWARE_CLASSES = [ | MIDDLEWARE_CLASSES = [ | ||||||
|     'django.middleware.security.SecurityMiddleware', |     'django.middleware.security.SecurityMiddleware', | ||||||
|     'django.contrib.sessions.middleware.SessionMiddleware', |     'django.contrib.sessions.middleware.SessionMiddleware', | ||||||
|   | |||||||
							
								
								
									
										0
									
								
								src/paperless_tesseract/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_tesseract/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_tesseract/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_tesseract/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | |||||||
|  | from django.apps import AppConfig | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PaperlessTesseractConfig(AppConfig): | ||||||
|  |  | ||||||
|  |     name = "paperless_tesseract" | ||||||
|  |  | ||||||
|  |     def ready(self): | ||||||
|  |  | ||||||
|  |         from documents.signals import document_consumer_declaration | ||||||
|  |  | ||||||
|  |         from .signals import ConsumerDeclaration | ||||||
|  |  | ||||||
|  |         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||||
|  |  | ||||||
|  |         AppConfig.ready(self) | ||||||
							
								
								
									
										214
									
								
								src/paperless_tesseract/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										214
									
								
								src/paperless_tesseract/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,214 @@ | |||||||
|  | import itertools | ||||||
|  | import os | ||||||
|  | import re | ||||||
|  | import subprocess | ||||||
|  | from multiprocessing.pool import Pool | ||||||
|  |  | ||||||
|  | import langdetect | ||||||
|  | import pyocr | ||||||
|  | from django.conf import settings | ||||||
|  | from documents.parsers import DocumentParser, ParseError | ||||||
|  | from PIL import Image | ||||||
|  | from pyocr.libtesseract.tesseract_raw import \ | ||||||
|  |     TesseractError as OtherTesseractError | ||||||
|  | from pyocr.tesseract import TesseractError | ||||||
|  |  | ||||||
|  | from .languages import ISO639 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class OCRError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class RasterisedDocumentParser(DocumentParser): | ||||||
|  |     """ | ||||||
|  |     This parser uses Tesseract to try and get some text out of a rasterised | ||||||
|  |     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     CONVERT = settings.CONVERT_BINARY | ||||||
|  |     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 | ||||||
|  |     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||||
|  |     UNPAPER = settings.UNPAPER_BINARY | ||||||
|  |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
|  |  | ||||||
|  |     def get_thumbnail(self): | ||||||
|  |         """ | ||||||
|  |         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         run_convert( | ||||||
|  |             self.CONVERT, | ||||||
|  |             "-scale", "500x5000", | ||||||
|  |             "-alpha", "remove", | ||||||
|  |             self.document_path, os.path.join(self.tempdir, "convert-%04d.png") | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         return os.path.join(self.tempdir, "convert-0000.png") | ||||||
|  |  | ||||||
|  |     def get_text(self): | ||||||
|  |  | ||||||
|  |         images = self._get_greyscale() | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |  | ||||||
|  |             return self._get_ocr(images) | ||||||
|  |         except OCRError as e: | ||||||
|  |             raise ParseError(e) | ||||||
|  |  | ||||||
|  |     def _get_greyscale(self): | ||||||
|  |         """ | ||||||
|  |         Greyscale images are easier for Tesseract to OCR | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         # Convert PDF to multiple PNMs | ||||||
|  |         pnm = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||||
|  |         run_convert( | ||||||
|  |             self.CONVERT, | ||||||
|  |             "-density", str(self.DENSITY), | ||||||
|  |             "-depth", "8", | ||||||
|  |             "-type", "grayscale", | ||||||
|  |             self.document_path, pnm, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         # Get a list of converted images | ||||||
|  |         pnms = [] | ||||||
|  |         for f in os.listdir(self.tempdir): | ||||||
|  |             if f.endswith(".pnm"): | ||||||
|  |                 pnms.append(os.path.join(self.tempdir, f)) | ||||||
|  |  | ||||||
|  |         # Run unpaper in parallel on converted images | ||||||
|  |         with Pool(processes=self.THREADS) as pool: | ||||||
|  |             pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) | ||||||
|  |  | ||||||
|  |         # Return list of converted images, processed with unpaper | ||||||
|  |         pnms = [] | ||||||
|  |         for f in os.listdir(self.tempdir): | ||||||
|  |             if f.endswith(".unpaper.pnm"): | ||||||
|  |                 pnms.append(os.path.join(self.tempdir, f)) | ||||||
|  |  | ||||||
|  |         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||||
|  |  | ||||||
|  |     def _guess_language(self, text): | ||||||
|  |         try: | ||||||
|  |             guess = langdetect.detect(text) | ||||||
|  |             self.log("debug", "Language detected: {}".format(guess)) | ||||||
|  |             return guess | ||||||
|  |         except Exception as e: | ||||||
|  |             self.log("warning", "Language detection error: {}".format(e)) | ||||||
|  |  | ||||||
|  |     def _get_ocr(self, imgs): | ||||||
|  |         """ | ||||||
|  |         Attempts to do the best job possible OCR'ing the document based on | ||||||
|  |         simple language detection trial & error. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if not imgs: | ||||||
|  |             raise OCRError("No images found") | ||||||
|  |  | ||||||
|  |         self.log("info", "OCRing the document") | ||||||
|  |  | ||||||
|  |         # Since the division gets rounded down by int, this calculation works | ||||||
|  |         # for every edge-case, i.e. 1 | ||||||
|  |         middle = int(len(imgs) / 2) | ||||||
|  |         raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) | ||||||
|  |  | ||||||
|  |         guessed_language = self._guess_language(raw_text) | ||||||
|  |  | ||||||
|  |         if not guessed_language or guessed_language not in ISO639: | ||||||
|  |             self.log("warning", "Language detection failed!") | ||||||
|  |             if settings.FORGIVING_OCR: | ||||||
|  |                 self.log( | ||||||
|  |                     "warning", | ||||||
|  |                     "As FORGIVING_OCR is enabled, we're going to make the " | ||||||
|  |                     "best with what we have." | ||||||
|  |                 ) | ||||||
|  |                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||||
|  |                 return raw_text | ||||||
|  |             raise OCRError("Language detection failed") | ||||||
|  |  | ||||||
|  |         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: | ||||||
|  |             raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||||
|  |             return raw_text | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             return self._ocr(imgs, ISO639[guessed_language]) | ||||||
|  |         except pyocr.pyocr.tesseract.TesseractError: | ||||||
|  |             if settings.FORGIVING_OCR: | ||||||
|  |                 self.log( | ||||||
|  |                     "warning", | ||||||
|  |                     "OCR for {} failed, but we're going to stick with what " | ||||||
|  |                     "we've got since FORGIVING_OCR is enabled.".format( | ||||||
|  |                         guessed_language | ||||||
|  |                     ) | ||||||
|  |                 ) | ||||||
|  |                 raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) | ||||||
|  |                 return raw_text | ||||||
|  |             raise OCRError( | ||||||
|  |                 "The guessed language is not available in this instance of " | ||||||
|  |                 "Tesseract." | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def _ocr(self, imgs, lang): | ||||||
|  |         """ | ||||||
|  |         Performs a single OCR attempt. | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |         if not imgs: | ||||||
|  |             return "" | ||||||
|  |  | ||||||
|  |         self.log("info", "Parsing for {}".format(lang)) | ||||||
|  |  | ||||||
|  |         with Pool(processes=self.THREADS) as pool: | ||||||
|  |             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||||
|  |             r = " ".join(r) | ||||||
|  |  | ||||||
|  |         # Strip out excess white space to allow matching to go smoother | ||||||
|  |         return strip_excess_whitespace(r) | ||||||
|  |  | ||||||
|  |     def _assemble_ocr_sections(self, imgs, middle, text): | ||||||
|  |         """ | ||||||
|  |         Given a `middle` value and the text that middle page represents, we OCR | ||||||
|  |         the remainder of the document and return the whole thing. | ||||||
|  |         """ | ||||||
|  |         text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text | ||||||
|  |         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||||
|  |         return text | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def run_convert(*args): | ||||||
|  |  | ||||||
|  |     environment = os.environ.copy() | ||||||
|  |     if settings.CONVERT_MEMORY_LIMIT: | ||||||
|  |         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||||
|  |     if settings.CONVERT_TMPDIR: | ||||||
|  |         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||||
|  |  | ||||||
|  |     subprocess.Popen(args, env=environment).wait() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def run_unpaper(args): | ||||||
|  |     unpaper, pnm = args | ||||||
|  |     subprocess.Popen( | ||||||
|  |         (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def strip_excess_whitespace(text): | ||||||
|  |     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||||
|  |     no_leading_whitespace = re.sub( | ||||||
|  |         "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||||
|  |     no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) | ||||||
|  |     return no_trailing_whitespace | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def image_to_string(args): | ||||||
|  |     img, lang = args | ||||||
|  |     ocr = pyocr.get_available_tools()[0] | ||||||
|  |     with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f: | ||||||
|  |         if ocr.can_detect_orientation(): | ||||||
|  |             try: | ||||||
|  |                 orientation = ocr.detect_orientation(f, lang=lang) | ||||||
|  |                 f = f.rotate(orientation["angle"], expand=1) | ||||||
|  |             except (TesseractError, OtherTesseractError): | ||||||
|  |                 pass | ||||||
|  |         return ocr.image_to_string(f, lang=lang) | ||||||
							
								
								
									
										23
									
								
								src/paperless_tesseract/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_tesseract/signals.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | import re | ||||||
|  |  | ||||||
|  | from .parsers import RasterisedDocumentParser | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ConsumerDeclaration(object): | ||||||
|  |  | ||||||
|  |     MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$") | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def handle(cls, sender, **kwargs): | ||||||
|  |         return cls.test | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def test(cls, doc): | ||||||
|  |  | ||||||
|  |         if cls.MATCHING_FILES.match(doc): | ||||||
|  |             return { | ||||||
|  |                 "parser": RasterisedDocumentParser, | ||||||
|  |                 "weight": 0 | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |         return None | ||||||
							
								
								
									
										0
									
								
								src/paperless_tesseract/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_tesseract/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							| Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										80
									
								
								src/paperless_tesseract/tests/test_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								src/paperless_tesseract/tests/test_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | |||||||
|  | import os | ||||||
|  | from unittest import mock, skipIf | ||||||
|  |  | ||||||
|  | import pyocr | ||||||
|  | from django.test import TestCase | ||||||
|  | from pyocr.libtesseract.tesseract_raw import \ | ||||||
|  |     TesseractError as OtherTesseractError | ||||||
|  |  | ||||||
|  | from ..parsers import image_to_string, strip_excess_whitespace | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FakeTesseract(object): | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def can_detect_orientation(): | ||||||
|  |         return True | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def detect_orientation(file_handle, lang): | ||||||
|  |         raise OtherTesseractError("arbitrary status", "message") | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def image_to_string(file_handle, lang): | ||||||
|  |         return "This is test text" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class FakePyOcr(object): | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def get_available_tools(): | ||||||
|  |         return [FakeTesseract] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestOCR(TestCase): | ||||||
|  |  | ||||||
|  |     text_cases = [ | ||||||
|  |         ("simple     string", "simple string"), | ||||||
|  |         ( | ||||||
|  |             "simple    newline\n   testing string", | ||||||
|  |             "simple newline\ntesting string" | ||||||
|  |         ), | ||||||
|  |         ( | ||||||
|  |             "utf-8   строка с пробелами в конце  ", | ||||||
|  |             "utf-8 строка с пробелами в конце" | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||||
|  |     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||||
|  |  | ||||||
|  |     def test_strip_excess_whitespace(self): | ||||||
|  |         for source, result in self.text_cases: | ||||||
|  |             actual_result = strip_excess_whitespace(source) | ||||||
|  |             self.assertEqual( | ||||||
|  |                 result, | ||||||
|  |                 actual_result, | ||||||
|  |                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||||
|  |                     source, | ||||||
|  |                     result, | ||||||
|  |                     actual_result | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||||
|  |     @mock.patch( | ||||||
|  |         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||||
|  |         SAMPLE_FILES | ||||||
|  |     ) | ||||||
|  |     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||||
|  |     def test_image_to_string_with_text_free_page(self): | ||||||
|  |         """ | ||||||
|  |         This test is sort of silly, since it's really just reproducing an odd | ||||||
|  |         exception thrown by pyocr when it encounters a page with no text. | ||||||
|  |         Actually running this test against an installation of Tesseract results | ||||||
|  |         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||||
|  |         don't care to dig.  Regardless, if you run the consumer normally, | ||||||
|  |         text-free pages are now handled correctly so long as we work around | ||||||
|  |         this weird exception. | ||||||
|  |         """ | ||||||
|  |         image_to_string(["no-text.png", "en"]) | ||||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn