feat: refactor for pluggable consumers

I've broken out the OCR-specific code from the consumers and dumped it all into its own app, `paperless_tesseract`. This new app should serve as a sample of how to create one's own consumer for different file types. Documentation for how to do this isn't ready yet, but for the impatient: * Create a new app * containing a `parsers.py` for your parser modelled after `paperless_tesseract.parsers.RasterisedDocumentParser` * containing a `signals.py` with a handler moddelled after `paperless_tesseract.signals.ConsumerDeclaration` * connect the signal handler to `documents.signals.document_consumer_declaration` in `your_app.apps` * Install the app into Paperless by declaring `PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be separated with commas. * Restart the consumer
2026-02-05 23:32:46 -06:00 · 2017-03-11 16:30:49 +00:00
parent 60eee48748
commit d2c283582b
14 changed files with 429 additions and 286 deletions
--- a/src/paperless_tesseract/init.py
+++ b/src/paperless_tesseract/init.py
--- a/src/paperless_tesseract/apps.py
+++ b/src/paperless_tesseract/apps.py
@@ -0,0 +1,16 @@
+from django.apps import AppConfig
+
+
+class PaperlessTesseractConfig(AppConfig):
+
+    name = "paperless_tesseract"
+
+    def ready(self):
+
+        from documents.signals import document_consumer_declaration
+
+        from .signals import ConsumerDeclaration
+
+        document_consumer_declaration.connect(ConsumerDeclaration.handle)
+
+        AppConfig.ready(self)
--- a/src/paperless_tesseract/languages.py
+++ b/src/paperless_tesseract/languages.py
@@ -0,0 +1,194 @@
+# Thanks to the Library of Congress and some creative use of sed and awk:
+# http://www.loc.gov/standards/iso639-2/php/English_list.php
+
+ISO639 = {
+
+    "aa": "aar",
+    "ab": "abk",
+    "ae": "ave",
+    "af": "afr",
+    "ak": "aka",
+    "am": "amh",
+    "an": "arg",
+    "ar": "ara",
+    "as": "asm",
+    "av": "ava",
+    "ay": "aym",
+    "az": "aze",
+    "ba": "bak",
+    "be": "bel",
+    "bg": "bul",
+    "bh": "bih",
+    "bi": "bis",
+    "bm": "bam",
+    "bn": "ben",
+    "bo": "bod",
+    "br": "bre",
+    "bs": "bos",
+    "ca": "cat",
+    "ce": "che",
+    "ch": "cha",
+    "co": "cos",
+    "cr": "cre",
+    "cs": "ces",
+    "cu": "chu",
+    "cv": "chv",
+    "cy": "cym",
+    "da": "dan",
+    "de": "deu",
+    "dv": "div",
+    "dz": "dzo",
+    "ee": "ewe",
+    "el": "ell",
+    "en": "eng",
+    "eo": "epo",
+    "es": "spa",
+    "et": "est",
+    "eu": "eus",
+    "fa": "fas",
+    "ff": "ful",
+    "fi": "fin",
+    "fj": "fij",
+    "fo": "fao",
+    "fr": "fra",
+    "fy": "fry",
+    "ga": "gle",
+    "gd": "gla",
+    "gl": "glg",
+    "gn": "grn",
+    "gu": "guj",
+    "gv": "glv",
+    "ha": "hau",
+    "he": "heb",
+    "hi": "hin",
+    "ho": "hmo",
+    "hr": "hrv",
+    "ht": "hat",
+    "hu": "hun",
+    "hy": "hye",
+    "hz": "her",
+    "ia": "ina",
+    "id": "ind",
+    "ie": "ile",
+    "ig": "ibo",
+    "ii": "iii",
+    "ik": "ipk",
+    "io": "ido",
+    "is": "isl",
+    "it": "ita",
+    "iu": "iku",
+    "ja": "jpn",
+    "jv": "jav",
+    "ka": "kat",
+    "kg": "kon",
+    "ki": "kik",
+    "kj": "kua",
+    "kk": "kaz",
+    "kl": "kal",
+    "km": "khm",
+    "kn": "kan",
+    "ko": "kor",
+    "kr": "kau",
+    "ks": "kas",
+    "ku": "kur",
+    "kv": "kom",
+    "kw": "cor",
+    "ky": "kir",
+    "la": "lat",
+    "lb": "ltz",
+    "lg": "lug",
+    "li": "lim",
+    "ln": "lin",
+    "lo": "lao",
+    "lt": "lit",
+    "lu": "lub",
+    "lv": "lav",
+    "mg": "mlg",
+    "mh": "mah",
+    "mi": "mri",
+    "mk": "mkd",
+    "ml": "mal",
+    "mn": "mon",
+    "mr": "mar",
+    "ms": "msa",
+    "mt": "mlt",
+    "my": "mya",
+    "na": "nau",
+    "nb": "nob",
+    "nd": "nde",
+    "ne": "nep",
+    "ng": "ndo",
+    "nl": "nld",
+    "no": "nor",
+    "nr": "nbl",
+    "nv": "nav",
+    "ny": "nya",
+    "oc": "oci",
+    "oj": "oji",
+    "om": "orm",
+    "or": "ori",
+    "os": "oss",
+    "pa": "pan",
+    "pi": "pli",
+    "pl": "pol",
+    "ps": "pus",
+    "pt": "por",
+    "qu": "que",
+    "rm": "roh",
+    "rn": "run",
+    "ro": "ron",
+    "ru": "rus",
+    "rw": "kin",
+    "sa": "san",
+    "sc": "srd",
+    "sd": "snd",
+    "se": "sme",
+    "sg": "sag",
+    "si": "sin",
+    "sk": "slk",
+    "sl": "slv",
+    "sm": "smo",
+    "sn": "sna",
+    "so": "som",
+    "sq": "sqi",
+    "sr": "srp",
+    "ss": "ssw",
+    "st": "sot",
+    "su": "sun",
+    "sv": "swe",
+    "sw": "swa",
+    "ta": "tam",
+    "te": "tel",
+    "tg": "tgk",
+    "th": "tha",
+    "ti": "tir",
+    "tk": "tuk",
+    "tl": "tgl",
+    "tn": "tsn",
+    "to": "ton",
+    "tr": "tur",
+    "ts": "tso",
+    "tt": "tat",
+    "tw": "twi",
+    "ty": "tah",
+    "ug": "uig",
+    "uk": "ukr",
+    "ur": "urd",
+    "uz": "uzb",
+    "ve": "ven",
+    "vi": "vie",
+    "vo": "vol",
+    "wa": "wln",
+    "wo": "wol",
+    "xh": "xho",
+    "yi": "yid",
+    "yo": "yor",
+    "za": "zha",
+
+    # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I
+    # have no idea which one is better, so I just picked the bigger file.
+    "zh": "chi_tra",
+
+    "zu": "zul"
+
+}
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -0,0 +1,214 @@
+import itertools
+import os
+import re
+import subprocess
+from multiprocessing.pool import Pool
+
+import langdetect
+import pyocr
+from django.conf import settings
+from documents.parsers import DocumentParser, ParseError
+from PIL import Image
+from pyocr.libtesseract.tesseract_raw import \
+    TesseractError as OtherTesseractError
+from pyocr.tesseract import TesseractError
+
+from .languages import ISO639
+
+
+class OCRError(Exception):
+    pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+    """
+    This parser uses Tesseract to try and get some text out of a rasterised
+    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+    """
+
+    CONVERT = settings.CONVERT_BINARY
+    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
+    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
+    UNPAPER = settings.UNPAPER_BINARY
+    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
+
+    def get_thumbnail(self):
+        """
+        The thumbnail of a PDF is just a 500px wide image of the first page.
+        """
+
+        run_convert(
+            self.CONVERT,
+            "-scale", "500x5000",
+            "-alpha", "remove",
+            self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
+        )
+
+        return os.path.join(self.tempdir, "convert-0000.png")
+
+    def get_text(self):
+
+        images = self._get_greyscale()
+
+        try:
+
+            return self._get_ocr(images)
+        except OCRError as e:
+            raise ParseError(e)
+
+    def _get_greyscale(self):
+        """
+        Greyscale images are easier for Tesseract to OCR
+        """
+
+        # Convert PDF to multiple PNMs
+        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
+        run_convert(
+            self.CONVERT,
+            "-density", str(self.DENSITY),
+            "-depth", "8",
+            "-type", "grayscale",
+            self.document_path, pnm,
+        )
+
+        # Get a list of converted images
+        pnms = []
+        for f in os.listdir(self.tempdir):
+            if f.endswith(".pnm"):
+                pnms.append(os.path.join(self.tempdir, f))
+
+        # Run unpaper in parallel on converted images
+        with Pool(processes=self.THREADS) as pool:
+            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
+
+        # Return list of converted images, processed with unpaper
+        pnms = []
+        for f in os.listdir(self.tempdir):
+            if f.endswith(".unpaper.pnm"):
+                pnms.append(os.path.join(self.tempdir, f))
+
+        return sorted(filter(lambda __: os.path.isfile(__), pnms))
+
+    def _guess_language(self, text):
+        try:
+            guess = langdetect.detect(text)
+            self.log("debug", "Language detected: {}".format(guess))
+            return guess
+        except Exception as e:
+            self.log("warning", "Language detection error: {}".format(e))
+
+    def _get_ocr(self, imgs):
+        """
+        Attempts to do the best job possible OCR'ing the document based on
+        simple language detection trial & error.
+        """
+
+        if not imgs:
+            raise OCRError("No images found")
+
+        self.log("info", "OCRing the document")
+
+        # Since the division gets rounded down by int, this calculation works
+        # for every edge-case, i.e. 1
+        middle = int(len(imgs) / 2)
+        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
+
+        guessed_language = self._guess_language(raw_text)
+
+        if not guessed_language or guessed_language not in ISO639:
+            self.log("warning", "Language detection failed!")
+            if settings.FORGIVING_OCR:
+                self.log(
+                    "warning",
+                    "As FORGIVING_OCR is enabled, we're going to make the "
+                    "best with what we have."
+                )
+                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
+                return raw_text
+            raise OCRError("Language detection failed")
+
+        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
+            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
+            return raw_text
+
+        try:
+            return self._ocr(imgs, ISO639[guessed_language])
+        except pyocr.pyocr.tesseract.TesseractError:
+            if settings.FORGIVING_OCR:
+                self.log(
+                    "warning",
+                    "OCR for {} failed, but we're going to stick with what "
+                    "we've got since FORGIVING_OCR is enabled.".format(
+                        guessed_language
+                    )
+                )
+                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
+                return raw_text
+            raise OCRError(
+                "The guessed language is not available in this instance of "
+                "Tesseract."
+            )
+
+    def _ocr(self, imgs, lang):
+        """
+        Performs a single OCR attempt.
+        """
+
+        if not imgs:
+            return ""
+
+        self.log("info", "Parsing for {}".format(lang))
+
+        with Pool(processes=self.THREADS) as pool:
+            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
+            r = " ".join(r)
+
+        # Strip out excess white space to allow matching to go smoother
+        return strip_excess_whitespace(r)
+
+    def _assemble_ocr_sections(self, imgs, middle, text):
+        """
+        Given a `middle` value and the text that middle page represents, we OCR
+        the remainder of the document and return the whole thing.
+        """
+        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
+        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
+        return text
+
+
+def run_convert(*args):
+
+    environment = os.environ.copy()
+    if settings.CONVERT_MEMORY_LIMIT:
+        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
+    if settings.CONVERT_TMPDIR:
+        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
+
+    subprocess.Popen(args, env=environment).wait()
+
+
+def run_unpaper(args):
+    unpaper, pnm = args
+    subprocess.Popen(
+        (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
+
+
+def strip_excess_whitespace(text):
+    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+    no_leading_whitespace = re.sub(
+        "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
+    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
+    return no_trailing_whitespace
+
+
+def image_to_string(args):
+    img, lang = args
+    ocr = pyocr.get_available_tools()[0]
+    with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
+        if ocr.can_detect_orientation():
+            try:
+                orientation = ocr.detect_orientation(f, lang=lang)
+                f = f.rotate(orientation["angle"], expand=1)
+            except (TesseractError, OtherTesseractError):
+                pass
+        return ocr.image_to_string(f, lang=lang)
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -0,0 +1,23 @@
+import re
+
+from .parsers import RasterisedDocumentParser
+
+
+class ConsumerDeclaration(object):
+
+    MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
+
+    @classmethod
+    def handle(cls, sender, **kwargs):
+        return cls.test
+
+    @classmethod
+    def test(cls, doc):
+
+        if cls.MATCHING_FILES.match(doc):
+            return {
+                "parser": RasterisedDocumentParser,
+                "weight": 0
+            }
+
+        return None
--- a/src/paperless_tesseract/tests/init.py
+++ b/src/paperless_tesseract/tests/init.py
--- a/src/paperless_tesseract/tests/samples/no-text.png
+++ b/src/paperless_tesseract/tests/samples/no-text.png
--- a/src/paperless_tesseract/tests/test_ocr.py
+++ b/src/paperless_tesseract/tests/test_ocr.py
@@ -0,0 +1,80 @@
+import os
+from unittest import mock, skipIf
+
+import pyocr
+from django.test import TestCase
+from pyocr.libtesseract.tesseract_raw import \
+    TesseractError as OtherTesseractError
+
+from ..parsers import image_to_string, strip_excess_whitespace
+
+
+class FakeTesseract(object):
+
+    @staticmethod
+    def can_detect_orientation():
+        return True
+
+    @staticmethod
+    def detect_orientation(file_handle, lang):
+        raise OtherTesseractError("arbitrary status", "message")
+
+    @staticmethod
+    def image_to_string(file_handle, lang):
+        return "This is test text"
+
+
+class FakePyOcr(object):
+
+    @staticmethod
+    def get_available_tools():
+        return [FakeTesseract]
+
+
+class TestOCR(TestCase):
+
+    text_cases = [
+        ("simple     string", "simple string"),
+        (
+            "simple    newline\n   testing string",
+            "simple newline\ntesting string"
+        ),
+        (
+            "utf-8   строка с пробелами в конце  ",
+            "utf-8 строка с пробелами в конце"
+        )
+    ]
+
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
+
+    def test_strip_excess_whitespace(self):
+        for source, result in self.text_cases:
+            actual_result = strip_excess_whitespace(source)
+            self.assertEqual(
+                result,
+                actual_result,
+                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
+                    source,
+                    result,
+                    actual_result
+                )
+            )
+
+    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SAMPLE_FILES
+    )
+    @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
+    def test_image_to_string_with_text_free_page(self):
+        """
+        This test is sort of silly, since it's really just reproducing an odd
+        exception thrown by pyocr when it encounters a page with no text.
+        Actually running this test against an installation of Tesseract results
+        in a segmentation fault rooted somewhere deep inside pyocr where I
+        don't care to dig.  Regardless, if you run the consumer normally,
+        text-free pages are now handled correctly so long as we work around
+        this weird exception.
+        """
+        image_to_string(["no-text.png", "en"])