feat: refactor for pluggable consumers

I've broken out the OCR-specific code from the consumers and dumped it all into its own app, `paperless_tesseract`. This new app should serve as a sample of how to create one's own consumer for different file types. Documentation for how to do this isn't ready yet, but for the impatient: * Create a new app * containing a `parsers.py` for your parser modelled after `paperless_tesseract.parsers.RasterisedDocumentParser` * containing a `signals.py` with a handler moddelled after `paperless_tesseract.signals.ConsumerDeclaration` * connect the signal handler to `documents.signals.document_consumer_declaration` in `your_app.apps` * Install the app into Paperless by declaring `PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be separated with commas. * Restart the consumer
2025-11-01 04:06:16 -05:00 · 2017-03-11 16:30:49 +00:00
parent 60eee48748
commit d2c283582b
14 changed files with 429 additions and 286 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,35 +1,21 @@
+import datetime
+import hashlib
+import logging
 import os
 import re
 import uuid
-import shutil
-import hashlib
-import logging
-import datetime
-import tempfile
-import itertools
-import subprocess
-from multiprocessing.pool import Pool

-import pyocr
-import langdetect
-from PIL import Image
 from django.conf import settings
 from django.utils import timezone
 from paperless.db import GnuPG
-from pyocr.tesseract import TesseractError
-from pyocr.libtesseract.tesseract_raw import \
-    TesseractError as OtherTesseractError

-from .models import Tag, Document, FileInfo
+from .models import Document, FileInfo, Tag
+from .parsers import ParseError
 from .signals import (
-    document_consumption_started,
-    document_consumption_finished
+    document_consumer_declaration,
+    document_consumption_finished,
+    document_consumption_started
 )
-from .languages import ISO639
-
-
-class OCRError(Exception):
-    pass


 class ConsumerError(Exception):
@@ -47,13 +33,7 @@ class Consumer(object):
    """

    SCRATCH = settings.SCRATCH_DIR
-    CONVERT = settings.CONVERT_BINARY
-    UNPAPER = settings.UNPAPER_BINARY
    CONSUME = settings.CONSUMPTION_DIR
-    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
-    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
-
-    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

    def __init__(self):

@@ -78,6 +58,16 @@ class Consumer(object):
            raise ConsumerError(
                "Consumption directory {} does not exist".format(self.CONSUME))

+        self.parsers = []
+        for response in document_consumer_declaration.send(self):
+            self.parsers.append(response[1])
+
+        if not self.parsers:
+            raise ConsumerError(
+                "No parsers could be found, not even the default.  "
+                "This is a problem."
+            )
+
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
            "group": self.logging_group
@@ -109,6 +99,13 @@ class Consumer(object):
                self._ignore.append(doc)
                continue

+            parser_class = self._get_parser_class(doc)
+            if not parser_class:
+                self.log(
+                    "info", "No parsers could be found for {}".format(doc))
+                self._ignore.append(doc)
+                continue
+
            self.logging_group = uuid.uuid4()

            self.log("info", "Consuming {}".format(doc))
@@ -119,25 +116,26 @@ class Consumer(object):
                logging_group=self.logging_group
            )

-            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
-            imgs = self._get_greyscale(tempdir, doc)
-            thumbnail = self._get_thumbnail(tempdir, doc)
+            parsed_document = parser_class(doc)
+            thumbnail = parsed_document.get_thumbnail()

            try:
-
-                document = self._store(self._get_ocr(imgs), doc, thumbnail)
-
-            except OCRError as e:
+                document = self._store(
+                    parsed_document.get_text(),
+                    doc,
+                    thumbnail
+                )
+            except ParseError as e:

                self._ignore.append(doc)
-                self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
-                self._cleanup_tempdir(tempdir)
+                self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
+                parsed_document.cleanup()

                continue

            else:

-                self._cleanup_tempdir(tempdir)
+                parsed_document.cleanup()
                self._cleanup_doc(doc)

                self.log(
@@ -151,142 +149,20 @@ class Consumer(object):
                    logging_group=self.logging_group
                )

-    def _get_greyscale(self, tempdir, doc):
+    def _get_parser_class(self, doc):
        """
-        Greyscale images are easier for Tesseract to OCR
+        Determine the appropriate parser class based on the file
        """

-        self.log("info", "Generating greyscale image from {}".format(doc))
+        options = []
+        for parser in self.parsers:
+            result = parser(doc)
+            if result:
+                options.append(result)

-        # Convert PDF to multiple PNMs
-        pnm = os.path.join(tempdir, "convert-%04d.pnm")
-        run_convert(
-            self.CONVERT,
-            "-density", str(self.DENSITY),
-            "-depth", "8",
-            "-type", "grayscale",
-            doc, pnm,
-        )
-
-        # Get a list of converted images
-        pnms = []
-        for f in os.listdir(tempdir):
-            if f.endswith(".pnm"):
-                pnms.append(os.path.join(tempdir, f))
-
-        # Run unpaper in parallel on converted images
-        with Pool(processes=self.THREADS) as pool:
-            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
-
-        # Return list of converted images, processed with unpaper
-        pnms = []
-        for f in os.listdir(tempdir):
-            if f.endswith(".unpaper.pnm"):
-                pnms.append(os.path.join(tempdir, f))
-
-        return sorted(filter(lambda __: os.path.isfile(__), pnms))
-
-    def _get_thumbnail(self, tempdir, doc):
-        """
-        The thumbnail of a PDF is just a 500px wide image of the first page.
-        """
-
-        self.log("info", "Generating the thumbnail")
-
-        run_convert(
-            self.CONVERT,
-            "-scale", "500x5000",
-            "-alpha", "remove",
-            doc, os.path.join(tempdir, "convert-%04d.png")
-        )
-
-        return os.path.join(tempdir, "convert-0000.png")
-
-    def _guess_language(self, text):
-        try:
-            guess = langdetect.detect(text)
-            self.log("debug", "Language detected: {}".format(guess))
-            return guess
-        except Exception as e:
-            self.log("warning", "Language detection error: {}".format(e))
-
-    def _get_ocr(self, imgs):
-        """
-        Attempts to do the best job possible OCR'ing the document based on
-        simple language detection trial & error.
-        """
-
-        if not imgs:
-            raise OCRError("No images found")
-
-        self.log("info", "OCRing the document")
-
-        # Since the division gets rounded down by int, this calculation works
-        # for every edge-case, i.e. 1
-        middle = int(len(imgs) / 2)
-        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
-
-        guessed_language = self._guess_language(raw_text)
-
-        if not guessed_language or guessed_language not in ISO639:
-            self.log("warning", "Language detection failed!")
-            if settings.FORGIVING_OCR:
-                self.log(
-                    "warning",
-                    "As FORGIVING_OCR is enabled, we're going to make the "
-                    "best with what we have."
-                )
-                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-                return raw_text
-            raise OCRError("Language detection failed")
-
-        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
-            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-            return raw_text
-
-        try:
-            return self._ocr(imgs, ISO639[guessed_language])
-        except pyocr.pyocr.tesseract.TesseractError:
-            if settings.FORGIVING_OCR:
-                self.log(
-                    "warning",
-                    "OCR for {} failed, but we're going to stick with what "
-                    "we've got since FORGIVING_OCR is enabled.".format(
-                        guessed_language
-                    )
-                )
-                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
-                return raw_text
-            raise OCRError(
-                "The guessed language is not available in this instance of "
-                "Tesseract."
-            )
-
-    def _assemble_ocr_sections(self, imgs, middle, text):
-        """
-        Given a `middle` value and the text that middle page represents, we OCR
-        the remainder of the document and return the whole thing.
-        """
-        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
-        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
-        return text
-
-    def _ocr(self, imgs, lang):
-        """
-        Performs a single OCR attempt.
-        """
-
-        if not imgs:
-            return ""
-
-        self.log("info", "Parsing for {}".format(lang))
-
-        with Pool(processes=self.THREADS) as pool:
-            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
-            r = " ".join(r)
-
-        # Strip out excess white space to allow matching to go smoother
-        return strip_excess_whitespace(r)
+        # Return the parser with the highest weight.
+        return sorted(
+            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]

    def _store(self, text, doc, thumbnail):

@@ -332,10 +208,6 @@ class Consumer(object):

        return document

-    def _cleanup_tempdir(self, d):
-        self.log("debug", "Deleting directory {}".format(d))
-        shutil.rmtree(d)
-
    def _cleanup_doc(self, doc):
        self.log("debug", "Deleting document {}".format(doc))
        os.unlink(doc)
@@ -361,41 +233,3 @@ class Consumer(object):
        with open(doc, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        return Document.objects.filter(checksum=checksum).exists()
-
-
-def strip_excess_whitespace(text):
-    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
-    no_leading_whitespace = re.sub(
-        "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
-    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
-    return no_trailing_whitespace
-
-
-def image_to_string(args):
-    img, lang = args
-    ocr = pyocr.get_available_tools()[0]
-    with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
-        if ocr.can_detect_orientation():
-            try:
-                orientation = ocr.detect_orientation(f, lang=lang)
-                f = f.rotate(orientation["angle"], expand=1)
-            except (TesseractError, OtherTesseractError):
-                pass
-        return ocr.image_to_string(f, lang=lang)
-
-
-def run_unpaper(args):
-    unpaper, pnm = args
-    subprocess.Popen(
-        (unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
-
-
-def run_convert(*args):
-
-    environment = os.environ.copy()
-    if settings.CONVERT_MEMORY_LIMIT:
-        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
-    if settings.CONVERT_TMPDIR:
-        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
-
-    subprocess.Popen(args, env=environment).wait()
--- a/src/documents/languages.py
+++ b/src/documents/languages.py
@@ -1,194 +0,0 @@
-# Thanks to the Library of Congress and some creative use of sed and awk:
-# http://www.loc.gov/standards/iso639-2/php/English_list.php
-
-ISO639 = {
-
-    "aa": "aar",
-    "ab": "abk",
-    "ae": "ave",
-    "af": "afr",
-    "ak": "aka",
-    "am": "amh",
-    "an": "arg",
-    "ar": "ara",
-    "as": "asm",
-    "av": "ava",
-    "ay": "aym",
-    "az": "aze",
-    "ba": "bak",
-    "be": "bel",
-    "bg": "bul",
-    "bh": "bih",
-    "bi": "bis",
-    "bm": "bam",
-    "bn": "ben",
-    "bo": "bod",
-    "br": "bre",
-    "bs": "bos",
-    "ca": "cat",
-    "ce": "che",
-    "ch": "cha",
-    "co": "cos",
-    "cr": "cre",
-    "cs": "ces",
-    "cu": "chu",
-    "cv": "chv",
-    "cy": "cym",
-    "da": "dan",
-    "de": "deu",
-    "dv": "div",
-    "dz": "dzo",
-    "ee": "ewe",
-    "el": "ell",
-    "en": "eng",
-    "eo": "epo",
-    "es": "spa",
-    "et": "est",
-    "eu": "eus",
-    "fa": "fas",
-    "ff": "ful",
-    "fi": "fin",
-    "fj": "fij",
-    "fo": "fao",
-    "fr": "fra",
-    "fy": "fry",
-    "ga": "gle",
-    "gd": "gla",
-    "gl": "glg",
-    "gn": "grn",
-    "gu": "guj",
-    "gv": "glv",
-    "ha": "hau",
-    "he": "heb",
-    "hi": "hin",
-    "ho": "hmo",
-    "hr": "hrv",
-    "ht": "hat",
-    "hu": "hun",
-    "hy": "hye",
-    "hz": "her",
-    "ia": "ina",
-    "id": "ind",
-    "ie": "ile",
-    "ig": "ibo",
-    "ii": "iii",
-    "ik": "ipk",
-    "io": "ido",
-    "is": "isl",
-    "it": "ita",
-    "iu": "iku",
-    "ja": "jpn",
-    "jv": "jav",
-    "ka": "kat",
-    "kg": "kon",
-    "ki": "kik",
-    "kj": "kua",
-    "kk": "kaz",
-    "kl": "kal",
-    "km": "khm",
-    "kn": "kan",
-    "ko": "kor",
-    "kr": "kau",
-    "ks": "kas",
-    "ku": "kur",
-    "kv": "kom",
-    "kw": "cor",
-    "ky": "kir",
-    "la": "lat",
-    "lb": "ltz",
-    "lg": "lug",
-    "li": "lim",
-    "ln": "lin",
-    "lo": "lao",
-    "lt": "lit",
-    "lu": "lub",
-    "lv": "lav",
-    "mg": "mlg",
-    "mh": "mah",
-    "mi": "mri",
-    "mk": "mkd",
-    "ml": "mal",
-    "mn": "mon",
-    "mr": "mar",
-    "ms": "msa",
-    "mt": "mlt",
-    "my": "mya",
-    "na": "nau",
-    "nb": "nob",
-    "nd": "nde",
-    "ne": "nep",
-    "ng": "ndo",
-    "nl": "nld",
-    "no": "nor",
-    "nr": "nbl",
-    "nv": "nav",
-    "ny": "nya",
-    "oc": "oci",
-    "oj": "oji",
-    "om": "orm",
-    "or": "ori",
-    "os": "oss",
-    "pa": "pan",
-    "pi": "pli",
-    "pl": "pol",
-    "ps": "pus",
-    "pt": "por",
-    "qu": "que",
-    "rm": "roh",
-    "rn": "run",
-    "ro": "ron",
-    "ru": "rus",
-    "rw": "kin",
-    "sa": "san",
-    "sc": "srd",
-    "sd": "snd",
-    "se": "sme",
-    "sg": "sag",
-    "si": "sin",
-    "sk": "slk",
-    "sl": "slv",
-    "sm": "smo",
-    "sn": "sna",
-    "so": "som",
-    "sq": "sqi",
-    "sr": "srp",
-    "ss": "ssw",
-    "st": "sot",
-    "su": "sun",
-    "sv": "swe",
-    "sw": "swa",
-    "ta": "tam",
-    "te": "tel",
-    "tg": "tgk",
-    "th": "tha",
-    "ti": "tir",
-    "tk": "tuk",
-    "tl": "tgl",
-    "tn": "tsn",
-    "to": "ton",
-    "tr": "tur",
-    "ts": "tso",
-    "tt": "tat",
-    "tw": "twi",
-    "ty": "tah",
-    "ug": "uig",
-    "uk": "ukr",
-    "ur": "urd",
-    "uz": "uzb",
-    "ve": "ven",
-    "vi": "vie",
-    "vo": "vol",
-    "wa": "wln",
-    "wo": "wol",
-    "xh": "xho",
-    "yi": "yid",
-    "yo": "yor",
-    "za": "zha",
-
-    # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I
-    # have no idea which one is better, so I just picked the bigger file.
-    "zh": "chi_tra",
-
-    "zu": "zul"
-
-}
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -0,0 +1,45 @@
+import logging
+import shutil
+import tempfile
+
+from django.conf import settings
+
+
+class ParseError(Exception):
+    pass
+
+
+class DocumentParser(object):
+    """
+    Subclass this to make your own parser.  Have a look at
+    `paperless_tesseract.parsers` for inspiration.
+    """
+
+    SCRATCH = settings.SCRATCH_DIR
+
+    def __init__(self, path):
+        self.document_path = path
+        self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
+        self.logger = logging.getLogger(__name__)
+        self.logging_group = None
+
+    def get_thumbnail(self):
+        """
+        Returns the path to a file we can use as a thumbnail for this document.
+        """
+        raise NotImplementedError()
+
+    def get_text(self):
+        """
+        Returns the text from the document and only the text.
+        """
+        raise NotImplementedError()
+
+    def log(self, level, message):
+        getattr(self.logger, level)(message, extra={
+            "group": self.logging_group
+        })
+
+    def cleanup(self):
+        self.log("debug", "Deleting directory {}".format(self.tempdir))
+        shutil.rmtree(self.tempdir)
--- a/src/documents/signals/init.py
+++ b/src/documents/signals/init.py
@@ -2,3 +2,4 @@ from django.dispatch import Signal

 document_consumption_started = Signal(providing_args=["filename"])
 document_consumption_finished = Signal(providing_args=["document"])
+document_consumer_declaration = Signal(providing_args=[])
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -1,6 +1,5 @@
 import logging
 import os
-
 from subprocess import Popen

 from django.conf import settings
--- a/src/documents/tests/samples/no-text.png
+++ b/src/documents/tests/samples/no-text.png
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,13 +1,6 @@
-import os
-from unittest import mock, skipIf
-
-import pyocr
 from django.test import TestCase
-from pyocr.libtesseract.tesseract_raw import \
-    TesseractError as OtherTesseractError

 from ..models import FileInfo
-from ..consumer import image_to_string, strip_excess_whitespace


 class TestAttributes(TestCase):
@@ -310,69 +303,3 @@ class TestFieldPermutations(TestCase):
                            template.format(**spec), **spec)


-class FakeTesseract(object):
-
-    @staticmethod
-    def can_detect_orientation():
-        return True
-
-    @staticmethod
-    def detect_orientation(file_handle, lang):
-        raise OtherTesseractError("arbitrary status", "message")
-
-    @staticmethod
-    def image_to_string(file_handle, lang):
-        return "This is test text"
-
-
-class FakePyOcr(object):
-
-    @staticmethod
-    def get_available_tools():
-        return [FakeTesseract]
-
-
-class TestOCR(TestCase):
-
-    text_cases = [
-        ("simple     string", "simple string"),
-        (
-            "simple    newline\n   testing string",
-            "simple newline\ntesting string"
-        ),
-        (
-            "utf-8   строка с пробелами в конце  ",
-            "utf-8 строка с пробелами в конце"
-        )
-    ]
-
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
-
-    def test_strip_excess_whitespace(self):
-        for source, result in self.text_cases:
-            actual_result = strip_excess_whitespace(source)
-            self.assertEqual(
-                result,
-                actual_result,
-                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
-                    source,
-                    result,
-                    actual_result
-                )
-            )
-
-    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
-    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
-    @mock.patch("documents.consumer.pyocr", FakePyOcr)
-    def test_image_to_string_with_text_free_page(self):
-        """
-        This test is sort of silly, since it's really just reproducing an odd
-        exception thrown by pyocr when it encounters a page with no text.
-        Actually running this test against an installation of Tesseract results
-        in a segmentation fault rooted somewhere deep inside pyocr where I
-        don't care to dig.  Regardless, if you run the consumer normally,
-        text-free pages are now handled correctly so long as we work around
-        this weird exception.
-        """
-        image_to_string(["no-text.png", "en"])