silenced unpaper, optipng for cleaner output

moved parser settings to settings removed forgiving ocr (now default) since tesseract is plenty accurate even without defining the correct language.
2025-12-14 01:21:14 -06:00 · 2020-11-01 23:23:42 +01:00
parent 1b5344ddee
commit 9f55fb668d
4 changed files with 33 additions and 64 deletions
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -212,11 +212,6 @@
 #PAPERLESS_CONSUMER_LOOP_TIME=10
 # By default Paperless stops consuming a document if no language can be
 # detected. Set to true to consume documents even if the language detection
 # fails.
 #PAPERLESS_FORGIVING_OCR="false"
 # By default Paperless does not OCR a document if the text can be retrieved from
 # the document directly. Set to true to always OCR documents.
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -60,7 +60,7 @@ class DocumentParser:
        out_path = os.path.join(self.tempdir, "optipng.png")
-        args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
+        args = (self.OPTIPNG, "-silent", "-o5", in_path, "-out", out_path)
        if not subprocess.Popen(args).wait() == 0:
            raise ParseError("Optipng failed at {}".format(args))
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -103,9 +103,6 @@ REST_FRAMEWORK = {
        'rest_framework.authentication.BasicAuthentication',
        'rest_framework.authentication.TokenAuthentication',
        'paperless.auth.QueryTokenAuthentication'
    ],
    'DEFAULT_PERMISSION_CLASSES': [
        'rest_framework.permissions.IsAuthenticated',
    ]
 }
@@ -260,15 +257,11 @@ LOGGING = {
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
-OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
+OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4))
 # OCR all documents?
 OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
 FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -277,7 +270,7 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
 CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
-CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
+CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
 # Ghostscript
 GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
@@ -327,3 +320,8 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
 CELERY_TASK_TRACK_STARTED = True
 CELERY_RESULT_BACKEND = 'db+sqlite:///results.sqlite'
 CELERY_WORKER_PREFETCH_MULTIPLIER = 1
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -28,14 +28,6 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """
    CONVERT = settings.CONVERT_BINARY
    GHOSTSCRIPT = settings.GS_BINARY
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS
    def __init__(self, path):
        super().__init__(path)
        self._text = None
@@ -50,7 +42,7 @@ class RasterisedDocumentParser(DocumentParser):
        # Run convert to get a decent thumbnail
        try:
            run_convert(
-                self.CONVERT,
+                settings.CONVERT_BINARY,
                "-density", "300",
                "-scale", "500x5000>",
                "-alpha", "remove",
@@ -67,7 +59,7 @@ class RasterisedDocumentParser(DocumentParser):
                "falling back to Ghostscript."
            )
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
-            cmd = [self.GHOSTSCRIPT,
+            cmd = [settings.GS_BINARY,
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
@@ -76,7 +68,7 @@ class RasterisedDocumentParser(DocumentParser):
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(
-                self.CONVERT,
+                settings.CONVERT_BINARY,
                "-density", "300",
                "-scale", "500x5000>",
                "-alpha", "remove",
@@ -101,7 +93,7 @@ class RasterisedDocumentParser(DocumentParser):
        if self._text is not None:
            return self._text
-        if not self.OCR_ALWAYS and self._is_ocred():
+        if not settings.OCR_ALWAYS and self._is_ocred():
            self.log("info", "Skipping OCR, using Text from PDF")
            self._text = get_text_from_pdf(self.document_path)
            return self._text
@@ -122,8 +114,8 @@ class RasterisedDocumentParser(DocumentParser):
        # Convert PDF to multiple PNMs
        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
        run_convert(
-            self.CONVERT,
+            settings.CONVERT_BINARY,
-            "-density", str(self.DENSITY),
+            "-density", str(settings.CONVERT_DENSITY),
            "-depth", "8",
            "-type", "grayscale",
            self.document_path, pnm,
@@ -136,8 +128,8 @@ class RasterisedDocumentParser(DocumentParser):
                pnms.append(os.path.join(self.tempdir, f))
        # Run unpaper in parallel on converted images
-        with Pool(processes=self.THREADS) as pool:
+        with Pool(processes=settings.OCR_THREADS) as pool:
-            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
+            pool.map(run_unpaper, itertools.product([settings.UNPAPER_BINARY], pnms))
        # Return list of converted images, processed with unpaper
        pnms = []
@@ -162,40 +154,29 @@ class RasterisedDocumentParser(DocumentParser):
        """
        if not imgs:
-            raise OCRError("No images found")
+            raise OCRError("Empty document, nothing to do.")
        self.log("info", "OCRing the document")
        # Since the division gets rounded down by int, this calculation works
        # for every edge-case, i.e. 1
        middle = int(len(imgs) / 2)
-        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
+        raw_text = self._ocr([imgs[middle]], settings.OCR_LANGUAGE)
        guessed_language = self._guess_language(raw_text)
        if not guessed_language or guessed_language not in ISO639:
            self.log("warning", "Language detection failed!")
-            if settings.FORGIVING_OCR:
+
                self.log(
                    "warning",
                    "As FORGIVING_OCR is enabled, we're going to make the "
                    "best with what we have."
                )
            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
            return raw_text
            error_msg = ("Language detection failed. Set "
                         "PAPERLESS_FORGIVING_OCR in config file to continue "
                         "anyway.")
            raise OCRError(error_msg)
-        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
+        if ISO639[guessed_language] == settings.OCR_LANGUAGE:
            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
            return raw_text
        try:
            return self._ocr(imgs, ISO639[guessed_language])
        except pyocr.pyocr.tesseract.TesseractError:
            if settings.FORGIVING_OCR:
            self.log(
                "warning",
                "OCR for {} failed, but we're going to stick with what "
@@ -205,10 +186,6 @@ class RasterisedDocumentParser(DocumentParser):
            )
            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
            return raw_text
            raise OCRError(
                "The guessed language ({}) is not available in this instance "
                "of Tesseract.".format(guessed_language)
            )
    def _ocr(self, imgs, lang):
        """
@@ -220,7 +197,7 @@ class RasterisedDocumentParser(DocumentParser):
        self.log("info", "Parsing for {}".format(lang))
-        with Pool(processes=self.THREADS) as pool:
+        with Pool(processes=settings.OCR_THREADS) as pool:
            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
            r = " ".join(r)
@@ -232,13 +209,12 @@ class RasterisedDocumentParser(DocumentParser):
        Given a `middle` value and the text that middle page represents, we OCR
        the remainder of the document and return the whole thing.
        """
-        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
+        text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
-        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
+        text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
        return text
 def run_convert(*args):
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT:
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -251,7 +227,7 @@ def run_convert(*args):
 def run_unpaper(args):
    unpaper, pnm = args
-    command_args = (unpaper, "--overwrite", pnm,
+    command_args = (unpaper, "--overwrite", "--quiet", pnm,
                    pnm.replace(".pnm", ".unpaper.pnm"))
    if not subprocess.Popen(command_args).wait() == 0:
        raise ParseError("Unpaper failed at {}".format(command_args))