made unpaper and convert a little bit nicer to interact with

2025-12-16 01:31:09 -06:00 · 2020-11-02 19:31:04 +01:00
parent c28b636ffa
commit 3a08a2d206
2 changed files with 73 additions and 49 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -29,6 +29,46 @@ DATE_REGEX = re.compile(
 )


+logger = logging.getLogger(__name__)
+
+
+def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
+    environment = os.environ.copy()
+    if settings.CONVERT_MEMORY_LIMIT:
+        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
+    if settings.CONVERT_TMPDIR:
+        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
+
+    args = [settings.CONVERT_BINARY]
+    args += ['-density', str(density)] if density else []
+    args += ['-scale', str(scale)] if scale else []
+    args += ['-alpha', str(alpha)] if alpha else []
+    args += ['-strip'] if strip else []
+    args += ['-trim'] if trim else []
+    args += ['-type', str(type)] if type else []
+    args += ['-depth', str(depth)] if depth else []
+    args += [input, output]
+
+    logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
+
+    if not subprocess.Popen(args, env=environment).wait() == 0:
+        raise ParseError("Convert failed at {}".format(args))
+
+
+def run_unpaper(pnm, logging_group=None):
+    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
+
+    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
+                    pnm_out)
+
+    logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
+
+    if not subprocess.Popen(command_args).wait() == 0:
+        raise ParseError("Unpaper failed at {}".format(command_args))
+
+    return pnm_out
+
+
 class ParseError(Exception):
    pass

@@ -56,6 +96,9 @@ class DocumentParser:
        out_path = os.path.join(self.tempdir, "optipng.png")

        args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
+
+        self.log('debug', 'Execute: ' + " ".join(args))
+
        if not subprocess.Popen(args).wait() == 0:
            raise ParseError("Optipng failed at {}".format(args))

--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -11,7 +11,8 @@ from PIL import Image
 from pyocr import PyocrException

 import pdftotext
-from documents.parsers import DocumentParser, ParseError
+from documents.parsers import DocumentParser, ParseError, run_unpaper, \
+    run_convert

 from .languages import ISO639

@@ -39,15 +40,14 @@ class RasterisedDocumentParser(DocumentParser):

        # Run convert to get a decent thumbnail
        try:
-            run_convert(
-                settings.CONVERT_BINARY,
-                "-density", "300",
-                "-scale", "500x5000>",
-                "-alpha", "remove",
-                "-strip", "-trim",
-                "{}[0]".format(self.document_path),
-                out_path
-            )
+            run_convert(density=300,
+                        scale="500x5000>",
+                        alpha="remove",
+                        strip=True,
+                        trim=True,
+                        input="{}[0]".format(self.document_path),
+                        output=out_path,
+                        logging_group=self.logging_group)
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
@@ -61,15 +61,14 @@ class RasterisedDocumentParser(DocumentParser):
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
-            run_convert(
-                settings.CONVERT_BINARY,
-                "-density", "300",
-                "-scale", "500x5000>",
-                "-alpha", "remove",
-                "-strip", "-trim",
-                gs_out_path,
-                out_path
-            )
+            run_convert(density=300,
+                        scale="500x5000>",
+                        alpha="remove",
+                        strip=True,
+                        trim=True,
+                        input=gs_out_path,
+                        output=out_path,
+                        logging_group=self.logging_group)

        return out_path

@@ -107,14 +106,17 @@ class RasterisedDocumentParser(DocumentParser):
            if not guessed_language or guessed_language not in ISO639:
                self.log("warning", "Language detection failed.")
                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+
            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
                self.log("info", "Detected language: {} (default language)".format(guessed_language))
                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+
            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
-                self.log("warning","Detected language {} is not available on this system.".format(guessed_language))
+                self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+
            else:
-                self.log("info","Detected language: {}".format(guessed_language))
+                self.log("info", "Detected language: {}".format(guessed_language))
                ocr_pages = self._ocr(images, ISO639[guessed_language])

            self.log("info", "OCR completed.")
@@ -133,13 +135,13 @@ class RasterisedDocumentParser(DocumentParser):

        # Convert PDF to multiple PNMs
        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
-        run_convert(
-            settings.CONVERT_BINARY,
-            "-density", str(settings.CONVERT_DENSITY),
-            "-depth", "8",
-            "-type", "grayscale",
-            self.document_path, pnm,
-        )
+
+        run_convert(density=settings.CONVERT_DENSITY,
+                    depth="8",
+                    type="grayscale",
+                    input=self.document_path,
+                    output=pnm,
+                    logging_group=self.logging_group)

        # Get a list of converted images
        pnms = []
@@ -187,27 +189,6 @@ class RasterisedDocumentParser(DocumentParser):
            return [sample_page]


-def run_convert(*args):
-    environment = os.environ.copy()
-    if settings.CONVERT_MEMORY_LIMIT:
-        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
-    if settings.CONVERT_TMPDIR:
-        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
-
-    if not subprocess.Popen(args, env=environment).wait() == 0:
-        raise ParseError("Convert failed at {}".format(args))
-
-
-def run_unpaper(pnm):
-    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
-
-    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
-                    pnm_out)
-    if not subprocess.Popen(command_args).wait() == 0:
-        raise ParseError("Unpaper failed at {}".format(command_args))
-
-    return pnm_out
-

 def strip_excess_whitespace(text):
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)