From 3a08a2d2060f1181b212aff69120617a8964c017 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Mon, 2 Nov 2020 19:31:04 +0100 Subject: [PATCH] made unpaper and convert a little bit nicer to interact with --- src/documents/parsers.py | 43 ++++++++++++++++ src/paperless_tesseract/parsers.py | 79 ++++++++++++------------------ 2 files changed, 73 insertions(+), 49 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c5357b419..63afa906d 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -29,6 +29,46 @@ DATE_REGEX = re.compile( ) +logger = logging.getLogger(__name__) + + +def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): + environment = os.environ.copy() + if settings.CONVERT_MEMORY_LIMIT: + environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT + if settings.CONVERT_TMPDIR: + environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR + + args = [settings.CONVERT_BINARY] + args += ['-density', str(density)] if density else [] + args += ['-scale', str(scale)] if scale else [] + args += ['-alpha', str(alpha)] if alpha else [] + args += ['-strip'] if strip else [] + args += ['-trim'] if trim else [] + args += ['-type', str(type)] if type else [] + args += ['-depth', str(depth)] if depth else [] + args += [input, output] + + logger.debug("Execute: " + " ".join(args), extra={'group': logging_group}) + + if not subprocess.Popen(args, env=environment).wait() == 0: + raise ParseError("Convert failed at {}".format(args)) + + +def run_unpaper(pnm, logging_group=None): + pnm_out = pnm.replace(".pnm", ".unpaper.pnm") + + command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, + pnm_out) + + logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group}) + + if not subprocess.Popen(command_args).wait() == 0: + raise ParseError("Unpaper failed at {}".format(command_args)) + + return pnm_out + + class ParseError(Exception): pass @@ -56,6 +96,9 @@ class DocumentParser: out_path = os.path.join(self.tempdir, "optipng.png") args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path) + + self.log('debug', 'Execute: ' + " ".join(args)) + if not subprocess.Popen(args).wait() == 0: raise ParseError("Optipng failed at {}".format(args)) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 951ad29ba..befc9bcd7 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -11,7 +11,8 @@ from PIL import Image from pyocr import PyocrException import pdftotext -from documents.parsers import DocumentParser, ParseError +from documents.parsers import DocumentParser, ParseError, run_unpaper, \ + run_convert from .languages import ISO639 @@ -39,15 +40,14 @@ class RasterisedDocumentParser(DocumentParser): # Run convert to get a decent thumbnail try: - run_convert( - settings.CONVERT_BINARY, - "-density", "300", - "-scale", "500x5000>", - "-alpha", "remove", - "-strip", "-trim", - "{}[0]".format(self.document_path), - out_path - ) + run_convert(density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=True, + input="{}[0]".format(self.document_path), + output=out_path, + logging_group=self.logging_group) except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript @@ -61,15 +61,14 @@ class RasterisedDocumentParser(DocumentParser): if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs - run_convert( - settings.CONVERT_BINARY, - "-density", "300", - "-scale", "500x5000>", - "-alpha", "remove", - "-strip", "-trim", - gs_out_path, - out_path - ) + run_convert(density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=True, + input=gs_out_path, + output=out_path, + logging_group=self.logging_group) return out_path @@ -107,14 +106,17 @@ class RasterisedDocumentParser(DocumentParser): if not guessed_language or guessed_language not in ISO639: self.log("warning", "Language detection failed.") ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + elif ISO639[guessed_language] == settings.OCR_LANGUAGE: self.log("info", "Detected language: {} (default language)".format(guessed_language)) ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): - self.log("warning","Detected language {} is not available on this system.".format(guessed_language)) + self.log("warning", "Detected language {} is not available on this system.".format(guessed_language)) ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) + else: - self.log("info","Detected language: {}".format(guessed_language)) + self.log("info", "Detected language: {}".format(guessed_language)) ocr_pages = self._ocr(images, ISO639[guessed_language]) self.log("info", "OCR completed.") @@ -133,13 +135,13 @@ class RasterisedDocumentParser(DocumentParser): # Convert PDF to multiple PNMs pnm = os.path.join(self.tempdir, "convert-%04d.pnm") - run_convert( - settings.CONVERT_BINARY, - "-density", str(settings.CONVERT_DENSITY), - "-depth", "8", - "-type", "grayscale", - self.document_path, pnm, - ) + + run_convert(density=settings.CONVERT_DENSITY, + depth="8", + type="grayscale", + input=self.document_path, + output=pnm, + logging_group=self.logging_group) # Get a list of converted images pnms = [] @@ -187,27 +189,6 @@ class RasterisedDocumentParser(DocumentParser): return [sample_page] -def run_convert(*args): - environment = os.environ.copy() - if settings.CONVERT_MEMORY_LIMIT: - environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT - if settings.CONVERT_TMPDIR: - environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR - - if not subprocess.Popen(args, env=environment).wait() == 0: - raise ParseError("Convert failed at {}".format(args)) - - -def run_unpaper(pnm): - pnm_out = pnm.replace(".pnm", ".unpaper.pnm") - - command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, - pnm_out) - if not subprocess.Popen(command_args).wait() == 0: - raise ParseError("Unpaper failed at {}".format(command_args)) - - return pnm_out - def strip_excess_whitespace(text): collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)