diff --git a/paperless.conf.example b/paperless.conf.example index aed2eee71..05a6c9cca 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -247,6 +247,9 @@ PAPERLESS_EMAIL_SECRET="" # Convert (part of the ImageMagick suite) #PAPERLESS_CONVERT_BINARY=/usr/bin/convert +# Ghostscript +#PAPERLESS_GS_BINARY = /usr/bin/gs + # Unpaper #PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper diff --git a/src/paperless/settings.py b/src/paperless/settings.py index eee727287..917d1e64f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -263,6 +263,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") +# Ghostscript +GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") + # OptiPNG OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index fb8c1c3ec..c2db5a056 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -29,6 +29,7 @@ class RasterisedDocumentParser(DocumentParser): """ CONVERT = settings.CONVERT_BINARY + GHOSTSCRIPT = settings.GS_BINARY DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY @@ -47,13 +48,38 @@ class RasterisedDocumentParser(DocumentParser): out_path = os.path.join(self.tempdir, "convert.png") # Run convert to get a decent thumbnail - run_convert( - self.CONVERT, - "-scale", "500x5000", - "-alpha", "remove", - "{}[0]".format(self.document_path), - out_path - ) + try: + run_convert( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + "{}[0]".format(self.document_path), + out_path + ) + except ParseError: + # if convert fails, fall back to extracting + # the first PDF page as a PNG using Ghostscript + self.log( + "warning", + "Thumbnail generation with ImageMagick failed, " + "falling back to Ghostscript." + ) + gs_out_path = os.path.join(self.tempdir, "gs_out.png") + cmd = [self.GHOSTSCRIPT, + "-q", + "-sDEVICE=pngalpha", + "-o", gs_out_path, + self.document_path] + if not subprocess.Popen(cmd).wait() == 0: + raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) + # then run convert on the output from gs + run_convert( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + gs_out_path, + out_path + ) return out_path