From 02201997669f51e5c612e152678d0169bf72c7d6 Mon Sep 17 00:00:00 2001 From: Jens Pfeifle Date: Tue, 29 Jan 2019 09:02:33 +0100 Subject: [PATCH 1/6] fix parse error of some documents by using gs --- src/paperless_tesseract/parsers.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index fb8c1c3ec..5b468973d 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -45,13 +45,23 @@ class RasterisedDocumentParser(DocumentParser): """ out_path = os.path.join(self.tempdir, "convert.png") + gs_out_path = os.path.join(self.tempdir, "gs_out.png") # Run convert to get a decent thumbnail + + # https://github.com/danielquinn/paperless/issues/447 + # call gs first + environment = os.environ.copy() + cmd = ["gs", "-q", "-sDEVICE=pngalpha", + "-o", gs_out_path, self.document_path] + if not subprocess.Popen(cmd, env=environment).wait() == 0: + raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) + # then run convert on the output from gs run_convert( self.CONVERT, "-scale", "500x5000", "-alpha", "remove", - "{}[0]".format(self.document_path), + "gs_out_path", out_path ) From 50504c3fd8cbf67a4c34c521e0f15a63e2653e95 Mon Sep 17 00:00:00 2001 From: JensPfeifle Date: Wed, 30 Jan 2019 10:03:42 +0100 Subject: [PATCH 2/6] remove unnecessary env arg in Popen --- src/paperless_tesseract/parsers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 5b468973d..01595754c 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -51,10 +51,9 @@ class RasterisedDocumentParser(DocumentParser): # https://github.com/danielquinn/paperless/issues/447 # call gs first - environment = os.environ.copy() cmd = ["gs", "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, self.document_path] - if not subprocess.Popen(cmd, env=environment).wait() == 0: + if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs run_convert( From cbf008f37b9ce0fb11e2b877eff95e9ec98a37e9 Mon Sep 17 00:00:00 2001 From: Pit Date: Sat, 2 Feb 2019 23:37:25 +0100 Subject: [PATCH 3/6] Fix quoting in call to run_convert Co-Authored-By: JensPfeifle --- src/paperless_tesseract/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 01595754c..b6b77502a 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -60,7 +60,7 @@ class RasterisedDocumentParser(DocumentParser): self.CONVERT, "-scale", "500x5000", "-alpha", "remove", - "gs_out_path", + gs_out_path, out_path ) From ea282c22baada5868127719907810f8a2e28a27f Mon Sep 17 00:00:00 2001 From: JensPfeifle Date: Sun, 3 Feb 2019 16:57:32 +0100 Subject: [PATCH 4/6] Add GS_BINARY to settings to avoid harcoded call of "gs" --- paperless.conf.example | 3 +++ src/paperless/settings.py | 3 +++ src/paperless_tesseract/parsers.py | 11 +++++++---- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/paperless.conf.example b/paperless.conf.example index aed2eee71..05a6c9cca 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -247,6 +247,9 @@ PAPERLESS_EMAIL_SECRET="" # Convert (part of the ImageMagick suite) #PAPERLESS_CONVERT_BINARY=/usr/bin/convert +# Ghostscript +#PAPERLESS_GS_BINARY = /usr/bin/gs + # Unpaper #PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper diff --git a/src/paperless/settings.py b/src/paperless/settings.py index eee727287..917d1e64f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -263,6 +263,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") +# Ghostscript +GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") + # OptiPNG OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b6b77502a..6086a5920 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -29,6 +29,7 @@ class RasterisedDocumentParser(DocumentParser): """ CONVERT = settings.CONVERT_BINARY + GHOSTSCRIPT = settings.GS_BINARY DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None UNPAPER = settings.UNPAPER_BINARY @@ -47,12 +48,14 @@ class RasterisedDocumentParser(DocumentParser): out_path = os.path.join(self.tempdir, "convert.png") gs_out_path = os.path.join(self.tempdir, "gs_out.png") - # Run convert to get a decent thumbnail - + # Extract the first PDF page as a PNG using Ghostscript # https://github.com/danielquinn/paperless/issues/447 # call gs first - cmd = ["gs", "-q", "-sDEVICE=pngalpha", - "-o", gs_out_path, self.document_path] + cmd = [self.GHOSTSCRIPT, + "-q", + "-sDEVICE=pngalpha", + "-o", gs_out_path, + self.document_path] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs From 29b0886950a667a18687221cc5093d7ff79af23a Mon Sep 17 00:00:00 2001 From: JensPfeifle Date: Sun, 3 Feb 2019 18:19:06 +0100 Subject: [PATCH 5/6] try to run convert, but fall back on gs if needed --- src/paperless_tesseract/parsers.py | 50 +++++++++++++++++++----------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 6086a5920..f312eee2a 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -46,25 +46,39 @@ class RasterisedDocumentParser(DocumentParser): """ out_path = os.path.join(self.tempdir, "convert.png") - gs_out_path = os.path.join(self.tempdir, "gs_out.png") - # Extract the first PDF page as a PNG using Ghostscript - # https://github.com/danielquinn/paperless/issues/447 - # call gs first - cmd = [self.GHOSTSCRIPT, - "-q", - "-sDEVICE=pngalpha", - "-o", gs_out_path, - self.document_path] - if not subprocess.Popen(cmd).wait() == 0: - raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) - # then run convert on the output from gs - run_convert( - self.CONVERT, - "-scale", "500x5000", - "-alpha", "remove", - gs_out_path, - out_path + # Run convert to get a decent thumbnail + try: + run_convert( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + "{}[0]".format(self.document_path), + out_path + ) + except ParseError: + # if convert fails, fall back to extracting + # the first PDF page as a PNG using Ghostscript + self.log( + "warning", + "Thumbnail generation with ImageMagick failed, " + "falling back to Ghostscript." + ) + gs_out_path = os.path.join(self.tempdir, "gs_out.png") + cmd = [self.GHOSTSCRIPT, + "-q", + "-sDEVICE=pngalpha", + "-o", gs_out_path, + self.document_path] + if not subprocess.Popen(cmd).wait() == 0: + raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) + # then run convert on the output from gs + run_convert( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + gs_out_path, + out_path ) return out_path From 336f747f16ecb3af556304cf59f40985f4b095c7 Mon Sep 17 00:00:00 2001 From: jenspfeifle Date: Sun, 3 Mar 2019 20:41:17 +0100 Subject: [PATCH 6/6] make pycodestyle happy --- src/paperless_tesseract/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index f312eee2a..c2db5a056 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -79,7 +79,7 @@ class RasterisedDocumentParser(DocumentParser): "-alpha", "remove", gs_out_path, out_path - ) + ) return out_path