Merge pull request #494 from JensPfeifle/fix_447

fix parse error of some documents by using gs
This commit is contained in:
JOKer 2019-03-08 21:45:25 +01:00 committed by GitHub
commit 305d50d7ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 7 deletions

View File

@ -247,6 +247,9 @@ PAPERLESS_EMAIL_SECRET=""
# Convert (part of the ImageMagick suite)
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
# Ghostscript
#PAPERLESS_GS_BINARY = /usr/bin/gs
# Unpaper
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper

View File

@ -263,6 +263,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
# Ghostscript
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
# OptiPNG
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")

View File

@ -29,6 +29,7 @@ class RasterisedDocumentParser(DocumentParser):
"""
CONVERT = settings.CONVERT_BINARY
GHOSTSCRIPT = settings.GS_BINARY
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
@ -47,13 +48,38 @@ class RasterisedDocumentParser(DocumentParser):
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
"{}[0]".format(self.document_path),
out_path
)
try:
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
"{}[0]".format(self.document_path),
out_path
)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
"warning",
"Thumbnail generation with ImageMagick failed, "
"falling back to Ghostscript."
)
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [self.GHOSTSCRIPT,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
self.document_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
gs_out_path,
out_path
)
return out_path