From 40ce38254b57b73c7e1952aede21e254df22ef94 Mon Sep 17 00:00:00 2001 From: jonaswinkler <17569239+jonaswinkler@users.noreply.github.com> Date: Sun, 14 Mar 2021 14:42:48 +0100 Subject: [PATCH] fixes #631 --- src/documents/parsers.py | 67 +++++++++++------- src/documents/resources/document.png | Bin 0 -> 10498 bytes src/paperless_tesseract/parsers.py | 1 + .../tests/samples/encrypted.pdf | Bin 81455 -> 46594 bytes .../tests/samples/signed.pdf | Bin 0 -> 81455 bytes src/paperless_tesseract/tests/test_parser.py | 25 +++++-- 6 files changed, 63 insertions(+), 30 deletions(-) create mode 100644 src/documents/resources/document.png create mode 100644 src/paperless_tesseract/tests/samples/signed.pdf diff --git a/src/documents/parsers.py b/src/documents/parsers.py index b2714f6a3..8cb8f5399 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -143,6 +143,46 @@ def run_convert(input_file, raise ParseError("Convert failed at {}".format(args)) +def get_default_thumbnail(): + return os.path.join(os.path.dirname(__file__), "resources", "document.png") + + +def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): + out_path = os.path.join(temp_dir, "convert_gs.png") + + # if convert fails, fall back to extracting + # the first PDF page as a PNG using Ghostscript + logger.warning( + "Thumbnail generation with ImageMagick failed, falling back " + "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", + extra={'group': logging_group} + ) + gs_out_path = os.path.join(temp_dir, "gs_out.png") + cmd = [settings.GS_BINARY, + "-q", + "-sDEVICE=pngalpha", + "-o", gs_out_path, + in_path] + try: + if not subprocess.Popen(cmd).wait() == 0: + raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) + # then run convert on the output from gs + run_convert(density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + auto_orient=True, + input_file=gs_out_path, + output_file=out_path, + logging_group=logging_group) + + return out_path + + except ParseError: + return get_default_thumbnail() + + def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): """ The thumbnail of a PDF is just a 500px wide image of the first page. @@ -161,31 +201,8 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): output_file=out_path, logging_group=logging_group) except ParseError: - # if convert fails, fall back to extracting - # the first PDF page as a PNG using Ghostscript - logger.warning( - "Thumbnail generation with ImageMagick failed, falling back " - "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", - extra={'group': logging_group} - ) - gs_out_path = os.path.join(temp_dir, "gs_out.png") - cmd = [settings.GS_BINARY, - "-q", - "-sDEVICE=pngalpha", - "-o", gs_out_path, - in_path] - if not subprocess.Popen(cmd).wait() == 0: - raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) - # then run convert on the output from gs - run_convert(density=300, - scale="500x5000>", - alpha="remove", - strip=True, - trim=False, - auto_orient=True, - input_file=gs_out_path, - output_file=out_path, - logging_group=logging_group) + out_path = make_thumbnail_from_pdf_gs_fallback( + in_path, temp_dir, logging_group) return out_path diff --git a/src/documents/resources/document.png b/src/documents/resources/document.png new file mode 100644 index 0000000000000000000000000000000000000000..8c24f9d6ed83af535d3ba5e2d7e396c2addde855 GIT binary patch literal 10498 zcmeHMcT|(vw#RW4rHMGm01AkJB7}s5me4db=|~X)1w#r_gGnHCEEEex=`A=&QxT9( zK)^Cc386@rrW8XHkSc=uzM#(Fb?(fa_1=Ad4QqXCW$*plzkSX=d!KW@kE
lGjKQzTOqS;U=y$2q!SMM&Q6SP`t3
zBaVf|>&2S{t0-wnF0D^iTHr0eik#xIhfMA7m|uG-bTd{1(?`3vazdCF^~$T_GRr`Y
z)J?Y!Z-?G|e5#|z8(-M d|=OAMK9j;ChvHFo12uFi6_6a_vL3{CufWTp!x~ZV~41Q=J@D
z*F1eC+A#BiZ==HnQ%ipY&-OAFG|EEhSUZRD-r3KWE)&gYFqnE$APBmw>8xRzRj3i-
zGBUyS@kew2G{vK*JJ+J(3-6+wC08q7j{S=KsM)jcqxwx@Md^`tGDrksM%NUM+x;*@
zmS6Kg<+1azsR>;B8bU3O5y~}j=B8fe-;xX8qW39bADV|!$iNEaMX=qcwV%o-QQ_@m
zhmJD`9&JYeX1_AboQ26LrPqO-99h8wrdIz(-Sw-0!KEc70!%tt^Vh48ua6I+`AH%e
z3oL747Xe7NCNYG*Gf}3xA);;wW9p9KV8%l+girxmf7m^9sua3kD2Er8 mu{VuEt@p)|?GmlfHW6S6ZfTC~Im=$Dpo2&~j^~bYu_Zo=Q
z=r<~WTyV*}K(s3Ym+o2W4UY>WWD(uVxJ%$!aNb*^qk5`C05vC~3$!D;&NzQAXv D;W3K_T){>1WLCBDssBE@hE#IOI%B$0d9XE*8;$M@X^%lcRH
zk&59QJlz*topexSa{|-nQ7g5c&5_97g6Yqa1UpP(SKpl-NsM_xCCA#21%e1h5EH@d(Gwj(9W)O@OnBf{F9&`JDyz8y
z%x4{@;?QOlIANx-O%B?{-QS-YX1nz8XyDzl?*Nt-mxC062z61F(ThhaO&59IDkaqx
zPY%>Cjv`l;!V>`9Rkn{3LD7#F
k4O#vzAJ@LQ&FHhV@FmjV6F9Oe)+2~Y
zy0D^VhLP!{^+rl;EfXG_z9Jd)W-d(p*-jDGwOhvEVt6!U(Pwcog
?@!6%
zgE#L!&?Qo(9HEFs(j@1!cnll403`KveJf>{
`I@A%=mBwl#d64zKtPaZ|CHQAfObx40@n!7ip
z@Z!62t}+0B4eeUdz++rye&Qz4So!1S?wd(*wPgT00ld>#l5IdbbI!Mi*#~ge>oy#|
z1w9iZPs;-+hr^LP-8Lfb*Er9TP7Op0GF6*p