diff --git a/Dockerfile b/Dockerfile index 968d67da5..55d54cc01 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ ENV PAPERLESS_EXPORT_DIR=/export \ RUN apk update --no-cache && apk add python3 gnupg libmagic bash shadow curl \ - sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \ + sudo poppler tesseract-ocr imagemagick ghostscript unpaper optipng && \ apk add --virtual .build-dependencies \ python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \ # Install python dependencies diff --git a/docs/changelog.rst b/docs/changelog.rst index aefe65c25..5e548301c 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,9 +1,14 @@ Changelog ######### -2.4.1 +2.5.0 ===== +* **New dependency**: Paperless now optimises thumbnail generation with + `optipng`_, so you'll need to install that somewhere in your PATH or declare + its location in ``PAPERLESS_OPTIPNG_BINARY``. The Docker image has already + been updated on the Docker Hub, so you just need to pull the latest one from + there if you're a Docker user. * An annoying bug in the date capture code was causing some bogus dates to be attached to documents, which in turn busted the UI. Thanks to `Andrew Peng`_ for reporting this. `#414`_. @@ -632,3 +637,4 @@ bulk of the work on this big change. .. _pipenv: https://docs.pipenv.org/ .. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/ +.. _optipng: http://optipng.sourceforge.net/ diff --git a/paperless.conf.example b/paperless.conf.example index 05cf81724..3604505cb 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -213,3 +213,23 @@ PAPERLESS_DEBUG="false" # The number of years for which a correspondent will be included in the recent # correspondents filter. #PAPERLESS_RECENT_CORRESPONDENT_YEARS=1 + +############################################################################### +#### Third-Party Binaries #### +############################################################################### + +# There are a few external software packages that Paperless expects to find on +# your system when it starts up. Unless you've done something creative with +# their installation, you probably won't need to edit any of these. However, +# if you've installed these programs somewhere where simply typing the name of +# the program doesn't automatically execute it (ie. the program isn't in your +# $PATH), then you'll need to specify the literal path for that program here. + +# Convert (part of the ImageMagick suite) +#PAPERLESS_CONVERT_BINARY=/usr/bin/convert + +# Unpaper +#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper + +# Optipng (for optimising thumbnail sizes) +#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 7dd94ebf1..3cb484b2a 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -149,7 +149,7 @@ class Consumer: parsed_document = parser_class(doc) try: - thumbnail = parsed_document.get_thumbnail() + thumbnail = parsed_document.get_optimised_thumbnail() date = parsed_document.get_date() document = self._store( parsed_document.get_text(), diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 29128eaad..1f60b1479 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -2,6 +2,7 @@ import logging import os import re import shutil +import subprocess import tempfile import dateparser @@ -36,6 +37,7 @@ class DocumentParser: SCRATCH = settings.SCRATCH_DIR DATE_ORDER = settings.DATE_ORDER + OPTIPNG = settings.OPTIPNG_BINARY def __init__(self, path): self.document_path = path @@ -49,6 +51,19 @@ class DocumentParser: """ raise NotImplementedError() + def optimise_thumbnail(self, in_path): + + out_path = os.path.join(self.tempdir, "optipng.png") + + args = (self.OPTIPNG, "-o5", in_path, "-out", out_path) + if not subprocess.Popen(args).wait() == 0: + raise ParseError("Optipng failed at {}".format(args)) + + return out_path + + def get_optimised_thumbnail(self): + return self.optimise_thumbnail(self.get_thumbnail()) + def get_text(self): """ Returns the text from the document and only the text. diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 666425f9c..e8c94362a 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs): error = "Paperless can't find {}. Without it, consumption is impossible." hint = "Either it's not in your ${PATH} or it's not installed." - binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract") + binaries = ( + settings.CONVERT_BINARY, + settings.OPTIPNG_BINARY, + settings.UNPAPER_BINARY, + "tesseract" + ) check_messages = [] for binary in binaries: diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 4e788e56b..fb5a399a8 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -247,6 +247,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") +# OptiPNG +OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") + # Unpaper UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 8ba162b9f..dc5dbd637 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -44,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser): The thumbnail of a PDF is just a 500px wide image of the first page. """ + out_path = os.path.join(self.tempdir, "convert.png") + + # Run convert to get a decent thumbnail run_convert( self.CONVERT, "-scale", "500x5000", "-alpha", "remove", "{}[0]".format(self.document_path), - os.path.join(self.tempdir, "convert.png") + out_path ) - return os.path.join(self.tempdir, "convert.png") + return out_path def _is_ocred(self): diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index afcfb013c..3ccb78404 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -32,7 +32,7 @@ class TextDocumentParser(DocumentParser): text_color = "black" # text color psize = [500, 647] # icon size n_lines = 50 # number of lines to show - output_file = os.path.join(self.tempdir, "convert-txt.png") + out_path = os.path.join(self.tempdir, "convert.png") temp_bg = os.path.join(self.tempdir, "bg.png") temp_txlayer = os.path.join(self.tempdir, "tx.png") @@ -43,9 +43,13 @@ class TextDocumentParser(DocumentParser): work_size = ",".join([str(n - 1) for n in psize]) r = str(round(psize[0] / 10)) rounded = ",".join([r, r]) - run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', - '"fill ', bg_color, ' roundrectangle 0,0,', - work_size, ",", rounded, '" ', temp_bg) + run_command( + self.CONVERT, + "-size ", picsize, + ' xc:none -draw ', + '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501 + temp_bg + ) def read_text(): with open(self.document_path, 'r') as src: @@ -54,22 +58,29 @@ class TextDocumentParser(DocumentParser): return text.replace('"', "'") def create_txlayer(): - run_command(self.CONVERT, - "-background none", - "-fill", - text_color, - "-pointsize", "12", - "-border 4 -bordercolor none", - "-size ", txsize, - ' caption:"', read_text(), '" ', - temp_txlayer) + run_command( + self.CONVERT, + "-background none", + "-fill", + text_color, + "-pointsize", "12", + "-border 4 -bordercolor none", + "-size ", txsize, + ' caption:"', read_text(), '" ', + temp_txlayer + ) create_txlayer() create_bg() - run_command(self.CONVERT, temp_bg, temp_txlayer, - "-background None -layers merge ", output_file) + run_command( + self.CONVERT, + temp_bg, + temp_txlayer, + "-background None -layers merge ", + out_path + ) - return output_file + return out_path def get_text(self):