From 750ab5bf85300db7903bb24e93dbe55f60f02307 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 7 Oct 2018 14:56:38 +0100
Subject: [PATCH] Use optipng to optimise document thumbnails

---
 Dockerfile                         |  2 +-
 docs/changelog.rst                 |  8 +++++-
 paperless.conf.example             | 20 ++++++++++++++
 src/documents/consumer.py          |  2 +-
 src/documents/parsers.py           | 15 +++++++++++
 src/paperless/checks.py            |  7 ++++-
 src/paperless/settings.py          |  3 +++
 src/paperless_tesseract/parsers.py |  7 +++--
 src/paperless_text/parsers.py      | 43 +++++++++++++++++++-----------
 9 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 968d67da5..55d54cc01 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,7 @@ ENV PAPERLESS_EXPORT_DIR=/export \
 
 
 RUN apk update --no-cache && apk add python3 gnupg libmagic bash shadow curl \
-        sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
+        sudo poppler tesseract-ocr imagemagick ghostscript unpaper optipng && \
     apk add --virtual .build-dependencies \
         python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
 # Install python dependencies
diff --git a/docs/changelog.rst b/docs/changelog.rst
index aefe65c25..5e548301c 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,9 +1,14 @@
 Changelog
 #########
 
-2.4.1
+2.5.0
 =====
 
+* **New dependency**: Paperless now optimises thumbnail generation with
+  `optipng`_, so you'll need to install that somewhere in your PATH or declare
+  its location in ``PAPERLESS_OPTIPNG_BINARY``.  The Docker image has already
+  been updated on the Docker Hub, so you just need to pull the latest one from
+  there if you're a Docker user.
 * An annoying bug in the date capture code was causing some bogus dates to be
   attached to documents, which in turn busted the UI.  Thanks to `Andrew Peng`_
   for reporting this. `#414`_.
@@ -632,3 +637,4 @@ bulk of the work on this big change.
 
 .. _pipenv: https://docs.pipenv.org/
 .. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
+.. _optipng: http://optipng.sourceforge.net/
diff --git a/paperless.conf.example b/paperless.conf.example
index 05cf81724..3604505cb 100644
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -213,3 +213,23 @@ PAPERLESS_DEBUG="false"
 # The number of years for which a correspondent will be included in the recent
 # correspondents filter.
 #PAPERLESS_RECENT_CORRESPONDENT_YEARS=1
+
+###############################################################################
+####                     Third-Party Binaries                              ####
+###############################################################################
+
+# There are a few external software packages that Paperless expects to find on
+# your system when it starts up.  Unless you've done something creative with
+# their installation, you probably won't need to edit any of these.  However,
+# if you've installed these programs somewhere where simply typing the name of
+# the program doesn't automatically execute it (ie. the program isn't in your
+# $PATH), then you'll need to specify the literal path for that program here.
+
+# Convert (part of the ImageMagick suite)
+#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
+
+# Unpaper
+#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
+
+# Optipng (for optimising thumbnail sizes)
+#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 7dd94ebf1..3cb484b2a 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -149,7 +149,7 @@ class Consumer:
         parsed_document = parser_class(doc)
 
         try:
-            thumbnail = parsed_document.get_thumbnail()
+            thumbnail = parsed_document.get_optimised_thumbnail()
             date = parsed_document.get_date()
             document = self._store(
                 parsed_document.get_text(),
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index 29128eaad..1f60b1479 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -2,6 +2,7 @@ import logging
 import os
 import re
 import shutil
+import subprocess
 import tempfile
 
 import dateparser
@@ -36,6 +37,7 @@ class DocumentParser:
 
     SCRATCH = settings.SCRATCH_DIR
     DATE_ORDER = settings.DATE_ORDER
+    OPTIPNG = settings.OPTIPNG_BINARY
 
     def __init__(self, path):
         self.document_path = path
@@ -49,6 +51,19 @@ class DocumentParser:
         """
         raise NotImplementedError()
 
+    def optimise_thumbnail(self, in_path):
+
+        out_path = os.path.join(self.tempdir, "optipng.png")
+
+        args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
+        if not subprocess.Popen(args).wait() == 0:
+            raise ParseError("Optipng failed at {}".format(args))
+
+        return out_path
+
+    def get_optimised_thumbnail(self):
+        return self.optimise_thumbnail(self.get_thumbnail())
+
     def get_text(self):
         """
         Returns the text from the document and only the text.
diff --git a/src/paperless/checks.py b/src/paperless/checks.py
index 666425f9c..e8c94362a 100644
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs):
     error = "Paperless can't find {}. Without it, consumption is impossible."
     hint = "Either it's not in your ${PATH} or it's not installed."
 
-    binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
+    binaries = (
+        settings.CONVERT_BINARY,
+        settings.OPTIPNG_BINARY,
+        settings.UNPAPER_BINARY,
+        "tesseract"
+    )
 
     check_messages = []
     for binary in binaries:
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 4e788e56b..fb5a399a8 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -247,6 +247,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
 CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
 
+# OptiPNG
+OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
+
 # Unpaper
 UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
 
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index 8ba162b9f..dc5dbd637 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -44,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser):
         The thumbnail of a PDF is just a 500px wide image of the first page.
         """
 
+        out_path = os.path.join(self.tempdir, "convert.png")
+
+        # Run convert to get a decent thumbnail
         run_convert(
             self.CONVERT,
             "-scale", "500x5000",
             "-alpha", "remove",
             "{}[0]".format(self.document_path),
-            os.path.join(self.tempdir, "convert.png")
+            out_path
         )
 
-        return os.path.join(self.tempdir, "convert.png")
+        return out_path
 
     def _is_ocred(self):
 
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py
index afcfb013c..3ccb78404 100644
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -32,7 +32,7 @@ class TextDocumentParser(DocumentParser):
         text_color = "black"  # text color
         psize = [500, 647]  # icon size
         n_lines = 50  # number of lines to show
-        output_file = os.path.join(self.tempdir, "convert-txt.png")
+        out_path = os.path.join(self.tempdir, "convert.png")
 
         temp_bg = os.path.join(self.tempdir, "bg.png")
         temp_txlayer = os.path.join(self.tempdir, "tx.png")
@@ -43,9 +43,13 @@ class TextDocumentParser(DocumentParser):
             work_size = ",".join([str(n - 1) for n in psize])
             r = str(round(psize[0] / 10))
             rounded = ",".join([r, r])
-            run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
-                        '"fill ', bg_color, ' roundrectangle 0,0,',
-                        work_size, ",", rounded, '" ', temp_bg)
+            run_command(
+                self.CONVERT,
+                "-size ", picsize,
+                ' xc:none -draw ',
+                '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ',  # NOQA: E501
+                temp_bg
+            )
 
         def read_text():
             with open(self.document_path, 'r') as src:
@@ -54,22 +58,29 @@ class TextDocumentParser(DocumentParser):
                 return text.replace('"', "'")
 
         def create_txlayer():
-            run_command(self.CONVERT,
-                        "-background none",
-                        "-fill",
-                        text_color,
-                        "-pointsize", "12",
-                        "-border 4 -bordercolor none",
-                        "-size ", txsize,
-                        ' caption:"', read_text(), '" ',
-                        temp_txlayer)
+            run_command(
+                self.CONVERT,
+                "-background none",
+                "-fill",
+                text_color,
+                "-pointsize", "12",
+                "-border 4 -bordercolor none",
+                "-size ", txsize,
+                ' caption:"', read_text(), '" ',
+                temp_txlayer
+            )
 
         create_txlayer()
         create_bg()
-        run_command(self.CONVERT, temp_bg, temp_txlayer,
-                    "-background None -layers merge ", output_file)
+        run_command(
+            self.CONVERT,
+            temp_bg,
+            temp_txlayer,
+            "-background None -layers merge ",
+            out_path
+        )
 
-        return output_file
+        return out_path
 
     def get_text(self):