mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Use optipng to optimise document thumbnails
This commit is contained in:
		| @@ -14,7 +14,7 @@ ENV PAPERLESS_EXPORT_DIR=/export \ | ||||
|  | ||||
|  | ||||
| RUN apk update --no-cache && apk add python3 gnupg libmagic bash shadow curl \ | ||||
|         sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \ | ||||
|         sudo poppler tesseract-ocr imagemagick ghostscript unpaper optipng && \ | ||||
|     apk add --virtual .build-dependencies \ | ||||
|         python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \ | ||||
| # Install python dependencies | ||||
|   | ||||
| @@ -1,9 +1,14 @@ | ||||
| Changelog | ||||
| ######### | ||||
|  | ||||
| 2.4.1 | ||||
| 2.5.0 | ||||
| ===== | ||||
|  | ||||
| * **New dependency**: Paperless now optimises thumbnail generation with | ||||
|   `optipng`_, so you'll need to install that somewhere in your PATH or declare | ||||
|   its location in ``PAPERLESS_OPTIPNG_BINARY``.  The Docker image has already | ||||
|   been updated on the Docker Hub, so you just need to pull the latest one from | ||||
|   there if you're a Docker user. | ||||
| * An annoying bug in the date capture code was causing some bogus dates to be | ||||
|   attached to documents, which in turn busted the UI.  Thanks to `Andrew Peng`_ | ||||
|   for reporting this. `#414`_. | ||||
| @@ -632,3 +637,4 @@ bulk of the work on this big change. | ||||
|  | ||||
| .. _pipenv: https://docs.pipenv.org/ | ||||
| .. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/ | ||||
| .. _optipng: http://optipng.sourceforge.net/ | ||||
|   | ||||
| @@ -213,3 +213,23 @@ PAPERLESS_DEBUG="false" | ||||
| # The number of years for which a correspondent will be included in the recent | ||||
| # correspondents filter. | ||||
| #PAPERLESS_RECENT_CORRESPONDENT_YEARS=1 | ||||
|  | ||||
| ############################################################################### | ||||
| ####                     Third-Party Binaries                              #### | ||||
| ############################################################################### | ||||
|  | ||||
| # There are a few external software packages that Paperless expects to find on | ||||
| # your system when it starts up.  Unless you've done something creative with | ||||
| # their installation, you probably won't need to edit any of these.  However, | ||||
| # if you've installed these programs somewhere where simply typing the name of | ||||
| # the program doesn't automatically execute it (ie. the program isn't in your | ||||
| # $PATH), then you'll need to specify the literal path for that program here. | ||||
|  | ||||
| # Convert (part of the ImageMagick suite) | ||||
| #PAPERLESS_CONVERT_BINARY=/usr/bin/convert | ||||
|  | ||||
| # Unpaper | ||||
| #PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper | ||||
|  | ||||
| # Optipng (for optimising thumbnail sizes) | ||||
| #PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng | ||||
|   | ||||
| @@ -149,7 +149,7 @@ class Consumer: | ||||
|         parsed_document = parser_class(doc) | ||||
|  | ||||
|         try: | ||||
|             thumbnail = parsed_document.get_thumbnail() | ||||
|             thumbnail = parsed_document.get_optimised_thumbnail() | ||||
|             date = parsed_document.get_date() | ||||
|             document = self._store( | ||||
|                 parsed_document.get_text(), | ||||
|   | ||||
| @@ -2,6 +2,7 @@ import logging | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import tempfile | ||||
|  | ||||
| import dateparser | ||||
| @@ -36,6 +37,7 @@ class DocumentParser: | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|     OPTIPNG = settings.OPTIPNG_BINARY | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         self.document_path = path | ||||
| @@ -49,6 +51,19 @@ class DocumentParser: | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def optimise_thumbnail(self, in_path): | ||||
|  | ||||
|         out_path = os.path.join(self.tempdir, "optipng.png") | ||||
|  | ||||
|         args = (self.OPTIPNG, "-o5", in_path, "-out", out_path) | ||||
|         if not subprocess.Popen(args).wait() == 0: | ||||
|             raise ParseError("Optipng failed at {}".format(args)) | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|         return self.optimise_thumbnail(self.get_thumbnail()) | ||||
|  | ||||
|     def get_text(self): | ||||
|         """ | ||||
|         Returns the text from the document and only the text. | ||||
|   | ||||
| @@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs): | ||||
|     error = "Paperless can't find {}. Without it, consumption is impossible." | ||||
|     hint = "Either it's not in your ${PATH} or it's not installed." | ||||
|  | ||||
|     binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract") | ||||
|     binaries = ( | ||||
|         settings.CONVERT_BINARY, | ||||
|         settings.OPTIPNG_BINARY, | ||||
|         settings.UNPAPER_BINARY, | ||||
|         "tesseract" | ||||
|     ) | ||||
|  | ||||
|     check_messages = [] | ||||
|     for binary in binaries: | ||||
|   | ||||
| @@ -247,6 +247,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | ||||
| CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | ||||
| CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY") | ||||
|  | ||||
| # OptiPNG | ||||
| OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") | ||||
|  | ||||
| # Unpaper | ||||
| UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | ||||
|  | ||||
|   | ||||
| @@ -44,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
|  | ||||
|         out_path = os.path.join(self.tempdir, "convert.png") | ||||
|  | ||||
|         # Run convert to get a decent thumbnail | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-scale", "500x5000", | ||||
|             "-alpha", "remove", | ||||
|             "{}[0]".format(self.document_path), | ||||
|             os.path.join(self.tempdir, "convert.png") | ||||
|             out_path | ||||
|         ) | ||||
|  | ||||
|         return os.path.join(self.tempdir, "convert.png") | ||||
|         return out_path | ||||
|  | ||||
|     def _is_ocred(self): | ||||
|  | ||||
|   | ||||
| @@ -32,7 +32,7 @@ class TextDocumentParser(DocumentParser): | ||||
|         text_color = "black"  # text color | ||||
|         psize = [500, 647]  # icon size | ||||
|         n_lines = 50  # number of lines to show | ||||
|         output_file = os.path.join(self.tempdir, "convert-txt.png") | ||||
|         out_path = os.path.join(self.tempdir, "convert.png") | ||||
|  | ||||
|         temp_bg = os.path.join(self.tempdir, "bg.png") | ||||
|         temp_txlayer = os.path.join(self.tempdir, "tx.png") | ||||
| @@ -43,9 +43,13 @@ class TextDocumentParser(DocumentParser): | ||||
|             work_size = ",".join([str(n - 1) for n in psize]) | ||||
|             r = str(round(psize[0] / 10)) | ||||
|             rounded = ",".join([r, r]) | ||||
|             run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', | ||||
|                         '"fill ', bg_color, ' roundrectangle 0,0,', | ||||
|                         work_size, ",", rounded, '" ', temp_bg) | ||||
|             run_command( | ||||
|                 self.CONVERT, | ||||
|                 "-size ", picsize, | ||||
|                 ' xc:none -draw ', | ||||
|                 '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ',  # NOQA: E501 | ||||
|                 temp_bg | ||||
|             ) | ||||
|  | ||||
|         def read_text(): | ||||
|             with open(self.document_path, 'r') as src: | ||||
| @@ -54,22 +58,29 @@ class TextDocumentParser(DocumentParser): | ||||
|                 return text.replace('"', "'") | ||||
|  | ||||
|         def create_txlayer(): | ||||
|             run_command(self.CONVERT, | ||||
|                         "-background none", | ||||
|                         "-fill", | ||||
|                         text_color, | ||||
|                         "-pointsize", "12", | ||||
|                         "-border 4 -bordercolor none", | ||||
|                         "-size ", txsize, | ||||
|                         ' caption:"', read_text(), '" ', | ||||
|                         temp_txlayer) | ||||
|             run_command( | ||||
|                 self.CONVERT, | ||||
|                 "-background none", | ||||
|                 "-fill", | ||||
|                 text_color, | ||||
|                 "-pointsize", "12", | ||||
|                 "-border 4 -bordercolor none", | ||||
|                 "-size ", txsize, | ||||
|                 ' caption:"', read_text(), '" ', | ||||
|                 temp_txlayer | ||||
|             ) | ||||
|  | ||||
|         create_txlayer() | ||||
|         create_bg() | ||||
|         run_command(self.CONVERT, temp_bg, temp_txlayer, | ||||
|                     "-background None -layers merge ", output_file) | ||||
|         run_command( | ||||
|             self.CONVERT, | ||||
|             temp_bg, | ||||
|             temp_txlayer, | ||||
|             "-background None -layers merge ", | ||||
|             out_path | ||||
|         ) | ||||
|  | ||||
|         return output_file | ||||
|         return out_path | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn