Use optipng to optimise document thumbnails

This commit is contained in:
Daniel Quinn
2018-10-07 14:56:38 +01:00
parent 2a3f766b93
commit 750ab5bf85
9 changed files with 85 additions and 22 deletions

View File

@@ -149,7 +149,7 @@ class Consumer:
parsed_document = parser_class(doc)
try:
thumbnail = parsed_document.get_thumbnail()
thumbnail = parsed_document.get_optimised_thumbnail()
date = parsed_document.get_date()
document = self._store(
parsed_document.get_text(),

View File

@@ -2,6 +2,7 @@ import logging
import os
import re
import shutil
import subprocess
import tempfile
import dateparser
@@ -36,6 +37,7 @@ class DocumentParser:
SCRATCH = settings.SCRATCH_DIR
DATE_ORDER = settings.DATE_ORDER
OPTIPNG = settings.OPTIPNG_BINARY
def __init__(self, path):
self.document_path = path
@@ -49,6 +51,19 @@ class DocumentParser:
"""
raise NotImplementedError()
def optimise_thumbnail(self, in_path):
out_path = os.path.join(self.tempdir, "optipng.png")
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
def get_text(self):
"""
Returns the text from the document and only the text.

View File

@@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs):
error = "Paperless can't find {}. Without it, consumption is impossible."
hint = "Either it's not in your ${PATH} or it's not installed."
binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
binaries = (
settings.CONVERT_BINARY,
settings.OPTIPNG_BINARY,
settings.UNPAPER_BINARY,
"tesseract"
)
check_messages = []
for binary in binaries:

View File

@@ -247,6 +247,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
# OptiPNG
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
# Unpaper
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")

View File

@@ -44,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser):
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
"{}[0]".format(self.document_path),
os.path.join(self.tempdir, "convert.png")
out_path
)
return os.path.join(self.tempdir, "convert.png")
return out_path
def _is_ocred(self):

View File

@@ -32,7 +32,7 @@ class TextDocumentParser(DocumentParser):
text_color = "black" # text color
psize = [500, 647] # icon size
n_lines = 50 # number of lines to show
output_file = os.path.join(self.tempdir, "convert-txt.png")
out_path = os.path.join(self.tempdir, "convert.png")
temp_bg = os.path.join(self.tempdir, "bg.png")
temp_txlayer = os.path.join(self.tempdir, "tx.png")
@@ -43,9 +43,13 @@ class TextDocumentParser(DocumentParser):
work_size = ",".join([str(n - 1) for n in psize])
r = str(round(psize[0] / 10))
rounded = ",".join([r, r])
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,',
work_size, ",", rounded, '" ', temp_bg)
run_command(
self.CONVERT,
"-size ", picsize,
' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501
temp_bg
)
def read_text():
with open(self.document_path, 'r') as src:
@@ -54,22 +58,29 @@ class TextDocumentParser(DocumentParser):
return text.replace('"', "'")
def create_txlayer():
run_command(self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer)
run_command(
self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer
)
create_txlayer()
create_bg()
run_command(self.CONVERT, temp_bg, temp_txlayer,
"-background None -layers merge ", output_file)
run_command(
self.CONVERT,
temp_bg,
temp_txlayer,
"-background None -layers merge ",
out_path
)
return output_file
return out_path
def get_text(self):