Entirely removes the optipng, updates ghostscript fall back to also use WebP. Updates the conversion to use a multiprocessing pool

This commit is contained in:
Trenton Holmes 2022-06-11 08:38:49 -07:00
parent 7d9a9033f9
commit e8868d7ebf
17 changed files with 65 additions and 162 deletions

View File

@ -74,7 +74,7 @@ jobs:
name: Install system dependencies name: Install system dependencies
run: | run: |
sudo apt-get update -qq sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript libzbar0 poppler-utils
- -
name: Install Python dependencies name: Install Python dependencies
run: | run: |

View File

@ -77,7 +77,6 @@ ARG RUNTIME_PACKAGES="\
libraqm0 \ libraqm0 \
libgnutls30 \ libgnutls30 \
libjpeg62-turbo \ libjpeg62-turbo \
optipng \
python3 \ python3 \
python3-pip \ python3-pip \
python3-setuptools \ python3-setuptools \

View File

@ -712,13 +712,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
Default is none, which disables the temporary directory. Default is none, which disables the temporary directory.
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
Use optipng to optimize thumbnails. This usually reduces the size of
thumbnails by about 20%, but uses considerable compute time during
consumption.
Defaults to true.
PAPERLESS_POST_CONSUME_SCRIPT=<filename> PAPERLESS_POST_CONSUME_SCRIPT=<filename>
After a document is consumed, Paperless can trigger an arbitrary script if After a document is consumed, Paperless can trigger an arbitrary script if
you like. This script will be passed a number of arguments for you to work you like. This script will be passed a number of arguments for you to work
@ -789,9 +782,6 @@ PAPERLESS_CONVERT_BINARY=<path>
PAPERLESS_GS_BINARY=<path> PAPERLESS_GS_BINARY=<path>
Defaults to "/usr/bin/gs". Defaults to "/usr/bin/gs".
PAPERLESS_OPTIPNG_BINARY=<path>
Defaults to "/usr/bin/optipng".
.. _configuration-docker: .. _configuration-docker:

View File

@ -286,7 +286,6 @@ writing. Windows is not and will never be supported.
* ``fonts-liberation`` for generating thumbnails for plain text files * ``fonts-liberation`` for generating thumbnails for plain text files
* ``imagemagick`` >= 6 for PDF conversion * ``imagemagick`` >= 6 for PDF conversion
* ``optipng`` for optimizing thumbnails
* ``gnupg`` for handling encrypted documents * ``gnupg`` for handling encrypted documents
* ``libpq-dev`` for PostgreSQL * ``libpq-dev`` for PostgreSQL
* ``libmagic-dev`` for mime type detection * ``libmagic-dev`` for mime type detection
@ -298,7 +297,7 @@ writing. Windows is not and will never be supported.
.. code:: .. code::
python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
These dependencies are required for OCRmyPDF, which is used for text recognition. These dependencies are required for OCRmyPDF, which is used for text recognition.
@ -730,8 +729,6 @@ configuring some options in paperless can help improve performance immensely:
* If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``. * If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``.
This will speed up OCR times and use less memory at the expense of slightly worse This will speed up OCR times and use less memory at the expense of slightly worse
OCR results. OCR results.
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
times. Thumbnails will be about 20% larger.
* If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to * If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
1. This will save some memory. 1. This will save some memory.

View File

@ -65,7 +65,6 @@
#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=false #PAPERLESS_CONSUMER_ENABLE_BARCODES=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT #PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
#PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_FILENAME_DATE_ORDER=YMD #PAPERLESS_FILENAME_DATE_ORDER=YMD
@ -84,4 +83,3 @@
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert #PAPERLESS_CONVERT_BINARY=/usr/bin/convert
#PAPERLESS_GS_BINARY=/usr/bin/gs #PAPERLESS_GS_BINARY=/usr/bin/gs
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng

View File

@ -273,7 +273,7 @@ class Consumer(LoggingMixin):
self.log("debug", f"Generating thumbnail for {self.filename}...") self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
thumbnail = document_parser.get_optimised_thumbnail( thumbnail = document_parser.get_thumbnail(
self.path, self.path,
mime_type, mime_type,
self.filename, self.filename,

View File

@ -1,4 +1,5 @@
import logging import logging
import multiprocessing.pool
import shutil import shutil
import tempfile import tempfile
import time import time
@ -8,10 +9,44 @@ from django.core.management.base import BaseCommand
from documents.models import Document from documents.models import Document
from documents.parsers import run_convert from documents.parsers import run_convert
logger = logging.getLogger("paperless.management.convert_thumbnails") logger = logging.getLogger("paperless.management.convert_thumbnails")
def _do_convert(work_package):
_, existing_thumbnail, converted_thumbnail = work_package
try:
logger.info(f"Converting thumbnail: {existing_thumbnail}")
# Run actual conversion
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{existing_thumbnail}[0]",
output_file=str(converted_thumbnail),
)
# Copy newly created thumbnail to thumbnail directory
shutil.copy(converted_thumbnail, existing_thumbnail.parent)
# Remove the PNG version
existing_thumbnail.unlink()
logger.info(
"Conversion to WebP completed, "
f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}",
)
except Exception as e:
logger.error(
f"Error converting thumbnail" f" (existing file unchanged): {e}",
)
class Command(BaseCommand): class Command(BaseCommand):
help = """ help = """
@ -24,21 +59,19 @@ class Command(BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
self.stdout.write("Converting all PNG thumbnails to WebP") logger.info("Converting all PNG thumbnails to WebP")
start = time.time() start = time.time()
documents = Document.objects.all() documents = Document.objects.all()
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
work_packages = []
for document in documents: for document in documents:
existing_thumbnail = Path(document.thumbnail_path).resolve() existing_thumbnail = Path(document.thumbnail_path).resolve()
if existing_thumbnail.suffix == ".png": if existing_thumbnail.suffix == ".png":
self.stdout.write(f"Converting thumbnail: {existing_thumbnail}")
# Change the existing filename suffix from png to webp # Change the existing filename suffix from png to webp
converted_thumbnail_name = existing_thumbnail.with_suffix( converted_thumbnail_name = existing_thumbnail.with_suffix(
".webp", ".webp",
@ -49,46 +82,16 @@ class Command(BaseCommand):
Path(tempdir) / Path(converted_thumbnail_name) Path(tempdir) / Path(converted_thumbnail_name)
).resolve() ).resolve()
try: # Package up the necessary info
# Run actual conversion work_packages.append(
run_convert( (document, existing_thumbnail, converted_thumbnail),
density=300, )
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{existing_thumbnail}[0]",
output_file=str(converted_thumbnail),
)
if converted_thumbnail.exists(): if len(work_packages):
# Copy newly created thumbnail to thumbnail directory with multiprocessing.pool.Pool(processes=4, maxtasksperchild=4) as pool:
shutil.copy(converted_thumbnail, existing_thumbnail.parent) pool.map(_do_convert, work_packages)
# Remove the PNG version
existing_thumbnail.unlink()
self.stdout.write(
self.style.SUCCESS(
"Conversion to WebP completed",
),
)
else:
# Highly unlike to reach here
self.stderr.write(
self.style.WARNING("Converted thumbnail doesn't exist"),
)
except Exception as e:
self.stderr.write(
self.style.ERROR(
f"Error converting thumbnail"
f" (existing file unchanged): {e}",
),
)
end = time.time() end = time.time()
duration = end - start duration = end - start
self.stdout.write(f"Conversion completed in {duration:.3f}s") logger.info(f"Conversion completed in {duration:.3f}s")

View File

@ -41,7 +41,7 @@ def handle_document(document_id):
try: try:
parser.parse(document.source_path, mime_type, document.get_public_filename()) parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_optimised_thumbnail( thumbnail = parser.get_thumbnail(
document.source_path, document.source_path,
mime_type, mime_type,
document.get_public_filename(), document.get_public_filename(),

View File

@ -29,7 +29,7 @@ def _process_document(doc_in):
if existing_thumbnail.exists() and existing_thumbnail.suffix == ".png": if existing_thumbnail.exists() and existing_thumbnail.suffix == ".png":
existing_thumbnail.unlink() existing_thumbnail.unlink()
thumb = parser.get_optimised_thumbnail( thumb = parser.get_thumbnail(
document.source_path, document.source_path,
document.mime_type, document.mime_type,
document.get_public_filename(), document.get_public_filename(),

View File

@ -308,17 +308,11 @@ class Document(models.Model):
png_file_path = os.path.join(settings.THUMBNAIL_DIR, png_file_name) png_file_path = os.path.join(settings.THUMBNAIL_DIR, png_file_name)
# 1. Assume the thumbnail is WebP # 1. Assume the thumbnail is WebP
if not os.path.exists(webp_file_path): if os.path.exists(png_file_path):
# 2. If WebP doesn't exist, check PNG thumb = png_file_path
if not os.path.exists(png_file_path):
# 3. If PNG doesn't exist, filename is being constructed, return WebP
thumb = webp_file_path
else:
# 2.1 - PNG file exists, return path to it
thumb = png_file_path
else: else:
# 1.1 - WebP file exists, return path to it
thumb = webp_file_path thumb = webp_file_path
return os.path.normpath(thumb) return os.path.normpath(thumb)
@property @property

View File

@ -150,11 +150,14 @@ def run_convert(
def get_default_thumbnail() -> str: def get_default_thumbnail() -> str:
"""
Returns the path to a generic thumbnail
"""
return os.path.join(os.path.dirname(__file__), "resources", "document.png") return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
out_path = os.path.join(temp_dir, "convert_gs.png") out_path = os.path.join(temp_dir, "convert_gs.webp")
# if convert fails, fall back to extracting # if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript # the first PDF page as a PNG using Ghostscript
@ -319,29 +322,6 @@ class DocumentParser(LoggingMixin):
""" """
raise NotImplementedError() raise NotImplementedError()
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
if settings.OPTIMIZE_THUMBNAILS and os.path.splitext(thumbnail)[1] == ".png":
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
args = (
settings.OPTIPNG_BINARY,
"-silent",
"-o5",
thumbnail,
"-out",
out_path,
)
self.log("debug", f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError(f"Optipng failed at {args}")
return out_path
else:
return thumbnail
def get_text(self): def get_text(self):
return self.text return self.text

View File

@ -183,7 +183,7 @@ class DummyParser(DocumentParser):
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
self.archive_path = archive_path self.archive_path = archive_path
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
@ -194,7 +194,7 @@ class CopyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def __init__(self, logging_group, progress_callback=None): def __init__(self, logging_group, progress_callback=None):
@ -216,7 +216,7 @@ class FaultyParser(DocumentParser):
super().__init__(logging_group) super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):

View File

@ -137,32 +137,3 @@ class TestConvertThumbnails(TestCase):
run_convert_mock.assert_called_once() run_convert_mock.assert_called_once()
self.assertIn("Error converting thumbnail", stderr) self.assertIn("Error converting thumbnail", stderr)
self.assertTrue(thumb_file.exists()) self.assertTrue(thumb_file.exists())
@mock.patch("documents.management.commands.convert_thumbnails.run_convert")
def test_convert_single_thumbnail_no_output(self, run_convert_mock):
"""
GIVEN:
- Document exists with PNG thumbnail
WHEN:
- Thumbnail conversion is attempted, but there is no output WebP
THEN:
- Single thumbnail is converted
"""
with tempfile.TemporaryDirectory() as thumbnail_dir:
with override_settings(
THUMBNAIL_DIR=thumbnail_dir,
):
thumb_file = self.create_png_thumbnail_file(thumbnail_dir)
stdout, stderr = self.call_command()
run_convert_mock.assert_called_once()
self.assertIn(f"{thumb_file}", stdout)
self.assertNotIn("Conversion to WebP completed", stdout)
self.assertIn("Converted thumbnail doesn't exist", stderr)
self.assertTrue(thumb_file.exists())
self.assertFalse(thumb_file.with_suffix(".webp").exists())

View File

@ -87,31 +87,6 @@ def fake_get_thumbnail(self, path, mimetype, file_name):
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
class TestBaseParser(TestCase):
def setUp(self) -> None:
self.scratch = tempfile.mkdtemp()
override_settings(SCRATCH_DIR=self.scratch).enable()
def tearDown(self) -> None:
shutil.rmtree(self.scratch)
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=True)
def test_get_optimised_thumbnail(self):
parser = DocumentParser(None)
parser.get_optimised_thumbnail("any", "not important", "document.pdf")
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=False)
def test_get_optimised_thumb_disabled(self):
parser = DocumentParser(None)
path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
class TestParserAvailability(TestCase): class TestParserAvailability(TestCase):
def test_file_extensions(self): def test_file_extensions(self):

View File

@ -72,7 +72,7 @@ def binaries_check(app_configs, **kwargs):
error = "Paperless can't find {}. Without it, consumption is impossible." error = "Paperless can't find {}. Without it, consumption is impossible."
hint = "Either it's not in your ${PATH} or it's not installed." hint = "Either it's not in your ${PATH} or it's not installed."
binaries = (settings.CONVERT_BINARY, settings.OPTIPNG_BINARY, "tesseract") binaries = (settings.CONVERT_BINARY, "tesseract")
check_messages = [] check_messages = []
for binary in binaries: for binary in binaries:

View File

@ -526,8 +526,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
# The default language that tesseract will attempt to use when parsing # The default language that tesseract will attempt to use when parsing
@ -570,8 +568,6 @@ CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
# Pre-2.x versions of Paperless stored your documents locally with GPG # Pre-2.x versions of Paperless stored your documents locally with GPG
# encryption, but that is no longer the default. This behaviour is still # encryption, but that is no longer the default. This behaviour is still

View File

@ -13,9 +13,9 @@ class TestChecks(DirectoriesMixin, TestCase):
def test_binaries(self): def test_binaries(self):
self.assertEqual(binaries_check(None), []) self.assertEqual(binaries_check(None), [])
@override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot") @override_settings(CONVERT_BINARY="uuuhh")
def test_binaries_fail(self): def test_binaries_fail(self):
self.assertEqual(len(binaries_check(None)), 2) self.assertEqual(len(binaries_check(None)), 1)
def test_paths_check(self): def test_paths_check(self):
self.assertEqual(paths_check(None), []) self.assertEqual(paths_check(None), [])