diff --git a/.github/workflows/reusable-ci-backend.yml b/.github/workflows/reusable-ci-backend.yml index 977011b2c..e872e8696 100644 --- a/.github/workflows/reusable-ci-backend.yml +++ b/.github/workflows/reusable-ci-backend.yml @@ -74,7 +74,7 @@ jobs: name: Install system dependencies run: | sudo apt-get update -qq - sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils + sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript libzbar0 poppler-utils - name: Install Python dependencies run: | diff --git a/Dockerfile b/Dockerfile index 5338d8aa4..fda47998c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,7 +77,6 @@ ARG RUNTIME_PACKAGES="\ libraqm0 \ libgnutls30 \ libjpeg62-turbo \ - optipng \ python3 \ python3-pip \ python3-setuptools \ diff --git a/docs/configuration.rst b/docs/configuration.rst index b7ab978f4..a5db55927 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -712,13 +712,6 @@ PAPERLESS_CONVERT_TMPDIR= Default is none, which disables the temporary directory. -PAPERLESS_OPTIMIZE_THUMBNAILS= - Use optipng to optimize thumbnails. This usually reduces the size of - thumbnails by about 20%, but uses considerable compute time during - consumption. - - Defaults to true. - PAPERLESS_POST_CONSUME_SCRIPT= After a document is consumed, Paperless can trigger an arbitrary script if you like. This script will be passed a number of arguments for you to work @@ -789,9 +782,6 @@ PAPERLESS_CONVERT_BINARY= PAPERLESS_GS_BINARY= Defaults to "/usr/bin/gs". -PAPERLESS_OPTIPNG_BINARY= - Defaults to "/usr/bin/optipng". - .. _configuration-docker: diff --git a/docs/setup.rst b/docs/setup.rst index 90b952e4c..b8d3ab8a3 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -286,7 +286,6 @@ writing. Windows is not and will never be supported. * ``fonts-liberation`` for generating thumbnails for plain text files * ``imagemagick`` >= 6 for PDF conversion - * ``optipng`` for optimizing thumbnails * ``gnupg`` for handling encrypted documents * ``libpq-dev`` for PostgreSQL * ``libmagic-dev`` for mime type detection @@ -298,7 +297,7 @@ writing. Windows is not and will never be supported. .. code:: - python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils + python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils These dependencies are required for OCRmyPDF, which is used for text recognition. @@ -730,8 +729,6 @@ configuring some options in paperless can help improve performance immensely: * If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``. This will speed up OCR times and use less memory at the expense of slightly worse OCR results. -* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption - times. Thumbnails will be about 20% larger. * If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to 1. This will save some memory. diff --git a/paperless.conf.example b/paperless.conf.example index 97e907e1f..bb2449e05 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -65,7 +65,6 @@ #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false #PAPERLESS_CONSUMER_ENABLE_BARCODES=false #PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT -#PAPERLESS_OPTIMIZE_THUMBNAILS=true #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD @@ -84,4 +83,3 @@ #PAPERLESS_CONVERT_BINARY=/usr/bin/convert #PAPERLESS_GS_BINARY=/usr/bin/gs -#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5e3d01fbc..e5794ce4f 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -273,7 +273,7 @@ class Consumer(LoggingMixin): self.log("debug", f"Generating thumbnail for {self.filename}...") self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) - thumbnail = document_parser.get_optimised_thumbnail( + thumbnail = document_parser.get_thumbnail( self.path, mime_type, self.filename, diff --git a/src/documents/management/commands/convert_thumbnails.py b/src/documents/management/commands/convert_thumbnails.py index 0be4cb702..089c689c9 100644 --- a/src/documents/management/commands/convert_thumbnails.py +++ b/src/documents/management/commands/convert_thumbnails.py @@ -1,4 +1,5 @@ import logging +import multiprocessing.pool import shutil import tempfile import time @@ -8,10 +9,44 @@ from django.core.management.base import BaseCommand from documents.models import Document from documents.parsers import run_convert - logger = logging.getLogger("paperless.management.convert_thumbnails") +def _do_convert(work_package): + _, existing_thumbnail, converted_thumbnail = work_package + try: + + logger.info(f"Converting thumbnail: {existing_thumbnail}") + + # Run actual conversion + run_convert( + density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + auto_orient=True, + input_file=f"{existing_thumbnail}[0]", + output_file=str(converted_thumbnail), + ) + + # Copy newly created thumbnail to thumbnail directory + shutil.copy(converted_thumbnail, existing_thumbnail.parent) + + # Remove the PNG version + existing_thumbnail.unlink() + + logger.info( + "Conversion to WebP completed, " + f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}", + ) + + except Exception as e: + logger.error( + f"Error converting thumbnail" f" (existing file unchanged): {e}", + ) + + class Command(BaseCommand): help = """ @@ -24,21 +59,19 @@ class Command(BaseCommand): def handle(self, *args, **options): - self.stdout.write("Converting all PNG thumbnails to WebP") - + logger.info("Converting all PNG thumbnails to WebP") start = time.time() - documents = Document.objects.all() with tempfile.TemporaryDirectory() as tempdir: + work_packages = [] + for document in documents: existing_thumbnail = Path(document.thumbnail_path).resolve() if existing_thumbnail.suffix == ".png": - self.stdout.write(f"Converting thumbnail: {existing_thumbnail}") - # Change the existing filename suffix from png to webp converted_thumbnail_name = existing_thumbnail.with_suffix( ".webp", @@ -49,46 +82,16 @@ class Command(BaseCommand): Path(tempdir) / Path(converted_thumbnail_name) ).resolve() - try: - # Run actual conversion - run_convert( - density=300, - scale="500x5000>", - alpha="remove", - strip=True, - trim=False, - auto_orient=True, - input_file=f"{existing_thumbnail}[0]", - output_file=str(converted_thumbnail), - ) + # Package up the necessary info + work_packages.append( + (document, existing_thumbnail, converted_thumbnail), + ) - if converted_thumbnail.exists(): - # Copy newly created thumbnail to thumbnail directory - shutil.copy(converted_thumbnail, existing_thumbnail.parent) - - # Remove the PNG version - existing_thumbnail.unlink() - - self.stdout.write( - self.style.SUCCESS( - "Conversion to WebP completed", - ), - ) - else: - # Highly unlike to reach here - self.stderr.write( - self.style.WARNING("Converted thumbnail doesn't exist"), - ) - - except Exception as e: - self.stderr.write( - self.style.ERROR( - f"Error converting thumbnail" - f" (existing file unchanged): {e}", - ), - ) + if len(work_packages): + with multiprocessing.pool.Pool(processes=4, maxtasksperchild=4) as pool: + pool.map(_do_convert, work_packages) end = time.time() duration = end - start - self.stdout.write(f"Conversion completed in {duration:.3f}s") + logger.info(f"Conversion completed in {duration:.3f}s") diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index bf0f352b5..c51f1baeb 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -41,7 +41,7 @@ def handle_document(document_id): try: parser.parse(document.source_path, mime_type, document.get_public_filename()) - thumbnail = parser.get_optimised_thumbnail( + thumbnail = parser.get_thumbnail( document.source_path, mime_type, document.get_public_filename(), diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index 595d8ba3b..535a0f670 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -29,7 +29,7 @@ def _process_document(doc_in): if existing_thumbnail.exists() and existing_thumbnail.suffix == ".png": existing_thumbnail.unlink() - thumb = parser.get_optimised_thumbnail( + thumb = parser.get_thumbnail( document.source_path, document.mime_type, document.get_public_filename(), diff --git a/src/documents/models.py b/src/documents/models.py index 221086ca2..9fed321c3 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -308,17 +308,11 @@ class Document(models.Model): png_file_path = os.path.join(settings.THUMBNAIL_DIR, png_file_name) # 1. Assume the thumbnail is WebP - if not os.path.exists(webp_file_path): - # 2. If WebP doesn't exist, check PNG - if not os.path.exists(png_file_path): - # 3. If PNG doesn't exist, filename is being constructed, return WebP - thumb = webp_file_path - else: - # 2.1 - PNG file exists, return path to it - thumb = png_file_path + if os.path.exists(png_file_path): + thumb = png_file_path else: - # 1.1 - WebP file exists, return path to it thumb = webp_file_path + return os.path.normpath(thumb) @property diff --git a/src/documents/parsers.py b/src/documents/parsers.py index bc8af0ec8..721346fb0 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -150,11 +150,14 @@ def run_convert( def get_default_thumbnail() -> str: + """ + Returns the path to a generic thumbnail + """ return os.path.join(os.path.dirname(__file__), "resources", "document.png") def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: - out_path = os.path.join(temp_dir, "convert_gs.png") + out_path = os.path.join(temp_dir, "convert_gs.webp") # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript @@ -319,29 +322,6 @@ class DocumentParser(LoggingMixin): """ raise NotImplementedError() - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): - thumbnail = self.get_thumbnail(document_path, mime_type, file_name) - if settings.OPTIMIZE_THUMBNAILS and os.path.splitext(thumbnail)[1] == ".png": - out_path = os.path.join(self.tempdir, "thumb_optipng.png") - - args = ( - settings.OPTIPNG_BINARY, - "-silent", - "-o5", - thumbnail, - "-out", - out_path, - ) - - self.log("debug", f"Execute: {' '.join(args)}") - - if not subprocess.Popen(args).wait() == 0: - raise ParseError(f"Optipng failed at {args}") - - return out_path - else: - return thumbnail - def get_text(self): return self.text diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 637c0d95e..a770d3ff6 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -183,7 +183,7 @@ class DummyParser(DocumentParser): _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) self.archive_path = archive_path - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb def parse(self, document_path, mime_type, file_name=None): @@ -194,7 +194,7 @@ class CopyParser(DocumentParser): def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb def __init__(self, logging_group, progress_callback=None): @@ -216,7 +216,7 @@ class FaultyParser(DocumentParser): super().__init__(logging_group) _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb def parse(self, document_path, mime_type, file_name=None): diff --git a/src/documents/tests/test_management_convert_thumbnail.py b/src/documents/tests/test_management_convert_thumbnail.py index 162f05cfe..8413cec3a 100644 --- a/src/documents/tests/test_management_convert_thumbnail.py +++ b/src/documents/tests/test_management_convert_thumbnail.py @@ -137,32 +137,3 @@ class TestConvertThumbnails(TestCase): run_convert_mock.assert_called_once() self.assertIn("Error converting thumbnail", stderr) self.assertTrue(thumb_file.exists()) - - @mock.patch("documents.management.commands.convert_thumbnails.run_convert") - def test_convert_single_thumbnail_no_output(self, run_convert_mock): - """ - GIVEN: - - Document exists with PNG thumbnail - WHEN: - - Thumbnail conversion is attempted, but there is no output WebP - THEN: - - Single thumbnail is converted - """ - - with tempfile.TemporaryDirectory() as thumbnail_dir: - - with override_settings( - THUMBNAIL_DIR=thumbnail_dir, - ): - - thumb_file = self.create_png_thumbnail_file(thumbnail_dir) - - stdout, stderr = self.call_command() - - run_convert_mock.assert_called_once() - self.assertIn(f"{thumb_file}", stdout) - self.assertNotIn("Conversion to WebP completed", stdout) - self.assertIn("Converted thumbnail doesn't exist", stderr) - - self.assertTrue(thumb_file.exists()) - self.assertFalse(thumb_file.with_suffix(".webp").exists()) diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 34711bca8..1942fe0dd 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -87,31 +87,6 @@ def fake_get_thumbnail(self, path, mimetype, file_name): return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") -class TestBaseParser(TestCase): - def setUp(self) -> None: - - self.scratch = tempfile.mkdtemp() - override_settings(SCRATCH_DIR=self.scratch).enable() - - def tearDown(self) -> None: - shutil.rmtree(self.scratch) - - @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) - @override_settings(OPTIMIZE_THUMBNAILS=True) - def test_get_optimised_thumbnail(self): - parser = DocumentParser(None) - - parser.get_optimised_thumbnail("any", "not important", "document.pdf") - - @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) - @override_settings(OPTIMIZE_THUMBNAILS=False) - def test_get_optimised_thumb_disabled(self): - parser = DocumentParser(None) - - path = parser.get_optimised_thumbnail("any", "not important", "document.pdf") - self.assertEqual(path, fake_get_thumbnail(None, None, None, None)) - - class TestParserAvailability(TestCase): def test_file_extensions(self): diff --git a/src/paperless/checks.py b/src/paperless/checks.py index ee9b95e09..26d18b692 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -72,7 +72,7 @@ def binaries_check(app_configs, **kwargs): error = "Paperless can't find {}. Without it, consumption is impossible." hint = "Either it's not in your ${PATH} or it's not installed." - binaries = (settings.CONVERT_BINARY, settings.OPTIPNG_BINARY, "tesseract") + binaries = (settings.CONVERT_BINARY, "tesseract") check_messages = [] for binary in binaries: diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 9a5d9453d..8c8aa8482 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -526,8 +526,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean( CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") -OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") - OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) # The default language that tesseract will attempt to use when parsing @@ -570,8 +568,6 @@ CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") -OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") - # Pre-2.x versions of Paperless stored your documents locally with GPG # encryption, but that is no longer the default. This behaviour is still diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index df0cb0afd..ba45ebf79 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -13,9 +13,9 @@ class TestChecks(DirectoriesMixin, TestCase): def test_binaries(self): self.assertEqual(binaries_check(None), []) - @override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot") + @override_settings(CONVERT_BINARY="uuuhh") def test_binaries_fail(self): - self.assertEqual(len(binaries_check(None)), 2) + self.assertEqual(len(binaries_check(None)), 1) def test_paths_check(self): self.assertEqual(paths_check(None), [])