diff --git a/.github/workflows/reusable-ci-backend.yml b/.github/workflows/reusable-ci-backend.yml index 977011b2c..e872e8696 100644 --- a/.github/workflows/reusable-ci-backend.yml +++ b/.github/workflows/reusable-ci-backend.yml @@ -74,7 +74,7 @@ jobs: name: Install system dependencies run: | sudo apt-get update -qq - sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils + sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript libzbar0 poppler-utils - name: Install Python dependencies run: | diff --git a/Dockerfile b/Dockerfile index 5338d8aa4..630cd367c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,15 +77,12 @@ ARG RUNTIME_PACKAGES="\ libraqm0 \ libgnutls30 \ libjpeg62-turbo \ - optipng \ python3 \ python3-pip \ python3-setuptools \ postgresql-client \ # For Numpy libatlas3-base \ - # thumbnail size reduction - pngquant \ # OCRmyPDF dependencies tesseract-ocr \ tesseract-ocr-eng \ diff --git a/docker/install_management_commands.sh b/docker/install_management_commands.sh index bf8bbeb93..e5c8b30a0 100755 --- a/docker/install_management_commands.sh +++ b/docker/install_management_commands.sh @@ -2,7 +2,18 @@ set -eu -for command in document_archiver document_exporter document_importer mail_fetcher document_create_classifier document_index document_renamer document_retagger document_thumbnails document_sanity_checker manage_superuser; +for command in decrypt_documents \ + document_archiver \ + document_exporter \ + document_importer \ + mail_fetcher \ + document_create_classifier \ + document_index \ + document_renamer \ + document_retagger \ + document_thumbnails \ + document_sanity_checker \ + manage_superuser; do echo "installing $command..." sed "s/management_command/$command/g" management_script.sh > /usr/local/bin/$command diff --git a/docs/configuration.rst b/docs/configuration.rst index b7ab978f4..a5db55927 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -712,13 +712,6 @@ PAPERLESS_CONVERT_TMPDIR= Default is none, which disables the temporary directory. -PAPERLESS_OPTIMIZE_THUMBNAILS= - Use optipng to optimize thumbnails. This usually reduces the size of - thumbnails by about 20%, but uses considerable compute time during - consumption. - - Defaults to true. - PAPERLESS_POST_CONSUME_SCRIPT= After a document is consumed, Paperless can trigger an arbitrary script if you like. This script will be passed a number of arguments for you to work @@ -789,9 +782,6 @@ PAPERLESS_CONVERT_BINARY= PAPERLESS_GS_BINARY= Defaults to "/usr/bin/gs". -PAPERLESS_OPTIPNG_BINARY= - Defaults to "/usr/bin/optipng". - .. _configuration-docker: diff --git a/docs/setup.rst b/docs/setup.rst index 90b952e4c..b8d3ab8a3 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -286,7 +286,6 @@ writing. Windows is not and will never be supported. * ``fonts-liberation`` for generating thumbnails for plain text files * ``imagemagick`` >= 6 for PDF conversion - * ``optipng`` for optimizing thumbnails * ``gnupg`` for handling encrypted documents * ``libpq-dev`` for PostgreSQL * ``libmagic-dev`` for mime type detection @@ -298,7 +297,7 @@ writing. Windows is not and will never be supported. .. code:: - python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils + python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils These dependencies are required for OCRmyPDF, which is used for text recognition. @@ -730,8 +729,6 @@ configuring some options in paperless can help improve performance immensely: * If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``. This will speed up OCR times and use less memory at the expense of slightly worse OCR results. -* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption - times. Thumbnails will be about 20% larger. * If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to 1. This will save some memory. diff --git a/paperless.conf.example b/paperless.conf.example index 97e907e1f..bb2449e05 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -65,7 +65,6 @@ #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false #PAPERLESS_CONSUMER_ENABLE_BARCODES=false #PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT -#PAPERLESS_OPTIMIZE_THUMBNAILS=true #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD @@ -84,4 +83,3 @@ #PAPERLESS_CONVERT_BINARY=/usr/bin/convert #PAPERLESS_GS_BINARY=/usr/bin/gs -#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng diff --git a/src/documents/checks.py b/src/documents/checks.py index 4ac49a2c2..a014a0ac2 100644 --- a/src/documents/checks.py +++ b/src/documents/checks.py @@ -11,7 +11,6 @@ from documents.signals import document_consumer_declaration @register() def changed_password_check(app_configs, **kwargs): - from documents.models import Document from paperless.db import GnuPG diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5e3d01fbc..e5794ce4f 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -273,7 +273,7 @@ class Consumer(LoggingMixin): self.log("debug", f"Generating thumbnail for {self.filename}...") self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) - thumbnail = document_parser.get_optimised_thumbnail( + thumbnail = document_parser.get_thumbnail( self.path, mime_type, self.filename, diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index bf0f352b5..c51f1baeb 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -41,7 +41,7 @@ def handle_document(document_id): try: parser.parse(document.source_path, mime_type, document.get_public_filename()) - thumbnail = parser.get_optimised_thumbnail( + thumbnail = parser.get_thumbnail( document.source_path, mime_type, document.get_public_filename(), diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 4bddd51b8..526d59368 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -189,7 +189,7 @@ class Command(BaseCommand): original_target = os.path.join(self.target, original_name) document_dict[EXPORTER_FILE_NAME] = original_name - thumbnail_name = base_name + "-thumbnail.png" + thumbnail_name = base_name + "-thumbnail.webp" thumbnail_target = os.path.join(self.target, thumbnail_name) document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index c9928c7cc..b56bc0042 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -11,7 +11,7 @@ from ...parsers import get_parser_class_for_mime_type def _process_document(doc_in): - document = Document.objects.get(id=doc_in) + document: Document = Document.objects.get(id=doc_in) parser_class = get_parser_class_for_mime_type(document.mime_type) if parser_class: @@ -21,7 +21,8 @@ def _process_document(doc_in): return try: - thumb = parser.get_optimised_thumbnail( + + thumb = parser.get_thumbnail( document.source_path, document.mime_type, document.get_public_filename(), @@ -69,7 +70,7 @@ class Command(BaseCommand): ids = [doc.id for doc in documents] # Note to future self: this prevents django from reusing database - # conncetions between processes, which is bad and does not work + # connections between processes, which is bad and does not work # with postgres. db.connections.close_all() diff --git a/src/documents/migrations/1021_webp_thumbnail_conversion.py b/src/documents/migrations/1021_webp_thumbnail_conversion.py new file mode 100644 index 000000000..c5a1c8733 --- /dev/null +++ b/src/documents/migrations/1021_webp_thumbnail_conversion.py @@ -0,0 +1,107 @@ +# Generated by Django 4.0.5 on 2022-06-11 15:40 +import logging +import multiprocessing.pool +import shutil +import tempfile +import time +from pathlib import Path + +from django.conf import settings +from django.db import migrations +from documents.parsers import run_convert + +logger = logging.getLogger("paperless.migrations") + + +def _do_convert(work_package): + existing_thumbnail, converted_thumbnail = work_package + try: + + logger.info(f"Converting thumbnail: {existing_thumbnail}") + + # Run actual conversion + run_convert( + density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + auto_orient=True, + input_file=f"{existing_thumbnail}[0]", + output_file=str(converted_thumbnail), + ) + + # Copy newly created thumbnail to thumbnail directory + shutil.copy(converted_thumbnail, existing_thumbnail.parent) + + # Remove the PNG version + existing_thumbnail.unlink() + + logger.info( + "Conversion to WebP completed, " + f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}", + ) + + except Exception as e: + logger.error(f"Error converting thumbnail (existing file unchanged): {e}") + + +def _convert_thumbnails_to_webp(apps, schema_editor): + start = time.time() + + with tempfile.TemporaryDirectory() as tempdir: + + work_packages = [] + + for file in Path(settings.THUMBNAIL_DIR).glob("*.png"): + existing_thumbnail = file.resolve() + + # Change the existing filename suffix from png to webp + converted_thumbnail_name = existing_thumbnail.with_suffix( + ".webp", + ).name + + # Create the expected output filename in the tempdir + converted_thumbnail = ( + Path(tempdir) / Path(converted_thumbnail_name) + ).resolve() + + # Package up the necessary info + work_packages.append( + (existing_thumbnail, converted_thumbnail), + ) + + if len(work_packages): + + logger.info( + "\n\n" + " This is a one-time only migration to convert thumbnails for all of your\n" + " documents into WebP format. If you have a lot of documents though, \n" + " this may take a while, so a coffee break may be in order." + "\n", + ) + + with multiprocessing.pool.Pool( + processes=min(multiprocessing.cpu_count(), 4), + maxtasksperchild=4, + ) as pool: + pool.map(_do_convert, work_packages) + + end = time.time() + duration = end - start + + logger.info(f"Conversion completed in {duration:.3f}s") + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "1020_merge_20220518_1839"), + ] + + operations = [ + migrations.RunPython( + code=_convert_thumbnails_to_webp, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 0061e5d0f..f24ce462e 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -3,6 +3,7 @@ import logging import os import re from collections import OrderedDict +from typing import Optional import dateutil.parser import pathvalidate @@ -228,7 +229,7 @@ class Document(models.Model): verbose_name = _("document") verbose_name_plural = _("documents") - def __str__(self): + def __str__(self) -> str: # Convert UTC database time to local time created = datetime.date.isoformat(timezone.localdate(self.created)) @@ -242,7 +243,7 @@ class Document(models.Model): return res @property - def source_path(self): + def source_path(self) -> str: if self.filename: fname = str(self.filename) else: @@ -257,11 +258,11 @@ class Document(models.Model): return open(self.source_path, "rb") @property - def has_archive_version(self): + def has_archive_version(self) -> bool: return self.archive_filename is not None @property - def archive_path(self): + def archive_path(self) -> Optional[str]: if self.has_archive_version: return os.path.join(settings.ARCHIVE_DIR, str(self.archive_filename)) else: @@ -271,7 +272,7 @@ class Document(models.Model): def archive_file(self): return open(self.archive_path, "rb") - def get_public_filename(self, archive=False, counter=0, suffix=None): + def get_public_filename(self, archive=False, counter=0, suffix=None) -> str: result = str(self) if counter: @@ -292,12 +293,14 @@ class Document(models.Model): return get_default_file_extension(self.mime_type) @property - def thumbnail_path(self): - file_name = f"{self.pk:07}.png" + def thumbnail_path(self) -> str: + webp_file_name = f"{self.pk:07}.webp" if self.storage_type == self.STORAGE_TYPE_GPG: - file_name += ".gpg" + webp_file_name += ".gpg" - return os.path.join(settings.THUMBNAIL_DIR, file_name) + webp_file_path = os.path.join(settings.THUMBNAIL_DIR, webp_file_name) + + return os.path.normpath(webp_file_path) @property def thumbnail_file(self): diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 469ec2f1e..721346fb0 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -150,11 +150,14 @@ def run_convert( def get_default_thumbnail() -> str: + """ + Returns the path to a generic thumbnail + """ return os.path.join(os.path.dirname(__file__), "resources", "document.png") def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: - out_path = os.path.join(temp_dir, "convert_gs.png") + out_path = os.path.join(temp_dir, "convert_gs.webp") # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript @@ -191,7 +194,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: """ The thumbnail of a PDF is just a 500px wide image of the first page. """ - out_path = os.path.join(temp_dir, "convert.png") + out_path = os.path.join(temp_dir, "convert.webp") # Run convert to get a decent thumbnail try: @@ -319,29 +322,6 @@ class DocumentParser(LoggingMixin): """ raise NotImplementedError() - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): - thumbnail = self.get_thumbnail(document_path, mime_type, file_name) - if settings.OPTIMIZE_THUMBNAILS: - out_path = os.path.join(self.tempdir, "thumb_optipng.png") - - args = ( - settings.OPTIPNG_BINARY, - "-silent", - "-o5", - thumbnail, - "-out", - out_path, - ) - - self.log("debug", f"Execute: {' '.join(args)}") - - if not subprocess.Popen(args).wait() == 0: - raise ParseError(f"Optipng failed at {args}") - - return out_path - else: - return thumbnail - def get_text(self): return self.text diff --git a/src/documents/tests/samples/documents/thumbnails/0000001.png b/src/documents/tests/samples/documents/thumbnails/0000001.png deleted file mode 100644 index a3a768401..000000000 Binary files a/src/documents/tests/samples/documents/thumbnails/0000001.png and /dev/null differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000001.webp b/src/documents/tests/samples/documents/thumbnails/0000001.webp new file mode 100644 index 000000000..a7ff623b2 Binary files /dev/null and b/src/documents/tests/samples/documents/thumbnails/0000001.webp differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000002.png b/src/documents/tests/samples/documents/thumbnails/0000002.png deleted file mode 100644 index a3a768401..000000000 Binary files a/src/documents/tests/samples/documents/thumbnails/0000002.png and /dev/null differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000002.webp b/src/documents/tests/samples/documents/thumbnails/0000002.webp new file mode 100644 index 000000000..a7ff623b2 Binary files /dev/null and b/src/documents/tests/samples/documents/thumbnails/0000002.webp differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000003.png b/src/documents/tests/samples/documents/thumbnails/0000003.png deleted file mode 100644 index a3a768401..000000000 Binary files a/src/documents/tests/samples/documents/thumbnails/0000003.png and /dev/null differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000003.webp b/src/documents/tests/samples/documents/thumbnails/0000003.webp new file mode 100644 index 000000000..a7ff623b2 Binary files /dev/null and b/src/documents/tests/samples/documents/thumbnails/0000003.webp differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000004.png.gpg b/src/documents/tests/samples/documents/thumbnails/0000004.png.gpg deleted file mode 100644 index 8a61a9126..000000000 Binary files a/src/documents/tests/samples/documents/thumbnails/0000004.png.gpg and /dev/null differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000004.webp.gpg b/src/documents/tests/samples/documents/thumbnails/0000004.webp.gpg new file mode 100644 index 000000000..3abc69d36 Binary files /dev/null and b/src/documents/tests/samples/documents/thumbnails/0000004.webp.gpg differ diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 24bdc3a50..6d659e66d 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -176,7 +176,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): ) with open( - os.path.join(self.dirs.thumbnail_dir, f"{doc.pk:07d}.png"), + os.path.join(self.dirs.thumbnail_dir, f"{doc.pk:07d}.webp"), "wb", ) as f: f.write(content_thumbnail) @@ -1022,7 +1022,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): "samples", "documents", "thumbnails", - "0000001.png", + "0000001.webp", ) archive_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 637c0d95e..48f195903 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -180,10 +180,10 @@ class DummyParser(DocumentParser): def __init__(self, logging_group, scratch_dir, archive_path): super().__init__(logging_group, None) - _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) + _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) self.archive_path = archive_path - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb def parse(self, document_path, mime_type, file_name=None): @@ -194,12 +194,12 @@ class CopyParser(DocumentParser): def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb def __init__(self, logging_group, progress_callback=None): super().__init__(logging_group, progress_callback) - _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=self.tempdir) + _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir) def parse(self, document_path, mime_type, file_name=None): self.text = "The text" @@ -214,9 +214,9 @@ class FaultyParser(DocumentParser): def __init__(self, logging_group, scratch_dir): super().__init__(logging_group) - _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) + _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) - def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb def parse(self, document_path, mime_type, file_name=None): @@ -230,6 +230,8 @@ def fake_magic_from_file(file, mime=False): return "application/pdf" elif os.path.splitext(file)[1] == ".png": return "image/png" + elif os.path.splitext(file)[1] == ".webp": + return "image/webp" else: return "unknown" else: diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 5e45086fe..76a5459b5 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -150,9 +150,9 @@ class TestDecryptDocuments(TestCase): "samples", "documents", "thumbnails", - f"0000004.png.gpg", + f"0000004.webp.gpg", ), - os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"), + os.path.join(thumb_dir, f"{doc.id:07}.webp.gpg"), ) call_command("decrypt_documents") @@ -163,7 +163,7 @@ class TestDecryptDocuments(TestCase): self.assertEqual(doc.filename, "0000004.pdf") self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000004.pdf"))) self.assertTrue(os.path.isfile(doc.source_path)) - self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png"))) + self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.webp"))) self.assertTrue(os.path.isfile(doc.thumbnail_path)) with doc.source_file as f: diff --git a/src/documents/tests/test_migration_webp_conversion.py b/src/documents/tests/test_migration_webp_conversion.py new file mode 100644 index 000000000..a3a5fa6bc --- /dev/null +++ b/src/documents/tests/test_migration_webp_conversion.py @@ -0,0 +1,231 @@ +import shutil +import tempfile +from pathlib import Path +from typing import Callable +from typing import Iterable +from typing import Union +from unittest import mock + +from django.test import override_settings +from documents.tests.test_migration_archive_files import thumbnail_path +from documents.tests.utils import TestMigrations + + +@mock.patch( + "documents.migrations.1021_webp_thumbnail_conversion.multiprocessing.pool.Pool.map", +) +@mock.patch("documents.migrations.1021_webp_thumbnail_conversion.run_convert") +class TestMigrateWebPThumbnails(TestMigrations): + + migrate_from = "1020_merge_20220518_1839" + migrate_to = "1021_webp_thumbnail_conversion" + auto_migrate = False + + def pretend_convert_output(self, *args, **kwargs): + """ + Pretends to do the conversion, by copying the input file + to the output file + """ + shutil.copy2( + Path(kwargs["input_file"].rstrip("[0]")), + Path(kwargs["output_file"]), + ) + + def pretend_map(self, func: Callable, iterable: Iterable): + """ + Pretends to be the map of a multiprocessing.Pool, but secretly does + everything in series + """ + for item in iterable: + func(item) + + def create_dummy_thumbnails( + self, + thumb_dir: Path, + ext: str, + count: int, + start_count: int = 0, + ): + """ + Helper to create a certain count of files of given extension in a given directory + """ + for idx in range(count): + (Path(thumb_dir) / Path(f"{start_count + idx:07}.{ext}")).touch() + # Triple check expected files exist + self.assert_file_count_by_extension(ext, thumb_dir, count) + + def create_webp_thumbnail_files( + self, + thumb_dir: Path, + count: int, + start_count: int = 0, + ): + """ + Creates a dummy WebP thumbnail file in the given directory, based on + the database Document + """ + self.create_dummy_thumbnails(thumb_dir, "webp", count, start_count) + + def create_png_thumbnail_file( + self, + thumb_dir: Path, + count: int, + start_count: int = 0, + ): + """ + Creates a dummy PNG thumbnail file in the given directory, based on + the database Document + """ + self.create_dummy_thumbnails(thumb_dir, "png", count, start_count) + + def assert_file_count_by_extension( + self, + ext: str, + dir: Union[str, Path], + expected_count: int, + ): + """ + Helper to assert a certain count of given extension files in given directory + """ + if not isinstance(dir, Path): + dir = Path(dir) + matching_files = list(dir.glob(f"*.{ext}")) + self.assertEqual(len(matching_files), expected_count) + + def assert_png_file_count(self, dir: Path, expected_count: int): + """ + Helper to assert a certain count of PNG extension files in given directory + """ + self.assert_file_count_by_extension("png", dir, expected_count) + + def assert_webp_file_count(self, dir: Path, expected_count: int): + """ + Helper to assert a certain count of WebP extension files in given directory + """ + self.assert_file_count_by_extension("webp", dir, expected_count) + + def setUp(self): + + self.thumbnail_dir = Path(tempfile.mkdtemp()).resolve() + + return super().setUp() + + def tearDown(self) -> None: + + shutil.rmtree(self.thumbnail_dir) + + return super().tearDown() + + def test_do_nothing_if_converted( + self, + run_convert_mock: mock.MagicMock, + map_mock: mock.MagicMock, + ): + """ + GIVEN: + - Document exists with default WebP thumbnail path + WHEN: + - Thumbnail conversion is attempted + THEN: + - Nothing is converted + """ + map_mock.side_effect = self.pretend_map + + with override_settings( + THUMBNAIL_DIR=self.thumbnail_dir, + ): + + self.create_webp_thumbnail_files(self.thumbnail_dir, 3) + + self.performMigration() + run_convert_mock.assert_not_called() + + self.assert_webp_file_count(self.thumbnail_dir, 3) + + def test_convert_single_thumbnail( + self, + run_convert_mock: mock.MagicMock, + map_mock: mock.MagicMock, + ): + """ + GIVEN: + - Document exists with PNG thumbnail + WHEN: + - Thumbnail conversion is attempted + THEN: + - Single thumbnail is converted + """ + map_mock.side_effect = self.pretend_map + run_convert_mock.side_effect = self.pretend_convert_output + + with override_settings( + THUMBNAIL_DIR=self.thumbnail_dir, + ): + self.create_png_thumbnail_file(self.thumbnail_dir, 3) + + self.performMigration() + + run_convert_mock.assert_called() + self.assertEqual(run_convert_mock.call_count, 3) + + self.assert_webp_file_count(self.thumbnail_dir, 3) + + def test_convert_errors_out( + self, + run_convert_mock: mock.MagicMock, + map_mock: mock.MagicMock, + ): + """ + GIVEN: + - Document exists with PNG thumbnail + WHEN: + - Thumbnail conversion is attempted, but raises an exception + THEN: + - Single thumbnail is converted + """ + map_mock.side_effect = self.pretend_map + run_convert_mock.side_effect = OSError + + with override_settings( + THUMBNAIL_DIR=self.thumbnail_dir, + ): + + self.create_png_thumbnail_file(self.thumbnail_dir, 3) + + self.performMigration() + + run_convert_mock.assert_called() + self.assertEqual(run_convert_mock.call_count, 3) + + self.assert_png_file_count(self.thumbnail_dir, 3) + + def test_convert_mixed( + self, + run_convert_mock: mock.MagicMock, + map_mock: mock.MagicMock, + ): + """ + GIVEN: + - Document exists with PNG thumbnail + WHEN: + - Thumbnail conversion is attempted, but raises an exception + THEN: + - Single thumbnail is converted + """ + map_mock.side_effect = self.pretend_map + run_convert_mock.side_effect = self.pretend_convert_output + + with override_settings( + THUMBNAIL_DIR=self.thumbnail_dir, + ): + + self.create_png_thumbnail_file(self.thumbnail_dir, 3) + self.create_webp_thumbnail_files(self.thumbnail_dir, 2, start_count=3) + + self.performMigration() + + run_convert_mock.assert_called() + self.assertEqual(run_convert_mock.call_count, 3) + + self.assert_png_file_count(self.thumbnail_dir, 0) + self.assert_webp_file_count(self.thumbnail_dir, 5) diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 34711bca8..1942fe0dd 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -87,31 +87,6 @@ def fake_get_thumbnail(self, path, mimetype, file_name): return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") -class TestBaseParser(TestCase): - def setUp(self) -> None: - - self.scratch = tempfile.mkdtemp() - override_settings(SCRATCH_DIR=self.scratch).enable() - - def tearDown(self) -> None: - shutil.rmtree(self.scratch) - - @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) - @override_settings(OPTIMIZE_THUMBNAILS=True) - def test_get_optimised_thumbnail(self): - parser = DocumentParser(None) - - parser.get_optimised_thumbnail("any", "not important", "document.pdf") - - @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) - @override_settings(OPTIMIZE_THUMBNAILS=False) - def test_get_optimised_thumb_disabled(self): - parser = DocumentParser(None) - - path = parser.get_optimised_thumbnail("any", "not important", "document.pdf") - self.assertEqual(path, fake_get_thumbnail(None, None, None, None)) - - class TestParserAvailability(TestCase): def test_file_extensions(self): diff --git a/src/documents/tests/test_sanity_check.py b/src/documents/tests/test_sanity_check.py index 5ebedd908..9bb424cbc 100644 --- a/src/documents/tests/test_sanity_check.py +++ b/src/documents/tests/test_sanity_check.py @@ -42,9 +42,9 @@ class TestSanityCheck(DirectoriesMixin, TestCase): "samples", "documents", "thumbnails", - "0000001.png", + "0000001.webp", ), - os.path.join(self.dirs.thumbnail_dir, "0000001.png"), + os.path.join(self.dirs.thumbnail_dir, "0000001.webp"), ) return Document.objects.create( diff --git a/src/documents/views.py b/src/documents/views.py index cdd38180b..b8d4075d0 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -362,7 +362,8 @@ class DocumentViewSet( handle = doc.thumbnail_file # TODO: Send ETag information and use that to send new thumbnails # if available - return HttpResponse(handle, content_type="image/png") + + return HttpResponse(handle, content_type="image/webp") except (FileNotFoundError, Document.DoesNotExist): raise Http404() diff --git a/src/paperless/checks.py b/src/paperless/checks.py index ee9b95e09..26d18b692 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -72,7 +72,7 @@ def binaries_check(app_configs, **kwargs): error = "Paperless can't find {}. Without it, consumption is impossible." hint = "Either it's not in your ${PATH} or it's not installed." - binaries = (settings.CONVERT_BINARY, settings.OPTIPNG_BINARY, "tesseract") + binaries = (settings.CONVERT_BINARY, "tesseract") check_messages = [] for binary in binaries: diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 9a5d9453d..8c8aa8482 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -526,8 +526,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean( CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") -OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") - OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) # The default language that tesseract will attempt to use when parsing @@ -570,8 +568,6 @@ CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") -OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") - # Pre-2.x versions of Paperless stored your documents locally with GPG # encryption, but that is no longer the default. This behaviour is still diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index df0cb0afd..ba45ebf79 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -13,9 +13,9 @@ class TestChecks(DirectoriesMixin, TestCase): def test_binaries(self): self.assertEqual(binaries_check(None), []) - @override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot") + @override_settings(CONVERT_BINARY="uuuhh") def test_binaries_fail(self): - self.assertEqual(len(binaries_check(None)), 2) + self.assertEqual(len(binaries_check(None)), 1) def test_paths_check(self): self.assertEqual(paths_check(None), []) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index fe7e823b3..4889c54df 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -30,8 +30,8 @@ class TextDocumentParser(DocumentParser): ) draw.text((5, 5), read_text(), font=font, fill="black") - out_path = os.path.join(self.tempdir, "thumb.png") - img.save(out_path) + out_path = os.path.join(self.tempdir, "thumb.webp") + img.save(out_path, format="WEBP") return out_path