Merge pull request #1127 from paperless-ngx/feature-webp-thumbnails

Feature: Change document thumbnails to WebP
This commit is contained in:
shamoon 2022-06-12 09:44:53 -07:00 committed by GitHub
commit 72ee904e67
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 398 additions and 110 deletions

View File

@ -74,7 +74,7 @@ jobs:
name: Install system dependencies name: Install system dependencies
run: | run: |
sudo apt-get update -qq sudo apt-get update -qq
sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript libzbar0 poppler-utils
- -
name: Install Python dependencies name: Install Python dependencies
run: | run: |

View File

@ -77,15 +77,12 @@ ARG RUNTIME_PACKAGES="\
libraqm0 \ libraqm0 \
libgnutls30 \ libgnutls30 \
libjpeg62-turbo \ libjpeg62-turbo \
optipng \
python3 \ python3 \
python3-pip \ python3-pip \
python3-setuptools \ python3-setuptools \
postgresql-client \ postgresql-client \
# For Numpy # For Numpy
libatlas3-base \ libatlas3-base \
# thumbnail size reduction
pngquant \
# OCRmyPDF dependencies # OCRmyPDF dependencies
tesseract-ocr \ tesseract-ocr \
tesseract-ocr-eng \ tesseract-ocr-eng \

View File

@ -2,7 +2,18 @@
set -eu set -eu
for command in document_archiver document_exporter document_importer mail_fetcher document_create_classifier document_index document_renamer document_retagger document_thumbnails document_sanity_checker manage_superuser; for command in decrypt_documents \
document_archiver \
document_exporter \
document_importer \
mail_fetcher \
document_create_classifier \
document_index \
document_renamer \
document_retagger \
document_thumbnails \
document_sanity_checker \
manage_superuser;
do do
echo "installing $command..." echo "installing $command..."
sed "s/management_command/$command/g" management_script.sh > /usr/local/bin/$command sed "s/management_command/$command/g" management_script.sh > /usr/local/bin/$command

View File

@ -712,13 +712,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
Default is none, which disables the temporary directory. Default is none, which disables the temporary directory.
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
Use optipng to optimize thumbnails. This usually reduces the size of
thumbnails by about 20%, but uses considerable compute time during
consumption.
Defaults to true.
PAPERLESS_POST_CONSUME_SCRIPT=<filename> PAPERLESS_POST_CONSUME_SCRIPT=<filename>
After a document is consumed, Paperless can trigger an arbitrary script if After a document is consumed, Paperless can trigger an arbitrary script if
you like. This script will be passed a number of arguments for you to work you like. This script will be passed a number of arguments for you to work
@ -789,9 +782,6 @@ PAPERLESS_CONVERT_BINARY=<path>
PAPERLESS_GS_BINARY=<path> PAPERLESS_GS_BINARY=<path>
Defaults to "/usr/bin/gs". Defaults to "/usr/bin/gs".
PAPERLESS_OPTIPNG_BINARY=<path>
Defaults to "/usr/bin/optipng".
.. _configuration-docker: .. _configuration-docker:

View File

@ -286,7 +286,6 @@ writing. Windows is not and will never be supported.
* ``fonts-liberation`` for generating thumbnails for plain text files * ``fonts-liberation`` for generating thumbnails for plain text files
* ``imagemagick`` >= 6 for PDF conversion * ``imagemagick`` >= 6 for PDF conversion
* ``optipng`` for optimizing thumbnails
* ``gnupg`` for handling encrypted documents * ``gnupg`` for handling encrypted documents
* ``libpq-dev`` for PostgreSQL * ``libpq-dev`` for PostgreSQL
* ``libmagic-dev`` for mime type detection * ``libmagic-dev`` for mime type detection
@ -298,7 +297,7 @@ writing. Windows is not and will never be supported.
.. code:: .. code::
python3 python3-pip python3-dev imagemagick fonts-liberation optipng gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils python3 python3-pip python3-dev imagemagick fonts-liberation gnupg libpq-dev libmagic-dev mime-support libzbar0 poppler-utils
These dependencies are required for OCRmyPDF, which is used for text recognition. These dependencies are required for OCRmyPDF, which is used for text recognition.
@ -730,8 +729,6 @@ configuring some options in paperless can help improve performance immensely:
* If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``. * If you want to perform OCR on the device, consider using ``PAPERLESS_OCR_CLEAN=none``.
This will speed up OCR times and use less memory at the expense of slightly worse This will speed up OCR times and use less memory at the expense of slightly worse
OCR results. OCR results.
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
times. Thumbnails will be about 20% larger.
* If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to * If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
1. This will save some memory. 1. This will save some memory.

View File

@ -65,7 +65,6 @@
#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=false #PAPERLESS_CONSUMER_ENABLE_BARCODES=false
#PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT #PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
#PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
#PAPERLESS_FILENAME_DATE_ORDER=YMD #PAPERLESS_FILENAME_DATE_ORDER=YMD
@ -84,4 +83,3 @@
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert #PAPERLESS_CONVERT_BINARY=/usr/bin/convert
#PAPERLESS_GS_BINARY=/usr/bin/gs #PAPERLESS_GS_BINARY=/usr/bin/gs
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng

View File

@ -11,7 +11,6 @@ from documents.signals import document_consumer_declaration
@register() @register()
def changed_password_check(app_configs, **kwargs): def changed_password_check(app_configs, **kwargs):
from documents.models import Document from documents.models import Document
from paperless.db import GnuPG from paperless.db import GnuPG

View File

@ -273,7 +273,7 @@ class Consumer(LoggingMixin):
self.log("debug", f"Generating thumbnail for {self.filename}...") self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
thumbnail = document_parser.get_optimised_thumbnail( thumbnail = document_parser.get_thumbnail(
self.path, self.path,
mime_type, mime_type,
self.filename, self.filename,

View File

@ -41,7 +41,7 @@ def handle_document(document_id):
try: try:
parser.parse(document.source_path, mime_type, document.get_public_filename()) parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_optimised_thumbnail( thumbnail = parser.get_thumbnail(
document.source_path, document.source_path,
mime_type, mime_type,
document.get_public_filename(), document.get_public_filename(),

View File

@ -189,7 +189,7 @@ class Command(BaseCommand):
original_target = os.path.join(self.target, original_name) original_target = os.path.join(self.target, original_name)
document_dict[EXPORTER_FILE_NAME] = original_name document_dict[EXPORTER_FILE_NAME] = original_name
thumbnail_name = base_name + "-thumbnail.png" thumbnail_name = base_name + "-thumbnail.webp"
thumbnail_target = os.path.join(self.target, thumbnail_name) thumbnail_target = os.path.join(self.target, thumbnail_name)
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

View File

@ -11,7 +11,7 @@ from ...parsers import get_parser_class_for_mime_type
def _process_document(doc_in): def _process_document(doc_in):
document = Document.objects.get(id=doc_in) document: Document = Document.objects.get(id=doc_in)
parser_class = get_parser_class_for_mime_type(document.mime_type) parser_class = get_parser_class_for_mime_type(document.mime_type)
if parser_class: if parser_class:
@ -21,7 +21,8 @@ def _process_document(doc_in):
return return
try: try:
thumb = parser.get_optimised_thumbnail(
thumb = parser.get_thumbnail(
document.source_path, document.source_path,
document.mime_type, document.mime_type,
document.get_public_filename(), document.get_public_filename(),
@ -69,7 +70,7 @@ class Command(BaseCommand):
ids = [doc.id for doc in documents] ids = [doc.id for doc in documents]
# Note to future self: this prevents django from reusing database # Note to future self: this prevents django from reusing database
# conncetions between processes, which is bad and does not work # connections between processes, which is bad and does not work
# with postgres. # with postgres.
db.connections.close_all() db.connections.close_all()

View File

@ -0,0 +1,107 @@
# Generated by Django 4.0.5 on 2022-06-11 15:40
import logging
import multiprocessing.pool
import shutil
import tempfile
import time
from pathlib import Path
from django.conf import settings
from django.db import migrations
from documents.parsers import run_convert
logger = logging.getLogger("paperless.migrations")
def _do_convert(work_package):
existing_thumbnail, converted_thumbnail = work_package
try:
logger.info(f"Converting thumbnail: {existing_thumbnail}")
# Run actual conversion
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{existing_thumbnail}[0]",
output_file=str(converted_thumbnail),
)
# Copy newly created thumbnail to thumbnail directory
shutil.copy(converted_thumbnail, existing_thumbnail.parent)
# Remove the PNG version
existing_thumbnail.unlink()
logger.info(
"Conversion to WebP completed, "
f"replaced {existing_thumbnail.name} with {converted_thumbnail.name}",
)
except Exception as e:
logger.error(f"Error converting thumbnail (existing file unchanged): {e}")
def _convert_thumbnails_to_webp(apps, schema_editor):
start = time.time()
with tempfile.TemporaryDirectory() as tempdir:
work_packages = []
for file in Path(settings.THUMBNAIL_DIR).glob("*.png"):
existing_thumbnail = file.resolve()
# Change the existing filename suffix from png to webp
converted_thumbnail_name = existing_thumbnail.with_suffix(
".webp",
).name
# Create the expected output filename in the tempdir
converted_thumbnail = (
Path(tempdir) / Path(converted_thumbnail_name)
).resolve()
# Package up the necessary info
work_packages.append(
(existing_thumbnail, converted_thumbnail),
)
if len(work_packages):
logger.info(
"\n\n"
" This is a one-time only migration to convert thumbnails for all of your\n"
" documents into WebP format. If you have a lot of documents though, \n"
" this may take a while, so a coffee break may be in order."
"\n",
)
with multiprocessing.pool.Pool(
processes=min(multiprocessing.cpu_count(), 4),
maxtasksperchild=4,
) as pool:
pool.map(_do_convert, work_packages)
end = time.time()
duration = end - start
logger.info(f"Conversion completed in {duration:.3f}s")
class Migration(migrations.Migration):
dependencies = [
("documents", "1020_merge_20220518_1839"),
]
operations = [
migrations.RunPython(
code=_convert_thumbnails_to_webp,
reverse_code=migrations.RunPython.noop,
),
]

View File

@ -3,6 +3,7 @@ import logging
import os import os
import re import re
from collections import OrderedDict from collections import OrderedDict
from typing import Optional
import dateutil.parser import dateutil.parser
import pathvalidate import pathvalidate
@ -228,7 +229,7 @@ class Document(models.Model):
verbose_name = _("document") verbose_name = _("document")
verbose_name_plural = _("documents") verbose_name_plural = _("documents")
def __str__(self): def __str__(self) -> str:
# Convert UTC database time to local time # Convert UTC database time to local time
created = datetime.date.isoformat(timezone.localdate(self.created)) created = datetime.date.isoformat(timezone.localdate(self.created))
@ -242,7 +243,7 @@ class Document(models.Model):
return res return res
@property @property
def source_path(self): def source_path(self) -> str:
if self.filename: if self.filename:
fname = str(self.filename) fname = str(self.filename)
else: else:
@ -257,11 +258,11 @@ class Document(models.Model):
return open(self.source_path, "rb") return open(self.source_path, "rb")
@property @property
def has_archive_version(self): def has_archive_version(self) -> bool:
return self.archive_filename is not None return self.archive_filename is not None
@property @property
def archive_path(self): def archive_path(self) -> Optional[str]:
if self.has_archive_version: if self.has_archive_version:
return os.path.join(settings.ARCHIVE_DIR, str(self.archive_filename)) return os.path.join(settings.ARCHIVE_DIR, str(self.archive_filename))
else: else:
@ -271,7 +272,7 @@ class Document(models.Model):
def archive_file(self): def archive_file(self):
return open(self.archive_path, "rb") return open(self.archive_path, "rb")
def get_public_filename(self, archive=False, counter=0, suffix=None): def get_public_filename(self, archive=False, counter=0, suffix=None) -> str:
result = str(self) result = str(self)
if counter: if counter:
@ -292,12 +293,14 @@ class Document(models.Model):
return get_default_file_extension(self.mime_type) return get_default_file_extension(self.mime_type)
@property @property
def thumbnail_path(self): def thumbnail_path(self) -> str:
file_name = f"{self.pk:07}.png" webp_file_name = f"{self.pk:07}.webp"
if self.storage_type == self.STORAGE_TYPE_GPG: if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg" webp_file_name += ".gpg"
return os.path.join(settings.THUMBNAIL_DIR, file_name) webp_file_path = os.path.join(settings.THUMBNAIL_DIR, webp_file_name)
return os.path.normpath(webp_file_path)
@property @property
def thumbnail_file(self): def thumbnail_file(self):

View File

@ -150,11 +150,14 @@ def run_convert(
def get_default_thumbnail() -> str: def get_default_thumbnail() -> str:
"""
Returns the path to a generic thumbnail
"""
return os.path.join(os.path.dirname(__file__), "resources", "document.png") return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
out_path = os.path.join(temp_dir, "convert_gs.png") out_path = os.path.join(temp_dir, "convert_gs.webp")
# if convert fails, fall back to extracting # if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript # the first PDF page as a PNG using Ghostscript
@ -191,7 +194,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.
""" """
out_path = os.path.join(temp_dir, "convert.png") out_path = os.path.join(temp_dir, "convert.webp")
# Run convert to get a decent thumbnail # Run convert to get a decent thumbnail
try: try:
@ -319,29 +322,6 @@ class DocumentParser(LoggingMixin):
""" """
raise NotImplementedError() raise NotImplementedError()
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
args = (
settings.OPTIPNG_BINARY,
"-silent",
"-o5",
thumbnail,
"-out",
out_path,
)
self.log("debug", f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError(f"Optipng failed at {args}")
return out_path
else:
return thumbnail
def get_text(self): def get_text(self):
return self.text return self.text

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB

View File

@ -176,7 +176,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
) )
with open( with open(
os.path.join(self.dirs.thumbnail_dir, f"{doc.pk:07d}.png"), os.path.join(self.dirs.thumbnail_dir, f"{doc.pk:07d}.webp"),
"wb", "wb",
) as f: ) as f:
f.write(content_thumbnail) f.write(content_thumbnail)
@ -1022,7 +1022,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"samples", "samples",
"documents", "documents",
"thumbnails", "thumbnails",
"0000001.png", "0000001.webp",
) )
archive_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") archive_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")

View File

@ -180,10 +180,10 @@ class DummyParser(DocumentParser):
def __init__(self, logging_group, scratch_dir, archive_path): def __init__(self, logging_group, scratch_dir, archive_path):
super().__init__(logging_group, None) super().__init__(logging_group, None)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
self.archive_path = archive_path self.archive_path = archive_path
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
@ -194,12 +194,12 @@ class CopyParser(DocumentParser):
def get_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def __init__(self, logging_group, progress_callback=None): def __init__(self, logging_group, progress_callback=None):
super().__init__(logging_group, progress_callback) super().__init__(logging_group, progress_callback)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=self.tempdir) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
self.text = "The text" self.text = "The text"
@ -214,9 +214,9 @@ class FaultyParser(DocumentParser):
def __init__(self, logging_group, scratch_dir): def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group) super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
def get_optimised_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
@ -230,6 +230,8 @@ def fake_magic_from_file(file, mime=False):
return "application/pdf" return "application/pdf"
elif os.path.splitext(file)[1] == ".png": elif os.path.splitext(file)[1] == ".png":
return "image/png" return "image/png"
elif os.path.splitext(file)[1] == ".webp":
return "image/webp"
else: else:
return "unknown" return "unknown"
else: else:

View File

@ -150,9 +150,9 @@ class TestDecryptDocuments(TestCase):
"samples", "samples",
"documents", "documents",
"thumbnails", "thumbnails",
f"0000004.png.gpg", f"0000004.webp.gpg",
), ),
os.path.join(thumb_dir, f"{doc.id:07}.png.gpg"), os.path.join(thumb_dir, f"{doc.id:07}.webp.gpg"),
) )
call_command("decrypt_documents") call_command("decrypt_documents")
@ -163,7 +163,7 @@ class TestDecryptDocuments(TestCase):
self.assertEqual(doc.filename, "0000004.pdf") self.assertEqual(doc.filename, "0000004.pdf")
self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000004.pdf"))) self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000004.pdf")))
self.assertTrue(os.path.isfile(doc.source_path)) self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.png"))) self.assertTrue(os.path.isfile(os.path.join(thumb_dir, f"{doc.id:07}.webp")))
self.assertTrue(os.path.isfile(doc.thumbnail_path)) self.assertTrue(os.path.isfile(doc.thumbnail_path))
with doc.source_file as f: with doc.source_file as f:

View File

@ -0,0 +1,231 @@
import shutil
import tempfile
from pathlib import Path
from typing import Callable
from typing import Iterable
from typing import Union
from unittest import mock
from django.test import override_settings
from documents.tests.test_migration_archive_files import thumbnail_path
from documents.tests.utils import TestMigrations
@mock.patch(
"documents.migrations.1021_webp_thumbnail_conversion.multiprocessing.pool.Pool.map",
)
@mock.patch("documents.migrations.1021_webp_thumbnail_conversion.run_convert")
class TestMigrateWebPThumbnails(TestMigrations):
migrate_from = "1020_merge_20220518_1839"
migrate_to = "1021_webp_thumbnail_conversion"
auto_migrate = False
def pretend_convert_output(self, *args, **kwargs):
"""
Pretends to do the conversion, by copying the input file
to the output file
"""
shutil.copy2(
Path(kwargs["input_file"].rstrip("[0]")),
Path(kwargs["output_file"]),
)
def pretend_map(self, func: Callable, iterable: Iterable):
"""
Pretends to be the map of a multiprocessing.Pool, but secretly does
everything in series
"""
for item in iterable:
func(item)
def create_dummy_thumbnails(
self,
thumb_dir: Path,
ext: str,
count: int,
start_count: int = 0,
):
"""
Helper to create a certain count of files of given extension in a given directory
"""
for idx in range(count):
(Path(thumb_dir) / Path(f"{start_count + idx:07}.{ext}")).touch()
# Triple check expected files exist
self.assert_file_count_by_extension(ext, thumb_dir, count)
def create_webp_thumbnail_files(
self,
thumb_dir: Path,
count: int,
start_count: int = 0,
):
"""
Creates a dummy WebP thumbnail file in the given directory, based on
the database Document
"""
self.create_dummy_thumbnails(thumb_dir, "webp", count, start_count)
def create_png_thumbnail_file(
self,
thumb_dir: Path,
count: int,
start_count: int = 0,
):
"""
Creates a dummy PNG thumbnail file in the given directory, based on
the database Document
"""
self.create_dummy_thumbnails(thumb_dir, "png", count, start_count)
def assert_file_count_by_extension(
self,
ext: str,
dir: Union[str, Path],
expected_count: int,
):
"""
Helper to assert a certain count of given extension files in given directory
"""
if not isinstance(dir, Path):
dir = Path(dir)
matching_files = list(dir.glob(f"*.{ext}"))
self.assertEqual(len(matching_files), expected_count)
def assert_png_file_count(self, dir: Path, expected_count: int):
"""
Helper to assert a certain count of PNG extension files in given directory
"""
self.assert_file_count_by_extension("png", dir, expected_count)
def assert_webp_file_count(self, dir: Path, expected_count: int):
"""
Helper to assert a certain count of WebP extension files in given directory
"""
self.assert_file_count_by_extension("webp", dir, expected_count)
def setUp(self):
self.thumbnail_dir = Path(tempfile.mkdtemp()).resolve()
return super().setUp()
def tearDown(self) -> None:
shutil.rmtree(self.thumbnail_dir)
return super().tearDown()
def test_do_nothing_if_converted(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Document exists with default WebP thumbnail path
WHEN:
- Thumbnail conversion is attempted
THEN:
- Nothing is converted
"""
map_mock.side_effect = self.pretend_map
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_webp_thumbnail_files(self.thumbnail_dir, 3)
self.performMigration()
run_convert_mock.assert_not_called()
self.assert_webp_file_count(self.thumbnail_dir, 3)
def test_convert_single_thumbnail(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Document exists with PNG thumbnail
WHEN:
- Thumbnail conversion is attempted
THEN:
- Single thumbnail is converted
"""
map_mock.side_effect = self.pretend_map
run_convert_mock.side_effect = self.pretend_convert_output
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_png_thumbnail_file(self.thumbnail_dir, 3)
self.performMigration()
run_convert_mock.assert_called()
self.assertEqual(run_convert_mock.call_count, 3)
self.assert_webp_file_count(self.thumbnail_dir, 3)
def test_convert_errors_out(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Document exists with PNG thumbnail
WHEN:
- Thumbnail conversion is attempted, but raises an exception
THEN:
- Single thumbnail is converted
"""
map_mock.side_effect = self.pretend_map
run_convert_mock.side_effect = OSError
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_png_thumbnail_file(self.thumbnail_dir, 3)
self.performMigration()
run_convert_mock.assert_called()
self.assertEqual(run_convert_mock.call_count, 3)
self.assert_png_file_count(self.thumbnail_dir, 3)
def test_convert_mixed(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Document exists with PNG thumbnail
WHEN:
- Thumbnail conversion is attempted, but raises an exception
THEN:
- Single thumbnail is converted
"""
map_mock.side_effect = self.pretend_map
run_convert_mock.side_effect = self.pretend_convert_output
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_png_thumbnail_file(self.thumbnail_dir, 3)
self.create_webp_thumbnail_files(self.thumbnail_dir, 2, start_count=3)
self.performMigration()
run_convert_mock.assert_called()
self.assertEqual(run_convert_mock.call_count, 3)
self.assert_png_file_count(self.thumbnail_dir, 0)
self.assert_webp_file_count(self.thumbnail_dir, 5)

View File

@ -87,31 +87,6 @@ def fake_get_thumbnail(self, path, mimetype, file_name):
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
class TestBaseParser(TestCase):
def setUp(self) -> None:
self.scratch = tempfile.mkdtemp()
override_settings(SCRATCH_DIR=self.scratch).enable()
def tearDown(self) -> None:
shutil.rmtree(self.scratch)
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=True)
def test_get_optimised_thumbnail(self):
parser = DocumentParser(None)
parser.get_optimised_thumbnail("any", "not important", "document.pdf")
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=False)
def test_get_optimised_thumb_disabled(self):
parser = DocumentParser(None)
path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
class TestParserAvailability(TestCase): class TestParserAvailability(TestCase):
def test_file_extensions(self): def test_file_extensions(self):

View File

@ -42,9 +42,9 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
"samples", "samples",
"documents", "documents",
"thumbnails", "thumbnails",
"0000001.png", "0000001.webp",
), ),
os.path.join(self.dirs.thumbnail_dir, "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.webp"),
) )
return Document.objects.create( return Document.objects.create(

View File

@ -362,7 +362,8 @@ class DocumentViewSet(
handle = doc.thumbnail_file handle = doc.thumbnail_file
# TODO: Send ETag information and use that to send new thumbnails # TODO: Send ETag information and use that to send new thumbnails
# if available # if available
return HttpResponse(handle, content_type="image/png")
return HttpResponse(handle, content_type="image/webp")
except (FileNotFoundError, Document.DoesNotExist): except (FileNotFoundError, Document.DoesNotExist):
raise Http404() raise Http404()

View File

@ -72,7 +72,7 @@ def binaries_check(app_configs, **kwargs):
error = "Paperless can't find {}. Without it, consumption is impossible." error = "Paperless can't find {}. Without it, consumption is impossible."
hint = "Either it's not in your ${PATH} or it's not installed." hint = "Either it's not in your ${PATH} or it's not installed."
binaries = (settings.CONVERT_BINARY, settings.OPTIPNG_BINARY, "tesseract") binaries = (settings.CONVERT_BINARY, "tesseract")
check_messages = [] check_messages = []
for binary in binaries: for binary in binaries:

View File

@ -526,8 +526,6 @@ CONSUMER_BARCODE_TIFF_SUPPORT = __get_boolean(
CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
# The default language that tesseract will attempt to use when parsing # The default language that tesseract will attempt to use when parsing
@ -570,8 +568,6 @@ CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
# Pre-2.x versions of Paperless stored your documents locally with GPG # Pre-2.x versions of Paperless stored your documents locally with GPG
# encryption, but that is no longer the default. This behaviour is still # encryption, but that is no longer the default. This behaviour is still

View File

@ -13,9 +13,9 @@ class TestChecks(DirectoriesMixin, TestCase):
def test_binaries(self): def test_binaries(self):
self.assertEqual(binaries_check(None), []) self.assertEqual(binaries_check(None), [])
@override_settings(CONVERT_BINARY="uuuhh", OPTIPNG_BINARY="forgot") @override_settings(CONVERT_BINARY="uuuhh")
def test_binaries_fail(self): def test_binaries_fail(self):
self.assertEqual(len(binaries_check(None)), 2) self.assertEqual(len(binaries_check(None)), 1)
def test_paths_check(self): def test_paths_check(self):
self.assertEqual(paths_check(None), []) self.assertEqual(paths_check(None), [])

View File

@ -30,8 +30,8 @@ class TextDocumentParser(DocumentParser):
) )
draw.text((5, 5), read_text(), font=font, fill="black") draw.text((5, 5), read_text(), font=font, fill="black")
out_path = os.path.join(self.tempdir, "thumb.png") out_path = os.path.join(self.tempdir, "thumb.webp")
img.save(out_path) img.save(out_path, format="WEBP")
return out_path return out_path