From 29e6371cd11de49c37f9b08318de20ec02209fce Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 26 Jun 2024 19:37:50 -0700 Subject: [PATCH] Feature: Upgrade Gotenberg to v8 (#7094) --- Pipfile.lock | 6 +-- docker/compose/docker-compose.ci-test.yml | 2 +- .../compose/docker-compose.mariadb-tika.yml | 2 +- .../compose/docker-compose.postgres-tika.yml | 2 +- docker/compose/docker-compose.sqlite-tika.yml | 2 +- src/paperless_mail/parsers.py | 40 ++++++++++++++----- src/paperless_tika/parsers.py | 5 ++- src/paperless_tika/tests/test_tika_parser.py | 5 ++- 8 files changed, 45 insertions(+), 19 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 19fa61a23..a79f6db12 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -602,12 +602,12 @@ }, "gotenberg-client": { "hashes": [ - "sha256:097151c959d9ad9c6292694dac454a07a511489a353086df924f489190084425", - "sha256:3d6c0449fd1afb82206bfdc2edacfe1d7d98e9de7207332b696b97a3d4dfba6b" + "sha256:94e12d1e2ebaaf8552008c95553f5c635c6b8beef401c65cee6ba0d73bbb146b", + "sha256:b0829ae666ee75fc2bd61cf055c2dea9877f73d52a3b4e1998eea4cee14923f5" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==0.5.0" + "version": "==0.6.0" }, "gunicorn": { "hashes": [ diff --git a/docker/compose/docker-compose.ci-test.yml b/docker/compose/docker-compose.ci-test.yml index d1c75ead2..d67aa9f61 100644 --- a/docker/compose/docker-compose.ci-test.yml +++ b/docker/compose/docker-compose.ci-test.yml @@ -5,7 +5,7 @@ services: gotenberg: - image: docker.io/gotenberg/gotenberg:7.10 + image: docker.io/gotenberg/gotenberg:8.7 hostname: gotenberg container_name: gotenberg network_mode: host diff --git a/docker/compose/docker-compose.mariadb-tika.yml b/docker/compose/docker-compose.mariadb-tika.yml index e23fe8b92..b451ce9e8 100644 --- a/docker/compose/docker-compose.mariadb-tika.yml +++ b/docker/compose/docker-compose.mariadb-tika.yml @@ -77,7 +77,7 @@ services: PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: - image: docker.io/gotenberg/gotenberg:7.10 + image: docker.io/gotenberg/gotenberg:8.7 restart: unless-stopped # The gotenberg chromium route is used to convert .eml files. We do not # want to allow external content like tracking pixels or even javascript. diff --git a/docker/compose/docker-compose.postgres-tika.yml b/docker/compose/docker-compose.postgres-tika.yml index fb129f7a2..e168dfadc 100644 --- a/docker/compose/docker-compose.postgres-tika.yml +++ b/docker/compose/docker-compose.postgres-tika.yml @@ -71,7 +71,7 @@ services: PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: - image: docker.io/gotenberg/gotenberg:7.10 + image: docker.io/gotenberg/gotenberg:8.7 restart: unless-stopped # The gotenberg chromium route is used to convert .eml files. We do not diff --git a/docker/compose/docker-compose.sqlite-tika.yml b/docker/compose/docker-compose.sqlite-tika.yml index 34c8dbfed..abfb64cdf 100644 --- a/docker/compose/docker-compose.sqlite-tika.yml +++ b/docker/compose/docker-compose.sqlite-tika.yml @@ -59,7 +59,7 @@ services: PAPERLESS_TIKA_ENDPOINT: http://tika:9998 gotenberg: - image: docker.io/gotenberg/gotenberg:7.10 + image: docker.io/gotenberg/gotenberg:8.7 restart: unless-stopped # The gotenberg chromium route is used to convert .eml files. We do not diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 9275c5579..9047b5f90 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -9,7 +9,9 @@ from django.conf import settings from django.utils.timezone import is_naive from django.utils.timezone import make_aware from gotenberg_client import GotenbergClient -from gotenberg_client.options import Margin +from gotenberg_client.options import MarginType +from gotenberg_client.options import MarginUnitType +from gotenberg_client.options import PageMarginsType from gotenberg_client.options import PageSize from gotenberg_client.options import PdfAFormat from humanize import naturalsize @@ -20,6 +22,7 @@ from tika_client import TikaClient from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf +from paperless.models import OutputTypeChoices class MailDocumentParser(DocumentParser): @@ -30,17 +33,22 @@ class MailDocumentParser(DocumentParser): logging_name = "paperless.parsing.mail" - @staticmethod - def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]: + def _settings_to_gotenberg_pdfa(self) -> Optional[PdfAFormat]: """ Converts our requested PDF/A output into the Gotenberg API format """ - if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + if settings.OCR_OUTPUT_TYPE in { + OutputTypeChoices.PDF_A, + OutputTypeChoices.PDF_A2, + }: return PdfAFormat.A2b - elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover - return PdfAFormat.A1a - elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover + self.log.warn( + "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", + ) + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover return PdfAFormat.A3b return None @@ -329,7 +337,14 @@ class MailDocumentParser(DocumentParser): response = ( route.index(email_html_file) .resource(css_file) - .margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)) + .margins( + PageMarginsType( + top=MarginType(0.1, MarginUnitType.Inches), + bottom=MarginType(0.1, MarginUnitType.Inches), + left=MarginType(0.1, MarginUnitType.Inches), + right=MarginType(0.1, MarginUnitType.Inches), + ), + ) .size(PageSize(height=11.7, width=8.27)) .scale(1.0) .run() @@ -404,7 +419,14 @@ class MailDocumentParser(DocumentParser): route.index(html_clean_file) # Set page size, margins - route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size( + route.margins( + PageMarginsType( + top=MarginType(0.1, MarginUnitType.Inches), + bottom=MarginType(0.1, MarginUnitType.Inches), + left=MarginType(0.1, MarginUnitType.Inches), + right=MarginType(0.1, MarginUnitType.Inches), + ), + ).size( PageSize(height=11.7, width=8.27), ).scale(1.0) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index d5589cca4..519f6c6ae 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -102,7 +102,10 @@ class TikaDocumentParser(DocumentParser): }: route.pdf_format(PdfAFormat.A2b) elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: - route.pdf_format(PdfAFormat.A1a) + self.log.warn( + "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead", + ) + route.pdf_format(PdfAFormat.A2b) elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: route.pdf_format(PdfAFormat.A3b) diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index 81d6f026a..ee010eb49 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -109,7 +109,7 @@ class TestTikaParser(HttpxMockMixin, TestCase): for setting, expected_key in [ ("pdfa", "PDF/A-2b"), ("pdfa-2", "PDF/A-2b"), - ("pdfa-1", "PDF/A-1a"), + ("pdfa-1", "PDF/A-2b"), ("pdfa-3", "PDF/A-3b"), ]: with override_settings(OCR_OUTPUT_TYPE=setting): @@ -124,9 +124,10 @@ class TestTikaParser(HttpxMockMixin, TestCase): request = self.httpx_mock.get_request() found = False for field in request.stream.fields: - if isinstance(field, DataField) and field.name == "pdfFormat": + if isinstance(field, DataField) and field.name == "pdfa": self.assertEqual(field.value, expected_key) found = True + break self.assertTrue(found) self.httpx_mock.reset(assert_all_responses_were_requested=False)