Feature: Upgrade Gotenberg to v8 (#7094)

This commit is contained in:
Trenton H 2024-06-26 19:37:50 -07:00 committed by GitHub
parent 80c2d90e74
commit 29e6371cd1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 45 additions and 19 deletions

6
Pipfile.lock generated
View File

@ -602,12 +602,12 @@
}, },
"gotenberg-client": { "gotenberg-client": {
"hashes": [ "hashes": [
"sha256:097151c959d9ad9c6292694dac454a07a511489a353086df924f489190084425", "sha256:94e12d1e2ebaaf8552008c95553f5c635c6b8beef401c65cee6ba0d73bbb146b",
"sha256:3d6c0449fd1afb82206bfdc2edacfe1d7d98e9de7207332b696b97a3d4dfba6b" "sha256:b0829ae666ee75fc2bd61cf055c2dea9877f73d52a3b4e1998eea4cee14923f5"
], ],
"index": "pypi", "index": "pypi",
"markers": "python_version >= '3.8'", "markers": "python_version >= '3.8'",
"version": "==0.5.0" "version": "==0.6.0"
}, },
"gunicorn": { "gunicorn": {
"hashes": [ "hashes": [

View File

@ -5,7 +5,7 @@
services: services:
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.10 image: docker.io/gotenberg/gotenberg:8.7
hostname: gotenberg hostname: gotenberg
container_name: gotenberg container_name: gotenberg
network_mode: host network_mode: host

View File

@ -77,7 +77,7 @@ services:
PAPERLESS_TIKA_ENDPOINT: http://tika:9998 PAPERLESS_TIKA_ENDPOINT: http://tika:9998
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.10 image: docker.io/gotenberg/gotenberg:8.7
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not # The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript. # want to allow external content like tracking pixels or even javascript.

View File

@ -71,7 +71,7 @@ services:
PAPERLESS_TIKA_ENDPOINT: http://tika:9998 PAPERLESS_TIKA_ENDPOINT: http://tika:9998
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.10 image: docker.io/gotenberg/gotenberg:8.7
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not # The gotenberg chromium route is used to convert .eml files. We do not

View File

@ -59,7 +59,7 @@ services:
PAPERLESS_TIKA_ENDPOINT: http://tika:9998 PAPERLESS_TIKA_ENDPOINT: http://tika:9998
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.10 image: docker.io/gotenberg/gotenberg:8.7
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not # The gotenberg chromium route is used to convert .eml files. We do not

View File

@ -9,7 +9,9 @@ from django.conf import settings
from django.utils.timezone import is_naive from django.utils.timezone import is_naive
from django.utils.timezone import make_aware from django.utils.timezone import make_aware
from gotenberg_client import GotenbergClient from gotenberg_client import GotenbergClient
from gotenberg_client.options import Margin from gotenberg_client.options import MarginType
from gotenberg_client.options import MarginUnitType
from gotenberg_client.options import PageMarginsType
from gotenberg_client.options import PageSize from gotenberg_client.options import PageSize
from gotenberg_client.options import PdfAFormat from gotenberg_client.options import PdfAFormat
from humanize import naturalsize from humanize import naturalsize
@ -20,6 +22,7 @@ from tika_client import TikaClient
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf from documents.parsers import make_thumbnail_from_pdf
from paperless.models import OutputTypeChoices
class MailDocumentParser(DocumentParser): class MailDocumentParser(DocumentParser):
@ -30,17 +33,22 @@ class MailDocumentParser(DocumentParser):
logging_name = "paperless.parsing.mail" logging_name = "paperless.parsing.mail"
@staticmethod def _settings_to_gotenberg_pdfa(self) -> Optional[PdfAFormat]:
def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]:
""" """
Converts our requested PDF/A output into the Gotenberg API Converts our requested PDF/A output into the Gotenberg API
format format
""" """
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
return PdfAFormat.A2b return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
return PdfAFormat.A1a self.log.warn(
elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
return PdfAFormat.A3b return PdfAFormat.A3b
return None return None
@ -329,7 +337,14 @@ class MailDocumentParser(DocumentParser):
response = ( response = (
route.index(email_html_file) route.index(email_html_file)
.resource(css_file) .resource(css_file)
.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)) .margins(
PageMarginsType(
top=MarginType(0.1, MarginUnitType.Inches),
bottom=MarginType(0.1, MarginUnitType.Inches),
left=MarginType(0.1, MarginUnitType.Inches),
right=MarginType(0.1, MarginUnitType.Inches),
),
)
.size(PageSize(height=11.7, width=8.27)) .size(PageSize(height=11.7, width=8.27))
.scale(1.0) .scale(1.0)
.run() .run()
@ -404,7 +419,14 @@ class MailDocumentParser(DocumentParser):
route.index(html_clean_file) route.index(html_clean_file)
# Set page size, margins # Set page size, margins
route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size( route.margins(
PageMarginsType(
top=MarginType(0.1, MarginUnitType.Inches),
bottom=MarginType(0.1, MarginUnitType.Inches),
left=MarginType(0.1, MarginUnitType.Inches),
right=MarginType(0.1, MarginUnitType.Inches),
),
).size(
PageSize(height=11.7, width=8.27), PageSize(height=11.7, width=8.27),
).scale(1.0) ).scale(1.0)

View File

@ -102,7 +102,10 @@ class TikaDocumentParser(DocumentParser):
}: }:
route.pdf_format(PdfAFormat.A2b) route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
route.pdf_format(PdfAFormat.A1a) self.log.warn(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
route.pdf_format(PdfAFormat.A3b) route.pdf_format(PdfAFormat.A3b)

View File

@ -109,7 +109,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
for setting, expected_key in [ for setting, expected_key in [
("pdfa", "PDF/A-2b"), ("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"), ("pdfa-2", "PDF/A-2b"),
("pdfa-1", "PDF/A-1a"), ("pdfa-1", "PDF/A-2b"),
("pdfa-3", "PDF/A-3b"), ("pdfa-3", "PDF/A-3b"),
]: ]:
with override_settings(OCR_OUTPUT_TYPE=setting): with override_settings(OCR_OUTPUT_TYPE=setting):
@ -124,9 +124,10 @@ class TestTikaParser(HttpxMockMixin, TestCase):
request = self.httpx_mock.get_request() request = self.httpx_mock.get_request()
found = False found = False
for field in request.stream.fields: for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfFormat": if isinstance(field, DataField) and field.name == "pdfa":
self.assertEqual(field.value, expected_key) self.assertEqual(field.value, expected_key)
found = True found = True
break
self.assertTrue(found) self.assertTrue(found)
self.httpx_mock.reset(assert_all_responses_were_requested=False) self.httpx_mock.reset(assert_all_responses_were_requested=False)