mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Feature: Upgrade Gotenberg to v8 (#7094)
This commit is contained in:
parent
80c2d90e74
commit
29e6371cd1
6
Pipfile.lock
generated
6
Pipfile.lock
generated
@ -602,12 +602,12 @@
|
||||
},
|
||||
"gotenberg-client": {
|
||||
"hashes": [
|
||||
"sha256:097151c959d9ad9c6292694dac454a07a511489a353086df924f489190084425",
|
||||
"sha256:3d6c0449fd1afb82206bfdc2edacfe1d7d98e9de7207332b696b97a3d4dfba6b"
|
||||
"sha256:94e12d1e2ebaaf8552008c95553f5c635c6b8beef401c65cee6ba0d73bbb146b",
|
||||
"sha256:b0829ae666ee75fc2bd61cf055c2dea9877f73d52a3b4e1998eea4cee14923f5"
|
||||
],
|
||||
"index": "pypi",
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==0.5.0"
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"gunicorn": {
|
||||
"hashes": [
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
services:
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:7.10
|
||||
image: docker.io/gotenberg/gotenberg:8.7
|
||||
hostname: gotenberg
|
||||
container_name: gotenberg
|
||||
network_mode: host
|
||||
|
@ -77,7 +77,7 @@ services:
|
||||
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
|
||||
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:7.10
|
||||
image: docker.io/gotenberg/gotenberg:8.7
|
||||
restart: unless-stopped
|
||||
# The gotenberg chromium route is used to convert .eml files. We do not
|
||||
# want to allow external content like tracking pixels or even javascript.
|
||||
|
@ -71,7 +71,7 @@ services:
|
||||
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
|
||||
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:7.10
|
||||
image: docker.io/gotenberg/gotenberg:8.7
|
||||
restart: unless-stopped
|
||||
|
||||
# The gotenberg chromium route is used to convert .eml files. We do not
|
||||
|
@ -59,7 +59,7 @@ services:
|
||||
PAPERLESS_TIKA_ENDPOINT: http://tika:9998
|
||||
|
||||
gotenberg:
|
||||
image: docker.io/gotenberg/gotenberg:7.10
|
||||
image: docker.io/gotenberg/gotenberg:8.7
|
||||
restart: unless-stopped
|
||||
|
||||
# The gotenberg chromium route is used to convert .eml files. We do not
|
||||
|
@ -9,7 +9,9 @@ from django.conf import settings
|
||||
from django.utils.timezone import is_naive
|
||||
from django.utils.timezone import make_aware
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.options import Margin
|
||||
from gotenberg_client.options import MarginType
|
||||
from gotenberg_client.options import MarginUnitType
|
||||
from gotenberg_client.options import PageMarginsType
|
||||
from gotenberg_client.options import PageSize
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from humanize import naturalsize
|
||||
@ -20,6 +22,7 @@ from tika_client import TikaClient
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.models import OutputTypeChoices
|
||||
|
||||
|
||||
class MailDocumentParser(DocumentParser):
|
||||
@ -30,17 +33,22 @@ class MailDocumentParser(DocumentParser):
|
||||
|
||||
logging_name = "paperless.parsing.mail"
|
||||
|
||||
@staticmethod
|
||||
def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]:
|
||||
def _settings_to_gotenberg_pdfa(self) -> Optional[PdfAFormat]:
|
||||
"""
|
||||
Converts our requested PDF/A output into the Gotenberg API
|
||||
format
|
||||
"""
|
||||
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
|
||||
if settings.OCR_OUTPUT_TYPE in {
|
||||
OutputTypeChoices.PDF_A,
|
||||
OutputTypeChoices.PDF_A2,
|
||||
}:
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover
|
||||
return PdfAFormat.A1a
|
||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
|
||||
self.log.warn(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
return PdfAFormat.A2b
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: # pragma: no cover
|
||||
return PdfAFormat.A3b
|
||||
return None
|
||||
|
||||
@ -329,7 +337,14 @@ class MailDocumentParser(DocumentParser):
|
||||
response = (
|
||||
route.index(email_html_file)
|
||||
.resource(css_file)
|
||||
.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1))
|
||||
.margins(
|
||||
PageMarginsType(
|
||||
top=MarginType(0.1, MarginUnitType.Inches),
|
||||
bottom=MarginType(0.1, MarginUnitType.Inches),
|
||||
left=MarginType(0.1, MarginUnitType.Inches),
|
||||
right=MarginType(0.1, MarginUnitType.Inches),
|
||||
),
|
||||
)
|
||||
.size(PageSize(height=11.7, width=8.27))
|
||||
.scale(1.0)
|
||||
.run()
|
||||
@ -404,7 +419,14 @@ class MailDocumentParser(DocumentParser):
|
||||
route.index(html_clean_file)
|
||||
|
||||
# Set page size, margins
|
||||
route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size(
|
||||
route.margins(
|
||||
PageMarginsType(
|
||||
top=MarginType(0.1, MarginUnitType.Inches),
|
||||
bottom=MarginType(0.1, MarginUnitType.Inches),
|
||||
left=MarginType(0.1, MarginUnitType.Inches),
|
||||
right=MarginType(0.1, MarginUnitType.Inches),
|
||||
),
|
||||
).size(
|
||||
PageSize(height=11.7, width=8.27),
|
||||
).scale(1.0)
|
||||
|
||||
|
@ -102,7 +102,10 @@ class TikaDocumentParser(DocumentParser):
|
||||
}:
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
|
||||
route.pdf_format(PdfAFormat.A1a)
|
||||
self.log.warn(
|
||||
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
|
||||
)
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
|
||||
route.pdf_format(PdfAFormat.A3b)
|
||||
|
||||
|
@ -109,7 +109,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
||||
for setting, expected_key in [
|
||||
("pdfa", "PDF/A-2b"),
|
||||
("pdfa-2", "PDF/A-2b"),
|
||||
("pdfa-1", "PDF/A-1a"),
|
||||
("pdfa-1", "PDF/A-2b"),
|
||||
("pdfa-3", "PDF/A-3b"),
|
||||
]:
|
||||
with override_settings(OCR_OUTPUT_TYPE=setting):
|
||||
@ -124,9 +124,10 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
||||
request = self.httpx_mock.get_request()
|
||||
found = False
|
||||
for field in request.stream.fields:
|
||||
if isinstance(field, DataField) and field.name == "pdfFormat":
|
||||
if isinstance(field, DataField) and field.name == "pdfa":
|
||||
self.assertEqual(field.value, expected_key)
|
||||
found = True
|
||||
break
|
||||
self.assertTrue(found)
|
||||
|
||||
self.httpx_mock.reset(assert_all_responses_were_requested=False)
|
||||
|
Loading…
x
Reference in New Issue
Block a user