From 1b2cb13a216311202449c5d7808b340ab8942004 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Jan 2023 08:00:02 -0800 Subject: [PATCH] Adds setting to Gotenberg API call for outputting the correct PDF/A format --- src/paperless_mail/parsers.py | 10 ++++++++++ src/paperless_tika/parsers.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index cc5d4e3c8..f1ee263aa 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser): "paperHeight": "11.7", "scale": "1.0", } + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" + try: response = requests.post( url, diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 1cfb1eecb..f34ecbbab 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser): ), } headers = {} + data = {} + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" try: - response = requests.post(url, files=files, headers=headers) + response = requests.post(url, files=files, headers=headers, data=data) response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError(