Feature: Switches to a new client to handle communication with Gotenberg (#4391)

Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
2026-02-01 23:19:00 -06:00 · 2023-10-19 17:27:29 -07:00
parent 5f0eba694c
commit 999ae678c2
8 changed files with 198 additions and 178 deletions
--- a/1
+++ b/1
@@ -51,6 +51,7 @@ flower = "*"
 bleach = "*"
 zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
 django-multiselectfield = "*"
 gotenberg-client = "*"
 [dev-packages]
 # Linting
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "3025da2940433d347b2fd2ac222852c21f4aa73eeefbd1ee9152cbfd7a7a48e9"
+            "sha256": "505bd6b18d31ed64988ef307c12a5acb70f611cafd932a391e985a11bbbc8000"
        },
        "pipfile-spec": 6,
        "requires": {},
@@ -539,6 +539,15 @@
            "markers": "python_version >= '3.7'",
            "version": "==2.0.1"
        },
        "gotenberg-client": {
            "hashes": [
                "sha256:4508ecb913ef2d553dd2ceb78e32cee001000ba08c910ba1f9ace38350d1589e",
                "sha256:7a3f8a02caee768391373b3610c6ec25a853cccf391ed6b5d5a1292c3ed15e7e"
            ],
            "index": "pypi",
            "markers": "python_version >= '3.8'",
            "version": "==0.3.0"
        },
        "gunicorn": {
            "hashes": [
                "sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0",
@@ -556,6 +565,13 @@
            "markers": "python_version >= '3.7'",
            "version": "==0.14.0"
        },
        "h2": {
            "hashes": [
                "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d",
                "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"
            ],
            "version": "==4.1.0"
        },
        "hiredis": {
            "hashes": [
                "sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240",
@@ -650,6 +666,14 @@
            ],
            "version": "==2.2.3"
        },
        "hpack": {
            "hashes": [
                "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c",
                "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"
            ],
            "markers": "python_full_version >= '3.6.1'",
            "version": "==4.0.0"
        },
        "httpcore": {
            "hashes": [
                "sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9",
@@ -699,6 +723,9 @@
            "version": "==0.6.0"
        },
        "httpx": {
            "extras": [
                "http2"
            ],
            "hashes": [
                "sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100",
                "sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875"
@@ -714,6 +741,14 @@
            "markers": "python_version >= '3.8'",
            "version": "==4.8.0"
        },
        "hyperframe": {
            "hashes": [
                "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15",
                "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"
            ],
            "markers": "python_full_version >= '3.6.1'",
            "version": "==6.0.1"
        },
        "idna": {
            "hashes": [
                "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
@@ -1782,7 +1817,7 @@
                "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0",
                "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"
            ],
-            "markers": "python_version < '3.11'",
+            "markers": "python_version < '3.10'",
            "version": "==4.8.0"
        },
        "tzdata": {
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -1,13 +1,17 @@
 import re
 from html import escape
 from pathlib import Path
 from typing import Optional
 import httpx
 from bleach import clean
 from bleach import linkify
 from django.conf import settings
 from django.utils.timezone import is_naive
 from django.utils.timezone import make_aware
 from gotenberg_client import GotenbergClient
 from gotenberg_client.options import Margin
 from gotenberg_client.options import PageSize
 from gotenberg_client.options import PdfAFormat
 from humanize import naturalsize
 from imap_tools import MailAttachment
 from imap_tools import MailMessage
@@ -24,11 +28,22 @@ class MailDocumentParser(DocumentParser):
    Gotenberg and sends the html part to a Tika server for text extraction.
    """
    gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
    tika_server = settings.TIKA_ENDPOINT
    logging_name = "paperless.parsing.mail"
    @staticmethod
    def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]:
        """
        Converts our requested PDF/A output into the Gotenberg API
        format
        """
        if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
            return PdfAFormat.A2b
        elif settings.OCR_OUTPUT_TYPE == "pdfa-1":  # pragma: no cover
            return PdfAFormat.A1a
        elif settings.OCR_OUTPUT_TYPE == "pdfa-3":  # pragma: no cover
            return PdfAFormat.A3b
        return None
    def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
        if not self.archive_path:
            self.archive_path = self.generate_pdf(
@@ -173,7 +188,7 @@ class MailDocumentParser(DocumentParser):
        self.log.info("Sending content to Tika server")
        try:
-            with TikaClient(tika_url=self.tika_server) as client:
+            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
                parsed = client.tika.as_text.from_buffer(html, "text/html")
                if parsed.content is not None:
@@ -182,7 +197,7 @@ class MailDocumentParser(DocumentParser):
        except Exception as err:
            raise ParseError(
                f"Could not parse content with tika server at "
-                f"{self.tika_server}: {err}",
+                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err
    def generate_pdf(self, mail_message: MailMessage) -> Path:
@@ -195,45 +210,29 @@ class MailDocumentParser(DocumentParser):
        if not mail_message.html:
            archive_path.write_bytes(mail_pdf_file.read_bytes())
        else:
            url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
            pdf_of_html_content = self.generate_pdf_from_html(
                mail_message.html,
                mail_message.attachments,
            )
-            pdf_collection = {
+            with GotenbergClient(
-                "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
+                host=settings.TIKA_GOTENBERG_ENDPOINT,
-                "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
+                timeout=settings.CELERY_TASK_TIME_LIMIT,
-            }
+            ) as client, client.merge.merge() as route:
                # Configure requested PDF/A formatting, if any
                pdf_a_format = self._settings_to_gotenberg_pdfa()
                if pdf_a_format is not None:
                    route.pdf_format(pdf_a_format)
                route.merge([mail_pdf_file, pdf_of_html_content])
                try:
-                # Open a handle to each file, replacing the tuple
+                    response = route.run()
                for filename in pdf_collection:
                    file_multi_part = pdf_collection[filename]
                    pdf_collection[filename] = (
                        file_multi_part[0],
                        file_multi_part[1].open("rb"),
                        file_multi_part[2],
                    )
                response = httpx.post(
                    url_merge,
                    files=pdf_collection,
                    timeout=settings.CELERY_TASK_TIME_LIMIT,
                )
                response.raise_for_status()  # ensure we notice bad responses
                    archive_path.write_bytes(response.content)
                except Exception as err:
                    raise ParseError(
                        f"Error while merging email HTML into PDF: {err}",
                    ) from err
            finally:
                for filename in pdf_collection:
                    file_multi_part_handle = pdf_collection[filename][1]
                    file_multi_part_handle.close()
        return archive_path
@@ -299,48 +298,29 @@ class MailDocumentParser(DocumentParser):
        Creates a PDF based on the given email, using the email's values in a
        an HTML template
        """
        url = self.gotenberg_server + "/forms/chromium/convert/html"
        self.log.info("Converting mail to PDF")
        css_file = Path(__file__).parent / "templates" / "output.css"
        email_html_file = self.mail_to_html(mail)
-        with css_file.open("rb") as css_handle, email_html_file.open(
+        with GotenbergClient(
-            "rb",
+            host=settings.TIKA_GOTENBERG_ENDPOINT,
-        ) as email_html_handle:
+            timeout=settings.CELERY_TASK_TIME_LIMIT,
-            files = {
+        ) as client, client.chromium.html_to_pdf() as route:
-                "html": ("index.html", email_html_handle, "text/html"),
+            # Configure requested PDF/A formatting, if any
-                "css": ("output.css", css_handle, "text/css"),
+            pdf_a_format = self._settings_to_gotenberg_pdfa()
-            }
+            if pdf_a_format is not None:
-            headers = {}
+                route.pdf_format(pdf_a_format)
            data = {
                "marginTop": "0.1",
                "marginBottom": "0.1",
                "marginLeft": "0.1",
                "marginRight": "0.1",
                "paperWidth": "8.27",
                "paperHeight": "11.7",
                "scale": "1.0",
            }
            # Set the output format of the resulting PDF
            # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
            if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
                data["pdfFormat"] = "PDF/A-2b"
            elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
                data["pdfFormat"] = "PDF/A-1a"
            elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
                data["pdfFormat"] = "PDF/A-3b"
            try:
-                response = httpx.post(
+                response = (
-                    url,
+                    route.index(email_html_file)
-                    files=files,
+                    .resource(css_file)
-                    headers=headers,
+                    .margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1))
-                    data=data,
+                    .size(PageSize(height=11.7, width=8.27))
-                    timeout=settings.CELERY_TASK_TIME_LIMIT,
+                    .scale(1.0)
                    .run()
                )
                response.raise_for_status()  # ensure we notice bad responses
            except Exception as err:
                raise ParseError(
                    f"Error while converting email to PDF: {err}",
@@ -368,15 +348,25 @@ class MailDocumentParser(DocumentParser):
            text = compiled_close.sub("</div", text)
            return text
        url = self.gotenberg_server + "/forms/chromium/convert/html"
        self.log.info("Converting html to PDF")
        tempdir = Path(self.tempdir)
        html_clean = clean_html_script(orig_html)
        html_clean_file = tempdir / "index.html"
        html_clean_file.write_text(html_clean)
-        files = {}
+        with GotenbergClient(
            host=settings.TIKA_GOTENBERG_ENDPOINT,
            timeout=settings.CELERY_TASK_TIME_LIMIT,
        ) as client, client.chromium.html_to_pdf() as route:
            # Configure requested PDF/A formatting, if any
            pdf_a_format = self._settings_to_gotenberg_pdfa()
            if pdf_a_format is not None:
                route.pdf_format(pdf_a_format)
            # Add attachments as resources, cleaning the filename and replacing
            # it in the index file for inclusion
            for attachment in attachments:
                # Clean the attachment name to be valid
                name_cid = f"cid:{attachment.content_id}"
@@ -386,8 +376,7 @@ class MailDocumentParser(DocumentParser):
                temp_file = tempdir / name_clean
                temp_file.write_bytes(attachment.payload)
-            # Store the attachment for upload
+                route.resource(temp_file)
            files[name_clean] = (name_clean, temp_file, attachment.content_type)
                # Replace as needed the name with the clean name
                html_clean = html_clean.replace(name_cid, name_clean)
@@ -395,42 +384,21 @@ class MailDocumentParser(DocumentParser):
            # Now store the cleaned up HTML version
            html_clean_file = tempdir / "index.html"
            html_clean_file.write_text(html_clean)
            # This is our index file, the main page basically
            route.index(html_clean_file)
-        files["index.html"] = ("index.html", html_clean_file, "text/html")
+            # Set page size, margins
            route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size(
                PageSize(height=11.7, width=8.27),
            ).scale(1.0)
        data = {
            "marginTop": "0.1",
            "marginBottom": "0.1",
            "marginLeft": "0.1",
            "marginRight": "0.1",
            "paperWidth": "8.27",
            "paperHeight": "11.7",
            "scale": "1.0",
        }
            try:
-            # Open a handle to each file, replacing the tuple
+                response = route.run()
            for filename in files:
                file_multi_part = files[filename]
                files[filename] = (
                    file_multi_part[0],
                    file_multi_part[1].open("rb"),
                    file_multi_part[2],
                )
            response = httpx.post(
                url,
                files=files,
                data=data,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            )
            response.raise_for_status()  # ensure we notice bad responses
            except Exception as err:
-            raise ParseError(f"Error while converting document to PDF: {err}") from err
+                raise ParseError(
-        finally:
+                    f"Error while converting document to PDF: {err}",
-            # Ensure all file handles as closed
+                ) from err
            for filename in files:
                file_multi_part_handle = files[filename][1]
                file_multi_part_handle.close()
        html_pdf = tempdir / "html.pdf"
        html_pdf.write_bytes(response.content)
--- a/src/paperless_mail/tests/test_parsers.py
+++ b/src/paperless_mail/tests/test_parsers.py
@@ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
        )
        parsed = self.parser.tika_parse(html)
        self.assertEqual(expected_text, parsed.strip())
-        self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url))
+        self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))
    def test_tika_parse_exception(self):
        """
@@ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
        self.assertEqual(
            str(request.url),
-            self.parser.gotenberg_server + "/forms/chromium/convert/html",
+            "http://localhost:3000/forms/chromium/convert/html",
        )
--- a/src/paperless_mail/tests/test_parsers_live.py
+++ b/src/paperless_mail/tests/test_parsers_live.py
@@ -1,11 +1,14 @@
 import os
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 from unittest import mock
 import httpx
 import pytest
 from django.test import TestCase
 from imagehash import average_hash
 from pdfminer.high_level import extract_text
 from PIL import Image
 from documents.tests.utils import FileSystemAssertsMixin
@@ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff
 from paperless_mail.tests.test_parsers import BaseMailParserTestCase
 def extract_text(pdf_path: Path) -> str:
    """
    Using pdftotext from poppler, extracts the text of a PDF into a file,
    then reads the file contents and returns it
    """
    with tempfile.NamedTemporaryFile(
        mode="w+",
    ) as tmp:
        subprocess.run(
            [
                shutil.which("pdftotext"),
                "-q",
                "-layout",
                "-enc",
                "UTF-8",
                str(pdf_path),
                tmp.name,
            ],
            check=True,
        )
        return tmp.read()
 class MailAttachmentMock:
    def __init__(self, payload, content_id):
        self.payload = payload
@@ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
        extracted = extract_text(pdf_path)
        expected = (
-            "first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
+            "first   PDF   to   be   merged.\n\x0csecond PDF   to   be   merged.\n\x0c"
        )
        self.assertEqual(expected, extracted)
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,9 +1,10 @@
 import os
 from pathlib import Path
 import httpx
 from django.conf import settings
 from django.utils import timezone
 from gotenberg_client import GotenbergClient
 from gotenberg_client.options import PdfAFormat
 from tika_client import TikaClient
 from documents.parsers import DocumentParser
@@ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser):
        self.archive_path = self.convert_to_pdf(document_path, file_name)
-    def convert_to_pdf(self, document_path, file_name):
+    def convert_to_pdf(self, document_path: Path, file_name):
-        pdf_path = os.path.join(self.tempdir, "convert.pdf")
+        pdf_path = Path(self.tempdir) / "convert.pdf"
        gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
        url = gotenberg_server + "/forms/libreoffice/convert"
        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
        with open(document_path, "rb") as document_handle:
            files = {
                "files": (
                    "convert" + os.path.splitext(document_path)[-1],
                    document_handle,
                ),
            }
            headers = {}
            data = {}
        with GotenbergClient(
            host=settings.TIKA_GOTENBERG_ENDPOINT,
            timeout=settings.CELERY_TASK_TIME_LIMIT,
        ) as client, client.libre_office.to_pdf() as route:
            # Set the output format of the resulting PDF
            # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
            if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
-                data["pdfFormat"] = "PDF/A-2b"
+                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
-                data["pdfFormat"] = "PDF/A-1a"
+                route.pdf_format(PdfAFormat.A1a)
            elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
-                data["pdfFormat"] = "PDF/A-3b"
+                route.pdf_format(PdfAFormat.A3b)
            route.convert(document_path)
            try:
-                response = httpx.post(
+                response = route.run()
-                    url,
+
-                    files=files,
+                pdf_path.write_bytes(response.content)
-                    headers=headers,
+
-                    data=data,
+                return pdf_path
-                    timeout=settings.CELERY_TASK_TIME_LIMIT,
+
                )
                response.raise_for_status()  # ensure we notice bad responses
            except Exception as err:
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err
        with open(pdf_path, "wb") as file:
            file.write(response.content)
            file.close()
        return pdf_path
--- a/src/paperless_tika/tests/test_tika_parser.py
+++ b/src/paperless_tika/tests/test_tika_parser.py
@@ -2,12 +2,11 @@ import datetime
 import os
 import zoneinfo
 from pathlib import Path
 from unittest import mock
 from django.test import TestCase
 from django.test import override_settings
-from httpx import Request
+from httpx import codes
-from httpx import Response
+from httpx._multipart import DataField
 from rest_framework import status
 from documents.parsers import ParseError
@@ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
        with self.assertRaises(ParseError):
            self.parser.convert_to_pdf(file, None)
-    @mock.patch("paperless_tika.parsers.httpx.post")
+    def test_request_pdf_a_format(self):
    def test_request_pdf_a_format(self, post: mock.Mock):
        """
        GIVEN:
            - Document needs to be converted to PDF
@@ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase):
        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
        file.touch()
        response = Response(status_code=status.HTTP_200_OK)
        response.request = Request("POST", "/somewhere/")
        post.return_value = response
        for setting, expected_key in [
            ("pdfa", "PDF/A-2b"),
            ("pdfa-2", "PDF/A-2b"),
@@ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
            ("pdfa-3", "PDF/A-3b"),
        ]:
            with override_settings(OCR_OUTPUT_TYPE=setting):
                self.httpx_mock.add_response(
                    status_code=codes.OK,
                    content=b"PDF document",
                    method="POST",
                )
                self.parser.convert_to_pdf(file, None)
-                post.assert_called_once()
+                request = self.httpx_mock.get_request()
-                _, kwargs = post.call_args
+                found = False
                for field in request.stream.fields:
                    if isinstance(field, DataField) and field.name == "pdfFormat":
                        self.assertEqual(field.value, expected_key)
                        found = True
                self.assertTrue(found)
-                self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
+                self.httpx_mock.reset(assert_all_responses_were_requested=False)
                post.reset_mock()
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -7,7 +7,7 @@ max-line-length = 88
 [tool:pytest]
 DJANGO_SETTINGS_MODULE=paperless.settings
-addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet --durations=50
+addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
 env =
  PAPERLESS_DISABLE_DBHANDLER=true