Feature: Switches to a new client to handle communication with Gotenberg (#4391)

Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
2026-01-10 21:34:20 -06:00 · 2023-10-19 17:27:29 -07:00
parent 5f0eba694c
commit 999ae678c2
8 changed files with 198 additions and 178 deletions
--- a/src/paperless_mail/tests/test_parsers.py
+++ b/src/paperless_mail/tests/test_parsers.py
@@ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
        )
        parsed = self.parser.tika_parse(html)
        self.assertEqual(expected_text, parsed.strip())
-        self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url))
+        self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))

    def test_tika_parse_exception(self):
        """
@@ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)

        self.assertEqual(
            str(request.url),
-            self.parser.gotenberg_server + "/forms/chromium/convert/html",
+            "http://localhost:3000/forms/chromium/convert/html",
        )
--- a/src/paperless_mail/tests/test_parsers_live.py
+++ b/src/paperless_mail/tests/test_parsers_live.py
@@ -1,11 +1,14 @@
 import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
 from unittest import mock

 import httpx
 import pytest
 from django.test import TestCase
 from imagehash import average_hash
-from pdfminer.high_level import extract_text
 from PIL import Image

 from documents.tests.utils import FileSystemAssertsMixin
@@ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff
 from paperless_mail.tests.test_parsers import BaseMailParserTestCase


+def extract_text(pdf_path: Path) -> str:
+    """
+    Using pdftotext from poppler, extracts the text of a PDF into a file,
+    then reads the file contents and returns it
+    """
+    with tempfile.NamedTemporaryFile(
+        mode="w+",
+    ) as tmp:
+        subprocess.run(
+            [
+                shutil.which("pdftotext"),
+                "-q",
+                "-layout",
+                "-enc",
+                "UTF-8",
+                str(pdf_path),
+                tmp.name,
+            ],
+            check=True,
+        )
+        return tmp.read()
+
+
 class MailAttachmentMock:
    def __init__(self, payload, content_id):
        self.payload = payload
@@ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):

        extracted = extract_text(pdf_path)
        expected = (
-            "first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
+            "first   PDF   to   be   merged.\n\x0csecond PDF   to   be   merged.\n\x0c"
        )

        self.assertEqual(expected, extracted)