Feature: Switches to a new client to handle communication with Gotenberg (#4391)

Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
This commit is contained in:
Trenton H
2023-10-19 17:27:29 -07:00
committed by GitHub
parent 5f0eba694c
commit 999ae678c2
8 changed files with 198 additions and 178 deletions

View File

@@ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
)
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url))
self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))
def test_tika_parse_exception(self):
"""
@@ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
self.assertEqual(
str(request.url),
self.parser.gotenberg_server + "/forms/chromium/convert/html",
"http://localhost:3000/forms/chromium/convert/html",
)

View File

@@ -1,11 +1,14 @@
import os
import shutil
import subprocess
import tempfile
from pathlib import Path
from unittest import mock
import httpx
import pytest
from django.test import TestCase
from imagehash import average_hash
from pdfminer.high_level import extract_text
from PIL import Image
from documents.tests.utils import FileSystemAssertsMixin
@@ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
def extract_text(pdf_path: Path) -> str:
"""
Using pdftotext from poppler, extracts the text of a PDF into a file,
then reads the file contents and returns it
"""
with tempfile.NamedTemporaryFile(
mode="w+",
) as tmp:
subprocess.run(
[
shutil.which("pdftotext"),
"-q",
"-layout",
"-enc",
"UTF-8",
str(pdf_path),
tmp.name,
],
check=True,
)
return tmp.read()
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
@@ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
extracted = extract_text(pdf_path)
expected = (
"first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
)
self.assertEqual(expected, extracted)