Feature: Switches to a new client to handle communication with Gotenberg (#4391)

Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
This commit is contained in:
Trenton H 2023-10-19 17:27:29 -07:00 committed by GitHub
parent 5f0eba694c
commit 999ae678c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 198 additions and 178 deletions

View File

@ -51,6 +51,7 @@ flower = "*"
bleach = "*"
zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
django-multiselectfield = "*"
gotenberg-client = "*"
[dev-packages]
# Linting

39
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "3025da2940433d347b2fd2ac222852c21f4aa73eeefbd1ee9152cbfd7a7a48e9"
"sha256": "505bd6b18d31ed64988ef307c12a5acb70f611cafd932a391e985a11bbbc8000"
},
"pipfile-spec": 6,
"requires": {},
@ -539,6 +539,15 @@
"markers": "python_version >= '3.7'",
"version": "==2.0.1"
},
"gotenberg-client": {
"hashes": [
"sha256:4508ecb913ef2d553dd2ceb78e32cee001000ba08c910ba1f9ace38350d1589e",
"sha256:7a3f8a02caee768391373b3610c6ec25a853cccf391ed6b5d5a1292c3ed15e7e"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
"version": "==0.3.0"
},
"gunicorn": {
"hashes": [
"sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0",
@ -556,6 +565,13 @@
"markers": "python_version >= '3.7'",
"version": "==0.14.0"
},
"h2": {
"hashes": [
"sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d",
"sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"
],
"version": "==4.1.0"
},
"hiredis": {
"hashes": [
"sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240",
@ -650,6 +666,14 @@
],
"version": "==2.2.3"
},
"hpack": {
"hashes": [
"sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c",
"sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"
],
"markers": "python_full_version >= '3.6.1'",
"version": "==4.0.0"
},
"httpcore": {
"hashes": [
"sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9",
@ -699,6 +723,9 @@
"version": "==0.6.0"
},
"httpx": {
"extras": [
"http2"
],
"hashes": [
"sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100",
"sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875"
@ -714,6 +741,14 @@
"markers": "python_version >= '3.8'",
"version": "==4.8.0"
},
"hyperframe": {
"hashes": [
"sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15",
"sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"
],
"markers": "python_full_version >= '3.6.1'",
"version": "==6.0.1"
},
"idna": {
"hashes": [
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
@ -1782,7 +1817,7 @@
"sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0",
"sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"
],
"markers": "python_version < '3.11'",
"markers": "python_version < '3.10'",
"version": "==4.8.0"
},
"tzdata": {

View File

@ -1,13 +1,17 @@
import re
from html import escape
from pathlib import Path
from typing import Optional
import httpx
from bleach import clean
from bleach import linkify
from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from gotenberg_client import GotenbergClient
from gotenberg_client.options import Margin
from gotenberg_client.options import PageSize
from gotenberg_client.options import PdfAFormat
from humanize import naturalsize
from imap_tools import MailAttachment
from imap_tools import MailMessage
@ -24,11 +28,22 @@ class MailDocumentParser(DocumentParser):
Gotenberg and sends the html part to a Tika server for text extraction.
"""
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
logging_name = "paperless.parsing.mail"
@staticmethod
def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]:
"""
Converts our requested PDF/A output into the Gotenberg API
format
"""
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover
return PdfAFormat.A1a
elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover
return PdfAFormat.A3b
return None
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(
@ -173,7 +188,7 @@ class MailDocumentParser(DocumentParser):
self.log.info("Sending content to Tika server")
try:
with TikaClient(tika_url=self.tika_server) as client:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if parsed.content is not None:
@ -182,7 +197,7 @@ class MailDocumentParser(DocumentParser):
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at "
f"{self.tika_server}: {err}",
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
def generate_pdf(self, mail_message: MailMessage) -> Path:
@ -195,45 +210,29 @@ class MailDocumentParser(DocumentParser):
if not mail_message.html:
archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
pdf_of_html_content = self.generate_pdf_from_html(
mail_message.html,
mail_message.attachments,
)
pdf_collection = {
"1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
"2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
}
with GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.merge.merge() as route:
# Configure requested PDF/A formatting, if any
pdf_a_format = self._settings_to_gotenberg_pdfa()
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
try:
# Open a handle to each file, replacing the tuple
for filename in pdf_collection:
file_multi_part = pdf_collection[filename]
pdf_collection[filename] = (
file_multi_part[0],
file_multi_part[1].open("rb"),
file_multi_part[2],
)
route.merge([mail_pdf_file, pdf_of_html_content])
response = httpx.post(
url_merge,
files=pdf_collection,
timeout=settings.CELERY_TASK_TIME_LIMIT,
)
response.raise_for_status() # ensure we notice bad responses
archive_path.write_bytes(response.content)
except Exception as err:
raise ParseError(
f"Error while merging email HTML into PDF: {err}",
) from err
finally:
for filename in pdf_collection:
file_multi_part_handle = pdf_collection[filename][1]
file_multi_part_handle.close()
try:
response = route.run()
archive_path.write_bytes(response.content)
except Exception as err:
raise ParseError(
f"Error while merging email HTML into PDF: {err}",
) from err
return archive_path
@ -299,48 +298,29 @@ class MailDocumentParser(DocumentParser):
Creates a PDF based on the given email, using the email's values in a
an HTML template
"""
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting mail to PDF")
css_file = Path(__file__).parent / "templates" / "output.css"
email_html_file = self.mail_to_html(mail)
with css_file.open("rb") as css_handle, email_html_file.open(
"rb",
) as email_html_handle:
files = {
"html": ("index.html", email_html_handle, "text/html"),
"css": ("output.css", css_handle, "text/css"),
}
headers = {}
data = {
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
}
# Set the output format of the resulting PDF
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
data["pdfFormat"] = "PDF/A-2b"
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
data["pdfFormat"] = "PDF/A-1a"
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
data["pdfFormat"] = "PDF/A-3b"
with GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.chromium.html_to_pdf() as route:
# Configure requested PDF/A formatting, if any
pdf_a_format = self._settings_to_gotenberg_pdfa()
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
try:
response = httpx.post(
url,
files=files,
headers=headers,
data=data,
timeout=settings.CELERY_TASK_TIME_LIMIT,
response = (
route.index(email_html_file)
.resource(css_file)
.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1))
.size(PageSize(height=11.7, width=8.27))
.scale(1.0)
.run()
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
f"Error while converting email to PDF: {err}",
@ -368,69 +348,57 @@ class MailDocumentParser(DocumentParser):
text = compiled_close.sub("</div", text)
return text
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting html to PDF")
tempdir = Path(self.tempdir)
html_clean = clean_html_script(orig_html)
files = {}
for attachment in attachments:
# Clean the attachment name to be valid
name_cid = f"cid:{attachment.content_id}"
name_clean = "".join(e for e in name_cid if e.isalnum())
# Write attachment payload to a temp file
temp_file = tempdir / name_clean
temp_file.write_bytes(attachment.payload)
# Store the attachment for upload
files[name_clean] = (name_clean, temp_file, attachment.content_type)
# Replace as needed the name with the clean name
html_clean = html_clean.replace(name_cid, name_clean)
# Now store the cleaned up HTML version
html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean)
files["index.html"] = ("index.html", html_clean_file, "text/html")
with GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.chromium.html_to_pdf() as route:
# Configure requested PDF/A formatting, if any
pdf_a_format = self._settings_to_gotenberg_pdfa()
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
data = {
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
}
try:
# Open a handle to each file, replacing the tuple
for filename in files:
file_multi_part = files[filename]
files[filename] = (
file_multi_part[0],
file_multi_part[1].open("rb"),
file_multi_part[2],
)
# Add attachments as resources, cleaning the filename and replacing
# it in the index file for inclusion
for attachment in attachments:
# Clean the attachment name to be valid
name_cid = f"cid:{attachment.content_id}"
name_clean = "".join(e for e in name_cid if e.isalnum())
response = httpx.post(
url,
files=files,
data=data,
timeout=settings.CELERY_TASK_TIME_LIMIT,
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}") from err
finally:
# Ensure all file handles as closed
for filename in files:
file_multi_part_handle = files[filename][1]
file_multi_part_handle.close()
# Write attachment payload to a temp file
temp_file = tempdir / name_clean
temp_file.write_bytes(attachment.payload)
route.resource(temp_file)
# Replace as needed the name with the clean name
html_clean = html_clean.replace(name_cid, name_clean)
# Now store the cleaned up HTML version
html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean)
# This is our index file, the main page basically
route.index(html_clean_file)
# Set page size, margins
route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size(
PageSize(height=11.7, width=8.27),
).scale(1.0)
try:
response = route.run()
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)

View File

@ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
)
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url))
self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))
def test_tika_parse_exception(self):
"""
@ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
self.assertEqual(
str(request.url),
self.parser.gotenberg_server + "/forms/chromium/convert/html",
"http://localhost:3000/forms/chromium/convert/html",
)

View File

@ -1,11 +1,14 @@
import os
import shutil
import subprocess
import tempfile
from pathlib import Path
from unittest import mock
import httpx
import pytest
from django.test import TestCase
from imagehash import average_hash
from pdfminer.high_level import extract_text
from PIL import Image
from documents.tests.utils import FileSystemAssertsMixin
@ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
def extract_text(pdf_path: Path) -> str:
"""
Using pdftotext from poppler, extracts the text of a PDF into a file,
then reads the file contents and returns it
"""
with tempfile.NamedTemporaryFile(
mode="w+",
) as tmp:
subprocess.run(
[
shutil.which("pdftotext"),
"-q",
"-layout",
"-enc",
"UTF-8",
str(pdf_path),
tmp.name,
],
check=True,
)
return tmp.read()
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
@ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
extracted = extract_text(pdf_path)
expected = (
"first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
)
self.assertEqual(expected, extracted)

View File

@ -1,9 +1,10 @@
import os
from pathlib import Path
import httpx
from django.conf import settings
from django.utils import timezone
from gotenberg_client import GotenbergClient
from gotenberg_client.options import PdfAFormat
from tika_client import TikaClient
from documents.parsers import DocumentParser
@ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser):
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/forms/libreoffice/convert"
def convert_to_pdf(self, document_path: Path, file_name):
pdf_path = Path(self.tempdir) / "convert.pdf"
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
with open(document_path, "rb") as document_handle:
files = {
"files": (
"convert" + os.path.splitext(document_path)[-1],
document_handle,
),
}
headers = {}
data = {}
with GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.libre_office.to_pdf() as route:
# Set the output format of the resulting PDF
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
data["pdfFormat"] = "PDF/A-2b"
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
data["pdfFormat"] = "PDF/A-1a"
route.pdf_format(PdfAFormat.A1a)
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
data["pdfFormat"] = "PDF/A-3b"
route.pdf_format(PdfAFormat.A3b)
route.convert(document_path)
try:
response = httpx.post(
url,
files=files,
headers=headers,
data=data,
timeout=settings.CELERY_TASK_TIME_LIMIT,
)
response.raise_for_status() # ensure we notice bad responses
response = route.run()
pdf_path.write_bytes(response.content)
return pdf_path
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
with open(pdf_path, "wb") as file:
file.write(response.content)
file.close()
return pdf_path

View File

@ -2,12 +2,11 @@ import datetime
import os
import zoneinfo
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from httpx import Request
from httpx import Response
from httpx import codes
from httpx._multipart import DataField
from rest_framework import status
from documents.parsers import ParseError
@ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
with self.assertRaises(ParseError):
self.parser.convert_to_pdf(file, None)
@mock.patch("paperless_tika.parsers.httpx.post")
def test_request_pdf_a_format(self, post: mock.Mock):
def test_request_pdf_a_format(self):
"""
GIVEN:
- Document needs to be converted to PDF
@ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase):
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
response = Response(status_code=status.HTTP_200_OK)
response.request = Request("POST", "/somewhere/")
post.return_value = response
for setting, expected_key in [
("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
@ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
("pdfa-3", "PDF/A-3b"),
]:
with override_settings(OCR_OUTPUT_TYPE=setting):
self.httpx_mock.add_response(
status_code=codes.OK,
content=b"PDF document",
method="POST",
)
self.parser.convert_to_pdf(file, None)
post.assert_called_once()
_, kwargs = post.call_args
request = self.httpx_mock.get_request()
found = False
for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfFormat":
self.assertEqual(field.value, expected_key)
found = True
self.assertTrue(found)
self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
post.reset_mock()
self.httpx_mock.reset(assert_all_responses_were_requested=False)

View File

@ -7,7 +7,7 @@ max-line-length = 88
[tool:pytest]
DJANGO_SETTINGS_MODULE=paperless.settings
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet --durations=50
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
env =
PAPERLESS_DISABLE_DBHANDLER=true