diff --git a/Pipfile b/Pipfile index af6f4e4fd..d8b66d719 100644 --- a/Pipfile +++ b/Pipfile @@ -51,6 +51,7 @@ flower = "*" bleach = "*" zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} django-multiselectfield = "*" +gotenberg-client = "*" [dev-packages] # Linting diff --git a/Pipfile.lock b/Pipfile.lock index 8a469ca92..67cdc29b1 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3025da2940433d347b2fd2ac222852c21f4aa73eeefbd1ee9152cbfd7a7a48e9" + "sha256": "505bd6b18d31ed64988ef307c12a5acb70f611cafd932a391e985a11bbbc8000" }, "pipfile-spec": 6, "requires": {}, @@ -539,6 +539,15 @@ "markers": "python_version >= '3.7'", "version": "==2.0.1" }, + "gotenberg-client": { + "hashes": [ + "sha256:4508ecb913ef2d553dd2ceb78e32cee001000ba08c910ba1f9ace38350d1589e", + "sha256:7a3f8a02caee768391373b3610c6ec25a853cccf391ed6b5d5a1292c3ed15e7e" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==0.3.0" + }, "gunicorn": { "hashes": [ "sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0", @@ -556,6 +565,13 @@ "markers": "python_version >= '3.7'", "version": "==0.14.0" }, + "h2": { + "hashes": [ + "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d", + "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb" + ], + "version": "==4.1.0" + }, "hiredis": { "hashes": [ "sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240", @@ -650,6 +666,14 @@ ], "version": "==2.2.3" }, + "hpack": { + "hashes": [ + "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", + "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==4.0.0" + }, "httpcore": { "hashes": [ "sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9", @@ -699,6 +723,9 @@ "version": "==0.6.0" }, "httpx": { + "extras": [ + "http2" + ], "hashes": [ "sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100", "sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875" @@ -714,6 +741,14 @@ "markers": "python_version >= '3.8'", "version": "==4.8.0" }, + "hyperframe": { + "hashes": [ + "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15", + "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==6.0.1" + }, "idna": { "hashes": [ "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", @@ -1782,7 +1817,7 @@ "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" ], - "markers": "python_version < '3.11'", + "markers": "python_version < '3.10'", "version": "==4.8.0" }, "tzdata": { diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index bcfdd5b3d..4b3e5686e 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -1,13 +1,17 @@ import re from html import escape from pathlib import Path +from typing import Optional -import httpx from bleach import clean from bleach import linkify from django.conf import settings from django.utils.timezone import is_naive from django.utils.timezone import make_aware +from gotenberg_client import GotenbergClient +from gotenberg_client.options import Margin +from gotenberg_client.options import PageSize +from gotenberg_client.options import PdfAFormat from humanize import naturalsize from imap_tools import MailAttachment from imap_tools import MailMessage @@ -24,11 +28,22 @@ class MailDocumentParser(DocumentParser): Gotenberg and sends the html part to a Tika server for text extraction. """ - gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT - tika_server = settings.TIKA_ENDPOINT - logging_name = "paperless.parsing.mail" + @staticmethod + def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]: + """ + Converts our requested PDF/A output into the Gotenberg API + format + """ + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + return PdfAFormat.A2b + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover + return PdfAFormat.A1a + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover + return PdfAFormat.A3b + return None + def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): if not self.archive_path: self.archive_path = self.generate_pdf( @@ -173,7 +188,7 @@ class MailDocumentParser(DocumentParser): self.log.info("Sending content to Tika server") try: - with TikaClient(tika_url=self.tika_server) as client: + with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: parsed = client.tika.as_text.from_buffer(html, "text/html") if parsed.content is not None: @@ -182,7 +197,7 @@ class MailDocumentParser(DocumentParser): except Exception as err: raise ParseError( f"Could not parse content with tika server at " - f"{self.tika_server}: {err}", + f"{settings.TIKA_ENDPOINT}: {err}", ) from err def generate_pdf(self, mail_message: MailMessage) -> Path: @@ -195,45 +210,29 @@ class MailDocumentParser(DocumentParser): if not mail_message.html: archive_path.write_bytes(mail_pdf_file.read_bytes()) else: - url_merge = self.gotenberg_server + "/forms/pdfengines/merge" - pdf_of_html_content = self.generate_pdf_from_html( mail_message.html, mail_message.attachments, ) - pdf_collection = { - "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"), - "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"), - } + with GotenbergClient( + host=settings.TIKA_GOTENBERG_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ) as client, client.merge.merge() as route: + # Configure requested PDF/A formatting, if any + pdf_a_format = self._settings_to_gotenberg_pdfa() + if pdf_a_format is not None: + route.pdf_format(pdf_a_format) - try: - # Open a handle to each file, replacing the tuple - for filename in pdf_collection: - file_multi_part = pdf_collection[filename] - pdf_collection[filename] = ( - file_multi_part[0], - file_multi_part[1].open("rb"), - file_multi_part[2], - ) + route.merge([mail_pdf_file, pdf_of_html_content]) - response = httpx.post( - url_merge, - files=pdf_collection, - timeout=settings.CELERY_TASK_TIME_LIMIT, - ) - response.raise_for_status() # ensure we notice bad responses - - archive_path.write_bytes(response.content) - - except Exception as err: - raise ParseError( - f"Error while merging email HTML into PDF: {err}", - ) from err - finally: - for filename in pdf_collection: - file_multi_part_handle = pdf_collection[filename][1] - file_multi_part_handle.close() + try: + response = route.run() + archive_path.write_bytes(response.content) + except Exception as err: + raise ParseError( + f"Error while merging email HTML into PDF: {err}", + ) from err return archive_path @@ -299,48 +298,29 @@ class MailDocumentParser(DocumentParser): Creates a PDF based on the given email, using the email's values in a an HTML template """ - url = self.gotenberg_server + "/forms/chromium/convert/html" self.log.info("Converting mail to PDF") css_file = Path(__file__).parent / "templates" / "output.css" email_html_file = self.mail_to_html(mail) - with css_file.open("rb") as css_handle, email_html_file.open( - "rb", - ) as email_html_handle: - files = { - "html": ("index.html", email_html_handle, "text/html"), - "css": ("output.css", css_handle, "text/css"), - } - headers = {} - data = { - "marginTop": "0.1", - "marginBottom": "0.1", - "marginLeft": "0.1", - "marginRight": "0.1", - "paperWidth": "8.27", - "paperHeight": "11.7", - "scale": "1.0", - } - - # Set the output format of the resulting PDF - # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno - if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: - data["pdfFormat"] = "PDF/A-2b" - elif settings.OCR_OUTPUT_TYPE == "pdfa-1": - data["pdfFormat"] = "PDF/A-1a" - elif settings.OCR_OUTPUT_TYPE == "pdfa-3": - data["pdfFormat"] = "PDF/A-3b" + with GotenbergClient( + host=settings.TIKA_GOTENBERG_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ) as client, client.chromium.html_to_pdf() as route: + # Configure requested PDF/A formatting, if any + pdf_a_format = self._settings_to_gotenberg_pdfa() + if pdf_a_format is not None: + route.pdf_format(pdf_a_format) try: - response = httpx.post( - url, - files=files, - headers=headers, - data=data, - timeout=settings.CELERY_TASK_TIME_LIMIT, + response = ( + route.index(email_html_file) + .resource(css_file) + .margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)) + .size(PageSize(height=11.7, width=8.27)) + .scale(1.0) + .run() ) - response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError( f"Error while converting email to PDF: {err}", @@ -368,69 +348,57 @@ class MailDocumentParser(DocumentParser): text = compiled_close.sub(" str: + """ + Using pdftotext from poppler, extracts the text of a PDF into a file, + then reads the file contents and returns it + """ + with tempfile.NamedTemporaryFile( + mode="w+", + ) as tmp: + subprocess.run( + [ + shutil.which("pdftotext"), + "-q", + "-layout", + "-enc", + "UTF-8", + str(pdf_path), + tmp.name, + ], + check=True, + ) + return tmp.read() + + class MailAttachmentMock: def __init__(self, payload, content_id): self.payload = payload @@ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): extracted = extract_text(pdf_path) expected = ( - "first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c" + "first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c" ) self.assertEqual(expected, extracted) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index c410594bb..c9056d90d 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,9 +1,10 @@ -import os from pathlib import Path import httpx from django.conf import settings from django.utils import timezone +from gotenberg_client import GotenbergClient +from gotenberg_client.options import PdfAFormat from tika_client import TikaClient from documents.parsers import DocumentParser @@ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser): self.archive_path = self.convert_to_pdf(document_path, file_name) - def convert_to_pdf(self, document_path, file_name): - pdf_path = os.path.join(self.tempdir, "convert.pdf") - gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT - url = gotenberg_server + "/forms/libreoffice/convert" + def convert_to_pdf(self, document_path: Path, file_name): + pdf_path = Path(self.tempdir) / "convert.pdf" self.log.info(f"Converting {document_path} to PDF as {pdf_path}") - with open(document_path, "rb") as document_handle: - files = { - "files": ( - "convert" + os.path.splitext(document_path)[-1], - document_handle, - ), - } - headers = {} - data = {} + with GotenbergClient( + host=settings.TIKA_GOTENBERG_ENDPOINT, + timeout=settings.CELERY_TASK_TIME_LIMIT, + ) as client, client.libre_office.to_pdf() as route: # Set the output format of the resulting PDF - # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: - data["pdfFormat"] = "PDF/A-2b" + route.pdf_format(PdfAFormat.A2b) elif settings.OCR_OUTPUT_TYPE == "pdfa-1": - data["pdfFormat"] = "PDF/A-1a" + route.pdf_format(PdfAFormat.A1a) elif settings.OCR_OUTPUT_TYPE == "pdfa-3": - data["pdfFormat"] = "PDF/A-3b" + route.pdf_format(PdfAFormat.A3b) + + route.convert(document_path) try: - response = httpx.post( - url, - files=files, - headers=headers, - data=data, - timeout=settings.CELERY_TASK_TIME_LIMIT, - ) - response.raise_for_status() # ensure we notice bad responses + response = route.run() + + pdf_path.write_bytes(response.content) + + return pdf_path + except Exception as err: raise ParseError( f"Error while converting document to PDF: {err}", ) from err - - with open(pdf_path, "wb") as file: - file.write(response.content) - file.close() - - return pdf_path diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index f693aa4e7..81d6f026a 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -2,12 +2,11 @@ import datetime import os import zoneinfo from pathlib import Path -from unittest import mock from django.test import TestCase from django.test import override_settings -from httpx import Request -from httpx import Response +from httpx import codes +from httpx._multipart import DataField from rest_framework import status from documents.parsers import ParseError @@ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase): with self.assertRaises(ParseError): self.parser.convert_to_pdf(file, None) - @mock.patch("paperless_tika.parsers.httpx.post") - def test_request_pdf_a_format(self, post: mock.Mock): + def test_request_pdf_a_format(self): """ GIVEN: - Document needs to be converted to PDF @@ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase): file = Path(os.path.join(self.parser.tempdir, "input.odt")) file.touch() - response = Response(status_code=status.HTTP_200_OK) - response.request = Request("POST", "/somewhere/") - post.return_value = response - for setting, expected_key in [ ("pdfa", "PDF/A-2b"), ("pdfa-2", "PDF/A-2b"), @@ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase): ("pdfa-3", "PDF/A-3b"), ]: with override_settings(OCR_OUTPUT_TYPE=setting): + self.httpx_mock.add_response( + status_code=codes.OK, + content=b"PDF document", + method="POST", + ) + self.parser.convert_to_pdf(file, None) - post.assert_called_once() - _, kwargs = post.call_args + request = self.httpx_mock.get_request() + found = False + for field in request.stream.fields: + if isinstance(field, DataField) and field.name == "pdfFormat": + self.assertEqual(field.value, expected_key) + found = True + self.assertTrue(found) - self.assertEqual(kwargs["data"]["pdfFormat"], expected_key) - - post.reset_mock() + self.httpx_mock.reset(assert_all_responses_were_requested=False) diff --git a/src/setup.cfg b/src/setup.cfg index fb6ecf315..e2e5cf8ea 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -7,7 +7,7 @@ max-line-length = 88 [tool:pytest] DJANGO_SETTINGS_MODULE=paperless.settings -addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet --durations=50 +addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 env = PAPERLESS_DISABLE_DBHANDLER=true