mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Feature: Switches to a new client to handle communication with Gotenberg (#4391)
Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from gotenberg_client import GotenbergClient
|
||||
from gotenberg_client.options import PdfAFormat
|
||||
from tika_client import TikaClient
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
@@ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser):
|
||||
|
||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||
|
||||
def convert_to_pdf(self, document_path, file_name):
|
||||
pdf_path = os.path.join(self.tempdir, "convert.pdf")
|
||||
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
|
||||
url = gotenberg_server + "/forms/libreoffice/convert"
|
||||
def convert_to_pdf(self, document_path: Path, file_name):
|
||||
pdf_path = Path(self.tempdir) / "convert.pdf"
|
||||
|
||||
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
|
||||
with open(document_path, "rb") as document_handle:
|
||||
files = {
|
||||
"files": (
|
||||
"convert" + os.path.splitext(document_path)[-1],
|
||||
document_handle,
|
||||
),
|
||||
}
|
||||
headers = {}
|
||||
data = {}
|
||||
|
||||
with GotenbergClient(
|
||||
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
) as client, client.libre_office.to_pdf() as route:
|
||||
# Set the output format of the resulting PDF
|
||||
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
|
||||
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
|
||||
data["pdfFormat"] = "PDF/A-2b"
|
||||
route.pdf_format(PdfAFormat.A2b)
|
||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
|
||||
data["pdfFormat"] = "PDF/A-1a"
|
||||
route.pdf_format(PdfAFormat.A1a)
|
||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
|
||||
data["pdfFormat"] = "PDF/A-3b"
|
||||
route.pdf_format(PdfAFormat.A3b)
|
||||
|
||||
route.convert(document_path)
|
||||
|
||||
try:
|
||||
response = httpx.post(
|
||||
url,
|
||||
files=files,
|
||||
headers=headers,
|
||||
data=data,
|
||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||
)
|
||||
response.raise_for_status() # ensure we notice bad responses
|
||||
response = route.run()
|
||||
|
||||
pdf_path.write_bytes(response.content)
|
||||
|
||||
return pdf_path
|
||||
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Error while converting document to PDF: {err}",
|
||||
) from err
|
||||
|
||||
with open(pdf_path, "wb") as file:
|
||||
file.write(response.content)
|
||||
file.close()
|
||||
|
||||
return pdf_path
|
||||
|
@@ -2,12 +2,11 @@ import datetime
|
||||
import os
|
||||
import zoneinfo
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from httpx import Request
|
||||
from httpx import Response
|
||||
from httpx import codes
|
||||
from httpx._multipart import DataField
|
||||
from rest_framework import status
|
||||
|
||||
from documents.parsers import ParseError
|
||||
@@ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
||||
with self.assertRaises(ParseError):
|
||||
self.parser.convert_to_pdf(file, None)
|
||||
|
||||
@mock.patch("paperless_tika.parsers.httpx.post")
|
||||
def test_request_pdf_a_format(self, post: mock.Mock):
|
||||
def test_request_pdf_a_format(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document needs to be converted to PDF
|
||||
@@ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
||||
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
||||
file.touch()
|
||||
|
||||
response = Response(status_code=status.HTTP_200_OK)
|
||||
response.request = Request("POST", "/somewhere/")
|
||||
post.return_value = response
|
||||
|
||||
for setting, expected_key in [
|
||||
("pdfa", "PDF/A-2b"),
|
||||
("pdfa-2", "PDF/A-2b"),
|
||||
@@ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
||||
("pdfa-3", "PDF/A-3b"),
|
||||
]:
|
||||
with override_settings(OCR_OUTPUT_TYPE=setting):
|
||||
self.httpx_mock.add_response(
|
||||
status_code=codes.OK,
|
||||
content=b"PDF document",
|
||||
method="POST",
|
||||
)
|
||||
|
||||
self.parser.convert_to_pdf(file, None)
|
||||
|
||||
post.assert_called_once()
|
||||
_, kwargs = post.call_args
|
||||
request = self.httpx_mock.get_request()
|
||||
found = False
|
||||
for field in request.stream.fields:
|
||||
if isinstance(field, DataField) and field.name == "pdfFormat":
|
||||
self.assertEqual(field.value, expected_key)
|
||||
found = True
|
||||
self.assertTrue(found)
|
||||
|
||||
self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
|
||||
|
||||
post.reset_mock()
|
||||
self.httpx_mock.reset(assert_all_responses_were_requested=False)
|
||||
|
Reference in New Issue
Block a user