mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Feature: Switches to a new client to handle communication with Gotenberg (#4391)
Switches to a new client to handle communication with Gotenberg for merging and generating PDFs
This commit is contained in:
parent
5f0eba694c
commit
999ae678c2
1
Pipfile
1
Pipfile
@ -51,6 +51,7 @@ flower = "*"
|
|||||||
bleach = "*"
|
bleach = "*"
|
||||||
zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
|
zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
|
||||||
django-multiselectfield = "*"
|
django-multiselectfield = "*"
|
||||||
|
gotenberg-client = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
# Linting
|
# Linting
|
||||||
|
39
Pipfile.lock
generated
39
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "3025da2940433d347b2fd2ac222852c21f4aa73eeefbd1ee9152cbfd7a7a48e9"
|
"sha256": "505bd6b18d31ed64988ef307c12a5acb70f611cafd932a391e985a11bbbc8000"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {},
|
"requires": {},
|
||||||
@ -539,6 +539,15 @@
|
|||||||
"markers": "python_version >= '3.7'",
|
"markers": "python_version >= '3.7'",
|
||||||
"version": "==2.0.1"
|
"version": "==2.0.1"
|
||||||
},
|
},
|
||||||
|
"gotenberg-client": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:4508ecb913ef2d553dd2ceb78e32cee001000ba08c910ba1f9ace38350d1589e",
|
||||||
|
"sha256:7a3f8a02caee768391373b3610c6ec25a853cccf391ed6b5d5a1292c3ed15e7e"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"markers": "python_version >= '3.8'",
|
||||||
|
"version": "==0.3.0"
|
||||||
|
},
|
||||||
"gunicorn": {
|
"gunicorn": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0",
|
"sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0",
|
||||||
@ -556,6 +565,13 @@
|
|||||||
"markers": "python_version >= '3.7'",
|
"markers": "python_version >= '3.7'",
|
||||||
"version": "==0.14.0"
|
"version": "==0.14.0"
|
||||||
},
|
},
|
||||||
|
"h2": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d",
|
||||||
|
"sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"
|
||||||
|
],
|
||||||
|
"version": "==4.1.0"
|
||||||
|
},
|
||||||
"hiredis": {
|
"hiredis": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240",
|
"sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240",
|
||||||
@ -650,6 +666,14 @@
|
|||||||
],
|
],
|
||||||
"version": "==2.2.3"
|
"version": "==2.2.3"
|
||||||
},
|
},
|
||||||
|
"hpack": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c",
|
||||||
|
"sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"
|
||||||
|
],
|
||||||
|
"markers": "python_full_version >= '3.6.1'",
|
||||||
|
"version": "==4.0.0"
|
||||||
|
},
|
||||||
"httpcore": {
|
"httpcore": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9",
|
"sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9",
|
||||||
@ -699,6 +723,9 @@
|
|||||||
"version": "==0.6.0"
|
"version": "==0.6.0"
|
||||||
},
|
},
|
||||||
"httpx": {
|
"httpx": {
|
||||||
|
"extras": [
|
||||||
|
"http2"
|
||||||
|
],
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100",
|
"sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100",
|
||||||
"sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875"
|
"sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875"
|
||||||
@ -714,6 +741,14 @@
|
|||||||
"markers": "python_version >= '3.8'",
|
"markers": "python_version >= '3.8'",
|
||||||
"version": "==4.8.0"
|
"version": "==4.8.0"
|
||||||
},
|
},
|
||||||
|
"hyperframe": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15",
|
||||||
|
"sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"
|
||||||
|
],
|
||||||
|
"markers": "python_full_version >= '3.6.1'",
|
||||||
|
"version": "==6.0.1"
|
||||||
|
},
|
||||||
"idna": {
|
"idna": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
|
"sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
|
||||||
@ -1782,7 +1817,7 @@
|
|||||||
"sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0",
|
"sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0",
|
||||||
"sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"
|
"sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"
|
||||||
],
|
],
|
||||||
"markers": "python_version < '3.11'",
|
"markers": "python_version < '3.10'",
|
||||||
"version": "==4.8.0"
|
"version": "==4.8.0"
|
||||||
},
|
},
|
||||||
"tzdata": {
|
"tzdata": {
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
import re
|
import re
|
||||||
from html import escape
|
from html import escape
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import httpx
|
|
||||||
from bleach import clean
|
from bleach import clean
|
||||||
from bleach import linkify
|
from bleach import linkify
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils.timezone import is_naive
|
from django.utils.timezone import is_naive
|
||||||
from django.utils.timezone import make_aware
|
from django.utils.timezone import make_aware
|
||||||
|
from gotenberg_client import GotenbergClient
|
||||||
|
from gotenberg_client.options import Margin
|
||||||
|
from gotenberg_client.options import PageSize
|
||||||
|
from gotenberg_client.options import PdfAFormat
|
||||||
from humanize import naturalsize
|
from humanize import naturalsize
|
||||||
from imap_tools import MailAttachment
|
from imap_tools import MailAttachment
|
||||||
from imap_tools import MailMessage
|
from imap_tools import MailMessage
|
||||||
@ -24,11 +28,22 @@ class MailDocumentParser(DocumentParser):
|
|||||||
Gotenberg and sends the html part to a Tika server for text extraction.
|
Gotenberg and sends the html part to a Tika server for text extraction.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
|
|
||||||
tika_server = settings.TIKA_ENDPOINT
|
|
||||||
|
|
||||||
logging_name = "paperless.parsing.mail"
|
logging_name = "paperless.parsing.mail"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]:
|
||||||
|
"""
|
||||||
|
Converts our requested PDF/A output into the Gotenberg API
|
||||||
|
format
|
||||||
|
"""
|
||||||
|
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
|
||||||
|
return PdfAFormat.A2b
|
||||||
|
elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover
|
||||||
|
return PdfAFormat.A1a
|
||||||
|
elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover
|
||||||
|
return PdfAFormat.A3b
|
||||||
|
return None
|
||||||
|
|
||||||
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
|
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
|
||||||
if not self.archive_path:
|
if not self.archive_path:
|
||||||
self.archive_path = self.generate_pdf(
|
self.archive_path = self.generate_pdf(
|
||||||
@ -173,7 +188,7 @@ class MailDocumentParser(DocumentParser):
|
|||||||
self.log.info("Sending content to Tika server")
|
self.log.info("Sending content to Tika server")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with TikaClient(tika_url=self.tika_server) as client:
|
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
|
||||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||||
|
|
||||||
if parsed.content is not None:
|
if parsed.content is not None:
|
||||||
@ -182,7 +197,7 @@ class MailDocumentParser(DocumentParser):
|
|||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Could not parse content with tika server at "
|
f"Could not parse content with tika server at "
|
||||||
f"{self.tika_server}: {err}",
|
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||||
) from err
|
) from err
|
||||||
|
|
||||||
def generate_pdf(self, mail_message: MailMessage) -> Path:
|
def generate_pdf(self, mail_message: MailMessage) -> Path:
|
||||||
@ -195,45 +210,29 @@ class MailDocumentParser(DocumentParser):
|
|||||||
if not mail_message.html:
|
if not mail_message.html:
|
||||||
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
archive_path.write_bytes(mail_pdf_file.read_bytes())
|
||||||
else:
|
else:
|
||||||
url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
|
|
||||||
|
|
||||||
pdf_of_html_content = self.generate_pdf_from_html(
|
pdf_of_html_content = self.generate_pdf_from_html(
|
||||||
mail_message.html,
|
mail_message.html,
|
||||||
mail_message.attachments,
|
mail_message.attachments,
|
||||||
)
|
)
|
||||||
|
|
||||||
pdf_collection = {
|
with GotenbergClient(
|
||||||
"1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
"2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
}
|
) as client, client.merge.merge() as route:
|
||||||
|
# Configure requested PDF/A formatting, if any
|
||||||
|
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||||
|
if pdf_a_format is not None:
|
||||||
|
route.pdf_format(pdf_a_format)
|
||||||
|
|
||||||
try:
|
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||||
# Open a handle to each file, replacing the tuple
|
|
||||||
for filename in pdf_collection:
|
|
||||||
file_multi_part = pdf_collection[filename]
|
|
||||||
pdf_collection[filename] = (
|
|
||||||
file_multi_part[0],
|
|
||||||
file_multi_part[1].open("rb"),
|
|
||||||
file_multi_part[2],
|
|
||||||
)
|
|
||||||
|
|
||||||
response = httpx.post(
|
try:
|
||||||
url_merge,
|
response = route.run()
|
||||||
files=pdf_collection,
|
archive_path.write_bytes(response.content)
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
except Exception as err:
|
||||||
)
|
raise ParseError(
|
||||||
response.raise_for_status() # ensure we notice bad responses
|
f"Error while merging email HTML into PDF: {err}",
|
||||||
|
) from err
|
||||||
archive_path.write_bytes(response.content)
|
|
||||||
|
|
||||||
except Exception as err:
|
|
||||||
raise ParseError(
|
|
||||||
f"Error while merging email HTML into PDF: {err}",
|
|
||||||
) from err
|
|
||||||
finally:
|
|
||||||
for filename in pdf_collection:
|
|
||||||
file_multi_part_handle = pdf_collection[filename][1]
|
|
||||||
file_multi_part_handle.close()
|
|
||||||
|
|
||||||
return archive_path
|
return archive_path
|
||||||
|
|
||||||
@ -299,48 +298,29 @@ class MailDocumentParser(DocumentParser):
|
|||||||
Creates a PDF based on the given email, using the email's values in a
|
Creates a PDF based on the given email, using the email's values in a
|
||||||
an HTML template
|
an HTML template
|
||||||
"""
|
"""
|
||||||
url = self.gotenberg_server + "/forms/chromium/convert/html"
|
|
||||||
self.log.info("Converting mail to PDF")
|
self.log.info("Converting mail to PDF")
|
||||||
|
|
||||||
css_file = Path(__file__).parent / "templates" / "output.css"
|
css_file = Path(__file__).parent / "templates" / "output.css"
|
||||||
email_html_file = self.mail_to_html(mail)
|
email_html_file = self.mail_to_html(mail)
|
||||||
|
|
||||||
with css_file.open("rb") as css_handle, email_html_file.open(
|
with GotenbergClient(
|
||||||
"rb",
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
) as email_html_handle:
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
files = {
|
) as client, client.chromium.html_to_pdf() as route:
|
||||||
"html": ("index.html", email_html_handle, "text/html"),
|
# Configure requested PDF/A formatting, if any
|
||||||
"css": ("output.css", css_handle, "text/css"),
|
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||||
}
|
if pdf_a_format is not None:
|
||||||
headers = {}
|
route.pdf_format(pdf_a_format)
|
||||||
data = {
|
|
||||||
"marginTop": "0.1",
|
|
||||||
"marginBottom": "0.1",
|
|
||||||
"marginLeft": "0.1",
|
|
||||||
"marginRight": "0.1",
|
|
||||||
"paperWidth": "8.27",
|
|
||||||
"paperHeight": "11.7",
|
|
||||||
"scale": "1.0",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Set the output format of the resulting PDF
|
|
||||||
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
|
|
||||||
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
|
|
||||||
data["pdfFormat"] = "PDF/A-2b"
|
|
||||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
|
|
||||||
data["pdfFormat"] = "PDF/A-1a"
|
|
||||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
|
|
||||||
data["pdfFormat"] = "PDF/A-3b"
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = httpx.post(
|
response = (
|
||||||
url,
|
route.index(email_html_file)
|
||||||
files=files,
|
.resource(css_file)
|
||||||
headers=headers,
|
.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1))
|
||||||
data=data,
|
.size(PageSize(height=11.7, width=8.27))
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
.scale(1.0)
|
||||||
|
.run()
|
||||||
)
|
)
|
||||||
response.raise_for_status() # ensure we notice bad responses
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Error while converting email to PDF: {err}",
|
f"Error while converting email to PDF: {err}",
|
||||||
@ -368,69 +348,57 @@ class MailDocumentParser(DocumentParser):
|
|||||||
text = compiled_close.sub("</div", text)
|
text = compiled_close.sub("</div", text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
url = self.gotenberg_server + "/forms/chromium/convert/html"
|
|
||||||
self.log.info("Converting html to PDF")
|
self.log.info("Converting html to PDF")
|
||||||
|
|
||||||
tempdir = Path(self.tempdir)
|
tempdir = Path(self.tempdir)
|
||||||
|
|
||||||
html_clean = clean_html_script(orig_html)
|
html_clean = clean_html_script(orig_html)
|
||||||
|
|
||||||
files = {}
|
|
||||||
|
|
||||||
for attachment in attachments:
|
|
||||||
# Clean the attachment name to be valid
|
|
||||||
name_cid = f"cid:{attachment.content_id}"
|
|
||||||
name_clean = "".join(e for e in name_cid if e.isalnum())
|
|
||||||
|
|
||||||
# Write attachment payload to a temp file
|
|
||||||
temp_file = tempdir / name_clean
|
|
||||||
temp_file.write_bytes(attachment.payload)
|
|
||||||
|
|
||||||
# Store the attachment for upload
|
|
||||||
files[name_clean] = (name_clean, temp_file, attachment.content_type)
|
|
||||||
|
|
||||||
# Replace as needed the name with the clean name
|
|
||||||
html_clean = html_clean.replace(name_cid, name_clean)
|
|
||||||
|
|
||||||
# Now store the cleaned up HTML version
|
|
||||||
html_clean_file = tempdir / "index.html"
|
html_clean_file = tempdir / "index.html"
|
||||||
html_clean_file.write_text(html_clean)
|
html_clean_file.write_text(html_clean)
|
||||||
|
|
||||||
files["index.html"] = ("index.html", html_clean_file, "text/html")
|
with GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
) as client, client.chromium.html_to_pdf() as route:
|
||||||
|
# Configure requested PDF/A formatting, if any
|
||||||
|
pdf_a_format = self._settings_to_gotenberg_pdfa()
|
||||||
|
if pdf_a_format is not None:
|
||||||
|
route.pdf_format(pdf_a_format)
|
||||||
|
|
||||||
data = {
|
# Add attachments as resources, cleaning the filename and replacing
|
||||||
"marginTop": "0.1",
|
# it in the index file for inclusion
|
||||||
"marginBottom": "0.1",
|
for attachment in attachments:
|
||||||
"marginLeft": "0.1",
|
# Clean the attachment name to be valid
|
||||||
"marginRight": "0.1",
|
name_cid = f"cid:{attachment.content_id}"
|
||||||
"paperWidth": "8.27",
|
name_clean = "".join(e for e in name_cid if e.isalnum())
|
||||||
"paperHeight": "11.7",
|
|
||||||
"scale": "1.0",
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
# Open a handle to each file, replacing the tuple
|
|
||||||
for filename in files:
|
|
||||||
file_multi_part = files[filename]
|
|
||||||
files[filename] = (
|
|
||||||
file_multi_part[0],
|
|
||||||
file_multi_part[1].open("rb"),
|
|
||||||
file_multi_part[2],
|
|
||||||
)
|
|
||||||
|
|
||||||
response = httpx.post(
|
# Write attachment payload to a temp file
|
||||||
url,
|
temp_file = tempdir / name_clean
|
||||||
files=files,
|
temp_file.write_bytes(attachment.payload)
|
||||||
data=data,
|
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
route.resource(temp_file)
|
||||||
)
|
|
||||||
response.raise_for_status() # ensure we notice bad responses
|
# Replace as needed the name with the clean name
|
||||||
except Exception as err:
|
html_clean = html_clean.replace(name_cid, name_clean)
|
||||||
raise ParseError(f"Error while converting document to PDF: {err}") from err
|
|
||||||
finally:
|
# Now store the cleaned up HTML version
|
||||||
# Ensure all file handles as closed
|
html_clean_file = tempdir / "index.html"
|
||||||
for filename in files:
|
html_clean_file.write_text(html_clean)
|
||||||
file_multi_part_handle = files[filename][1]
|
# This is our index file, the main page basically
|
||||||
file_multi_part_handle.close()
|
route.index(html_clean_file)
|
||||||
|
|
||||||
|
# Set page size, margins
|
||||||
|
route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size(
|
||||||
|
PageSize(height=11.7, width=8.27),
|
||||||
|
).scale(1.0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = route.run()
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
raise ParseError(
|
||||||
|
f"Error while converting document to PDF: {err}",
|
||||||
|
) from err
|
||||||
|
|
||||||
html_pdf = tempdir / "html.pdf"
|
html_pdf = tempdir / "html.pdf"
|
||||||
html_pdf.write_bytes(response.content)
|
html_pdf.write_bytes(response.content)
|
||||||
|
@ -341,7 +341,7 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
|
|||||||
)
|
)
|
||||||
parsed = self.parser.tika_parse(html)
|
parsed = self.parser.tika_parse(html)
|
||||||
self.assertEqual(expected_text, parsed.strip())
|
self.assertEqual(expected_text, parsed.strip())
|
||||||
self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url))
|
self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url))
|
||||||
|
|
||||||
def test_tika_parse_exception(self):
|
def test_tika_parse_exception(self):
|
||||||
"""
|
"""
|
||||||
@ -653,5 +653,5 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
|
|||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
str(request.url),
|
str(request.url),
|
||||||
self.parser.gotenberg_server + "/forms/chromium/convert/html",
|
"http://localhost:3000/forms/chromium/convert/html",
|
||||||
)
|
)
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import pytest
|
import pytest
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from imagehash import average_hash
|
from imagehash import average_hash
|
||||||
from pdfminer.high_level import extract_text
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
from documents.tests.utils import FileSystemAssertsMixin
|
||||||
@ -13,6 +16,29 @@ from documents.tests.utils import util_call_with_backoff
|
|||||||
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
|
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(pdf_path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Using pdftotext from poppler, extracts the text of a PDF into a file,
|
||||||
|
then reads the file contents and returns it
|
||||||
|
"""
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w+",
|
||||||
|
) as tmp:
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
shutil.which("pdftotext"),
|
||||||
|
"-q",
|
||||||
|
"-layout",
|
||||||
|
"-enc",
|
||||||
|
"UTF-8",
|
||||||
|
str(pdf_path),
|
||||||
|
tmp.name,
|
||||||
|
],
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return tmp.read()
|
||||||
|
|
||||||
|
|
||||||
class MailAttachmentMock:
|
class MailAttachmentMock:
|
||||||
def __init__(self, payload, content_id):
|
def __init__(self, payload, content_id):
|
||||||
self.payload = payload
|
self.payload = payload
|
||||||
@ -150,7 +176,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
|||||||
|
|
||||||
extracted = extract_text(pdf_path)
|
extracted = extract_text(pdf_path)
|
||||||
expected = (
|
expected = (
|
||||||
"first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
|
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(expected, extracted)
|
self.assertEqual(expected, extracted)
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
from gotenberg_client import GotenbergClient
|
||||||
|
from gotenberg_client.options import PdfAFormat
|
||||||
from tika_client import TikaClient
|
from tika_client import TikaClient
|
||||||
|
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
@ -80,47 +81,33 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||||
|
|
||||||
def convert_to_pdf(self, document_path, file_name):
|
def convert_to_pdf(self, document_path: Path, file_name):
|
||||||
pdf_path = os.path.join(self.tempdir, "convert.pdf")
|
pdf_path = Path(self.tempdir) / "convert.pdf"
|
||||||
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
|
|
||||||
url = gotenberg_server + "/forms/libreoffice/convert"
|
|
||||||
|
|
||||||
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
|
self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
|
||||||
with open(document_path, "rb") as document_handle:
|
|
||||||
files = {
|
|
||||||
"files": (
|
|
||||||
"convert" + os.path.splitext(document_path)[-1],
|
|
||||||
document_handle,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
headers = {}
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
|
with GotenbergClient(
|
||||||
|
host=settings.TIKA_GOTENBERG_ENDPOINT,
|
||||||
|
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
||||||
|
) as client, client.libre_office.to_pdf() as route:
|
||||||
# Set the output format of the resulting PDF
|
# Set the output format of the resulting PDF
|
||||||
# Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno
|
|
||||||
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
|
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
|
||||||
data["pdfFormat"] = "PDF/A-2b"
|
route.pdf_format(PdfAFormat.A2b)
|
||||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
|
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
|
||||||
data["pdfFormat"] = "PDF/A-1a"
|
route.pdf_format(PdfAFormat.A1a)
|
||||||
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
|
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
|
||||||
data["pdfFormat"] = "PDF/A-3b"
|
route.pdf_format(PdfAFormat.A3b)
|
||||||
|
|
||||||
|
route.convert(document_path)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = httpx.post(
|
response = route.run()
|
||||||
url,
|
|
||||||
files=files,
|
pdf_path.write_bytes(response.content)
|
||||||
headers=headers,
|
|
||||||
data=data,
|
return pdf_path
|
||||||
timeout=settings.CELERY_TASK_TIME_LIMIT,
|
|
||||||
)
|
|
||||||
response.raise_for_status() # ensure we notice bad responses
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Error while converting document to PDF: {err}",
|
f"Error while converting document to PDF: {err}",
|
||||||
) from err
|
) from err
|
||||||
|
|
||||||
with open(pdf_path, "wb") as file:
|
|
||||||
file.write(response.content)
|
|
||||||
file.close()
|
|
||||||
|
|
||||||
return pdf_path
|
|
||||||
|
@ -2,12 +2,11 @@ import datetime
|
|||||||
import os
|
import os
|
||||||
import zoneinfo
|
import zoneinfo
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
from httpx import Request
|
from httpx import codes
|
||||||
from httpx import Response
|
from httpx._multipart import DataField
|
||||||
from rest_framework import status
|
from rest_framework import status
|
||||||
|
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
@ -95,8 +94,7 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
|||||||
with self.assertRaises(ParseError):
|
with self.assertRaises(ParseError):
|
||||||
self.parser.convert_to_pdf(file, None)
|
self.parser.convert_to_pdf(file, None)
|
||||||
|
|
||||||
@mock.patch("paperless_tika.parsers.httpx.post")
|
def test_request_pdf_a_format(self):
|
||||||
def test_request_pdf_a_format(self, post: mock.Mock):
|
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
- Document needs to be converted to PDF
|
- Document needs to be converted to PDF
|
||||||
@ -108,10 +106,6 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
|||||||
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
||||||
file.touch()
|
file.touch()
|
||||||
|
|
||||||
response = Response(status_code=status.HTTP_200_OK)
|
|
||||||
response.request = Request("POST", "/somewhere/")
|
|
||||||
post.return_value = response
|
|
||||||
|
|
||||||
for setting, expected_key in [
|
for setting, expected_key in [
|
||||||
("pdfa", "PDF/A-2b"),
|
("pdfa", "PDF/A-2b"),
|
||||||
("pdfa-2", "PDF/A-2b"),
|
("pdfa-2", "PDF/A-2b"),
|
||||||
@ -119,11 +113,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
|
|||||||
("pdfa-3", "PDF/A-3b"),
|
("pdfa-3", "PDF/A-3b"),
|
||||||
]:
|
]:
|
||||||
with override_settings(OCR_OUTPUT_TYPE=setting):
|
with override_settings(OCR_OUTPUT_TYPE=setting):
|
||||||
|
self.httpx_mock.add_response(
|
||||||
|
status_code=codes.OK,
|
||||||
|
content=b"PDF document",
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
|
||||||
self.parser.convert_to_pdf(file, None)
|
self.parser.convert_to_pdf(file, None)
|
||||||
|
|
||||||
post.assert_called_once()
|
request = self.httpx_mock.get_request()
|
||||||
_, kwargs = post.call_args
|
found = False
|
||||||
|
for field in request.stream.fields:
|
||||||
|
if isinstance(field, DataField) and field.name == "pdfFormat":
|
||||||
|
self.assertEqual(field.value, expected_key)
|
||||||
|
found = True
|
||||||
|
self.assertTrue(found)
|
||||||
|
|
||||||
self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
|
self.httpx_mock.reset(assert_all_responses_were_requested=False)
|
||||||
|
|
||||||
post.reset_mock()
|
|
||||||
|
@ -7,7 +7,7 @@ max-line-length = 88
|
|||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||||
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet --durations=50
|
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
|
||||||
env =
|
env =
|
||||||
PAPERLESS_DISABLE_DBHANDLER=true
|
PAPERLESS_DISABLE_DBHANDLER=true
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user