mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Enhancement: add layout options for email conversion (#8907)
--------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
28
src/paperless_mail/migrations/0029_mailrule_pdf_layout.py
Normal file
28
src/paperless_mail/migrations/0029_mailrule_pdf_layout.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated by Django 5.1.3 on 2024-11-24 12:39
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless_mail", "0028_alter_mailaccount_password_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="mailrule",
|
||||
name="pdf_layout",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(0, "System default"),
|
||||
(1, "Text, then HTML"),
|
||||
(2, "HTML, then text"),
|
||||
(3, "HTML only"),
|
||||
(4, "Text only"),
|
||||
],
|
||||
default=0,
|
||||
verbose_name="pdf layout",
|
||||
),
|
||||
),
|
||||
]
|
@@ -115,6 +115,13 @@ class MailRule(document_models.ModelWithOwner):
|
||||
ATTACHMENTS_ONLY = 1, _("Only process attachments.")
|
||||
EVERYTHING = 2, _("Process all files, including 'inline' attachments.")
|
||||
|
||||
class PdfLayout(models.IntegerChoices):
|
||||
DEFAULT = 0, _("System default")
|
||||
TEXT_HTML = 1, _("Text, then HTML")
|
||||
HTML_TEXT = 2, _("HTML, then text")
|
||||
HTML_ONLY = 3, _("HTML only")
|
||||
TEXT_ONLY = 4, _("Text only")
|
||||
|
||||
class MailAction(models.IntegerChoices):
|
||||
DELETE = 1, _("Delete")
|
||||
MOVE = 2, _("Move to specified folder")
|
||||
@@ -230,6 +237,12 @@ class MailRule(document_models.ModelWithOwner):
|
||||
default=ConsumptionScope.ATTACHMENTS_ONLY,
|
||||
)
|
||||
|
||||
pdf_layout = models.PositiveIntegerField(
|
||||
_("pdf layout"),
|
||||
choices=PdfLayout.choices,
|
||||
default=PdfLayout.DEFAULT,
|
||||
)
|
||||
|
||||
action = models.PositiveIntegerField(
|
||||
_("action"),
|
||||
choices=MailAction.choices,
|
||||
|
@@ -22,6 +22,7 @@ from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
|
||||
class MailDocumentParser(DocumentParser):
|
||||
@@ -121,7 +122,13 @@ class MailDocumentParser(DocumentParser):
|
||||
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||
return result
|
||||
|
||||
def parse(self, document_path: Path, mime_type: str, file_name=None):
|
||||
def parse(
|
||||
self,
|
||||
document_path: Path,
|
||||
mime_type: str,
|
||||
file_name=None,
|
||||
mailrule_id: int | None = None,
|
||||
):
|
||||
"""
|
||||
Parses the given .eml into formatted text, based on the decoded email.
|
||||
|
||||
@@ -180,7 +187,11 @@ class MailDocumentParser(DocumentParser):
|
||||
self.date = mail.date
|
||||
|
||||
self.log.debug("Creating a PDF from the email")
|
||||
self.archive_path = self.generate_pdf(mail)
|
||||
if mailrule_id:
|
||||
rule = MailRule.objects.get(pk=mailrule_id)
|
||||
self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
|
||||
else:
|
||||
self.archive_path = self.generate_pdf(mail)
|
||||
|
||||
@staticmethod
|
||||
def parse_file_to_message(filepath: Path) -> MailMessage:
|
||||
@@ -217,11 +228,19 @@ class MailDocumentParser(DocumentParser):
|
||||
f"{settings.TIKA_ENDPOINT}: {err}",
|
||||
) from err
|
||||
|
||||
def generate_pdf(self, mail_message: MailMessage) -> Path:
|
||||
def generate_pdf(
|
||||
self,
|
||||
mail_message: MailMessage,
|
||||
pdf_layout: MailRule.PdfLayout | None = None,
|
||||
) -> Path:
|
||||
archive_path = Path(self.tempdir) / "merged.pdf"
|
||||
|
||||
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
|
||||
|
||||
pdf_layout = (
|
||||
pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
|
||||
) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
|
||||
|
||||
# If no HTML content, create the PDF from the message
|
||||
# Otherwise, create 2 PDFs and merge them with Gotenberg
|
||||
if not mail_message.html:
|
||||
@@ -246,7 +265,15 @@ class MailDocumentParser(DocumentParser):
|
||||
if pdf_a_format is not None:
|
||||
route.pdf_format(pdf_a_format)
|
||||
|
||||
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||
match pdf_layout:
|
||||
case MailRule.PdfLayout.HTML_TEXT:
|
||||
route.merge([pdf_of_html_content, mail_pdf_file])
|
||||
case MailRule.PdfLayout.HTML_ONLY:
|
||||
route.merge([pdf_of_html_content])
|
||||
case MailRule.PdfLayout.TEXT_ONLY:
|
||||
route.merge([mail_pdf_file])
|
||||
case MailRule.PdfLayout.TEXT_HTML | _:
|
||||
route.merge([mail_pdf_file, pdf_of_html_content])
|
||||
|
||||
try:
|
||||
response = route.run()
|
||||
|
@@ -96,6 +96,7 @@ class MailRuleSerializer(OwnedObjectSerializer):
|
||||
"order",
|
||||
"attachment_type",
|
||||
"consumption_scope",
|
||||
"pdf_layout",
|
||||
"owner",
|
||||
"user_can_change",
|
||||
"permissions",
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import datetime
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
@@ -662,3 +663,67 @@ class TestParser:
|
||||
request = httpx_mock.get_request()
|
||||
|
||||
assert str(request.url) == "http://localhost:3000/forms/chromium/convert/html"
|
||||
|
||||
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
|
||||
@mock.patch("gotenberg_client._merge.MergeRoute.merge")
|
||||
@mock.patch("paperless_mail.models.MailRule.objects.get")
|
||||
def test_generate_pdf_layout_options(
|
||||
self,
|
||||
mock_mailrule_get: mock.Mock,
|
||||
mock_merge_route: mock.Mock,
|
||||
httpx_mock: HTTPXMock,
|
||||
mail_parser: MailDocumentParser,
|
||||
html_email_file: Path,
|
||||
html_email_pdf_file: Path,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Email message
|
||||
WHEN:
|
||||
- Email is parsed with different layout options
|
||||
THEN:
|
||||
- Gotenberg is called with the correct layout option
|
||||
"""
|
||||
httpx_mock.add_response(
|
||||
url="http://localhost:9998/tika/text",
|
||||
method="PUT",
|
||||
json={
|
||||
"Content-Type": "text/html",
|
||||
"X-TIKA:Parsed-By": [],
|
||||
"X-TIKA:content": "This is some Tika HTML text",
|
||||
},
|
||||
)
|
||||
httpx_mock.add_response(
|
||||
url="http://localhost:3000/forms/chromium/convert/html",
|
||||
method="POST",
|
||||
content=html_email_pdf_file.read_bytes(),
|
||||
)
|
||||
httpx_mock.add_response(
|
||||
url="http://localhost:3000/forms/pdfengines/merge",
|
||||
method="POST",
|
||||
content=b"Pretend merged PDF content",
|
||||
)
|
||||
|
||||
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
|
||||
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
|
||||
mail_parser.parse(
|
||||
document_path=html_email_file,
|
||||
mime_type="message/rfc822",
|
||||
mailrule_id=1,
|
||||
)
|
||||
args, _ = mock_merge_route.call_args
|
||||
assert len(args[0]) == expected_calls
|
||||
for i, pdf in enumerate(expected_pdf_names):
|
||||
assert args[0][i].name == pdf
|
||||
|
||||
# 1 = MailRule.PdfLayout.TEXT_HTML
|
||||
test_layout_option(1, 2, ["email_as_pdf.pdf", "html.pdf"])
|
||||
|
||||
# 2 = MailRule.PdfLayout.HTML_TEXT
|
||||
test_layout_option(2, 2, ["html.pdf", "email_as_pdf.pdf"])
|
||||
|
||||
# 3 = MailRule.PdfLayout.HTML_ONLY
|
||||
test_layout_option(3, 1, ["html.pdf"])
|
||||
|
||||
# 4 = MailRule.PdfLayout.TEXT_ONLY
|
||||
test_layout_option(4, 1, ["email_as_pdf.pdf"])
|
||||
|
Reference in New Issue
Block a user