Enhancement: add layout options for email conversion (#8907)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Silvia Bigler
2025-02-07 19:32:35 +01:00
committed by GitHub
parent 7f36163c3b
commit 71472a6a82
16 changed files with 421 additions and 89 deletions

View File

@@ -48,6 +48,7 @@ from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless_mail.parsers import MailDocumentParser
class WorkflowTriggerPlugin(
@@ -479,7 +480,18 @@ class ConsumerPlugin(
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
self.log.debug(f"Parsing {self.filename}...")
document_parser.parse(self.working_copy, mime_type, self.filename)
if (
isinstance(document_parser, MailDocumentParser)
and self.input_doc.mailrule_id
):
document_parser.parse(
self.working_copy,
mime_type,
self.filename,
self.input_doc.mailrule_id,
)
else:
document_parser.parse(self.working_copy, mime_type, self.filename)
self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(

View File

@@ -21,6 +21,7 @@ from guardian.core import ObjectPermissionChecker
from documents.consumer import ConsumerError
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import Document
@@ -35,6 +36,8 @@ from documents.tasks import sanity_check
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import GetConsumerMixin
from paperless_mail.models import MailRule
from paperless_mail.parsers import MailDocumentParser
class TestAttributes(UnittestTestCase):
@@ -243,6 +246,8 @@ def fake_magic_from_file(file, *, mime=False):
return "image/png"
elif os.path.splitext(file)[1] == ".webp":
return "image/webp"
elif os.path.splitext(file)[1] == ".eml":
return "message/rfc822"
else:
return "unknown"
else:
@@ -975,6 +980,59 @@ class TestConsumer(
self.assertEqual(command[0], "qpdf")
self.assertEqual(command[1], "--replace-input")
@mock.patch("paperless_mail.models.MailRule.objects.get")
@mock.patch("paperless_mail.parsers.MailDocumentParser.parse")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_mail_parser_receives_mailrule(
self,
mock_consumer_declaration_send: mock.Mock,
mock_mail_parser_parse: mock.Mock,
mock_mailrule_get: mock.Mock,
):
"""
GIVEN:
- A mail document from a mail rule
WHEN:
- The consumer is run
THEN:
- The mail parser should receive the mail rule
"""
mock_consumer_declaration_send.return_value = [
(
None,
{
"parser": MailDocumentParser,
"mime_types": {"message/rfc822": ".eml"},
"weight": 0,
},
),
]
mock_mailrule_get.return_value = mock.Mock(
pdf_layout=MailRule.PdfLayout.HTML_ONLY,
)
with self.get_consumer(
filepath=(
Path(__file__).parent.parent.parent
/ Path("paperless_mail")
/ Path("tests")
/ Path("samples")
).resolve()
/ "html.eml",
source=DocumentSource.MailFetch,
mailrule_id=1,
) as consumer:
# fails because no gotenberg
with self.assertRaises(
ConsumerError,
):
consumer.run()
mock_mail_parser_parse.assert_called_once_with(
consumer.working_copy,
"message/rfc822",
file_name="sample.pdf",
mailrule=mock_mailrule_get.return_value,
)
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):

View File

@@ -8,7 +8,7 @@ class TestMigrateWorkflow(TestMigrations):
dependencies = (
(
"paperless_mail",
"0028_alter_mailaccount_password_and_more",
"0029_mailrule_pdf_layout",
),
)

View File

@@ -340,11 +340,16 @@ class GetConsumerMixin:
filepath: Path,
overrides: DocumentMetadataOverrides | None = None,
source: DocumentSource = DocumentSource.ConsumeFolder,
mailrule_id: int | None = None,
) -> Generator[ConsumerPlugin, None, None]:
# Store this for verification
self.status = DummyProgressManager(filepath.name, None)
reader = ConsumerPlugin(
ConsumableDocument(source, original_file=filepath),
ConsumableDocument(
source,
original_file=filepath,
mailrule_id=mailrule_id or None,
),
overrides or DocumentMetadataOverrides(),
self.status, # type: ignore
self.dirs.scratch_dir,

View File

@@ -2,7 +2,7 @@ msgid ""
msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2025-01-27 08:19-0800\n"
"POT-Creation-Date: 2025-01-28 12:17-0800\n"
"PO-Revision-Date: 2022-02-17 04:17\n"
"Last-Translator: \n"
"Language-Team: English\n"
@@ -90,7 +90,7 @@ msgid "Automatic"
msgstr ""
#: documents/models.py:67 documents/models.py:433 documents/models.py:1493
#: paperless_mail/models.py:23 paperless_mail/models.py:136
#: paperless_mail/models.py:23 paperless_mail/models.py:143
msgid "name"
msgstr ""
@@ -276,7 +276,7 @@ msgstr ""
msgid "warning"
msgstr ""
#: documents/models.py:387 paperless_mail/models.py:350
#: documents/models.py:387 paperless_mail/models.py:363
msgid "error"
msgstr ""
@@ -818,7 +818,7 @@ msgstr ""
msgid "filter filename"
msgstr ""
#: documents/models.py:1066 paperless_mail/models.py:193
#: documents/models.py:1066 paperless_mail/models.py:200
msgid ""
"Only consume documents which entirely match this filename if specified. "
"Wildcards such as *.pdf or *invoice* are allowed. Case insensitive."
@@ -988,15 +988,15 @@ msgid ""
"Assign a document title, can include some placeholders, see documentation."
msgstr ""
#: documents/models.py:1287 paperless_mail/models.py:261
#: documents/models.py:1287 paperless_mail/models.py:274
msgid "assign this tag"
msgstr ""
#: documents/models.py:1296 paperless_mail/models.py:269
#: documents/models.py:1296 paperless_mail/models.py:282
msgid "assign this document type"
msgstr ""
#: documents/models.py:1305 paperless_mail/models.py:283
#: documents/models.py:1305 paperless_mail/models.py:296
msgid "assign this correspondent"
msgstr ""
@@ -1112,7 +1112,7 @@ msgstr ""
msgid "workflow actions"
msgstr ""
#: documents/models.py:1495 paperless_mail/models.py:138
#: documents/models.py:1495 paperless_mail/models.py:145
msgid "order"
msgstr ""
@@ -1124,7 +1124,7 @@ msgstr ""
msgid "actions"
msgstr ""
#: documents/models.py:1511 paperless_mail/models.py:147
#: documents/models.py:1511 paperless_mail/models.py:154
msgid "enabled"
msgstr ""
@@ -1838,161 +1838,185 @@ msgid "Process all files, including 'inline' attachments."
msgstr ""
#: paperless_mail/models.py:119
msgid "Delete"
msgid "System default"
msgstr ""
#: paperless_mail/models.py:120
msgid "Move to specified folder"
msgid "Text, then HTML"
msgstr ""
#: paperless_mail/models.py:121
msgid "Mark as read, don't process read mails"
msgid "HTML, then text"
msgstr ""
#: paperless_mail/models.py:122
msgid "Flag the mail, don't process flagged mails"
msgid "HTML only"
msgstr ""
#: paperless_mail/models.py:123
msgid "Tag the mail with specified tag, don't process tagged mails"
msgid "Text only"
msgstr ""
#: paperless_mail/models.py:126
msgid "Use subject as title"
msgid "Delete"
msgstr ""
#: paperless_mail/models.py:127
msgid "Use attachment filename as title"
msgid "Move to specified folder"
msgstr ""
#: paperless_mail/models.py:128
msgid "Do not assign title from rule"
msgid "Mark as read, don't process read mails"
msgstr ""
#: paperless_mail/models.py:131
msgid "Do not assign a correspondent"
#: paperless_mail/models.py:129
msgid "Flag the mail, don't process flagged mails"
msgstr ""
#: paperless_mail/models.py:132
msgid "Use mail address"
#: paperless_mail/models.py:130
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
#: paperless_mail/models.py:133
msgid "Use name (or mail address if not available)"
msgid "Use subject as title"
msgstr ""
#: paperless_mail/models.py:134
msgid "Use attachment filename as title"
msgstr ""
#: paperless_mail/models.py:135
msgid "Do not assign title from rule"
msgstr ""
#: paperless_mail/models.py:138
msgid "Do not assign a correspondent"
msgstr ""
#: paperless_mail/models.py:139
msgid "Use mail address"
msgstr ""
#: paperless_mail/models.py:140
msgid "Use name (or mail address if not available)"
msgstr ""
#: paperless_mail/models.py:141
msgid "Use correspondent selected below"
msgstr ""
#: paperless_mail/models.py:144
#: paperless_mail/models.py:151
msgid "account"
msgstr ""
#: paperless_mail/models.py:150 paperless_mail/models.py:305
#: paperless_mail/models.py:157 paperless_mail/models.py:318
msgid "folder"
msgstr ""
#: paperless_mail/models.py:154
#: paperless_mail/models.py:161
msgid ""
"Subfolders must be separated by a delimiter, often a dot ('.') or slash "
"('/'), but it varies by mail server."
msgstr ""
#: paperless_mail/models.py:160
#: paperless_mail/models.py:167
msgid "filter from"
msgstr ""
#: paperless_mail/models.py:167
#: paperless_mail/models.py:174
msgid "filter to"
msgstr ""
#: paperless_mail/models.py:174
#: paperless_mail/models.py:181
msgid "filter subject"
msgstr ""
#: paperless_mail/models.py:181
#: paperless_mail/models.py:188
msgid "filter body"
msgstr ""
#: paperless_mail/models.py:188
#: paperless_mail/models.py:195
msgid "filter attachment filename inclusive"
msgstr ""
#: paperless_mail/models.py:200
#: paperless_mail/models.py:207
msgid "filter attachment filename exclusive"
msgstr ""
#: paperless_mail/models.py:205
#: paperless_mail/models.py:212
msgid ""
"Do not consume documents which entirely match this filename if specified. "
"Wildcards such as *.pdf or *invoice* are allowed. Case insensitive."
msgstr ""
#: paperless_mail/models.py:212
#: paperless_mail/models.py:219
msgid "maximum age"
msgstr ""
#: paperless_mail/models.py:214
#: paperless_mail/models.py:221
msgid "Specified in days."
msgstr ""
#: paperless_mail/models.py:218
#: paperless_mail/models.py:225
msgid "attachment type"
msgstr ""
#: paperless_mail/models.py:222
#: paperless_mail/models.py:229
msgid ""
"Inline attachments include embedded images, so it's best to combine this "
"option with a filename filter."
msgstr ""
#: paperless_mail/models.py:228
#: paperless_mail/models.py:235
msgid "consumption scope"
msgstr ""
#: paperless_mail/models.py:234
#: paperless_mail/models.py:241
msgid "pdf layout"
msgstr ""
#: paperless_mail/models.py:247
msgid "action"
msgstr ""
#: paperless_mail/models.py:240
#: paperless_mail/models.py:253
msgid "action parameter"
msgstr ""
#: paperless_mail/models.py:245
#: paperless_mail/models.py:258
msgid ""
"Additional parameter for the action selected above, i.e., the target folder "
"of the move to folder action. Subfolders must be separated by dots."
msgstr ""
#: paperless_mail/models.py:253
#: paperless_mail/models.py:266
msgid "assign title from"
msgstr ""
#: paperless_mail/models.py:273
#: paperless_mail/models.py:286
msgid "assign correspondent from"
msgstr ""
#: paperless_mail/models.py:287
#: paperless_mail/models.py:300
msgid "Assign the rule owner to documents"
msgstr ""
#: paperless_mail/models.py:313
#: paperless_mail/models.py:326
msgid "uid"
msgstr ""
#: paperless_mail/models.py:321
#: paperless_mail/models.py:334
msgid "subject"
msgstr ""
#: paperless_mail/models.py:329
#: paperless_mail/models.py:342
msgid "received"
msgstr ""
#: paperless_mail/models.py:336
#: paperless_mail/models.py:349
msgid "processed"
msgstr ""
#: paperless_mail/models.py:342
#: paperless_mail/models.py:355
msgid "status"
msgstr ""

View File

@@ -1030,6 +1030,11 @@ CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
# Fallback layout for .eml consumption
EMAIL_PARSE_DEFAULT_LAYOUT = __get_int(
"PAPERLESS_EMAIL_PARSE_DEFAULT_LAYOUT",
1, # MailRule.PdfLayout.TEXT_HTML but that can't be imported here
)
# Pre-2.x versions of Paperless stored your documents locally with GPG
# encryption, but that is no longer the default. This behaviour is still

View File

@@ -0,0 +1,28 @@
# Generated by Django 5.1.3 on 2024-11-24 12:39
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0028_alter_mailaccount_password_and_more"),
]
operations = [
migrations.AddField(
model_name="mailrule",
name="pdf_layout",
field=models.PositiveIntegerField(
choices=[
(0, "System default"),
(1, "Text, then HTML"),
(2, "HTML, then text"),
(3, "HTML only"),
(4, "Text only"),
],
default=0,
verbose_name="pdf layout",
),
),
]

View File

@@ -115,6 +115,13 @@ class MailRule(document_models.ModelWithOwner):
ATTACHMENTS_ONLY = 1, _("Only process attachments.")
EVERYTHING = 2, _("Process all files, including 'inline' attachments.")
class PdfLayout(models.IntegerChoices):
DEFAULT = 0, _("System default")
TEXT_HTML = 1, _("Text, then HTML")
HTML_TEXT = 2, _("HTML, then text")
HTML_ONLY = 3, _("HTML only")
TEXT_ONLY = 4, _("Text only")
class MailAction(models.IntegerChoices):
DELETE = 1, _("Delete")
MOVE = 2, _("Move to specified folder")
@@ -230,6 +237,12 @@ class MailRule(document_models.ModelWithOwner):
default=ConsumptionScope.ATTACHMENTS_ONLY,
)
pdf_layout = models.PositiveIntegerField(
_("pdf layout"),
choices=PdfLayout.choices,
default=PdfLayout.DEFAULT,
)
action = models.PositiveIntegerField(
_("action"),
choices=MailAction.choices,

View File

@@ -22,6 +22,7 @@ from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.models import OutputTypeChoices
from paperless_mail.models import MailRule
class MailDocumentParser(DocumentParser):
@@ -121,7 +122,13 @@ class MailDocumentParser(DocumentParser):
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(self, document_path: Path, mime_type: str, file_name=None):
def parse(
self,
document_path: Path,
mime_type: str,
file_name=None,
mailrule_id: int | None = None,
):
"""
Parses the given .eml into formatted text, based on the decoded email.
@@ -180,7 +187,11 @@ class MailDocumentParser(DocumentParser):
self.date = mail.date
self.log.debug("Creating a PDF from the email")
self.archive_path = self.generate_pdf(mail)
if mailrule_id:
rule = MailRule.objects.get(pk=mailrule_id)
self.archive_path = self.generate_pdf(mail, rule.pdf_layout)
else:
self.archive_path = self.generate_pdf(mail)
@staticmethod
def parse_file_to_message(filepath: Path) -> MailMessage:
@@ -217,11 +228,19 @@ class MailDocumentParser(DocumentParser):
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
def generate_pdf(self, mail_message: MailMessage) -> Path:
def generate_pdf(
self,
mail_message: MailMessage,
pdf_layout: MailRule.PdfLayout | None = None,
) -> Path:
archive_path = Path(self.tempdir) / "merged.pdf"
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
pdf_layout = (
pdf_layout or settings.EMAIL_PARSE_DEFAULT_LAYOUT
) # EMAIL_PARSE_DEFAULT_LAYOUT is a MailRule.PdfLayout
# If no HTML content, create the PDF from the message
# Otherwise, create 2 PDFs and merge them with Gotenberg
if not mail_message.html:
@@ -246,7 +265,15 @@ class MailDocumentParser(DocumentParser):
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
route.merge([mail_pdf_file, pdf_of_html_content])
match pdf_layout:
case MailRule.PdfLayout.HTML_TEXT:
route.merge([pdf_of_html_content, mail_pdf_file])
case MailRule.PdfLayout.HTML_ONLY:
route.merge([pdf_of_html_content])
case MailRule.PdfLayout.TEXT_ONLY:
route.merge([mail_pdf_file])
case MailRule.PdfLayout.TEXT_HTML | _:
route.merge([mail_pdf_file, pdf_of_html_content])
try:
response = route.run()

View File

@@ -96,6 +96,7 @@ class MailRuleSerializer(OwnedObjectSerializer):
"order",
"attachment_type",
"consumption_scope",
"pdf_layout",
"owner",
"user_can_change",
"permissions",

View File

@@ -1,6 +1,7 @@
import datetime
import logging
from pathlib import Path
from unittest import mock
import httpx
import pytest
@@ -662,3 +663,67 @@ class TestParser:
request = httpx_mock.get_request()
assert str(request.url) == "http://localhost:3000/forms/chromium/convert/html"
@pytest.mark.httpx_mock(can_send_already_matched_responses=True)
@mock.patch("gotenberg_client._merge.MergeRoute.merge")
@mock.patch("paperless_mail.models.MailRule.objects.get")
def test_generate_pdf_layout_options(
self,
mock_mailrule_get: mock.Mock,
mock_merge_route: mock.Mock,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
):
"""
GIVEN:
- Email message
WHEN:
- Email is parsed with different layout options
THEN:
- Gotenberg is called with the correct layout option
"""
httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "This is some Tika HTML text",
},
)
httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
content=html_email_pdf_file.read_bytes(),
)
httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge",
method="POST",
content=b"Pretend merged PDF content",
)
def test_layout_option(layout_option, expected_calls, expected_pdf_names):
mock_mailrule_get.return_value = mock.Mock(pdf_layout=layout_option)
mail_parser.parse(
document_path=html_email_file,
mime_type="message/rfc822",
mailrule_id=1,
)
args, _ = mock_merge_route.call_args
assert len(args[0]) == expected_calls
for i, pdf in enumerate(expected_pdf_names):
assert args[0][i].name == pdf
# 1 = MailRule.PdfLayout.TEXT_HTML
test_layout_option(1, 2, ["email_as_pdf.pdf", "html.pdf"])
# 2 = MailRule.PdfLayout.HTML_TEXT
test_layout_option(2, 2, ["html.pdf", "email_as_pdf.pdf"])
# 3 = MailRule.PdfLayout.HTML_ONLY
test_layout_option(3, 1, ["html.pdf"])
# 4 = MailRule.PdfLayout.TEXT_ONLY
test_layout_option(4, 1, ["email_as_pdf.pdf"])