From 2c1cd25be4a54670161450bc1696f361a651631d Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 1 Jun 2023 14:50:08 -0700
Subject: [PATCH] Rewrites the email parsing to be more clear and concise.
Adds testing to use httpx mocked responses to stand in as a server even offline
---
.github/workflows/ci.yml | 15 +-
Pipfile.lock | 6 +-
src/documents/tests/utils.py | 14 +
src/paperless_mail/parsers.py | 327 +++++----
src/paperless_mail/tests/test_parsers.py | 663 +++++++++---------
src/paperless_mail/tests/test_parsers_live.py | 355 +++-------
src/paperless_tika/parsers.py | 54 +-
src/paperless_tika/tests/test_live_tika.py | 7 +-
src/paperless_tika/tests/test_tika_parser.py | 83 ++-
9 files changed, 701 insertions(+), 823 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e8c9bb533..9d2c510ca 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -106,15 +106,6 @@ jobs:
matrix:
python-version: ['3.8', '3.9', '3.10']
fail-fast: false
- env:
- # Enable Tika end to end testing
- TIKA_LIVE: 1
- # Enable paperless_mail testing against real server
- PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
- PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
- PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
- # Enable Gotenberg end to end testing
- GOTENBERG_LIVE: 1
steps:
-
name: Checkout
@@ -156,6 +147,12 @@ jobs:
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pip list
-
name: Tests
+ env:
+ PAPERLESS_CI_TEST: 1
+ # Enable paperless_mail testing against real server
+ PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
+ PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
+ PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
run: |
cd src/
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra
diff --git a/Pipfile.lock b/Pipfile.lock
index e92c913c4..d9e6b8d56 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1745,11 +1745,11 @@
},
"tika-client": {
"hashes": [
- "sha256:6f2afab12eb46cd7b4ed6c34c9c2a1791a45d2f479c0da0076936dc6dbfe8061",
- "sha256:f2c23cb76677b7b8be70e2d95ac3418ed046b1514bff920f7460beae1ca3342b"
+ "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156",
+ "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610"
],
"index": "pypi",
- "version": "==0.0.2"
+ "version": "==0.0.3"
},
"tornado": {
"hashes": [
diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py
index fbde3345c..483d3b12d 100644
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -105,6 +105,20 @@ class FileSystemAssertsMixin:
def assertIsNotDir(self, path: Union[PathLike, str]):
self.assertFalse(Path(path).resolve().is_dir(), f"Dir does exist: {path}")
+ def assertFilesEqual(
+ self,
+ path1: Union[PathLike, str],
+ path2: Union[PathLike, str],
+ ):
+ path1 = Path(path1)
+ path2 = Path(path2)
+ import hashlib
+
+ hash1 = hashlib.sha256(path1.read_bytes()).hexdigest()
+ hash2 = hashlib.sha256(path2.read_bytes()).hexdigest()
+
+ self.assertEqual(hash1, hash2, "File SHA256 mismatch")
+
class ConsumerProgressMixin:
def setUp(self) -> None:
diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py
index 7cd5e06e6..3ec3e64a0 100644
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -1,8 +1,7 @@
-import os
import re
from html import escape
-from io import BytesIO
-from io import StringIO
+from pathlib import Path
+from typing import List
import httpx
from bleach import clean
@@ -11,8 +10,9 @@ from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from humanfriendly import format_size
+from imap_tools import MailAttachment
from imap_tools import MailMessage
-from tika import parser
+from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -22,33 +22,15 @@ from documents.parsers import make_thumbnail_from_pdf
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
- gotenbergs and sends the html part to a local tika server for text extraction.
+ Gotenberg and sends the html part to a Tika server for text extraction.
"""
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
logging_name = "paperless.parsing.mail"
- _parsed = None
- def get_parsed(self, document_path) -> MailMessage:
- if not self._parsed:
- try:
- with open(document_path, "rb") as eml:
- self._parsed = MailMessage.from_bytes(eml.read())
- except Exception as err:
- raise ParseError(
- f"Could not parse {document_path}: {err}",
- ) from err
- if not self._parsed.from_values:
- self._parsed = None
- raise ParseError(
- f"Could not parse {document_path}: Missing 'from'",
- )
-
- return self._parsed
-
- def get_thumbnail(self, document_path, mime_type, file_name=None):
+ def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(document_path)
@@ -58,11 +40,11 @@ class MailDocumentParser(DocumentParser):
self.logging_group,
)
- def extract_metadata(self, document_path, mime_type):
+ def extract_metadata(self, document_path: Path, mime_type: str):
result = []
try:
- mail = self.get_parsed(document_path)
+ mail = self.parse_file_to_message(document_path)
except ParseError as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
@@ -106,101 +88,157 @@ class MailDocumentParser(DocumentParser):
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
- def parse(self, document_path, mime_type, file_name=None):
+ def parse(self, document_path: Path, mime_type: str, file_name=None):
+ """
+ Parses the given .eml into formatted text, based on the decoded email.
+
+ """
+
def strip_text(text: str):
+ """
+ Reduces the spacing of the given text string
+ """
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
- mail = self.get_parsed(document_path)
+ def build_formatted_text(mail_message: MailMessage) -> str:
+ """
+ Constructs a formatted string, based on the given email. Basically tries
+ to get most of the email content, included front matter, into a nice string
+ """
+ fmt_text = f"Subject: {mail_message.subject}\n\n"
+ fmt_text += f"From: {mail_message.from_values.full}\n\n"
+ to_list = [address.full for address in mail_message.to_values]
+ fmt_text += f"To: {', '.join(to_list)}\n\n"
+ if mail_message.cc_values:
+ fmt_text += (
+ f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+ )
+ if mail_message.bcc_values:
+ fmt_text += (
+ f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+ )
+ if mail_message.attachments:
+ att = []
+ for a in mail.attachments:
+ att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
+ fmt_text += f"Attachments: {', '.join(att)}\n\n"
- self.text = f"Subject: {mail.subject}\n\n"
- self.text += f"From: {mail.from_values.full}\n\n"
- self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n"
- if len(mail.cc_values) >= 1:
- self.text += (
- f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
- )
- if len(mail.bcc_values) >= 1:
- self.text += (
- f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
- )
- if len(mail.attachments) >= 1:
- att = []
- for a in mail.attachments:
- att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
+ if mail.html:
+ fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
- self.text += f"Attachments: {', '.join(att)}\n\n"
+ fmt_text += f"\n\n{strip_text(mail.text)}"
- if mail.html:
- self.text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+ return fmt_text
- self.text += f"\n\n{strip_text(mail.text)}"
+ self.log.debug(f"Parsing file {document_path.name} into an email")
+ mail = self.parse_file_to_message(document_path)
+
+ self.log.debug("Building formatted text from email")
+ self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
- self.archive_path = self.generate_pdf(document_path)
+ self.log.debug("Creating a PDF from the email")
+ self.archive_path = self.generate_pdf(mail)
+
+ @staticmethod
+ def parse_file_to_message(filepath: Path) -> MailMessage:
+ """
+ Parses the given .eml file into a MailMessage object
+ """
+ try:
+ with filepath.open("rb") as eml:
+ parsed = MailMessage.from_bytes(eml.read())
+ if parsed.from_values is None:
+ raise ParseError(
+ f"Could not parse {filepath}: Missing 'from'",
+ )
+ except Exception as err:
+ raise ParseError(
+ f"Could not parse {filepath}: {err}",
+ ) from err
+
+ return parsed
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
try:
- parsed = parser.from_buffer(html, self.tika_server)
+ with TikaClient(tika_url=self.tika_server) as client:
+ parsed = client.tika.as_text.from_buffer(html, "text/html")
+
+ if "X-TIKA:content" in parsed.data:
+ return parsed.data["X-TIKA:content"].strip()
+ return ""
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at "
f"{self.tika_server}: {err}",
) from err
- if parsed["content"]:
- return parsed["content"]
+
+ def generate_pdf(self, mail_message: MailMessage) -> Path:
+ archive_path = Path(self.tempdir) / "merged.pdf"
+
+ mail_pdf_file = self.generate_pdf_from_mail(mail_message)
+
+ # If no HTML content, create the PDF from the message
+ # Otherwise, create 2 PDFs and merge them with Gotenberg
+ if not mail_message.html:
+ archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
- return ""
+ url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
- def generate_pdf(self, document_path):
- pdf_collection = []
- url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
- pdf_path = os.path.join(self.tempdir, "merged.pdf")
- mail = self.get_parsed(document_path)
-
- pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail)))
-
- if not mail.html:
- with open(pdf_path, "wb") as file:
- file.write(pdf_collection[0][1])
- file.close()
- return pdf_path
- else:
- pdf_collection.append(
- (
- "2_html.pdf",
- self.generate_pdf_from_html(mail.html, mail.attachments),
- ),
+ pdf_of_html_content = self.generate_pdf_from_html(
+ mail_message.html,
+ mail_message.attachments,
)
- files = {}
- for name, content in pdf_collection:
- files[name] = (name, BytesIO(content))
- headers = {}
- try:
- response = httpx.post(url_merge, files=files, headers=headers)
- response.raise_for_status() # ensure we notice bad responses
- except Exception as err:
- raise ParseError(f"Error while converting document to PDF: {err}") from err
+ pdf_collection = {
+ "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
+ "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
+ }
- with open(pdf_path, "wb") as file:
- file.write(response.content)
- file.close()
+ try:
+ # Open a handle to each file, replacing the tuple
+ for filename in pdf_collection:
+ file_multi_part = pdf_collection[filename]
+ pdf_collection[filename] = (
+ file_multi_part[0],
+ file_multi_part[1].open("rb"),
+ file_multi_part[2],
+ )
- return pdf_path
+ response = httpx.post(url_merge, files=pdf_collection)
+ response.raise_for_status() # ensure we notice bad responses
- @staticmethod
- def mail_to_html(mail: MailMessage) -> StringIO:
- data = {}
+ archive_path.write_bytes(response.content)
- def clean_html(text: str):
+ except Exception as err:
+ raise ParseError(
+ f"Error while merging email HTML into PDF: {err}",
+ ) from err
+ finally:
+ for filename in pdf_collection:
+ file_multi_part_handle = pdf_collection[filename][1]
+ file_multi_part_handle.close()
+
+ return archive_path
+
+ def mail_to_html(self, mail: MailMessage) -> Path:
+ """
+ Converts the given email into an HTML file, formatted
+ based on the given template
+ """
+
+ def clean_html(text: str) -> str:
+ """
+ Attempts to clean, escape and linkify the given HTML string
+ """
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if type(text) != str:
@@ -211,6 +249,8 @@ class MailDocumentParser(DocumentParser):
text = text.replace("\n", "
")
return text
+ data = {}
+
data["subject"] = clean_html(mail.subject)
if data["subject"]:
data["subject_label"] = "Subject"
@@ -237,27 +277,33 @@ class MailDocumentParser(DocumentParser):
data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M"))
data["content"] = clean_html(mail.text.strip())
- html = StringIO()
-
from django.template.loader import render_to_string
- rendered = render_to_string("email_msg_template.html", context=data)
+ html_file = Path(self.tempdir) / "email_as_html.html"
+ html_file.write_text(render_to_string("email_msg_template.html", context=data))
- html.write(rendered)
- html.seek(0)
+ return html_file
- return html
-
- def generate_pdf_from_mail(self, mail):
+ def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
+ """
+ Creates a PDF based on the given email, using the email's values in a
+ an HTML template
+ """
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting mail to PDF")
- css_file = os.path.join(os.path.dirname(__file__), "templates/output.css")
+ css_file = Path(__file__).parent / "templates" / "output.css"
+ email_html_file = self.mail_to_html(mail)
- with open(css_file, "rb") as css_handle:
+ print(css_file)
+ print(email_html_file)
+
+ with css_file.open("rb") as css_handle, email_html_file.open(
+ "rb",
+ ) as email_html_handle:
files = {
- "html": ("index.html", self.mail_to_html(mail)),
- "css": ("output.css", css_handle),
+ "html": ("index.html", email_html_handle, "text/html"),
+ "css": ("output.css", css_handle, "text/css"),
}
headers = {}
data = {
@@ -289,13 +335,23 @@ class MailDocumentParser(DocumentParser):
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
- f"Error while converting document to PDF: {err}",
+ f"Error while converting email to PDF: {err}",
) from err
- return response.content
+ email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
+ email_as_pdf_file.write_bytes(response.content)
+
+ return email_as_pdf_file
+
+ def generate_pdf_from_html(
+ self,
+ orig_html: str,
+ attachments: List[MailAttachment],
+ ) -> Path:
+ """
+ Generates a PDF file based on the HTML and attachments of the email
+ """
- @staticmethod
- def transform_inline_html(html, attachments):
def clean_html_script(text: str):
compiled_open = re.compile(re.escape("