Rewrites the email parsing to be more clear and concise.

Adds testing to use httpx mocked responses to stand in as a server even offline
This commit is contained in:
Trenton H
2023-06-01 14:50:08 -07:00
parent 6e65558ea4
commit 2c1cd25be4
9 changed files with 701 additions and 823 deletions

View File

@@ -1,8 +1,7 @@
import os
import re
from html import escape
from io import BytesIO
from io import StringIO
from pathlib import Path
from typing import List
import httpx
from bleach import clean
@@ -11,8 +10,9 @@ from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from humanfriendly import format_size
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika import parser
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -22,33 +22,15 @@ from documents.parsers import make_thumbnail_from_pdf
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
gotenbergs and sends the html part to a local tika server for text extraction.
Gotenberg and sends the html part to a Tika server for text extraction.
"""
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
logging_name = "paperless.parsing.mail"
_parsed = None
def get_parsed(self, document_path) -> MailMessage:
if not self._parsed:
try:
with open(document_path, "rb") as eml:
self._parsed = MailMessage.from_bytes(eml.read())
except Exception as err:
raise ParseError(
f"Could not parse {document_path}: {err}",
) from err
if not self._parsed.from_values:
self._parsed = None
raise ParseError(
f"Could not parse {document_path}: Missing 'from'",
)
return self._parsed
def get_thumbnail(self, document_path, mime_type, file_name=None):
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(document_path)
@@ -58,11 +40,11 @@ class MailDocumentParser(DocumentParser):
self.logging_group,
)
def extract_metadata(self, document_path, mime_type):
def extract_metadata(self, document_path: Path, mime_type: str):
result = []
try:
mail = self.get_parsed(document_path)
mail = self.parse_file_to_message(document_path)
except ParseError as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
@@ -106,101 +88,157 @@ class MailDocumentParser(DocumentParser):
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(self, document_path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type: str, file_name=None):
"""
Parses the given .eml into formatted text, based on the decoded email.
"""
def strip_text(text: str):
"""
Reduces the spacing of the given text string
"""
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
mail = self.get_parsed(document_path)
def build_formatted_text(mail_message: MailMessage) -> str:
"""
Constructs a formatted string, based on the given email. Basically tries
to get most of the email content, included front matter, into a nice string
"""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if mail_message.attachments:
att = []
for a in mail.attachments:
att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
fmt_text += f"Attachments: {', '.join(att)}\n\n"
self.text = f"Subject: {mail.subject}\n\n"
self.text += f"From: {mail.from_values.full}\n\n"
self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n"
if len(mail.cc_values) >= 1:
self.text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if len(mail.bcc_values) >= 1:
self.text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if len(mail.attachments) >= 1:
att = []
for a in mail.attachments:
att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
if mail.html:
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
self.text += f"Attachments: {', '.join(att)}\n\n"
fmt_text += f"\n\n{strip_text(mail.text)}"
if mail.html:
self.text += "HTML content: " + strip_text(self.tika_parse(mail.html))
return fmt_text
self.text += f"\n\n{strip_text(mail.text)}"
self.log.debug(f"Parsing file {document_path.name} into an email")
mail = self.parse_file_to_message(document_path)
self.log.debug("Building formatted text from email")
self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
self.log.debug("Creating a PDF from the email")
self.archive_path = self.generate_pdf(mail)
@staticmethod
def parse_file_to_message(filepath: Path) -> MailMessage:
"""
Parses the given .eml file into a MailMessage object
"""
try:
with filepath.open("rb") as eml:
parsed = MailMessage.from_bytes(eml.read())
if parsed.from_values is None:
raise ParseError(
f"Could not parse {filepath}: Missing 'from'",
)
except Exception as err:
raise ParseError(
f"Could not parse {filepath}: {err}",
) from err
return parsed
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
try:
parsed = parser.from_buffer(html, self.tika_server)
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if "X-TIKA:content" in parsed.data:
return parsed.data["X-TIKA:content"].strip()
return ""
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at "
f"{self.tika_server}: {err}",
) from err
if parsed["content"]:
return parsed["content"]
def generate_pdf(self, mail_message: MailMessage) -> Path:
archive_path = Path(self.tempdir) / "merged.pdf"
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
# If no HTML content, create the PDF from the message
# Otherwise, create 2 PDFs and merge them with Gotenberg
if not mail_message.html:
archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
return ""
url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
def generate_pdf(self, document_path):
pdf_collection = []
url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
pdf_path = os.path.join(self.tempdir, "merged.pdf")
mail = self.get_parsed(document_path)
pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail)))
if not mail.html:
with open(pdf_path, "wb") as file:
file.write(pdf_collection[0][1])
file.close()
return pdf_path
else:
pdf_collection.append(
(
"2_html.pdf",
self.generate_pdf_from_html(mail.html, mail.attachments),
),
pdf_of_html_content = self.generate_pdf_from_html(
mail_message.html,
mail_message.attachments,
)
files = {}
for name, content in pdf_collection:
files[name] = (name, BytesIO(content))
headers = {}
try:
response = httpx.post(url_merge, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}") from err
pdf_collection = {
"1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
"2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
}
with open(pdf_path, "wb") as file:
file.write(response.content)
file.close()
try:
# Open a handle to each file, replacing the tuple
for filename in pdf_collection:
file_multi_part = pdf_collection[filename]
pdf_collection[filename] = (
file_multi_part[0],
file_multi_part[1].open("rb"),
file_multi_part[2],
)
return pdf_path
response = httpx.post(url_merge, files=pdf_collection)
response.raise_for_status() # ensure we notice bad responses
@staticmethod
def mail_to_html(mail: MailMessage) -> StringIO:
data = {}
archive_path.write_bytes(response.content)
def clean_html(text: str):
except Exception as err:
raise ParseError(
f"Error while merging email HTML into PDF: {err}",
) from err
finally:
for filename in pdf_collection:
file_multi_part_handle = pdf_collection[filename][1]
file_multi_part_handle.close()
return archive_path
def mail_to_html(self, mail: MailMessage) -> Path:
"""
Converts the given email into an HTML file, formatted
based on the given template
"""
def clean_html(text: str) -> str:
"""
Attempts to clean, escape and linkify the given HTML string
"""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if type(text) != str:
@@ -211,6 +249,8 @@ class MailDocumentParser(DocumentParser):
text = text.replace("\n", "<br>")
return text
data = {}
data["subject"] = clean_html(mail.subject)
if data["subject"]:
data["subject_label"] = "Subject"
@@ -237,27 +277,33 @@ class MailDocumentParser(DocumentParser):
data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M"))
data["content"] = clean_html(mail.text.strip())
html = StringIO()
from django.template.loader import render_to_string
rendered = render_to_string("email_msg_template.html", context=data)
html_file = Path(self.tempdir) / "email_as_html.html"
html_file.write_text(render_to_string("email_msg_template.html", context=data))
html.write(rendered)
html.seek(0)
return html_file
return html
def generate_pdf_from_mail(self, mail):
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
"""
Creates a PDF based on the given email, using the email's values in a
an HTML template
"""
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting mail to PDF")
css_file = os.path.join(os.path.dirname(__file__), "templates/output.css")
css_file = Path(__file__).parent / "templates" / "output.css"
email_html_file = self.mail_to_html(mail)
with open(css_file, "rb") as css_handle:
print(css_file)
print(email_html_file)
with css_file.open("rb") as css_handle, email_html_file.open(
"rb",
) as email_html_handle:
files = {
"html": ("index.html", self.mail_to_html(mail)),
"css": ("output.css", css_handle),
"html": ("index.html", email_html_handle, "text/html"),
"css": ("output.css", css_handle, "text/css"),
}
headers = {}
data = {
@@ -289,13 +335,23 @@ class MailDocumentParser(DocumentParser):
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
f"Error while converting email to PDF: {err}",
) from err
return response.content
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
email_as_pdf_file.write_bytes(response.content)
return email_as_pdf_file
def generate_pdf_from_html(
self,
orig_html: str,
attachments: List[MailAttachment],
) -> Path:
"""
Generates a PDF file based on the HTML and attachments of the email
"""
@staticmethod
def transform_inline_html(html, attachments):
def clean_html_script(text: str):
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
@@ -304,28 +360,36 @@ class MailDocumentParser(DocumentParser):
text = compiled_close.sub("</div", text)
return text
html_clean = clean_html_script(html)
files = []
for a in attachments:
name_cid = "cid:" + a.content_id
name_clean = "".join(e for e in name_cid if e.isalnum())
files.append((name_clean, BytesIO(a.payload)))
html_clean = html_clean.replace(name_cid, name_clean)
files.append(("index.html", StringIO(html_clean)))
return files
def generate_pdf_from_html(self, orig_html, attachments):
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting html to PDF")
files = {}
for name, file in self.transform_inline_html(orig_html, attachments):
files[name] = (name, file)
tempdir = Path(self.tempdir)
html_clean = clean_html_script(orig_html)
files = {}
for attachment in attachments:
# Clean the attachment name to be valid
name_cid = f"cid:{attachment.content_id}"
name_clean = "".join(e for e in name_cid if e.isalnum())
# Write attachment payload to a temp file
temp_file = tempdir / name_clean
temp_file.write_bytes(attachment.payload)
# Store the attachment for upload
files[name_clean] = (name_clean, temp_file, attachment.content_type)
# Replace as needed the name with the clean name
html_clean = html_clean.replace(name_cid, name_clean)
# Now store the cleaned up HTML version
html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean)
files["index.html"] = ("index.html", html_clean_file, "text/html")
headers = {}
data = {
"marginTop": "0.1",
"marginBottom": "0.1",
@@ -336,14 +400,29 @@ class MailDocumentParser(DocumentParser):
"scale": "1.0",
}
try:
# Open a handle to each file, replacing the tuple
for filename in files:
file_multi_part = files[filename]
files[filename] = (
file_multi_part[0],
file_multi_part[1].open("rb"),
file_multi_part[2],
)
response = httpx.post(
url,
files=files,
headers=headers,
data=data,
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}") from err
finally:
# Ensure all file handles as closed
for filename in files:
file_multi_part_handle = files[filename][1]
file_multi_part_handle.close()
return response.content
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf