diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e8c9bb533..9d2c510ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,15 +106,6 @@ jobs: matrix: python-version: ['3.8', '3.9', '3.10'] fail-fast: false - env: - # Enable Tika end to end testing - TIKA_LIVE: 1 - # Enable paperless_mail testing against real server - PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }} - PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }} - PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }} - # Enable Gotenberg end to end testing - GOTENBERG_LIVE: 1 steps: - name: Checkout @@ -156,6 +147,12 @@ jobs: pipenv --python ${{ steps.setup-python.outputs.python-version }} run pip list - name: Tests + env: + PAPERLESS_CI_TEST: 1 + # Enable paperless_mail testing against real server + PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }} + PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }} + PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }} run: | cd src/ pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra diff --git a/Pipfile.lock b/Pipfile.lock index e92c913c4..d9e6b8d56 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1745,11 +1745,11 @@ }, "tika-client": { "hashes": [ - "sha256:6f2afab12eb46cd7b4ed6c34c9c2a1791a45d2f479c0da0076936dc6dbfe8061", - "sha256:f2c23cb76677b7b8be70e2d95ac3418ed046b1514bff920f7460beae1ca3342b" + "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156", + "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610" ], "index": "pypi", - "version": "==0.0.2" + "version": "==0.0.3" }, "tornado": { "hashes": [ diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index fbde3345c..483d3b12d 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -105,6 +105,20 @@ class FileSystemAssertsMixin: def assertIsNotDir(self, path: Union[PathLike, str]): self.assertFalse(Path(path).resolve().is_dir(), f"Dir does exist: {path}") + def assertFilesEqual( + self, + path1: Union[PathLike, str], + path2: Union[PathLike, str], + ): + path1 = Path(path1) + path2 = Path(path2) + import hashlib + + hash1 = hashlib.sha256(path1.read_bytes()).hexdigest() + hash2 = hashlib.sha256(path2.read_bytes()).hexdigest() + + self.assertEqual(hash1, hash2, "File SHA256 mismatch") + class ConsumerProgressMixin: def setUp(self) -> None: diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 7cd5e06e6..3ec3e64a0 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -1,8 +1,7 @@ -import os import re from html import escape -from io import BytesIO -from io import StringIO +from pathlib import Path +from typing import List import httpx from bleach import clean @@ -11,8 +10,9 @@ from django.conf import settings from django.utils.timezone import is_naive from django.utils.timezone import make_aware from humanfriendly import format_size +from imap_tools import MailAttachment from imap_tools import MailMessage -from tika import parser +from tika_client import TikaClient from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -22,33 +22,15 @@ from documents.parsers import make_thumbnail_from_pdf class MailDocumentParser(DocumentParser): """ This parser uses imap_tools to parse .eml files, generates pdf using - gotenbergs and sends the html part to a local tika server for text extraction. + Gotenberg and sends the html part to a Tika server for text extraction. """ gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT tika_server = settings.TIKA_ENDPOINT logging_name = "paperless.parsing.mail" - _parsed = None - def get_parsed(self, document_path) -> MailMessage: - if not self._parsed: - try: - with open(document_path, "rb") as eml: - self._parsed = MailMessage.from_bytes(eml.read()) - except Exception as err: - raise ParseError( - f"Could not parse {document_path}: {err}", - ) from err - if not self._parsed.from_values: - self._parsed = None - raise ParseError( - f"Could not parse {document_path}: Missing 'from'", - ) - - return self._parsed - - def get_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): if not self.archive_path: self.archive_path = self.generate_pdf(document_path) @@ -58,11 +40,11 @@ class MailDocumentParser(DocumentParser): self.logging_group, ) - def extract_metadata(self, document_path, mime_type): + def extract_metadata(self, document_path: Path, mime_type: str): result = [] try: - mail = self.get_parsed(document_path) + mail = self.parse_file_to_message(document_path) except ParseError as e: self.log.warning( f"Error while fetching document metadata for {document_path}: {e}", @@ -106,101 +88,157 @@ class MailDocumentParser(DocumentParser): result.sort(key=lambda item: (item["prefix"], item["key"])) return result - def parse(self, document_path, mime_type, file_name=None): + def parse(self, document_path: Path, mime_type: str, file_name=None): + """ + Parses the given .eml into formatted text, based on the decoded email. + + """ + def strip_text(text: str): + """ + Reduces the spacing of the given text string + """ text = re.sub(r"\s+", " ", text) text = re.sub(r"(\n *)+", "\n", text) return text.strip() - mail = self.get_parsed(document_path) + def build_formatted_text(mail_message: MailMessage) -> str: + """ + Constructs a formatted string, based on the given email. Basically tries + to get most of the email content, included front matter, into a nice string + """ + fmt_text = f"Subject: {mail_message.subject}\n\n" + fmt_text += f"From: {mail_message.from_values.full}\n\n" + to_list = [address.full for address in mail_message.to_values] + fmt_text += f"To: {', '.join(to_list)}\n\n" + if mail_message.cc_values: + fmt_text += ( + f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" + ) + if mail_message.bcc_values: + fmt_text += ( + f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" + ) + if mail_message.attachments: + att = [] + for a in mail.attachments: + att.append(f"{a.filename} ({format_size(a.size, binary=True)})") + fmt_text += f"Attachments: {', '.join(att)}\n\n" - self.text = f"Subject: {mail.subject}\n\n" - self.text += f"From: {mail.from_values.full}\n\n" - self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n" - if len(mail.cc_values) >= 1: - self.text += ( - f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" - ) - if len(mail.bcc_values) >= 1: - self.text += ( - f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" - ) - if len(mail.attachments) >= 1: - att = [] - for a in mail.attachments: - att.append(f"{a.filename} ({format_size(a.size, binary=True)})") + if mail.html: + fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html)) - self.text += f"Attachments: {', '.join(att)}\n\n" + fmt_text += f"\n\n{strip_text(mail.text)}" - if mail.html: - self.text += "HTML content: " + strip_text(self.tika_parse(mail.html)) + return fmt_text - self.text += f"\n\n{strip_text(mail.text)}" + self.log.debug(f"Parsing file {document_path.name} into an email") + mail = self.parse_file_to_message(document_path) + + self.log.debug("Building formatted text from email") + self.text = build_formatted_text(mail) if is_naive(mail.date): self.date = make_aware(mail.date) else: self.date = mail.date - self.archive_path = self.generate_pdf(document_path) + self.log.debug("Creating a PDF from the email") + self.archive_path = self.generate_pdf(mail) + + @staticmethod + def parse_file_to_message(filepath: Path) -> MailMessage: + """ + Parses the given .eml file into a MailMessage object + """ + try: + with filepath.open("rb") as eml: + parsed = MailMessage.from_bytes(eml.read()) + if parsed.from_values is None: + raise ParseError( + f"Could not parse {filepath}: Missing 'from'", + ) + except Exception as err: + raise ParseError( + f"Could not parse {filepath}: {err}", + ) from err + + return parsed def tika_parse(self, html: str): self.log.info("Sending content to Tika server") try: - parsed = parser.from_buffer(html, self.tika_server) + with TikaClient(tika_url=self.tika_server) as client: + parsed = client.tika.as_text.from_buffer(html, "text/html") + + if "X-TIKA:content" in parsed.data: + return parsed.data["X-TIKA:content"].strip() + return "" except Exception as err: raise ParseError( f"Could not parse content with tika server at " f"{self.tika_server}: {err}", ) from err - if parsed["content"]: - return parsed["content"] + + def generate_pdf(self, mail_message: MailMessage) -> Path: + archive_path = Path(self.tempdir) / "merged.pdf" + + mail_pdf_file = self.generate_pdf_from_mail(mail_message) + + # If no HTML content, create the PDF from the message + # Otherwise, create 2 PDFs and merge them with Gotenberg + if not mail_message.html: + archive_path.write_bytes(mail_pdf_file.read_bytes()) else: - return "" + url_merge = self.gotenberg_server + "/forms/pdfengines/merge" - def generate_pdf(self, document_path): - pdf_collection = [] - url_merge = self.gotenberg_server + "/forms/pdfengines/merge" - pdf_path = os.path.join(self.tempdir, "merged.pdf") - mail = self.get_parsed(document_path) - - pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail))) - - if not mail.html: - with open(pdf_path, "wb") as file: - file.write(pdf_collection[0][1]) - file.close() - return pdf_path - else: - pdf_collection.append( - ( - "2_html.pdf", - self.generate_pdf_from_html(mail.html, mail.attachments), - ), + pdf_of_html_content = self.generate_pdf_from_html( + mail_message.html, + mail_message.attachments, ) - files = {} - for name, content in pdf_collection: - files[name] = (name, BytesIO(content)) - headers = {} - try: - response = httpx.post(url_merge, files=files, headers=headers) - response.raise_for_status() # ensure we notice bad responses - except Exception as err: - raise ParseError(f"Error while converting document to PDF: {err}") from err + pdf_collection = { + "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"), + "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"), + } - with open(pdf_path, "wb") as file: - file.write(response.content) - file.close() + try: + # Open a handle to each file, replacing the tuple + for filename in pdf_collection: + file_multi_part = pdf_collection[filename] + pdf_collection[filename] = ( + file_multi_part[0], + file_multi_part[1].open("rb"), + file_multi_part[2], + ) - return pdf_path + response = httpx.post(url_merge, files=pdf_collection) + response.raise_for_status() # ensure we notice bad responses - @staticmethod - def mail_to_html(mail: MailMessage) -> StringIO: - data = {} + archive_path.write_bytes(response.content) - def clean_html(text: str): + except Exception as err: + raise ParseError( + f"Error while merging email HTML into PDF: {err}", + ) from err + finally: + for filename in pdf_collection: + file_multi_part_handle = pdf_collection[filename][1] + file_multi_part_handle.close() + + return archive_path + + def mail_to_html(self, mail: MailMessage) -> Path: + """ + Converts the given email into an HTML file, formatted + based on the given template + """ + + def clean_html(text: str) -> str: + """ + Attempts to clean, escape and linkify the given HTML string + """ if isinstance(text, list): text = "\n".join([str(e) for e in text]) if type(text) != str: @@ -211,6 +249,8 @@ class MailDocumentParser(DocumentParser): text = text.replace("\n", "
") return text + data = {} + data["subject"] = clean_html(mail.subject) if data["subject"]: data["subject_label"] = "Subject" @@ -237,27 +277,33 @@ class MailDocumentParser(DocumentParser): data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M")) data["content"] = clean_html(mail.text.strip()) - html = StringIO() - from django.template.loader import render_to_string - rendered = render_to_string("email_msg_template.html", context=data) + html_file = Path(self.tempdir) / "email_as_html.html" + html_file.write_text(render_to_string("email_msg_template.html", context=data)) - html.write(rendered) - html.seek(0) + return html_file - return html - - def generate_pdf_from_mail(self, mail): + def generate_pdf_from_mail(self, mail: MailMessage) -> Path: + """ + Creates a PDF based on the given email, using the email's values in a + an HTML template + """ url = self.gotenberg_server + "/forms/chromium/convert/html" self.log.info("Converting mail to PDF") - css_file = os.path.join(os.path.dirname(__file__), "templates/output.css") + css_file = Path(__file__).parent / "templates" / "output.css" + email_html_file = self.mail_to_html(mail) - with open(css_file, "rb") as css_handle: + print(css_file) + print(email_html_file) + + with css_file.open("rb") as css_handle, email_html_file.open( + "rb", + ) as email_html_handle: files = { - "html": ("index.html", self.mail_to_html(mail)), - "css": ("output.css", css_handle), + "html": ("index.html", email_html_handle, "text/html"), + "css": ("output.css", css_handle, "text/css"), } headers = {} data = { @@ -289,13 +335,23 @@ class MailDocumentParser(DocumentParser): response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError( - f"Error while converting document to PDF: {err}", + f"Error while converting email to PDF: {err}", ) from err - return response.content + email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf" + email_as_pdf_file.write_bytes(response.content) + + return email_as_pdf_file + + def generate_pdf_from_html( + self, + orig_html: str, + attachments: List[MailAttachment], + ) -> Path: + """ + Generates a PDF file based on the HTML and attachments of the email + """ - @staticmethod - def transform_inline_html(html, attachments): def clean_html_script(text: str): compiled_open = re.compile(re.escape("