diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py
index f981023f3..b701b6e00 100644
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -16,9 +16,12 @@ from tika import parser
class MailDocumentParser(DocumentParser):
"""
- This parser sends documents to a local tika server
+ This parser uses imap_tools to parse .eml files, generates pdf using
+ gotenbergs and sends the html part to a local tika server for text extraction.
"""
+ gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
+
logging_name = "paperless.parsing.mail"
_parsed = None
@@ -93,7 +96,7 @@ class MailDocumentParser(DocumentParser):
return result
def parse(self, document_path, mime_type, file_name=None):
- def strip_content(text: str):
+ def strip_text(text: str):
text = re.sub("\t", " ", text)
text = re.sub(" +", " ", text)
text = re.sub("(\n *)+", "\n", text)
@@ -101,7 +104,7 @@ class MailDocumentParser(DocumentParser):
mail = self.get_parsed(document_path)
- self.text = f"{strip_content(mail.text)}\n\n"
+ self.text = f"{strip_text(mail.text)}\n\n"
self.text += f"Subject: {mail.subject}\n\n"
self.text += f"From: {mail.from_values.full}\n\n"
self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n"
@@ -118,17 +121,17 @@ class MailDocumentParser(DocumentParser):
self.text += f"Attachments: {att}\n\n"
if mail.html != "":
- self.text += "HTML content: " + strip_content(self.tika_parse(mail.html))
+ self.text += "HTML content: " + strip_text(self.tika_parse(mail.html))
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
- def tika_parse(self, input):
+ def tika_parse(self, html: str):
self.log("info", "Sending content to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
- parsed = parser.from_buffer(input, tika_server)
+ parsed = parser.from_buffer(html, tika_server)
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at " f"{tika_server}: {err}",
@@ -139,6 +142,46 @@ class MailDocumentParser(DocumentParser):
return ""
def generate_pdf(self, document_path):
+ pdf_collection = []
+ url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
+ pdf_path = os.path.join(self.tempdir, "merged.pdf")
+ mail = self.get_parsed(document_path)
+
+ pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail)))
+
+ if mail.html != "":
+ pdf_collection.append(
+ (
+ "2_html.pdf",
+ self.generate_pdf_from_html(mail.html, mail.attachments),
+ ),
+ )
+
+ if len(pdf_collection) == 1:
+ with open(pdf_path, "wb") as file:
+ file.write(pdf_collection[0][1])
+ file.close()
+ return pdf_path
+
+ files = {}
+ for name, content in pdf_collection:
+ files[name] = (name, BytesIO(content))
+ headers = {}
+ try:
+ response = requests.post(url_merge, files=files, headers=headers)
+ response.raise_for_status() # ensure we notice bad responses
+ except Exception as err:
+ raise ParseError(f"Error while converting document to PDF: {err}")
+
+ with open(pdf_path, "wb") as file:
+ file.write(response.content)
+ file.close()
+
+ return pdf_path
+
+ def mail_to_html(self, mail):
+ data = {}
+
def clean_html(text: str):
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
@@ -155,135 +198,104 @@ class MailDocumentParser(DocumentParser):
text = text.replace("\n", "
")
return text
- def clean_html_script(text: str):
- text = text.replace("