From 466afa820352a0da59fc60b3d0e08fcc037ac11b Mon Sep 17 00:00:00 2001 From: phail Date: Wed, 4 May 2022 23:42:59 +0200 Subject: [PATCH] fix consumption of mails without html split pdf generation functions --- src/paperless_mail/parsers.py | 212 ++++++++++++++++++---------------- 1 file changed, 112 insertions(+), 100 deletions(-) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index f981023f3..b701b6e00 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -16,9 +16,12 @@ from tika import parser class MailDocumentParser(DocumentParser): """ - This parser sends documents to a local tika server + This parser uses imap_tools to parse .eml files, generates pdf using + gotenbergs and sends the html part to a local tika server for text extraction. """ + gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT + logging_name = "paperless.parsing.mail" _parsed = None @@ -93,7 +96,7 @@ class MailDocumentParser(DocumentParser): return result def parse(self, document_path, mime_type, file_name=None): - def strip_content(text: str): + def strip_text(text: str): text = re.sub("\t", " ", text) text = re.sub(" +", " ", text) text = re.sub("(\n *)+", "\n", text) @@ -101,7 +104,7 @@ class MailDocumentParser(DocumentParser): mail = self.get_parsed(document_path) - self.text = f"{strip_content(mail.text)}\n\n" + self.text = f"{strip_text(mail.text)}\n\n" self.text += f"Subject: {mail.subject}\n\n" self.text += f"From: {mail.from_values.full}\n\n" self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n" @@ -118,17 +121,17 @@ class MailDocumentParser(DocumentParser): self.text += f"Attachments: {att}\n\n" if mail.html != "": - self.text += "HTML content: " + strip_content(self.tika_parse(mail.html)) + self.text += "HTML content: " + strip_text(self.tika_parse(mail.html)) self.date = mail.date self.archive_path = self.generate_pdf(document_path) - def tika_parse(self, input): + def tika_parse(self, html: str): self.log("info", "Sending content to Tika server") tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: - parsed = parser.from_buffer(input, tika_server) + parsed = parser.from_buffer(html, tika_server) except Exception as err: raise ParseError( f"Could not parse content with tika server at " f"{tika_server}: {err}", @@ -139,6 +142,46 @@ class MailDocumentParser(DocumentParser): return "" def generate_pdf(self, document_path): + pdf_collection = [] + url_merge = self.gotenberg_server + "/forms/pdfengines/merge" + pdf_path = os.path.join(self.tempdir, "merged.pdf") + mail = self.get_parsed(document_path) + + pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail))) + + if mail.html != "": + pdf_collection.append( + ( + "2_html.pdf", + self.generate_pdf_from_html(mail.html, mail.attachments), + ), + ) + + if len(pdf_collection) == 1: + with open(pdf_path, "wb") as file: + file.write(pdf_collection[0][1]) + file.close() + return pdf_path + + files = {} + for name, content in pdf_collection: + files[name] = (name, BytesIO(content)) + headers = {} + try: + response = requests.post(url_merge, files=files, headers=headers) + response.raise_for_status() # ensure we notice bad responses + except Exception as err: + raise ParseError(f"Error while converting document to PDF: {err}") + + with open(pdf_path, "wb") as file: + file.write(response.content) + file.close() + + return pdf_path + + def mail_to_html(self, mail): + data = {} + def clean_html(text: str): if isinstance(text, list): text = "\n".join([str(e) for e in text]) @@ -155,135 +198,104 @@ class MailDocumentParser(DocumentParser): text = text.replace("\n", "
") return text - def clean_html_script(text: str): - text = text.replace("