From 027897ff0309423f524626b894981298a3606c8b Mon Sep 17 00:00:00 2001 From: phail Date: Tue, 19 Apr 2022 00:39:00 +0200 Subject: [PATCH] work in progress Mail parsing --- Pipfile | 1 + src/paperless_mail/mail.py | 2 +- src/paperless_tika/apps.py | 2 + src/paperless_tika/parsers.py | 148 ++++++++++++++++++++++++++++++++++ src/paperless_tika/signals.py | 16 ++++ 5 files changed, 168 insertions(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 0feabe237..da2644251 100644 --- a/Pipfile +++ b/Pipfile @@ -53,6 +53,7 @@ concurrent-log-handler = "*" zipp = {version = "*", markers = "python_version < '3.9'"} pyzbar = "*" pdf2image = "*" +click = "==8.0.4" [dev-packages] coveralls = "*" diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index 72a74639c..865581446 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin): return total_processed_files - def handle_mail_rule(self, M, rule): + def handle_mail_rule(self, M, rule: MailRule): self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py index 5cab21427..791d234a0 100644 --- a/src/paperless_tika/apps.py +++ b/src/paperless_tika/apps.py @@ -1,6 +1,7 @@ from django.apps import AppConfig from django.conf import settings from paperless_tika.signals import tika_consumer_declaration +from paperless_tika.signals import tika_consumer_declaration_eml class PaperlessTikaConfig(AppConfig): @@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig): if settings.PAPERLESS_TIKA_ENABLED: document_consumer_declaration.connect(tika_consumer_declaration) + document_consumer_declaration.connect(tika_consumer_declaration_eml) AppConfig.ready(self) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 22218dfe7..294f637ef 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,4 +1,6 @@ import os +import re +from io import StringIO import dateutil.parser import requests @@ -6,6 +8,9 @@ from django.conf import settings from documents.parsers import DocumentParser from documents.parsers import make_thumbnail_from_pdf from documents.parsers import ParseError +from PIL import Image +from PIL import ImageDraw +from PIL import ImageFont from tika import parser @@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser): file.close() return pdf_path + + +class TikaDocumentParserEml(DocumentParser): + """ + This parser sends documents to a local tika server + """ + + logging_name = "paperless.parsing.tikaeml" + + def get_thumbnail(self, document_path, mime_type, file_name=None): + + img = Image.new("RGB", (500, 700), color="white") + draw = ImageDraw.Draw(img) + font = ImageFont.truetype( + font=settings.THUMBNAIL_FONT_NAME, + size=20, + layout_engine=ImageFont.LAYOUT_BASIC, + ) + draw.text((5, 5), self.text, font=font, fill="black") + + out_path = os.path.join(self.tempdir, "thumb.png") + img.save(out_path) + + return out_path + + def extract_metadata(self, document_path, mime_type): + tika_server = settings.PAPERLESS_TIKA_ENDPOINT + try: + parsed = parser.from_file(document_path, tika_server) + except Exception as e: + self.log( + "warning", + f"Error while fetching document metadata for " f"{document_path}: {e}", + ) + return [] + + return [ + { + "namespace": "", + "prefix": "", + "key": key, + "value": parsed["metadata"][key], + } + for key in parsed["metadata"] + ] + + def parse(self, document_path, mime_type, file_name=None): + self.log("info", f"Sending {document_path} to Tika server") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT + + try: + parsed = parser.from_file(document_path, tika_server) + except Exception as err: + raise ParseError( + f"Could not parse {document_path} with tika server at " + f"{tika_server}: {err}", + ) + + text = re.sub(" +", " ", str(parsed)) + text = re.sub("\n+", "\n", text) + self.text = text + + print(text) + + try: + self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) + except Exception as e: + self.log( + "warning", + f"Unable to extract date for document " f"{document_path}: {e}", + ) + + md_path = self.convert_to_md(document_path, file_name) + self.archive_path = self.convert_md_to_pdf(md_path) + + def convert_md_to_pdf(self, md_path): + pdf_path = os.path.join(self.tempdir, "convert.pdf") + gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT + url = gotenberg_server + "/forms/chromium/convert/markdown" + + self.log("info", f"Converting {md_path} to PDF as {pdf_path}") + html = StringIO( + """ + + + + + My PDF + + + {{ toHTML "convert.md" }} + + + """, + ) + md = StringIO( + """ +# Subject + +blub \nblah +blib + """, + ) + + files = { + "md": ( + os.path.basename(md_path), + md, + ), + "html": ( + "index.html", + html, + ), + } + headers = {} + + try: + response = requests.post(url, files=files, headers=headers) + response.raise_for_status() # ensure we notice bad responses + except Exception as err: + raise ParseError(f"Error while converting document to PDF: {err}") + + with open(pdf_path, "wb") as file: + file.write(response.content) + file.close() + + return pdf_path + + def convert_to_md(self, document_path, file_name): + md_path = os.path.join(self.tempdir, "convert.md") + + self.log("info", f"Converting {document_path} to markdown as {md_path}") + + with open(md_path, "w") as file: + md = [ + "# Subject", + "\n\n", + "blah", + ] + file.writelines(md) + file.close() + + return md_path diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py index 39838f076..a852cfdb2 100644 --- a/src/paperless_tika/signals.py +++ b/src/paperless_tika/signals.py @@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs): "text/rtf": ".rtf", }, } + + +def get_parser_eml(*args, **kwargs): + from .parsers import TikaDocumentParserEml + + return TikaDocumentParserEml(*args, **kwargs) + + +def tika_consumer_declaration_eml(sender, **kwargs): + return { + "parser": get_parser_eml, + "weight": 10, + "mime_types": { + "message/rfc822": ".eml", + }, + }