work in progress Mail parsing

2026-02-01 23:19:00 -06:00 · 2022-04-19 00:39:00 +02:00
parent cca576f518
commit 027897ff03
5 changed files with 168 additions and 1 deletions
--- a/1
+++ b/1
@@ -53,6 +53,7 @@ concurrent-log-handler = "*"
 zipp = {version = "*", markers = "python_version < '3.9'"}
 pyzbar = "*"
 pdf2image = "*"
+click = "==8.0.4"

 [dev-packages]
 coveralls = "*"
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin):

        return total_processed_files

-    def handle_mail_rule(self, M, rule):
+    def handle_mail_rule(self, M, rule: MailRule):

        self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")

--- a/src/paperless_tika/apps.py
+++ b/src/paperless_tika/apps.py
@@ -1,6 +1,7 @@
 from django.apps import AppConfig
 from django.conf import settings
 from paperless_tika.signals import tika_consumer_declaration
+from paperless_tika.signals import tika_consumer_declaration_eml


 class PaperlessTikaConfig(AppConfig):
@@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig):

        if settings.PAPERLESS_TIKA_ENABLED:
            document_consumer_declaration.connect(tika_consumer_declaration)
+            document_consumer_declaration.connect(tika_consumer_declaration_eml)
        AppConfig.ready(self)
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,4 +1,6 @@
 import os
+import re
+from io import StringIO

 import dateutil.parser
 import requests
@@ -6,6 +8,9 @@ from django.conf import settings
 from documents.parsers import DocumentParser
 from documents.parsers import make_thumbnail_from_pdf
 from documents.parsers import ParseError
+from PIL import Image
+from PIL import ImageDraw
+from PIL import ImageFont
 from tika import parser


@@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser):
            file.close()

        return pdf_path
+
+
+class TikaDocumentParserEml(DocumentParser):
+    """
+    This parser sends documents to a local tika server
+    """
+
+    logging_name = "paperless.parsing.tikaeml"
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+
+        img = Image.new("RGB", (500, 700), color="white")
+        draw = ImageDraw.Draw(img)
+        font = ImageFont.truetype(
+            font=settings.THUMBNAIL_FONT_NAME,
+            size=20,
+            layout_engine=ImageFont.LAYOUT_BASIC,
+        )
+        draw.text((5, 5), self.text, font=font, fill="black")
+
+        out_path = os.path.join(self.tempdir, "thumb.png")
+        img.save(out_path)
+
+        return out_path
+
+    def extract_metadata(self, document_path, mime_type):
+        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
+        try:
+            parsed = parser.from_file(document_path, tika_server)
+        except Exception as e:
+            self.log(
+                "warning",
+                f"Error while fetching document metadata for " f"{document_path}: {e}",
+            )
+            return []
+
+        return [
+            {
+                "namespace": "",
+                "prefix": "",
+                "key": key,
+                "value": parsed["metadata"][key],
+            }
+            for key in parsed["metadata"]
+        ]
+
+    def parse(self, document_path, mime_type, file_name=None):
+        self.log("info", f"Sending {document_path} to Tika server")
+        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
+
+        try:
+            parsed = parser.from_file(document_path, tika_server)
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {document_path} with tika server at "
+                f"{tika_server}: {err}",
+            )
+
+        text = re.sub(" +", " ", str(parsed))
+        text = re.sub("\n+", "\n", text)
+        self.text = text
+
+        print(text)
+
+        try:
+            self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
+        except Exception as e:
+            self.log(
+                "warning",
+                f"Unable to extract date for document " f"{document_path}: {e}",
+            )
+
+        md_path = self.convert_to_md(document_path, file_name)
+        self.archive_path = self.convert_md_to_pdf(md_path)
+
+    def convert_md_to_pdf(self, md_path):
+        pdf_path = os.path.join(self.tempdir, "convert.pdf")
+        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
+        url = gotenberg_server + "/forms/chromium/convert/markdown"
+
+        self.log("info", f"Converting {md_path} to PDF as {pdf_path}")
+        html = StringIO(
+            """
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>My PDF</title>
+  </head>
+  <body>
+    {{ toHTML "convert.md" }}
+  </body>
+</html>
+        """,
+        )
+        md = StringIO(
+            """
+# Subject
+
+blub  \nblah
+blib
+        """,
+        )
+
+        files = {
+            "md": (
+                os.path.basename(md_path),
+                md,
+            ),
+            "html": (
+                "index.html",
+                html,
+            ),
+        }
+        headers = {}
+
+        try:
+            response = requests.post(url, files=files, headers=headers)
+            response.raise_for_status()  # ensure we notice bad responses
+        except Exception as err:
+            raise ParseError(f"Error while converting document to PDF: {err}")
+
+        with open(pdf_path, "wb") as file:
+            file.write(response.content)
+            file.close()
+
+        return pdf_path
+
+    def convert_to_md(self, document_path, file_name):
+        md_path = os.path.join(self.tempdir, "convert.md")
+
+        self.log("info", f"Converting {document_path} to markdown as {md_path}")
+
+        with open(md_path, "w") as file:
+            md = [
+                "# Subject",
+                "\n\n",
+                "blah",
+            ]
+            file.writelines(md)
+            file.close()
+
+        return md_path
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs):
            "text/rtf": ".rtf",
        },
    }
+
+
+def get_parser_eml(*args, **kwargs):
+    from .parsers import TikaDocumentParserEml
+
+    return TikaDocumentParserEml(*args, **kwargs)
+
+
+def tika_consumer_declaration_eml(sender, **kwargs):
+    return {
+        "parser": get_parser_eml,
+        "weight": 10,
+        "mime_types": {
+            "message/rfc822": ".eml",
+        },
+    }