diff --git a/Pipfile b/Pipfile index 0feabe237..da2644251 100644 --- a/Pipfile +++ b/Pipfile @@ -53,6 +53,7 @@ concurrent-log-handler = "*" zipp = {version = "*", markers = "python_version < '3.9'"} pyzbar = "*" pdf2image = "*" +click = "==8.0.4" [dev-packages] coveralls = "*" diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index 72a74639c..865581446 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin): return total_processed_files - def handle_mail_rule(self, M, rule): + def handle_mail_rule(self, M, rule: MailRule): self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py index 5cab21427..791d234a0 100644 --- a/src/paperless_tika/apps.py +++ b/src/paperless_tika/apps.py @@ -1,6 +1,7 @@ from django.apps import AppConfig from django.conf import settings from paperless_tika.signals import tika_consumer_declaration +from paperless_tika.signals import tika_consumer_declaration_eml class PaperlessTikaConfig(AppConfig): @@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig): if settings.PAPERLESS_TIKA_ENABLED: document_consumer_declaration.connect(tika_consumer_declaration) + document_consumer_declaration.connect(tika_consumer_declaration_eml) AppConfig.ready(self) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 22218dfe7..294f637ef 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,4 +1,6 @@ import os +import re +from io import StringIO import dateutil.parser import requests @@ -6,6 +8,9 @@ from django.conf import settings from documents.parsers import DocumentParser from documents.parsers import make_thumbnail_from_pdf from documents.parsers import ParseError +from PIL import Image +from PIL import ImageDraw +from PIL import ImageFont from tika import parser @@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser): file.close() return pdf_path + + +class TikaDocumentParserEml(DocumentParser): + """ + This parser sends documents to a local tika server + """ + + logging_name = "paperless.parsing.tikaeml" + + def get_thumbnail(self, document_path, mime_type, file_name=None): + + img = Image.new("RGB", (500, 700), color="white") + draw = ImageDraw.Draw(img) + font = ImageFont.truetype( + font=settings.THUMBNAIL_FONT_NAME, + size=20, + layout_engine=ImageFont.LAYOUT_BASIC, + ) + draw.text((5, 5), self.text, font=font, fill="black") + + out_path = os.path.join(self.tempdir, "thumb.png") + img.save(out_path) + + return out_path + + def extract_metadata(self, document_path, mime_type): + tika_server = settings.PAPERLESS_TIKA_ENDPOINT + try: + parsed = parser.from_file(document_path, tika_server) + except Exception as e: + self.log( + "warning", + f"Error while fetching document metadata for " f"{document_path}: {e}", + ) + return [] + + return [ + { + "namespace": "", + "prefix": "", + "key": key, + "value": parsed["metadata"][key], + } + for key in parsed["metadata"] + ] + + def parse(self, document_path, mime_type, file_name=None): + self.log("info", f"Sending {document_path} to Tika server") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT + + try: + parsed = parser.from_file(document_path, tika_server) + except Exception as err: + raise ParseError( + f"Could not parse {document_path} with tika server at " + f"{tika_server}: {err}", + ) + + text = re.sub(" +", " ", str(parsed)) + text = re.sub("\n+", "\n", text) + self.text = text + + print(text) + + try: + self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) + except Exception as e: + self.log( + "warning", + f"Unable to extract date for document " f"{document_path}: {e}", + ) + + md_path = self.convert_to_md(document_path, file_name) + self.archive_path = self.convert_md_to_pdf(md_path) + + def convert_md_to_pdf(self, md_path): + pdf_path = os.path.join(self.tempdir, "convert.pdf") + gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT + url = gotenberg_server + "/forms/chromium/convert/markdown" + + self.log("info", f"Converting {md_path} to PDF as {pdf_path}") + html = StringIO( + """ + + +
+ +