From 790bcf05ed7f478dd497a8f4fcb47c10063a7859 Mon Sep 17 00:00:00 2001 From: phail Date: Mon, 25 Apr 2022 20:55:00 +0200 Subject: [PATCH] add prototype archive pdf --- src/paperless_mail/mail_template/index.html | 46 +++++ src/paperless_mail/mail_template/output.css | 0 src/paperless_tika/parsers.py | 186 ++++++++++++-------- 3 files changed, 157 insertions(+), 75 deletions(-) create mode 100644 src/paperless_mail/mail_template/index.html create mode 100644 src/paperless_mail/mail_template/output.css diff --git a/src/paperless_mail/mail_template/index.html b/src/paperless_mail/mail_template/index.html new file mode 100644 index 000000000..b1f332f75 --- /dev/null +++ b/src/paperless_mail/mail_template/index.html @@ -0,0 +1,46 @@ + + + + + + + + + + +
+ +
+ +
23.04.2022 18:18
+ +
From
+
{{ from }}
+ +
Subject
+
{{ Subject }} +
+ +
To
+
{{ To }} +
+ +
CC
+
{{ CC }} +
+ +
BCC
+
{{ BCC }} +
+
+ + +
+ + +
{{ content }} +
+ + + + diff --git a/src/paperless_mail/mail_template/output.css b/src/paperless_mail/mail_template/output.css new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 294f637ef..9eed095b3 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -8,9 +8,6 @@ from django.conf import settings from documents.parsers import DocumentParser from documents.parsers import make_thumbnail_from_pdf from documents.parsers import ParseError -from PIL import Image -from PIL import ImageDraw -from PIL import ImageFont from tika import parser @@ -112,22 +109,19 @@ class TikaDocumentParserEml(DocumentParser): logging_name = "paperless.parsing.tikaeml" def get_thumbnail(self, document_path, mime_type, file_name=None): + if not self.archive_path: + self.archive_path = self.generate_pdf(document_path) - img = Image.new("RGB", (500, 700), color="white") - draw = ImageDraw.Draw(img) - font = ImageFont.truetype( - font=settings.THUMBNAIL_FONT_NAME, - size=20, - layout_engine=ImageFont.LAYOUT_BASIC, + return make_thumbnail_from_pdf( + self.archive_path, + self.tempdir, + self.logging_group, ) - draw.text((5, 5), self.text, font=font, fill="black") - - out_path = os.path.join(self.tempdir, "thumb.png") - img.save(out_path) - - return out_path def extract_metadata(self, document_path, mime_type): + result = [] + prefix_pattern = re.compile(r"(.*):(.*)") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: parsed = parser.from_file(document_path, tika_server) @@ -136,17 +130,38 @@ class TikaDocumentParserEml(DocumentParser): "warning", f"Error while fetching document metadata for " f"{document_path}: {e}", ) - return [] + return result - return [ - { - "namespace": "", - "prefix": "", - "key": key, - "value": parsed["metadata"][key], - } - for key in parsed["metadata"] - ] + for key, value in parsed["metadata"].items(): + if isinstance(value, list): + value = ", ".join([str(e) for e in value]) + value = str(value) + try: + m = prefix_pattern.match(key) + result.append( + { + "namespace": "", + "prefix": m.group(1), + "key": m.group(2), + "value": value, + }, + ) + except AttributeError: + result.append( + { + "namespace": "", + "prefix": "", + "key": key, + "value": value, + }, + ) + except Exception as e: + self.log( + "warning", + f"Error while reading metadata {key}: {value}. Error: " f"{e}", + ) + result.sort(key=lambda item: (item["prefix"], item["key"])) + return result def parse(self, document_path, mime_type, file_name=None): self.log("info", f"Sending {document_path} to Tika server") @@ -160,57 +175,94 @@ class TikaDocumentParserEml(DocumentParser): f"{tika_server}: {err}", ) - text = re.sub(" +", " ", str(parsed)) - text = re.sub("\n+", "\n", text) - self.text = text + metadata = parsed["metadata"].copy() - print(text) + subject = metadata.pop("dc:subject", "") + content = parsed["content"].strip() + + if content.startswith(subject): + content = content[len(subject) :].strip() + + content = re.sub(" +", " ", content) + content = re.sub("\n+", "\n", content) + + self.text = ( + f"{content}\n" + f"______________________\n" + f"From: {metadata.pop('Message-From', '')}\n" + f"To: {metadata.pop('Message-To', '')}\n" + f"CC: {metadata.pop('Message-CC', '')}" + ) try: - self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) + self.date = dateutil.parser.isoparse(parsed["metadata"]["dcterms:created"]) except Exception as e: self.log( "warning", f"Unable to extract date for document " f"{document_path}: {e}", ) - md_path = self.convert_to_md(document_path, file_name) - self.archive_path = self.convert_md_to_pdf(md_path) + self.archive_path = self.generate_pdf(document_path, parsed) + + def generate_pdf(self, document_path, parsed=None): + if not parsed: + self.log("info", f"Sending {document_path} to Tika server") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT + + try: + parsed = parser.from_file(document_path, tika_server) + except Exception as err: + raise ParseError( + f"Could not parse {document_path} with tika server at " + f"{tika_server}: {err}", + ) + + def clean_html(text: str): + if isinstance(text, list): + text = ", ".join([str(e) for e in text]) + if type(text) != str: + text = str(text) + text = text.replace("&", "&") + text = text.replace("<", "<") + text = text.replace(">", ">") + text = text.replace(" ", " ") + text = text.replace("'", "'") + text = text.replace('"', """) + return text - def convert_md_to_pdf(self, md_path): pdf_path = os.path.join(self.tempdir, "convert.pdf") gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT - url = gotenberg_server + "/forms/chromium/convert/markdown" + url = gotenberg_server + "/forms/chromium/convert/html" + + self.log("info", f"Converting {document_path} to PDF as {pdf_path}") + + subject = parsed["metadata"].pop("dc:subject", "") + content = parsed.pop("content", "").strip() + + if content.startswith(subject): + content = content[len(subject) :].strip() - self.log("info", f"Converting {md_path} to PDF as {pdf_path}") html = StringIO( - """ - - - - - My PDF - - - {{ toHTML "convert.md" }} - - - """, - ) - md = StringIO( - """ -# Subject - -blub \nblah -blib - """, + f""" + + + + + My PDF + + +

{clean_html(subject)}

+

From: {clean_html(parsed['metadata'].pop('Message-From', ''))} +

To: {clean_html(parsed['metadata'].pop('Message-To', ''))} +

CC: {clean_html(parsed['metadata'].pop('Message-CC', ''))} +

Date: {clean_html(parsed['metadata'].pop('dcterms:created', ''))} +

{clean_html(content)}
+ + + """, ) files = { - "md": ( - os.path.basename(md_path), - md, - ), "html": ( "index.html", html, @@ -229,19 +281,3 @@ blib file.close() return pdf_path - - def convert_to_md(self, document_path, file_name): - md_path = os.path.join(self.tempdir, "convert.md") - - self.log("info", f"Converting {document_path} to markdown as {md_path}") - - with open(md_path, "w") as file: - md = [ - "# Subject", - "\n\n", - "blah", - ] - file.writelines(md) - file.close() - - return md_path