work in progress Mail parsing

This commit is contained in:
phail 2022-04-19 00:39:00 +02:00
parent cca576f518
commit 027897ff03
5 changed files with 168 additions and 1 deletions

View File

@ -53,6 +53,7 @@ concurrent-log-handler = "*"
zipp = {version = "*", markers = "python_version < '3.9'"}
pyzbar = "*"
pdf2image = "*"
click = "==8.0.4"
[dev-packages]
coveralls = "*"

View File

@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin):
return total_processed_files
def handle_mail_rule(self, M, rule):
def handle_mail_rule(self, M, rule: MailRule):
self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")

View File

@ -1,6 +1,7 @@
from django.apps import AppConfig
from django.conf import settings
from paperless_tika.signals import tika_consumer_declaration
from paperless_tika.signals import tika_consumer_declaration_eml
class PaperlessTikaConfig(AppConfig):
@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig):
if settings.PAPERLESS_TIKA_ENABLED:
document_consumer_declaration.connect(tika_consumer_declaration)
document_consumer_declaration.connect(tika_consumer_declaration_eml)
AppConfig.ready(self)

View File

@ -1,4 +1,6 @@
import os
import re
from io import StringIO
import dateutil.parser
import requests
@ -6,6 +8,9 @@ from django.conf import settings
from documents.parsers import DocumentParser
from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from tika import parser
@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser):
file.close()
return pdf_path
class TikaDocumentParserEml(DocumentParser):
"""
This parser sends documents to a local tika server
"""
logging_name = "paperless.parsing.tikaeml"
def get_thumbnail(self, document_path, mime_type, file_name=None):
img = Image.new("RGB", (500, 700), color="white")
draw = ImageDraw.Draw(img)
font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME,
size=20,
layout_engine=ImageFont.LAYOUT_BASIC,
)
draw.text((5, 5), self.text, font=font, fill="black")
out_path = os.path.join(self.tempdir, "thumb.png")
img.save(out_path)
return out_path
def extract_metadata(self, document_path, mime_type):
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as e:
self.log(
"warning",
f"Error while fetching document metadata for " f"{document_path}: {e}",
)
return []
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed["metadata"][key],
}
for key in parsed["metadata"]
]
def parse(self, document_path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
)
text = re.sub(" +", " ", str(parsed))
text = re.sub("\n+", "\n", text)
self.text = text
print(text)
try:
self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
except Exception as e:
self.log(
"warning",
f"Unable to extract date for document " f"{document_path}: {e}",
)
md_path = self.convert_to_md(document_path, file_name)
self.archive_path = self.convert_md_to_pdf(md_path)
def convert_md_to_pdf(self, md_path):
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/forms/chromium/convert/markdown"
self.log("info", f"Converting {md_path} to PDF as {pdf_path}")
html = StringIO(
"""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>My PDF</title>
</head>
<body>
{{ toHTML "convert.md" }}
</body>
</html>
""",
)
md = StringIO(
"""
# Subject
blub \nblah
blib
""",
)
files = {
"md": (
os.path.basename(md_path),
md,
),
"html": (
"index.html",
html,
),
}
headers = {}
try:
response = requests.post(url, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}")
with open(pdf_path, "wb") as file:
file.write(response.content)
file.close()
return pdf_path
def convert_to_md(self, document_path, file_name):
md_path = os.path.join(self.tempdir, "convert.md")
self.log("info", f"Converting {document_path} to markdown as {md_path}")
with open(md_path, "w") as file:
md = [
"# Subject",
"\n\n",
"blah",
]
file.writelines(md)
file.close()
return md_path

View File

@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs):
"text/rtf": ".rtf",
},
}
def get_parser_eml(*args, **kwargs):
from .parsers import TikaDocumentParserEml
return TikaDocumentParserEml(*args, **kwargs)
def tika_consumer_declaration_eml(sender, **kwargs):
return {
"parser": get_parser_eml,
"weight": 10,
"mime_types": {
"message/rfc822": ".eml",
},
}