mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	work in progress Mail parsing
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @@ -53,6 +53,7 @@ concurrent-log-handler = "*" | |||||||
| zipp = {version = "*", markers = "python_version < '3.9'"} | zipp = {version = "*", markers = "python_version < '3.9'"} | ||||||
| pyzbar = "*" | pyzbar = "*" | ||||||
| pdf2image = "*" | pdf2image = "*" | ||||||
|  | click = "==8.0.4" | ||||||
|  |  | ||||||
| [dev-packages] | [dev-packages] | ||||||
| coveralls = "*" | coveralls = "*" | ||||||
|   | |||||||
| @@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin): | |||||||
|  |  | ||||||
|         return total_processed_files |         return total_processed_files | ||||||
|  |  | ||||||
|     def handle_mail_rule(self, M, rule): |     def handle_mail_rule(self, M, rule: MailRule): | ||||||
|  |  | ||||||
|         self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") |         self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| from django.apps import AppConfig | from django.apps import AppConfig | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from paperless_tika.signals import tika_consumer_declaration | from paperless_tika.signals import tika_consumer_declaration | ||||||
|  | from paperless_tika.signals import tika_consumer_declaration_eml | ||||||
|  |  | ||||||
|  |  | ||||||
| class PaperlessTikaConfig(AppConfig): | class PaperlessTikaConfig(AppConfig): | ||||||
| @@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig): | |||||||
|  |  | ||||||
|         if settings.PAPERLESS_TIKA_ENABLED: |         if settings.PAPERLESS_TIKA_ENABLED: | ||||||
|             document_consumer_declaration.connect(tika_consumer_declaration) |             document_consumer_declaration.connect(tika_consumer_declaration) | ||||||
|  |             document_consumer_declaration.connect(tika_consumer_declaration_eml) | ||||||
|         AppConfig.ready(self) |         AppConfig.ready(self) | ||||||
|   | |||||||
| @@ -1,4 +1,6 @@ | |||||||
| import os | import os | ||||||
|  | import re | ||||||
|  | from io import StringIO | ||||||
|  |  | ||||||
| import dateutil.parser | import dateutil.parser | ||||||
| import requests | import requests | ||||||
| @@ -6,6 +8,9 @@ from django.conf import settings | |||||||
| from documents.parsers import DocumentParser | from documents.parsers import DocumentParser | ||||||
| from documents.parsers import make_thumbnail_from_pdf | from documents.parsers import make_thumbnail_from_pdf | ||||||
| from documents.parsers import ParseError | from documents.parsers import ParseError | ||||||
|  | from PIL import Image | ||||||
|  | from PIL import ImageDraw | ||||||
|  | from PIL import ImageFont | ||||||
| from tika import parser | from tika import parser | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser): | |||||||
|             file.close() |             file.close() | ||||||
|  |  | ||||||
|         return pdf_path |         return pdf_path | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TikaDocumentParserEml(DocumentParser): | ||||||
|  |     """ | ||||||
|  |     This parser sends documents to a local tika server | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     logging_name = "paperless.parsing.tikaeml" | ||||||
|  |  | ||||||
|  |     def get_thumbnail(self, document_path, mime_type, file_name=None): | ||||||
|  |  | ||||||
|  |         img = Image.new("RGB", (500, 700), color="white") | ||||||
|  |         draw = ImageDraw.Draw(img) | ||||||
|  |         font = ImageFont.truetype( | ||||||
|  |             font=settings.THUMBNAIL_FONT_NAME, | ||||||
|  |             size=20, | ||||||
|  |             layout_engine=ImageFont.LAYOUT_BASIC, | ||||||
|  |         ) | ||||||
|  |         draw.text((5, 5), self.text, font=font, fill="black") | ||||||
|  |  | ||||||
|  |         out_path = os.path.join(self.tempdir, "thumb.png") | ||||||
|  |         img.save(out_path) | ||||||
|  |  | ||||||
|  |         return out_path | ||||||
|  |  | ||||||
|  |     def extract_metadata(self, document_path, mime_type): | ||||||
|  |         tika_server = settings.PAPERLESS_TIKA_ENDPOINT | ||||||
|  |         try: | ||||||
|  |             parsed = parser.from_file(document_path, tika_server) | ||||||
|  |         except Exception as e: | ||||||
|  |             self.log( | ||||||
|  |                 "warning", | ||||||
|  |                 f"Error while fetching document metadata for " f"{document_path}: {e}", | ||||||
|  |             ) | ||||||
|  |             return [] | ||||||
|  |  | ||||||
|  |         return [ | ||||||
|  |             { | ||||||
|  |                 "namespace": "", | ||||||
|  |                 "prefix": "", | ||||||
|  |                 "key": key, | ||||||
|  |                 "value": parsed["metadata"][key], | ||||||
|  |             } | ||||||
|  |             for key in parsed["metadata"] | ||||||
|  |         ] | ||||||
|  |  | ||||||
|  |     def parse(self, document_path, mime_type, file_name=None): | ||||||
|  |         self.log("info", f"Sending {document_path} to Tika server") | ||||||
|  |         tika_server = settings.PAPERLESS_TIKA_ENDPOINT | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             parsed = parser.from_file(document_path, tika_server) | ||||||
|  |         except Exception as err: | ||||||
|  |             raise ParseError( | ||||||
|  |                 f"Could not parse {document_path} with tika server at " | ||||||
|  |                 f"{tika_server}: {err}", | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         text = re.sub(" +", " ", str(parsed)) | ||||||
|  |         text = re.sub("\n+", "\n", text) | ||||||
|  |         self.text = text | ||||||
|  |  | ||||||
|  |         print(text) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) | ||||||
|  |         except Exception as e: | ||||||
|  |             self.log( | ||||||
|  |                 "warning", | ||||||
|  |                 f"Unable to extract date for document " f"{document_path}: {e}", | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         md_path = self.convert_to_md(document_path, file_name) | ||||||
|  |         self.archive_path = self.convert_md_to_pdf(md_path) | ||||||
|  |  | ||||||
|  |     def convert_md_to_pdf(self, md_path): | ||||||
|  |         pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||||
|  |         gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT | ||||||
|  |         url = gotenberg_server + "/forms/chromium/convert/markdown" | ||||||
|  |  | ||||||
|  |         self.log("info", f"Converting {md_path} to PDF as {pdf_path}") | ||||||
|  |         html = StringIO( | ||||||
|  |             """ | ||||||
|  | <!doctype html> | ||||||
|  | <html lang="en"> | ||||||
|  |   <head> | ||||||
|  |     <meta charset="utf-8"> | ||||||
|  |     <title>My PDF</title> | ||||||
|  |   </head> | ||||||
|  |   <body> | ||||||
|  |     {{ toHTML "convert.md" }} | ||||||
|  |   </body> | ||||||
|  | </html> | ||||||
|  |         """, | ||||||
|  |         ) | ||||||
|  |         md = StringIO( | ||||||
|  |             """ | ||||||
|  | # Subject | ||||||
|  |  | ||||||
|  | blub  \nblah | ||||||
|  | blib | ||||||
|  |         """, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         files = { | ||||||
|  |             "md": ( | ||||||
|  |                 os.path.basename(md_path), | ||||||
|  |                 md, | ||||||
|  |             ), | ||||||
|  |             "html": ( | ||||||
|  |                 "index.html", | ||||||
|  |                 html, | ||||||
|  |             ), | ||||||
|  |         } | ||||||
|  |         headers = {} | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             response = requests.post(url, files=files, headers=headers) | ||||||
|  |             response.raise_for_status()  # ensure we notice bad responses | ||||||
|  |         except Exception as err: | ||||||
|  |             raise ParseError(f"Error while converting document to PDF: {err}") | ||||||
|  |  | ||||||
|  |         with open(pdf_path, "wb") as file: | ||||||
|  |             file.write(response.content) | ||||||
|  |             file.close() | ||||||
|  |  | ||||||
|  |         return pdf_path | ||||||
|  |  | ||||||
|  |     def convert_to_md(self, document_path, file_name): | ||||||
|  |         md_path = os.path.join(self.tempdir, "convert.md") | ||||||
|  |  | ||||||
|  |         self.log("info", f"Converting {document_path} to markdown as {md_path}") | ||||||
|  |  | ||||||
|  |         with open(md_path, "w") as file: | ||||||
|  |             md = [ | ||||||
|  |                 "# Subject", | ||||||
|  |                 "\n\n", | ||||||
|  |                 "blah", | ||||||
|  |             ] | ||||||
|  |             file.writelines(md) | ||||||
|  |             file.close() | ||||||
|  |  | ||||||
|  |         return md_path | ||||||
|   | |||||||
| @@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs): | |||||||
|             "text/rtf": ".rtf", |             "text/rtf": ".rtf", | ||||||
|         }, |         }, | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_parser_eml(*args, **kwargs): | ||||||
|  |     from .parsers import TikaDocumentParserEml | ||||||
|  |  | ||||||
|  |     return TikaDocumentParserEml(*args, **kwargs) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def tika_consumer_declaration_eml(sender, **kwargs): | ||||||
|  |     return { | ||||||
|  |         "parser": get_parser_eml, | ||||||
|  |         "weight": 10, | ||||||
|  |         "mime_types": { | ||||||
|  |             "message/rfc822": ".eml", | ||||||
|  |         }, | ||||||
|  |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 phail
					phail