mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	work in progress Mail parsing
This commit is contained in:
		
							
								
								
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								Pipfile
									
									
									
									
									
								
							| @@ -53,6 +53,7 @@ concurrent-log-handler = "*" | ||||
| zipp = {version = "*", markers = "python_version < '3.9'"} | ||||
| pyzbar = "*" | ||||
| pdf2image = "*" | ||||
| click = "==8.0.4" | ||||
|  | ||||
| [dev-packages] | ||||
| coveralls = "*" | ||||
|   | ||||
| @@ -199,7 +199,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|         return total_processed_files | ||||
|  | ||||
|     def handle_mail_rule(self, M, rule): | ||||
|     def handle_mail_rule(self, M, rule: MailRule): | ||||
|  | ||||
|         self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") | ||||
|  | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| from django.apps import AppConfig | ||||
| from django.conf import settings | ||||
| from paperless_tika.signals import tika_consumer_declaration | ||||
| from paperless_tika.signals import tika_consumer_declaration_eml | ||||
|  | ||||
|  | ||||
| class PaperlessTikaConfig(AppConfig): | ||||
| @@ -11,4 +12,5 @@ class PaperlessTikaConfig(AppConfig): | ||||
|  | ||||
|         if settings.PAPERLESS_TIKA_ENABLED: | ||||
|             document_consumer_declaration.connect(tika_consumer_declaration) | ||||
|             document_consumer_declaration.connect(tika_consumer_declaration_eml) | ||||
|         AppConfig.ready(self) | ||||
|   | ||||
| @@ -1,4 +1,6 @@ | ||||
| import os | ||||
| import re | ||||
| from io import StringIO | ||||
|  | ||||
| import dateutil.parser | ||||
| import requests | ||||
| @@ -6,6 +8,9 @@ from django.conf import settings | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import make_thumbnail_from_pdf | ||||
| from documents.parsers import ParseError | ||||
| from PIL import Image | ||||
| from PIL import ImageDraw | ||||
| from PIL import ImageFont | ||||
| from tika import parser | ||||
|  | ||||
|  | ||||
| @@ -97,3 +102,146 @@ class TikaDocumentParser(DocumentParser): | ||||
|             file.close() | ||||
|  | ||||
|         return pdf_path | ||||
|  | ||||
|  | ||||
| class TikaDocumentParserEml(DocumentParser): | ||||
|     """ | ||||
|     This parser sends documents to a local tika server | ||||
|     """ | ||||
|  | ||||
|     logging_name = "paperless.parsing.tikaeml" | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type, file_name=None): | ||||
|  | ||||
|         img = Image.new("RGB", (500, 700), color="white") | ||||
|         draw = ImageDraw.Draw(img) | ||||
|         font = ImageFont.truetype( | ||||
|             font=settings.THUMBNAIL_FONT_NAME, | ||||
|             size=20, | ||||
|             layout_engine=ImageFont.LAYOUT_BASIC, | ||||
|         ) | ||||
|         draw.text((5, 5), self.text, font=font, fill="black") | ||||
|  | ||||
|         out_path = os.path.join(self.tempdir, "thumb.png") | ||||
|         img.save(out_path) | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
|         tika_server = settings.PAPERLESS_TIKA_ENDPOINT | ||||
|         try: | ||||
|             parsed = parser.from_file(document_path, tika_server) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 f"Error while fetching document metadata for " f"{document_path}: {e}", | ||||
|             ) | ||||
|             return [] | ||||
|  | ||||
|         return [ | ||||
|             { | ||||
|                 "namespace": "", | ||||
|                 "prefix": "", | ||||
|                 "key": key, | ||||
|                 "value": parsed["metadata"][key], | ||||
|             } | ||||
|             for key in parsed["metadata"] | ||||
|         ] | ||||
|  | ||||
|     def parse(self, document_path, mime_type, file_name=None): | ||||
|         self.log("info", f"Sending {document_path} to Tika server") | ||||
|         tika_server = settings.PAPERLESS_TIKA_ENDPOINT | ||||
|  | ||||
|         try: | ||||
|             parsed = parser.from_file(document_path, tika_server) | ||||
|         except Exception as err: | ||||
|             raise ParseError( | ||||
|                 f"Could not parse {document_path} with tika server at " | ||||
|                 f"{tika_server}: {err}", | ||||
|             ) | ||||
|  | ||||
|         text = re.sub(" +", " ", str(parsed)) | ||||
|         text = re.sub("\n+", "\n", text) | ||||
|         self.text = text | ||||
|  | ||||
|         print(text) | ||||
|  | ||||
|         try: | ||||
|             self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 f"Unable to extract date for document " f"{document_path}: {e}", | ||||
|             ) | ||||
|  | ||||
|         md_path = self.convert_to_md(document_path, file_name) | ||||
|         self.archive_path = self.convert_md_to_pdf(md_path) | ||||
|  | ||||
|     def convert_md_to_pdf(self, md_path): | ||||
|         pdf_path = os.path.join(self.tempdir, "convert.pdf") | ||||
|         gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT | ||||
|         url = gotenberg_server + "/forms/chromium/convert/markdown" | ||||
|  | ||||
|         self.log("info", f"Converting {md_path} to PDF as {pdf_path}") | ||||
|         html = StringIO( | ||||
|             """ | ||||
| <!doctype html> | ||||
| <html lang="en"> | ||||
|   <head> | ||||
|     <meta charset="utf-8"> | ||||
|     <title>My PDF</title> | ||||
|   </head> | ||||
|   <body> | ||||
|     {{ toHTML "convert.md" }} | ||||
|   </body> | ||||
| </html> | ||||
|         """, | ||||
|         ) | ||||
|         md = StringIO( | ||||
|             """ | ||||
| # Subject | ||||
|  | ||||
| blub  \nblah | ||||
| blib | ||||
|         """, | ||||
|         ) | ||||
|  | ||||
|         files = { | ||||
|             "md": ( | ||||
|                 os.path.basename(md_path), | ||||
|                 md, | ||||
|             ), | ||||
|             "html": ( | ||||
|                 "index.html", | ||||
|                 html, | ||||
|             ), | ||||
|         } | ||||
|         headers = {} | ||||
|  | ||||
|         try: | ||||
|             response = requests.post(url, files=files, headers=headers) | ||||
|             response.raise_for_status()  # ensure we notice bad responses | ||||
|         except Exception as err: | ||||
|             raise ParseError(f"Error while converting document to PDF: {err}") | ||||
|  | ||||
|         with open(pdf_path, "wb") as file: | ||||
|             file.write(response.content) | ||||
|             file.close() | ||||
|  | ||||
|         return pdf_path | ||||
|  | ||||
|     def convert_to_md(self, document_path, file_name): | ||||
|         md_path = os.path.join(self.tempdir, "convert.md") | ||||
|  | ||||
|         self.log("info", f"Converting {document_path} to markdown as {md_path}") | ||||
|  | ||||
|         with open(md_path, "w") as file: | ||||
|             md = [ | ||||
|                 "# Subject", | ||||
|                 "\n\n", | ||||
|                 "blah", | ||||
|             ] | ||||
|             file.writelines(md) | ||||
|             file.close() | ||||
|  | ||||
|         return md_path | ||||
|   | ||||
| @@ -22,3 +22,19 @@ def tika_consumer_declaration(sender, **kwargs): | ||||
|             "text/rtf": ".rtf", | ||||
|         }, | ||||
|     } | ||||
|  | ||||
|  | ||||
| def get_parser_eml(*args, **kwargs): | ||||
|     from .parsers import TikaDocumentParserEml | ||||
|  | ||||
|     return TikaDocumentParserEml(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| def tika_consumer_declaration_eml(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": get_parser_eml, | ||||
|         "weight": 10, | ||||
|         "mime_types": { | ||||
|             "message/rfc822": ".eml", | ||||
|         }, | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 phail
					phail