mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	first stab at text consumer
This commit is contained in:
		| @@ -67,6 +67,7 @@ INSTALLED_APPS = [ | ||||
|     "documents.apps.DocumentsConfig", | ||||
|     "reminders.apps.RemindersConfig", | ||||
|     "paperless_tesseract.apps.PaperlessTesseractConfig", | ||||
|     "paperless_text.apps.PaperlessTextConfig", | ||||
|  | ||||
|     "flat_responsive",  # TODO: Remove as of Django 2.x | ||||
|     "django.contrib.admin", | ||||
|   | ||||
							
								
								
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_text/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/paperless_text/apps.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| from django.apps import AppConfig | ||||
|  | ||||
|  | ||||
| class PaperlessTextConfig(AppConfig): | ||||
|  | ||||
|     name = "paperless_text" | ||||
|  | ||||
|     def ready(self): | ||||
|  | ||||
|         from documents.signals import document_consumer_declaration | ||||
|  | ||||
|         from .signals import ConsumerDeclaration | ||||
|  | ||||
|         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||
|  | ||||
|         AppConfig.ready(self) | ||||
							
								
								
									
										113
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								src/paperless_text/parsers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,113 @@ | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
|  | ||||
| import dateparser | ||||
| from django.conf import settings | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError | ||||
|  | ||||
|  | ||||
| class TextDocumentParser(DocumentParser): | ||||
|     """ | ||||
|     This parser directly parses a text document (.txt or .md) | ||||
|     """ | ||||
|  | ||||
|  | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     OCR_ALWAYS = settings.OCR_ALWAYS | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         super().__init__(path) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|         The thumbnail of a txt is just a 500px wide image of the text | ||||
|         rendered onto a letter-sized page. | ||||
|         """ | ||||
|  | ||||
|         run_convert( | ||||
|             self.CONVERT, | ||||
|             "-size", "500x647", | ||||
|             "xc:white", | ||||
|             "-pointsize", "12", | ||||
|             "-fill", "black", | ||||
|             "-draw", "\"text 0,12 \'$(cat {})\'\"".format(self.document_path), | ||||
|             os.path.join(self.tempdir, "convert-txt.png") | ||||
|         ) | ||||
|  | ||||
|         return os.path.join(self.tempdir, "convert-txt.png") | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|         if self._text is not None: | ||||
|             return self._text | ||||
|  | ||||
|         with open(self.document_path, 'r') as f: | ||||
|             self._text = f.read() | ||||
|  | ||||
|         return self._text | ||||
|  | ||||
|     def get_date(self): | ||||
|         date = None | ||||
|         datestring = None | ||||
|  | ||||
|         try: | ||||
|             text = self.get_text() | ||||
|         except ParseError as e: | ||||
|             return None | ||||
|  | ||||
|         # This regular expression will try to find dates in the document at | ||||
|         # hand and will match the following formats: | ||||
|         # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - MONTH ZZZZ, with ZZZZ being 4 digits | ||||
|         # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits | ||||
|         pattern = re.compile( | ||||
|             r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||
|             r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|             r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + | ||||
|             r'\b([^\W\d_]{3,9} [0-9]{4})\b') | ||||
|  | ||||
|         # Iterate through all regex matches and try to parse the date | ||||
|         for m in re.finditer(pattern, text): | ||||
|             datestring = m.group(0) | ||||
|  | ||||
|             try: | ||||
|                 date = dateparser.parse( | ||||
|                            datestring, | ||||
|                            settings={'DATE_ORDER': self.DATE_ORDER, | ||||
|                                      'PREFER_DAY_OF_MONTH': 'first', | ||||
|                                      'RETURN_AS_TIMEZONE_AWARE': True}) | ||||
|             except TypeError: | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             if date is not None: | ||||
|                 break | ||||
|  | ||||
|         if date is not None: | ||||
|             self.log("info", "Detected document date " + date.isoformat() + | ||||
|                              " based on string " + datestring) | ||||
|         else: | ||||
|             self.log("info", "Unable to detect date for document") | ||||
|  | ||||
|         return date | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
|         environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT | ||||
|     if settings.CONVERT_TMPDIR: | ||||
|         environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR | ||||
|  | ||||
|     if not subprocess.Popen(args, env=environment).wait() == 0: | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
							
								
								
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								src/paperless_text/signals.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| import re | ||||
|  | ||||
| from .parsers import TextDocumentParser | ||||
|  | ||||
|  | ||||
| class ConsumerDeclaration: | ||||
|  | ||||
|     MATCHING_FILES = re.compile("^.*\.(txt|md)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def handle(cls, sender, **kwargs): | ||||
|         return cls.test | ||||
|  | ||||
|     @classmethod | ||||
|     def test(cls, doc): | ||||
|  | ||||
|         if cls.MATCHING_FILES.match(doc.lower()): | ||||
|             return { | ||||
|                 "parser": TextDocumentParser, | ||||
|                 "weight": 10 | ||||
|             } | ||||
|  | ||||
|         return None | ||||
		Reference in New Issue
	
	Block a user
	 Joshua Taillon
					Joshua Taillon