mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add support for a heuristic that extracts the document date from its text
This commit is contained in:
		| @@ -3,6 +3,7 @@ import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
| import dateparser | ||||
|  | ||||
| import langdetect | ||||
| import pyocr | ||||
| @@ -30,6 +31,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
| @@ -175,6 +177,29 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||
|         return text | ||||
|  | ||||
|     def get_date(self): | ||||
|         text = self.get_text() | ||||
|  | ||||
|         # This regular expression will try to find dates in the document at | ||||
|         # hand and will match the following formats: | ||||
|         # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - MONTH ZZZZ | ||||
|         m = re.search( | ||||
|             r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||
|             r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|             r'\b([^ ]{3,9} [0-9]{4})\b', text) | ||||
|  | ||||
|         if m is None: | ||||
|             return None | ||||
|  | ||||
|         return dateparser.parse(m.group(0), | ||||
|                                 settings={'DATE_ORDER': self.DATE_ORDER, | ||||
|                                           'PREFER_DAY_OF_MONTH': 'first', | ||||
|                                           'RETURN_AS_TIMEZONE_AWARE': True}) | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Wolf-Bastian Pöttner
					Wolf-Bastian Pöttner