mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add support for a heuristic that extracts the document date from its text
This commit is contained in:
		| @@ -13,6 +13,7 @@ python-dateutil>=2.6.0 | ||||
| python-dotenv>=0.6.2 | ||||
| python-gnupg>=0.3.9 | ||||
| pytz>=2016.10 | ||||
| dateparser>=0.6.0 | ||||
| gunicorn==19.7.1 | ||||
|  | ||||
| # For the tests | ||||
|   | ||||
| @@ -118,12 +118,14 @@ class Consumer(object): | ||||
|  | ||||
|             parsed_document = parser_class(doc) | ||||
|             thumbnail = parsed_document.get_thumbnail() | ||||
|             date = parsed_document.get_date() | ||||
|  | ||||
|             try: | ||||
|                 document = self._store( | ||||
|                     parsed_document.get_text(), | ||||
|                     doc, | ||||
|                     thumbnail | ||||
|                     thumbnail, | ||||
|                     date | ||||
|                 ) | ||||
|             except ParseError as e: | ||||
|  | ||||
| @@ -174,7 +176,7 @@ class Consumer(object): | ||||
|         return sorted( | ||||
|             options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||
|  | ||||
|     def _store(self, text, doc, thumbnail): | ||||
|     def _store(self, text, doc, thumbnail, date): | ||||
|  | ||||
|         file_info = FileInfo.from_path(doc) | ||||
|  | ||||
| @@ -182,7 +184,7 @@ class Consumer(object): | ||||
|  | ||||
|         self.log("debug", "Saving record to database") | ||||
|  | ||||
|         created = file_info.created or timezone.make_aware( | ||||
|         created = file_info.created or date or timezone.make_aware( | ||||
|                     datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||
|  | ||||
|         with open(doc, "rb") as f: | ||||
|   | ||||
| @@ -35,6 +35,12 @@ class DocumentParser(object): | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_date(self): | ||||
|         """ | ||||
|         Returns the date of the document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|             "group": self.logging_group | ||||
|   | ||||
| @@ -258,3 +258,6 @@ PAPERLESS_LIST_PER_PAGE = int(os.getenv("PAPERLESS_LIST_PER_PAGE", 100)) | ||||
|  | ||||
| FY_START = os.getenv("PAPERLESS_FINANCIAL_YEAR_START") | ||||
| FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END") | ||||
|  | ||||
| # Specify the default date order (for autodetected dates) | ||||
| DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") | ||||
|   | ||||
| @@ -3,6 +3,7 @@ import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
| import dateparser | ||||
|  | ||||
| import langdetect | ||||
| import pyocr | ||||
| @@ -30,6 +31,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300 | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
| @@ -175,6 +177,29 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) | ||||
|         return text | ||||
|  | ||||
|     def get_date(self): | ||||
|         text = self.get_text() | ||||
|  | ||||
|         # This regular expression will try to find dates in the document at | ||||
|         # hand and will match the following formats: | ||||
|         # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - MONTH ZZZZ | ||||
|         m = re.search( | ||||
|             r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||
|             r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|             r'\b([^ ]{3,9} [0-9]{4})\b', text) | ||||
|  | ||||
|         if m is None: | ||||
|             return None | ||||
|  | ||||
|         return dateparser.parse(m.group(0), | ||||
|                                 settings={'DATE_ORDER': self.DATE_ORDER, | ||||
|                                           'PREFER_DAY_OF_MONTH': 'first', | ||||
|                                           'RETURN_AS_TIMEZONE_AWARE': True}) | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Wolf-Bastian Pöttner
					Wolf-Bastian Pöttner