mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Consolidate get_date onto the DocumentParser parent class
This commit is contained in:
		| @@ -1,9 +1,12 @@ | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import tempfile | ||||
| import re | ||||
|  | ||||
| import dateparser | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
|  | ||||
| # This regular expression will try to find dates in the document at | ||||
| # hand and will match the following formats: | ||||
| @@ -32,6 +35,7 @@ class DocumentParser: | ||||
|     """ | ||||
|  | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     DATE_ORDER = settings.DATE_ORDER | ||||
|  | ||||
|     def __init__(self, path): | ||||
|         self.document_path = path | ||||
| @@ -55,7 +59,52 @@ class DocumentParser: | ||||
|         """ | ||||
|         Returns the date of the document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|         date = None | ||||
|         date_string = None | ||||
|  | ||||
|         try: | ||||
|             text = self.get_text() | ||||
|         except ParseError: | ||||
|             return None | ||||
|  | ||||
|         next_year = timezone.now().year + 5  # Arbitrary 5 year future limit | ||||
|  | ||||
|         # Iterate through all regex matches and try to parse the date | ||||
|         for m in re.finditer(DATE_REGEX, text): | ||||
|  | ||||
|             date_string = m.group(0) | ||||
|  | ||||
|             try: | ||||
|                 date = dateparser.parse( | ||||
|                     date_string, | ||||
|                     settings={ | ||||
|                         "DATE_ORDER": self.DATE_ORDER, | ||||
|                         "PREFER_DAY_OF_MONTH": "first", | ||||
|                         "RETURN_AS_TIMEZONE_AWARE": True | ||||
|                     } | ||||
|                 ) | ||||
|             except TypeError: | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             if date is not None and next_year > date.year > 1900: | ||||
|                 break | ||||
|             else: | ||||
|                 date = None | ||||
|  | ||||
|         if date is not None: | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "Detected document date {} based on string {}".format( | ||||
|                     date.isoformat(), | ||||
|                     date_string | ||||
|                 ) | ||||
|             ) | ||||
|         else: | ||||
|             self.log("info", "Unable to detect date for document") | ||||
|  | ||||
|         return date | ||||
|  | ||||
|     def log(self, level, message): | ||||
|         getattr(self.logger, level)(message, extra={ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn