mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Updates the ignore date parsing to utilize the settings defined date order, instead of guessing a bit
This commit is contained in:
		 Trenton Holmes
					Trenton Holmes
				
			
				
					committed by
					
						 Trenton Holmes
						Trenton Holmes
					
				
			
			
				
	
			
			
			 Trenton Holmes
						Trenton Holmes
					
				
			
						parent
						
							a944ef1ca6
						
					
				
				
					commit
					304d5b0d5a
				
			| @@ -247,10 +247,8 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: | |||||||
|  |  | ||||||
|     # if filename date parsing is enabled, search there first: |     # if filename date parsing is enabled, search there first: | ||||||
|     if settings.FILENAME_DATE_ORDER: |     if settings.FILENAME_DATE_ORDER: | ||||||
|         logger.info("Attempting parsing from filename") |  | ||||||
|         for m in re.finditer(DATE_REGEX, filename): |         for m in re.finditer(DATE_REGEX, filename): | ||||||
|             date_string = m.group(0) |             date_string = m.group(0) | ||||||
|             logger.info(f"Found potential date: {date_string}") |  | ||||||
|  |  | ||||||
|             try: |             try: | ||||||
|                 date = __parser(date_string, settings.FILENAME_DATE_ORDER) |                 date = __parser(date_string, settings.FILENAME_DATE_ORDER) | ||||||
| @@ -260,16 +258,11 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: | |||||||
|  |  | ||||||
|             date = __filter(date) |             date = __filter(date) | ||||||
|             if date is not None: |             if date is not None: | ||||||
|                 logger.info(f"Found date: {date}") |  | ||||||
|                 return date |                 return date | ||||||
|             else: |  | ||||||
|                 logger.info("Filtered date out") |  | ||||||
|  |  | ||||||
|     logger.info("Attempting parsing from content") |  | ||||||
|     # Iterate through all regex matches in text and try to parse the date |     # Iterate through all regex matches in text and try to parse the date | ||||||
|     for m in re.finditer(DATE_REGEX, text): |     for m in re.finditer(DATE_REGEX, text): | ||||||
|         date_string = m.group(0) |         date_string = m.group(0) | ||||||
|         logger.info(f"Found potential date: {date_string}") |  | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             date = __parser(date_string, settings.DATE_ORDER) |             date = __parser(date_string, settings.DATE_ORDER) | ||||||
| @@ -279,10 +272,7 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: | |||||||
|  |  | ||||||
|         date = __filter(date) |         date = __filter(date) | ||||||
|         if date is not None: |         if date is not None: | ||||||
|             logger.info(f"Found date: {date}") |  | ||||||
|             return date |             return date | ||||||
|         else: |  | ||||||
|             logger.info("Filtered date out") |  | ||||||
|  |  | ||||||
|     return date |     return date | ||||||
|  |  | ||||||
|   | |||||||
| @@ -605,21 +605,40 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( | |||||||
| if PAPERLESS_TIKA_ENABLED: | if PAPERLESS_TIKA_ENABLED: | ||||||
|     INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") |     INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") | ||||||
|  |  | ||||||
| # List dates that should be ignored when trying to parse date from document text |  | ||||||
| IGNORE_DATES: Set[datetime.date] = set() |  | ||||||
|  |  | ||||||
|  | def _parse_ignore_dates( | ||||||
|  |     env_ignore: str, | ||||||
|  |     date_order: str = DATE_ORDER, | ||||||
|  | ) -> Set[datetime.datetime]: | ||||||
|  |     """ | ||||||
|  |     If the PAPERLESS_IGNORE_DATES environment variable is set, parse the | ||||||
|  |     user provided string(s) into dates | ||||||
|  |  | ||||||
| def _parse_ignore_dates(env_ignore: str) -> Set[datetime.datetime]: |     Args: | ||||||
|  |         env_ignore (str): The value of the environment variable, comma seperated dates | ||||||
|  |         date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER. | ||||||
|  |  | ||||||
|  |     Returns: | ||||||
|  |         Set[datetime.datetime]: The set of parsed date objects | ||||||
|  |     """ | ||||||
|     import dateparser |     import dateparser | ||||||
|  |  | ||||||
|     ignored_dates = set() |     ignored_dates = set() | ||||||
|     for s in env_ignore.split(","): |     for s in env_ignore.split(","): | ||||||
|         d = dateparser.parse(s) |         d = dateparser.parse( | ||||||
|  |             s, | ||||||
|  |             settings={ | ||||||
|  |                 "DATE_ORDER": date_order, | ||||||
|  |             }, | ||||||
|  |         ) | ||||||
|         if d: |         if d: | ||||||
|             ignored_dates.add(d.date()) |             ignored_dates.add(d.date()) | ||||||
|     return ignored_dates |     return ignored_dates | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # List dates that should be ignored when trying to parse date from document text | ||||||
|  | IGNORE_DATES: Set[datetime.date] = set() | ||||||
|  |  | ||||||
| if os.getenv("PAPERLESS_IGNORE_DATES") is not None: | if os.getenv("PAPERLESS_IGNORE_DATES") is not None: | ||||||
|     IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES")) |     IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES")) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,6 +9,20 @@ class TestIgnoreDateParsing(TestCase): | |||||||
|     Tests the parsing of the PAPERLESS_IGNORE_DATES setting value |     Tests the parsing of the PAPERLESS_IGNORE_DATES setting value | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|  |     def _parse_checker(self, test_cases): | ||||||
|  |         """ | ||||||
|  |         Helper function to check ignore date parsing | ||||||
|  |  | ||||||
|  |         Args: | ||||||
|  |             test_cases (_type_): _description_ | ||||||
|  |         """ | ||||||
|  |         for env_str, date_format, expected_date_set in test_cases: | ||||||
|  |  | ||||||
|  |             self.assertSetEqual( | ||||||
|  |                 _parse_ignore_dates(env_str, date_format), | ||||||
|  |                 expected_date_set, | ||||||
|  |             ) | ||||||
|  |  | ||||||
|     def test_no_ignore_dates_set(self): |     def test_no_ignore_dates_set(self): | ||||||
|         """ |         """ | ||||||
|         GIVEN: |         GIVEN: | ||||||
| @@ -26,20 +40,19 @@ class TestIgnoreDateParsing(TestCase): | |||||||
|             - All ignore dates are parsed |             - All ignore dates are parsed | ||||||
|         """ |         """ | ||||||
|         test_cases = [ |         test_cases = [ | ||||||
|             ("1985-05-01", [datetime.date(1985, 5, 1)]), |             ("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}), | ||||||
|             ( |             ( | ||||||
|                 "1985-05-01,1991-12-05", |                 "1985-05-01,1991-12-05", | ||||||
|                 [datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)], |                 "YMD", | ||||||
|  |                 {datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)}, | ||||||
|  |             ), | ||||||
|  |             ("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}), | ||||||
|  |             ("11.01.10", "DMY", {datetime.date(2010, 1, 11)}), | ||||||
|  |             ( | ||||||
|  |                 "11.01.2001,15-06-1996", | ||||||
|  |                 "DMY", | ||||||
|  |                 {datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)}, | ||||||
|             ), |             ), | ||||||
|             ("2010-12-13", [datetime.date(2010, 12, 13)]), |  | ||||||
|         ] |         ] | ||||||
|         for env_str, expected_dates in test_cases: |  | ||||||
|             expected_date_set = set() |  | ||||||
|  |  | ||||||
|             for expected_date in expected_dates: |         self._parse_checker(test_cases) | ||||||
|                 expected_date_set.add(expected_date) |  | ||||||
|  |  | ||||||
|             self.assertSetEqual( |  | ||||||
|                 _parse_ignore_dates(env_str), |  | ||||||
|                 expected_date_set, |  | ||||||
|             ) |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user