mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge pull request #302 from BastianPoe/bugfix/extend_regex_to_find_more_dates
Extends the regex to find dates in documents as reported by @isaacsando
This commit is contained in:
		| @@ -202,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         return text | ||||
|  | ||||
|     def get_date(self): | ||||
|         text = self.get_text() | ||||
|         date = None | ||||
|         datestring = None | ||||
|  | ||||
|         try: | ||||
|             text = self.get_text() | ||||
|         except ParseError as e: | ||||
|             return None | ||||
|  | ||||
|         # This regular expression will try to find dates in the document at | ||||
|         # hand and will match the following formats: | ||||
| @@ -210,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits | ||||
|         # - MONTH ZZZZ | ||||
|         m = re.search( | ||||
|         # - MONTH ZZZZ, with ZZZZ being 4 digits | ||||
|         # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits | ||||
|         pattern = re.compile( | ||||
|             r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + | ||||
|             r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|             r'\b([^ ]{3,9} [0-9]{4})\b', text) | ||||
|             r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + | ||||
|             r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + | ||||
|             r'\b([^\W\d_]{3,9} [0-9]{4})\b') | ||||
|  | ||||
|         if m is None: | ||||
|             return None | ||||
|         # Iterate through all regex matches and try to parse the date | ||||
|         for m in re.finditer(pattern, text): | ||||
|             datestring = m.group(0) | ||||
|  | ||||
|         return dateparser.parse(m.group(0), | ||||
|                                 settings={'DATE_ORDER': self.DATE_ORDER, | ||||
|                                           'PREFER_DAY_OF_MONTH': 'first', | ||||
|                                           'RETURN_AS_TIMEZONE_AWARE': True}) | ||||
|             try: | ||||
|                 date = dateparser.parse( | ||||
|                            datestring, | ||||
|                            settings={'DATE_ORDER': self.DATE_ORDER, | ||||
|                                      'PREFER_DAY_OF_MONTH': 'first', | ||||
|                                      'RETURN_AS_TIMEZONE_AWARE': True}) | ||||
|             except TypeError: | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             if date is not None: | ||||
|                 break | ||||
|  | ||||
|         if date is not None: | ||||
|             self.log("info", "Detected document date " + date.strftime("%x") + | ||||
|                              " based on string " + datestring) | ||||
|         else: | ||||
|             self.log("info", "Unable to detect date for document") | ||||
|  | ||||
|         return date | ||||
|  | ||||
|  | ||||
| def run_convert(*args): | ||||
|   | ||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_8.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_8.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_9.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_9.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -25,6 +25,97 @@ class TestDate(TestCase): | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SCRATCH | ||||
|     ) | ||||
|     def test_date_format_1(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = "lorem ipsum 130218 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          None) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_date_format_2(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = "lorem ipsum 2018 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          None) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_date_format_3(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = "lorem ipsum 20180213 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          None) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_date_format_4(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = "lorem ipsum 13.02.2018 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          datetime.datetime(2018, 2, 13, 0, 0, | ||||
|                                            tzinfo=tz.tzutc())) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_date_format_5(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = ("lorem ipsum 130218, 2018, 20180213 and " | ||||
|                                "13.02.2018 lorem ipsum") | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          datetime.datetime(2018, 2, 13, 0, 0, | ||||
|                                            tzinfo=tz.tzutc())) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_date_format_6(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = ("lorem ipsum\n" | ||||
|                                "Wohnort\n" | ||||
|                                "3100\n" | ||||
|                                "IBAN\n" | ||||
|                                "AT87 4534\n" | ||||
|                                "1234\n" | ||||
|                                "1234 5678\n" | ||||
|                                "BIC\n" | ||||
|                                "lorem ipsum") | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          None) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_date_format_7(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.TEXT_CACHE = ("lorem ipsum\n" | ||||
|                                "März 2019\n" | ||||
|                                "lorem ipsum") | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          datetime.datetime(2019, 3, 1, 0, 0, | ||||
|                                            tzinfo=tz.tzutc())) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_get_text_1_pdf(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
| @@ -213,3 +304,29 @@ class TestDate(TestCase): | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          datetime.datetime(2018, 4, 1, 0, 0, | ||||
|                                            tzinfo=tz.tzutc())) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_get_text_8_pdf(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.get_text() | ||||
|         self.assertEqual(document._is_ocred(), True) | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          datetime.datetime(2017, 12, 31, 0, 0, | ||||
|                                            tzinfo=tz.tzutc())) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||
|         SAMPLE_FILES | ||||
|     ) | ||||
|     def test_get_text_9_pdf(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf") | ||||
|         document = RasterisedDocumentParser(input_file) | ||||
|         document.get_text() | ||||
|         self.assertEqual(document._is_ocred(), True) | ||||
|         self.assertEqual(document.get_date(), | ||||
|                          datetime.datetime(2017, 12, 31, 0, 0, | ||||
|                                            tzinfo=tz.tzutc())) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn