mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add option for parsing of date from filename (and associated tests)
This commit is contained in:
		| @@ -122,6 +122,14 @@ PAPERLESS_EMAIL_SECRET="" | |||||||
| # "true", the document will instead be opened in the browser, if possible. | # "true", the document will instead be opened in the browser, if possible. | ||||||
| #PAPERLESS_INLINE_DOC="false" | #PAPERLESS_INLINE_DOC="false" | ||||||
|  |  | ||||||
|  | # By default, paperless will check the document text for document date information. | ||||||
|  | # Uncomment the line below to enable checking the document filename for date | ||||||
|  | # information. The date order can be set to any option as specified in | ||||||
|  | # https://dateparser.readthedocs.io/en/latest/#settings. The filename will be | ||||||
|  | # checked first, and if nothing is found, the document text will be checked | ||||||
|  | # as normal. | ||||||
|  | #PAPERLESS_FILENAME_DATE_ORDER="YMD" | ||||||
|  |  | ||||||
| # | # | ||||||
| # The following values use sensible defaults for modern systems, but if you're | # The following values use sensible defaults for modern systems, but if you're | ||||||
| # running Paperless on a low-resource device (like a Raspberry Pi), modifying | # running Paperless on a low-resource device (like a Raspberry Pi), modifying | ||||||
|   | |||||||
| @@ -292,3 +292,4 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END") | |||||||
|  |  | ||||||
| # Specify the default date order (for autodetected dates) | # Specify the default date order (for autodetected dates) | ||||||
| DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") | DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") | ||||||
|  | FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") | ||||||
|   | |||||||
| @@ -34,6 +34,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None |     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||||
|     UNPAPER = settings.UNPAPER_BINARY |     UNPAPER = settings.UNPAPER_BINARY | ||||||
|     DATE_ORDER = settings.DATE_ORDER |     DATE_ORDER = settings.DATE_ORDER | ||||||
|  |     FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER | ||||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE |     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||||
|     OCR_ALWAYS = settings.OCR_ALWAYS |     OCR_ALWAYS = settings.OCR_ALWAYS | ||||||
|  |  | ||||||
| @@ -206,7 +207,30 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|         date = None |         date = None | ||||||
|         datestring = None |         datestring = None | ||||||
|  |  | ||||||
|  |         if self.FILENAME_DATE_ORDER: | ||||||
|  |             self.log("info", "Checking document title for date") | ||||||
|  |             text = os.path.basename(self.document_path) | ||||||
|  |             for m in re.finditer(DATE_REGEX, text): | ||||||
|  |                 datestring = m.group(0) | ||||||
|  |                 try: | ||||||
|  |                     date = dateparser.parse( | ||||||
|  |                         datestring, | ||||||
|  |                         settings={'DATE_ORDER': self.FILENAME_DATE_ORDER, | ||||||
|  |                                   'PREFER_DAY_OF_MONTH': 'first', | ||||||
|  |                                   'RETURN_AS_TIMEZONE_AWARE': True}) | ||||||
|  |                 except TypeError: | ||||||
|  |                     # Skip all matches that do not parse to a proper date | ||||||
|  |                     continue | ||||||
|  |  | ||||||
|  |                 if date is not None: | ||||||
|  |                     self.log("info", | ||||||
|  |                              "Detected document date {} based on string {} " | ||||||
|  |                              "from document title" | ||||||
|  |                              "".format(date.isoformat(), datestring)) | ||||||
|  |                     return date | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|  |             self.log('info', "Checking document text for date") | ||||||
|             text = self.get_text() |             text = self.get_text() | ||||||
|         except ParseError as e: |         except ParseError as e: | ||||||
|             return None |             return None | ||||||
|   | |||||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 136 KiB | 
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 136 KiB | 
| @@ -425,4 +425,72 @@ class TestDate(TestCase): | |||||||
|             datetime.datetime(2017, 12, 31, 0, 0, |             datetime.datetime(2017, 12, 31, 0, 0, | ||||||
|                               tzinfo=tz.gettz(settings.TIME_ZONE)) |                               tzinfo=tz.gettz(settings.TIME_ZONE)) | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     @mock.patch( | ||||||
|  |         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||||
|  |         SCRATCH | ||||||
|  |     ) | ||||||
|  |     def test_filename_date_1_pdf(self): | ||||||
|  |         input_file = os.path.join(self.SAMPLE_FILES, | ||||||
|  |                                   "tests_date_in_filename_2018-03-20_1.pdf") | ||||||
|  |         document = RasterisedDocumentParser(input_file) | ||||||
|  |         document.FILENAME_DATE_ORDER = 'YMD' | ||||||
|  |         document.get_text() | ||||||
|  |         date = document.get_date() | ||||||
|  |         self.assertEqual(document._is_ocred(), True) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime(2018, 3, 20, 0, 0, | ||||||
|  |                               tzinfo=tz.gettz(settings.TIME_ZONE)) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @mock.patch( | ||||||
|  |         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||||
|  |         SCRATCH | ||||||
|  |     ) | ||||||
|  |     def test_filename_date_1_png(self): | ||||||
|  |         input_file = os.path.join(self.SAMPLE_FILES, | ||||||
|  |                                   "tests_date_in_filename_2018-03-20_1.png") | ||||||
|  |         document = RasterisedDocumentParser(input_file) | ||||||
|  |         document.FILENAME_DATE_ORDER = 'YMD' | ||||||
|  |         date = document.get_date() | ||||||
|  |         self.assertEqual(document._is_ocred(), False) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime(2018, 3, 20, 0, 0, | ||||||
|  |                               tzinfo=tz.gettz(settings.TIME_ZONE)) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @mock.patch( | ||||||
|  |         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||||
|  |         SCRATCH | ||||||
|  |     ) | ||||||
|  |     def test_filename_date_2_pdf(self): | ||||||
|  |         input_file = os.path.join(self.SAMPLE_FILES, | ||||||
|  |                                   "2013-12-11_tests_date_in_filename_2.pdf") | ||||||
|  |         document = RasterisedDocumentParser(input_file) | ||||||
|  |         document.FILENAME_DATE_ORDER = 'YMD' | ||||||
|  |         date = document.get_date() | ||||||
|  |         self.assertEqual(document._is_ocred(), True) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime(2013, 12, 11, 0, 0, | ||||||
|  |                               tzinfo=tz.gettz(settings.TIME_ZONE)) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @mock.patch( | ||||||
|  |         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", | ||||||
|  |         SCRATCH | ||||||
|  |     ) | ||||||
|  |     def test_filename_date_2_png(self): | ||||||
|  |         input_file = os.path.join(self.SAMPLE_FILES, | ||||||
|  |                                   "2013-12-11_tests_date_in_filename_2.png") | ||||||
|  |         document = RasterisedDocumentParser(input_file) | ||||||
|  |         document.FILENAME_DATE_ORDER = 'YMD' | ||||||
|  |         date = document.get_date() | ||||||
|  |         self.assertEqual(document._is_ocred(), False) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime(2013, 12, 11, 0, 0, | ||||||
|  |                               tzinfo=tz.gettz(settings.TIME_ZONE)) | ||||||
|         ) |         ) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Joshua Taillon
					Joshua Taillon