mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-28 03:46:06 -05:00 
			
		
		
		
	Merge pull request #251 from jayme-github/ignore-date
Add option to ignore certain dates in parse_date
This commit is contained in:
		| @@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME=<filename> | |||||||
|  |  | ||||||
|     Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``. |     Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``. | ||||||
|  |  | ||||||
|  | PAPERLESS_IGNORE_DATES=<string> | ||||||
|  |     Paperless parses a documents creation date from filename and file content. | ||||||
|  |     You may specify a comma separated list of dates that should be ignored during | ||||||
|  |     this process. This is useful for special dates (like date of birth) that appear | ||||||
|  |     in documents regularly but are very unlikely to be the documents creation date. | ||||||
|  |  | ||||||
|  |     You may specify dates in a multitude of formats supported by dateparser (see | ||||||
|  |     https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates | ||||||
|  |     need to be comma separated, the options are limited. | ||||||
|  |     Example: "2020-12-02,22.04.1999" | ||||||
|  |  | ||||||
|  |     Defaults to an empty string to not ignore any dates. | ||||||
|  |  | ||||||
|  |  | ||||||
| Binaries | Binaries | ||||||
| ######## | ######## | ||||||
|   | |||||||
| @@ -50,11 +50,14 @@ | |||||||
| #PAPERLESS_TIME_ZONE=UTC | #PAPERLESS_TIME_ZONE=UTC | ||||||
| #PAPERLESS_CONSUMER_POLLING=10 | #PAPERLESS_CONSUMER_POLLING=10 | ||||||
| #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | ||||||
|  | #PAPERLESS_CONSUMER_RECURSIVE=false | ||||||
|  | #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false | ||||||
| #PAPERLESS_OPTIMIZE_THUMBNAILS=true | #PAPERLESS_OPTIMIZE_THUMBNAILS=true | ||||||
| #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | ||||||
| #PAPERLESS_FILENAME_DATE_ORDER=YMD | #PAPERLESS_FILENAME_DATE_ORDER=YMD | ||||||
| #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[] | #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[] | ||||||
| #PAPERLESS_THUMBNAIL_FONT_NAME= | #PAPERLESS_THUMBNAIL_FONT_NAME= | ||||||
|  | #PAPERLESS_IGNORE_DATES= | ||||||
|  |  | ||||||
| # Tika settings | # Tika settings | ||||||
|  |  | ||||||
|   | |||||||
| @@ -210,6 +210,13 @@ def parse_date(filename, text): | |||||||
|             } |             } | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     def __filter(date): | ||||||
|  |         if date and date.year > 1900 and \ | ||||||
|  |                 date <= timezone.now() and \ | ||||||
|  |                 date.date() not in settings.IGNORE_DATES: | ||||||
|  |             return date | ||||||
|  |         return None | ||||||
|  |  | ||||||
|     date = None |     date = None | ||||||
|  |  | ||||||
|     # if filename date parsing is enabled, search there first: |     # if filename date parsing is enabled, search there first: | ||||||
| @@ -223,7 +230,8 @@ def parse_date(filename, text): | |||||||
|                 # Skip all matches that do not parse to a proper date |                 # Skip all matches that do not parse to a proper date | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             if date and date.year > 1900 and date <= timezone.now(): |             date = __filter(date) | ||||||
|  |             if date is not None: | ||||||
|                 return date |                 return date | ||||||
|  |  | ||||||
|     # Iterate through all regex matches in text and try to parse the date |     # Iterate through all regex matches in text and try to parse the date | ||||||
| @@ -236,10 +244,9 @@ def parse_date(filename, text): | |||||||
|             # Skip all matches that do not parse to a proper date |             # Skip all matches that do not parse to a proper date | ||||||
|             continue |             continue | ||||||
|  |  | ||||||
|         if date and date.year > 1900 and date <= timezone.now(): |         date = __filter(date) | ||||||
|  |         if date is not None: | ||||||
|             break |             break | ||||||
|         else: |  | ||||||
|             date = None |  | ||||||
|  |  | ||||||
|     return date |     return date | ||||||
|  |  | ||||||
|   | |||||||
| @@ -138,3 +138,18 @@ class TestDate(TestCase): | |||||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") |     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||||
|     def test_filename_date_parse_invalid(self, *args): |     def test_filename_date_parse_invalid(self, *args): | ||||||
|         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) |         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) | ||||||
|  |  | ||||||
|  |     @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))) | ||||||
|  |     def test_ignored_dates(self, *args): | ||||||
|  |         text = ( | ||||||
|  |             "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " | ||||||
|  |             "ipsum" | ||||||
|  |         ) | ||||||
|  |         date = parse_date("", text) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime( | ||||||
|  |                 2018, 2, 13, 0, 0, | ||||||
|  |                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
| @@ -4,6 +4,7 @@ import multiprocessing | |||||||
| import os | import os | ||||||
| import re | import re | ||||||
|  |  | ||||||
|  | import dateparser | ||||||
| from dotenv import load_dotenv | from dotenv import load_dotenv | ||||||
|  |  | ||||||
| from django.utils.translation import gettext_lazy as _ | from django.utils.translation import gettext_lazy as _ | ||||||
| @@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost | |||||||
| PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( | PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( | ||||||
|     "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" |     "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | # List dates that should be ignored when trying to parse date from document text | ||||||
|  | IGNORE_DATES = set() | ||||||
|  | for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): | ||||||
|  |     d = dateparser.parse(s) | ||||||
|  |     if d: | ||||||
|  |         IGNORE_DATES.add(d.date()) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler