mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-28 03:46:06 -05:00 
			
		
		
		
	Merge pull request #251 from jayme-github/ignore-date
Add option to ignore certain dates in parse_date
This commit is contained in:
		| @@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME=<filename> | ||||
|  | ||||
|     Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``. | ||||
|  | ||||
| PAPERLESS_IGNORE_DATES=<string> | ||||
|     Paperless parses a documents creation date from filename and file content. | ||||
|     You may specify a comma separated list of dates that should be ignored during | ||||
|     this process. This is useful for special dates (like date of birth) that appear | ||||
|     in documents regularly but are very unlikely to be the documents creation date. | ||||
|  | ||||
|     You may specify dates in a multitude of formats supported by dateparser (see | ||||
|     https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates | ||||
|     need to be comma separated, the options are limited. | ||||
|     Example: "2020-12-02,22.04.1999" | ||||
|  | ||||
|     Defaults to an empty string to not ignore any dates. | ||||
|  | ||||
|  | ||||
| Binaries | ||||
| ######## | ||||
|   | ||||
| @@ -50,11 +50,14 @@ | ||||
| #PAPERLESS_TIME_ZONE=UTC | ||||
| #PAPERLESS_CONSUMER_POLLING=10 | ||||
| #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | ||||
| #PAPERLESS_CONSUMER_RECURSIVE=false | ||||
| #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false | ||||
| #PAPERLESS_OPTIMIZE_THUMBNAILS=true | ||||
| #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | ||||
| #PAPERLESS_FILENAME_DATE_ORDER=YMD | ||||
| #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[] | ||||
| #PAPERLESS_THUMBNAIL_FONT_NAME= | ||||
| #PAPERLESS_IGNORE_DATES= | ||||
|  | ||||
| # Tika settings | ||||
|  | ||||
|   | ||||
| @@ -210,6 +210,13 @@ def parse_date(filename, text): | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|     def __filter(date): | ||||
|         if date and date.year > 1900 and \ | ||||
|                 date <= timezone.now() and \ | ||||
|                 date.date() not in settings.IGNORE_DATES: | ||||
|             return date | ||||
|         return None | ||||
|  | ||||
|     date = None | ||||
|  | ||||
|     # if filename date parsing is enabled, search there first: | ||||
| @@ -223,7 +230,8 @@ def parse_date(filename, text): | ||||
|                 # Skip all matches that do not parse to a proper date | ||||
|                 continue | ||||
|  | ||||
|             if date and date.year > 1900 and date <= timezone.now(): | ||||
|             date = __filter(date) | ||||
|             if date is not None: | ||||
|                 return date | ||||
|  | ||||
|     # Iterate through all regex matches in text and try to parse the date | ||||
| @@ -236,10 +244,9 @@ def parse_date(filename, text): | ||||
|             # Skip all matches that do not parse to a proper date | ||||
|             continue | ||||
|  | ||||
|         if date and date.year > 1900 and date <= timezone.now(): | ||||
|         date = __filter(date) | ||||
|         if date is not None: | ||||
|             break | ||||
|         else: | ||||
|             date = None | ||||
|  | ||||
|     return date | ||||
|  | ||||
|   | ||||
| @@ -138,3 +138,18 @@ class TestDate(TestCase): | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     def test_filename_date_parse_invalid(self, *args): | ||||
|         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) | ||||
|  | ||||
|     @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))) | ||||
|     def test_ignored_dates(self, *args): | ||||
|         text = ( | ||||
|             "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " | ||||
|             "ipsum" | ||||
|         ) | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
| @@ -4,6 +4,7 @@ import multiprocessing | ||||
| import os | ||||
| import re | ||||
|  | ||||
| import dateparser | ||||
| from dotenv import load_dotenv | ||||
|  | ||||
| from django.utils.translation import gettext_lazy as _ | ||||
| @@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost | ||||
| PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( | ||||
|     "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" | ||||
| ) | ||||
|  | ||||
| # List dates that should be ignored when trying to parse date from document text | ||||
| IGNORE_DATES = set() | ||||
| for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): | ||||
|     d = dateparser.parse(s) | ||||
|     if d: | ||||
|         IGNORE_DATES.add(d.date()) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler