diff --git a/docs/configuration.rst b/docs/configuration.rst index 454377283..7d297a760 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME= Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``. +PAPERLESS_IGNORE_DATES= + Paperless parses a documents creation date from filename and file content. + You may specify a comma separated list of dates that should be ignored during + this process. This is useful for special dates (like date of birth) that appear + in documents regularly but are very unlikely to be the documents creation date. + + You may specify dates in a multitude of formats supported by dateparser (see + https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates + need to be comma separated, the options are limited. + Example: "2020-12-02,22.04.1999" + + Defaults to an empty string to not ignore any dates. + Binaries ######## diff --git a/paperless.conf.example b/paperless.conf.example index d732bb5cb..cda52cc19 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -50,11 +50,14 @@ #PAPERLESS_TIME_ZONE=UTC #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false +#PAPERLESS_CONSUMER_RECURSIVE=false +#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false #PAPERLESS_OPTIMIZE_THUMBNAILS=true #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[] #PAPERLESS_THUMBNAIL_FONT_NAME= +#PAPERLESS_IGNORE_DATES= # Tika settings diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 371a53c4b..c0039207f 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -210,6 +210,13 @@ def parse_date(filename, text): } ) + def __filter(date): + if date and date.year > 1900 and \ + date <= timezone.now() and \ + date.date() not in settings.IGNORE_DATES: + return date + return None + date = None # if filename date parsing is enabled, search there first: @@ -223,7 +230,8 @@ def parse_date(filename, text): # Skip all matches that do not parse to a proper date continue - if date and date.year > 1900 and date <= timezone.now(): + date = __filter(date) + if date is not None: return date # Iterate through all regex matches in text and try to parse the date @@ -236,10 +244,9 @@ def parse_date(filename, text): # Skip all matches that do not parse to a proper date continue - if date and date.year > 1900 and date <= timezone.now(): + date = __filter(date) + if date is not None: break - else: - date = None return date diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index 357b0937e..9cbb19c2b 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -138,3 +138,18 @@ class TestDate(TestCase): @override_settings(FILENAME_DATE_ORDER="YMD") def test_filename_date_parse_invalid(self, *args): self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) + + @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))) + def test_ignored_dates(self, *args): + text = ( + "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " + "ipsum" + ) + date = parse_date("", text) + self.assertEqual( + date, + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) \ No newline at end of file diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1fd54823f..9f770aeae 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -4,6 +4,7 @@ import multiprocessing import os import re +import dateparser from dotenv import load_dotenv from django.utils.translation import gettext_lazy as _ @@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" ) + +# List dates that should be ignored when trying to parse date from document text +IGNORE_DATES = set() +for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): + d = dateparser.parse(s) + if d: + IGNORE_DATES.add(d.date())