From 654ee4e62e774aad562bfd708fe873e21fbcee1c Mon Sep 17 00:00:00 2001 From: jayme-github Date: Sat, 2 Jan 2021 14:40:56 +0100 Subject: [PATCH 1/3] Add option to ignore certain dates in parse_date PAPERLESS_IGNORE_DATES allows to specify a comma separated list of dates to ignore during date parsing (from filename and content). This can be used so specify dates that do appear often in documents but are usually not the documents creation date (like your date of birth). --- src/documents/parsers.py | 15 +++++++++++---- src/documents/tests/test_date_parsing.py | 15 +++++++++++++++ src/paperless/settings.py | 8 ++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index e14607bd0..cf413a449 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -209,6 +209,13 @@ def parse_date(filename, text): } ) + def __filter(date): + if date and date.year > 1900 and \ + date <= timezone.now() and \ + date.date() not in settings.IGNORE_DATES: + return date + return None + date = None # if filename date parsing is enabled, search there first: @@ -222,7 +229,8 @@ def parse_date(filename, text): # Skip all matches that do not parse to a proper date continue - if date and date.year > 1900 and date <= timezone.now(): + date = __filter(date) + if date is not None: return date # Iterate through all regex matches in text and try to parse the date @@ -235,10 +243,9 @@ def parse_date(filename, text): # Skip all matches that do not parse to a proper date continue - if date and date.year > 1900 and date <= timezone.now(): + date = __filter(date) + if date is not None: break - else: - date = None return date diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index 357b0937e..9cbb19c2b 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -138,3 +138,18 @@ class TestDate(TestCase): @override_settings(FILENAME_DATE_ORDER="YMD") def test_filename_date_parse_invalid(self, *args): self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) + + @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))) + def test_ignored_dates(self, *args): + text = ( + "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " + "ipsum" + ) + date = parse_date("", text) + self.assertEqual( + date, + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) \ No newline at end of file diff --git a/src/paperless/settings.py b/src/paperless/settings.py index e8b44e8cd..5191803d0 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -4,6 +4,7 @@ import multiprocessing import os import re +import dateparser from dotenv import load_dotenv from django.utils.translation import gettext_lazy as _ @@ -444,3 +445,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000" ) + +# List dates that should be ignored when trying to parse date from document text +IGNORE_DATES = set() +for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): + d = dateparser.parse(s) + if d: + IGNORE_DATES.add(d.date()) From 2aa2086dfbdbdef5839f35e08d2cc8a996ca4446 Mon Sep 17 00:00:00 2001 From: jayme-github Date: Sun, 3 Jan 2021 14:35:28 +0100 Subject: [PATCH 2/3] Add missing config options to example file --- paperless.conf.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paperless.conf.example b/paperless.conf.example index d9d0f5b06..2e46b91de 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -50,11 +50,14 @@ #PAPERLESS_TIME_ZONE=UTC #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false +#PAPERLESS_CONSUMER_RECURSIVE=false +#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false #PAPERLESS_OPTIMIZE_THUMBNAILS=true #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[] #PAPERLESS_THUMBNAIL_FONT_NAME= +#PAPERLESS_IGNORE_DATES= # Binaries From be2061b74d9ad99a4caf0568aa49275bc5eb2eb7 Mon Sep 17 00:00:00 2001 From: jayme-github Date: Sun, 3 Jan 2021 14:47:04 +0100 Subject: [PATCH 3/3] Add PAPERLESS_IGNORE_DATES to docs --- docs/configuration.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 49c95bff1..5036d6aa4 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -438,6 +438,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME= Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``. +PAPERLESS_IGNORE_DATES= + Paperless parses a documents creation date from filename and file content. + You may specify a comma separated list of dates that should be ignored during + this process. This is useful for special dates (like date of birth) that appear + in documents regularly but are very unlikely to be the documents creation date. + + You may specify dates in a multitude of formats supported by dateparser (see + https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates + need to be comma separated, the options are limited. + Example: "2020-12-02,22.04.1999" + + Defaults to an empty string to not ignore any dates. + Binaries ########