Merge pull request #251 from jayme-github/ignore-date

Add option to ignore certain dates in parse_date
2026-02-01 23:19:00 -06:00 · 2021-01-05 00:19:13 +01:00
parent 7587150f96 be2061b74d
commit be94a8e49a
5 changed files with 50 additions and 4 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
    Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``.
 PAPERLESS_IGNORE_DATES=<string>
    Paperless parses a documents creation date from filename and file content.
    You may specify a comma separated list of dates that should be ignored during
    this process. This is useful for special dates (like date of birth) that appear
    in documents regularly but are very unlikely to be the documents creation date.
    You may specify dates in a multitude of formats supported by dateparser (see
    https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
    need to be comma separated, the options are limited.
    Example: "2020-12-02,22.04.1999"
    Defaults to an empty string to not ignore any dates.
 Binaries
 ########
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -50,11 +50,14 @@
 #PAPERLESS_TIME_ZONE=UTC
 #PAPERLESS_CONSUMER_POLLING=10
 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
 #PAPERLESS_CONSUMER_RECURSIVE=false
 #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
 #PAPERLESS_OPTIMIZE_THUMBNAILS=true
 #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
 #PAPERLESS_FILENAME_DATE_ORDER=YMD
 #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
 #PAPERLESS_THUMBNAIL_FONT_NAME=
 #PAPERLESS_IGNORE_DATES=
 # Tika settings
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -210,6 +210,13 @@ def parse_date(filename, text):
            }
        )
    def __filter(date):
        if date and date.year > 1900 and \
                date <= timezone.now() and \
                date.date() not in settings.IGNORE_DATES:
            return date
        return None
    date = None
    # if filename date parsing is enabled, search there first:
@@ -223,7 +230,8 @@ def parse_date(filename, text):
                # Skip all matches that do not parse to a proper date
                continue
-            if date and date.year > 1900 and date <= timezone.now():
+            date = __filter(date)
            if date is not None:
                return date
    # Iterate through all regex matches in text and try to parse the date
@@ -236,10 +244,9 @@ def parse_date(filename, text):
            # Skip all matches that do not parse to a proper date
            continue
-        if date and date.year > 1900 and date <= timezone.now():
+        date = __filter(date)
        if date is not None:
            break
        else:
            date = None
    return date
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -138,3 +138,18 @@ class TestDate(TestCase):
    @override_settings(FILENAME_DATE_ORDER="YMD")
    def test_filename_date_parse_invalid(self, *args):
        self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
    @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
    def test_ignored_dates(self, *args):
        text = (
            "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
            "ipsum"
        )
        date = parse_date("", text)
        self.assertEqual(
            date,
            datetime.datetime(
                2018, 2, 13, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -4,6 +4,7 @@ import multiprocessing
 import os
 import re
 import dateparser
 from dotenv import load_dotenv
 from django.utils.translation import gettext_lazy as _
@@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
 PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
    "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
 )
 # List dates that should be ignored when trying to parse date from document text
 IGNORE_DATES = set()
 for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
    d = dateparser.parse(s)
    if d:
        IGNORE_DATES.add(d.date())