Merge pull request #251 from jayme-github/ignore-date

Add option to ignore certain dates in parse_date
2026-02-05 23:32:46 -06:00 · 2021-01-05 00:19:13 +01:00
parent 7587150f96 be2061b74d
commit be94a8e49a
5 changed files with 50 additions and 4 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME=<filename>

    Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``.

+PAPERLESS_IGNORE_DATES=<string>
+    Paperless parses a documents creation date from filename and file content.
+    You may specify a comma separated list of dates that should be ignored during
+    this process. This is useful for special dates (like date of birth) that appear
+    in documents regularly but are very unlikely to be the documents creation date.
+
+    You may specify dates in a multitude of formats supported by dateparser (see
+    https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
+    need to be comma separated, the options are limited.
+    Example: "2020-12-02,22.04.1999"
+
+    Defaults to an empty string to not ignore any dates.
+

 Binaries
 ########
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -50,11 +50,14 @@
 #PAPERLESS_TIME_ZONE=UTC
 #PAPERLESS_CONSUMER_POLLING=10
 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
+#PAPERLESS_CONSUMER_RECURSIVE=false
+#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
 #PAPERLESS_OPTIMIZE_THUMBNAILS=true
 #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
 #PAPERLESS_FILENAME_DATE_ORDER=YMD
 #PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
 #PAPERLESS_THUMBNAIL_FONT_NAME=
+#PAPERLESS_IGNORE_DATES=

 # Tika settings

--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -210,6 +210,13 @@ def parse_date(filename, text):
            }
        )

+    def __filter(date):
+        if date and date.year > 1900 and \
+                date <= timezone.now() and \
+                date.date() not in settings.IGNORE_DATES:
+            return date
+        return None
+
    date = None

    # if filename date parsing is enabled, search there first:
@@ -223,7 +230,8 @@ def parse_date(filename, text):
                # Skip all matches that do not parse to a proper date
                continue

-            if date and date.year > 1900 and date <= timezone.now():
+            date = __filter(date)
+            if date is not None:
                return date

    # Iterate through all regex matches in text and try to parse the date
@@ -236,10 +244,9 @@ def parse_date(filename, text):
            # Skip all matches that do not parse to a proper date
            continue

-        if date and date.year > 1900 and date <= timezone.now():
+        date = __filter(date)
+        if date is not None:
            break
-        else:
-            date = None

    return date

--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -138,3 +138,18 @@ class TestDate(TestCase):
    @override_settings(FILENAME_DATE_ORDER="YMD")
    def test_filename_date_parse_invalid(self, *args):
        self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
+
+    @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
+    def test_ignored_dates(self, *args):
+        text = (
+            "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
+            "ipsum"
+        )
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -4,6 +4,7 @@ import multiprocessing
 import os
 import re

+import dateparser
 from dotenv import load_dotenv

 from django.utils.translation import gettext_lazy as _
@@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
 PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
    "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
 )
+
+# List dates that should be ignored when trying to parse date from document text
+IGNORE_DATES = set()
+for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
+    d = dateparser.parse(s)
+    if d:
+        IGNORE_DATES.add(d.date())