From cd15490e91e7db755ea3a5feb6edd4471d10d45f Mon Sep 17 00:00:00 2001
From: jayme-github <jayme-github@users.noreply.github.com>
Date: Sat, 2 Jan 2021 14:40:56 +0100
Subject: [PATCH] Add option to ignore certain dates in parse_date

PAPERLESS_IGNORE_DATES allows to specify a comma separated list of dates
to ignore during date parsing (from filename and content). This can be
used so specify dates that do appear often in documents but are usually
not the documents creation date (like your date of birth).
---
 src/documents/parsers.py                 | 15 +++++++++++----
 src/documents/tests/test_date_parsing.py | 15 +++++++++++++++
 src/paperless/settings.py                |  8 ++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index e14607bd0..cf413a449 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -209,6 +209,13 @@ def parse_date(filename, text):
             }
         )
 
+    def __filter(date):
+        if date and date.year > 1900 and \
+                date <= timezone.now() and \
+                date.date() not in settings.IGNORE_DATES:
+            return date
+        return None
+
     date = None
 
     # if filename date parsing is enabled, search there first:
@@ -222,7 +229,8 @@ def parse_date(filename, text):
                 # Skip all matches that do not parse to a proper date
                 continue
 
-            if date and date.year > 1900 and date <= timezone.now():
+            date = __filter(date)
+            if date is not None:
                 return date
 
     # Iterate through all regex matches in text and try to parse the date
@@ -235,10 +243,9 @@ def parse_date(filename, text):
             # Skip all matches that do not parse to a proper date
             continue
 
-        if date and date.year > 1900 and date <= timezone.now():
+        date = __filter(date)
+        if date is not None:
             break
-        else:
-            date = None
 
     return date
 
diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py
index 357b0937e..9cbb19c2b 100644
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -138,3 +138,18 @@ class TestDate(TestCase):
     @override_settings(FILENAME_DATE_ORDER="YMD")
     def test_filename_date_parse_invalid(self, *args):
         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
+
+    @override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
+    def test_ignored_dates(self, *args):
+        text = (
+            "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
+            "ipsum"
+        )
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
\ No newline at end of file
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index e8b44e8cd..5191803d0 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -4,6 +4,7 @@ import multiprocessing
 import os
 import re
 
+import dateparser
 from dotenv import load_dotenv
 
 from django.utils.translation import gettext_lazy as _
@@ -444,3 +445,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
 PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
     "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
 )
+
+# List dates that should be ignored when trying to parse date from document text
+IGNORE_DATES = set()
+for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
+    d = dateparser.parse(s)
+    if d:
+        IGNORE_DATES.add(d.date())