Merge pull request #251 from jayme-github/ignore-date

Add option to ignore certain dates in parse_date
This commit is contained in:
Jonas Winkler
2021-01-05 00:19:13 +01:00
committed by GitHub
5 changed files with 50 additions and 4 deletions

View File

@@ -210,6 +210,13 @@ def parse_date(filename, text):
}
)
def __filter(date):
if date and date.year > 1900 and \
date <= timezone.now() and \
date.date() not in settings.IGNORE_DATES:
return date
return None
date = None
# if filename date parsing is enabled, search there first:
@@ -223,7 +230,8 @@ def parse_date(filename, text):
# Skip all matches that do not parse to a proper date
continue
if date and date.year > 1900 and date <= timezone.now():
date = __filter(date)
if date is not None:
return date
# Iterate through all regex matches in text and try to parse the date
@@ -236,10 +244,9 @@ def parse_date(filename, text):
# Skip all matches that do not parse to a proper date
continue
if date and date.year > 1900 and date <= timezone.now():
date = __filter(date)
if date is not None:
break
else:
date = None
return date

View File

@@ -138,3 +138,18 @@ class TestDate(TestCase):
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_invalid(self, *args):
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
@override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
def test_ignored_dates(self, *args):
text = (
"lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
"ipsum"
)
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)

View File

@@ -4,6 +4,7 @@ import multiprocessing
import os
import re
import dateparser
from dotenv import load_dotenv
from django.utils.translation import gettext_lazy as _
@@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
)
# List dates that should be ignored when trying to parse date from document text
IGNORE_DATES = set()
for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
d = dateparser.parse(s)
if d:
IGNORE_DATES.add(d.date())