mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Add option to ignore certain dates in parse_date
PAPERLESS_IGNORE_DATES allows to specify a comma separated list of dates to ignore during date parsing (from filename and content). This can be used so specify dates that do appear often in documents but are usually not the documents creation date (like your date of birth).
This commit is contained in:
parent
7bca5bf40e
commit
654ee4e62e
@ -209,6 +209,13 @@ def parse_date(filename, text):
|
||||
}
|
||||
)
|
||||
|
||||
def __filter(date):
|
||||
if date and date.year > 1900 and \
|
||||
date <= timezone.now() and \
|
||||
date.date() not in settings.IGNORE_DATES:
|
||||
return date
|
||||
return None
|
||||
|
||||
date = None
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
@ -222,7 +229,8 @@ def parse_date(filename, text):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date and date.year > 1900 and date <= timezone.now():
|
||||
date = __filter(date)
|
||||
if date is not None:
|
||||
return date
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
@ -235,10 +243,9 @@ def parse_date(filename, text):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date and date.year > 1900 and date <= timezone.now():
|
||||
date = __filter(date)
|
||||
if date is not None:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
return date
|
||||
|
||||
|
@ -138,3 +138,18 @@ class TestDate(TestCase):
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
||||
|
||||
@override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
|
||||
def test_ignored_dates(self, *args):
|
||||
text = (
|
||||
"lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
@ -4,6 +4,7 @@ import multiprocessing
|
||||
import os
|
||||
import re
|
||||
|
||||
import dateparser
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
@ -444,3 +445,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
|
||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
|
||||
)
|
||||
|
||||
# List dates that should be ignored when trying to parse date from document text
|
||||
IGNORE_DATES = set()
|
||||
for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
|
||||
d = dateparser.parse(s)
|
||||
if d:
|
||||
IGNORE_DATES.add(d.date())
|
||||
|
Loading…
x
Reference in New Issue
Block a user