mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #251 from jayme-github/ignore-date
Add option to ignore certain dates in parse_date
This commit is contained in:
commit
be94a8e49a
@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
|
||||
|
||||
Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``.
|
||||
|
||||
PAPERLESS_IGNORE_DATES=<string>
|
||||
Paperless parses a documents creation date from filename and file content.
|
||||
You may specify a comma separated list of dates that should be ignored during
|
||||
this process. This is useful for special dates (like date of birth) that appear
|
||||
in documents regularly but are very unlikely to be the documents creation date.
|
||||
|
||||
You may specify dates in a multitude of formats supported by dateparser (see
|
||||
https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
|
||||
need to be comma separated, the options are limited.
|
||||
Example: "2020-12-02,22.04.1999"
|
||||
|
||||
Defaults to an empty string to not ignore any dates.
|
||||
|
||||
|
||||
Binaries
|
||||
########
|
||||
|
@ -50,11 +50,14 @@
|
||||
#PAPERLESS_TIME_ZONE=UTC
|
||||
#PAPERLESS_CONSUMER_POLLING=10
|
||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
|
||||
#PAPERLESS_CONSUMER_RECURSIVE=false
|
||||
#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
|
||||
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
|
||||
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
|
||||
#PAPERLESS_FILENAME_DATE_ORDER=YMD
|
||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
|
||||
#PAPERLESS_THUMBNAIL_FONT_NAME=
|
||||
#PAPERLESS_IGNORE_DATES=
|
||||
|
||||
# Tika settings
|
||||
|
||||
|
@ -210,6 +210,13 @@ def parse_date(filename, text):
|
||||
}
|
||||
)
|
||||
|
||||
def __filter(date):
|
||||
if date and date.year > 1900 and \
|
||||
date <= timezone.now() and \
|
||||
date.date() not in settings.IGNORE_DATES:
|
||||
return date
|
||||
return None
|
||||
|
||||
date = None
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
@ -223,7 +230,8 @@ def parse_date(filename, text):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date and date.year > 1900 and date <= timezone.now():
|
||||
date = __filter(date)
|
||||
if date is not None:
|
||||
return date
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
@ -236,10 +244,9 @@ def parse_date(filename, text):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date and date.year > 1900 and date <= timezone.now():
|
||||
date = __filter(date)
|
||||
if date is not None:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
return date
|
||||
|
||||
|
@ -138,3 +138,18 @@ class TestDate(TestCase):
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
||||
|
||||
@override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
|
||||
def test_ignored_dates(self, *args):
|
||||
text = (
|
||||
"lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
@ -4,6 +4,7 @@ import multiprocessing
|
||||
import os
|
||||
import re
|
||||
|
||||
import dateparser
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
|
||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
|
||||
)
|
||||
|
||||
# List dates that should be ignored when trying to parse date from document text
|
||||
IGNORE_DATES = set()
|
||||
for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
|
||||
d = dateparser.parse(s)
|
||||
if d:
|
||||
IGNORE_DATES.add(d.date())
|
||||
|
Loading…
x
Reference in New Issue
Block a user