mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #251 from jayme-github/ignore-date
Add option to ignore certain dates in parse_date
This commit is contained in:
commit
be94a8e49a
@ -441,6 +441,19 @@ PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
|
|||||||
|
|
||||||
Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``.
|
Defaults to ``/usr/share/fonts/liberation/LiberationSerif-Regular.ttf``.
|
||||||
|
|
||||||
|
PAPERLESS_IGNORE_DATES=<string>
|
||||||
|
Paperless parses a documents creation date from filename and file content.
|
||||||
|
You may specify a comma separated list of dates that should be ignored during
|
||||||
|
this process. This is useful for special dates (like date of birth) that appear
|
||||||
|
in documents regularly but are very unlikely to be the documents creation date.
|
||||||
|
|
||||||
|
You may specify dates in a multitude of formats supported by dateparser (see
|
||||||
|
https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
|
||||||
|
need to be comma separated, the options are limited.
|
||||||
|
Example: "2020-12-02,22.04.1999"
|
||||||
|
|
||||||
|
Defaults to an empty string to not ignore any dates.
|
||||||
|
|
||||||
|
|
||||||
Binaries
|
Binaries
|
||||||
########
|
########
|
||||||
|
@ -50,11 +50,14 @@
|
|||||||
#PAPERLESS_TIME_ZONE=UTC
|
#PAPERLESS_TIME_ZONE=UTC
|
||||||
#PAPERLESS_CONSUMER_POLLING=10
|
#PAPERLESS_CONSUMER_POLLING=10
|
||||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
|
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
|
||||||
|
#PAPERLESS_CONSUMER_RECURSIVE=false
|
||||||
|
#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false
|
||||||
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
|
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
|
||||||
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
|
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
|
||||||
#PAPERLESS_FILENAME_DATE_ORDER=YMD
|
#PAPERLESS_FILENAME_DATE_ORDER=YMD
|
||||||
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
|
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[]
|
||||||
#PAPERLESS_THUMBNAIL_FONT_NAME=
|
#PAPERLESS_THUMBNAIL_FONT_NAME=
|
||||||
|
#PAPERLESS_IGNORE_DATES=
|
||||||
|
|
||||||
# Tika settings
|
# Tika settings
|
||||||
|
|
||||||
|
@ -210,6 +210,13 @@ def parse_date(filename, text):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __filter(date):
|
||||||
|
if date and date.year > 1900 and \
|
||||||
|
date <= timezone.now() and \
|
||||||
|
date.date() not in settings.IGNORE_DATES:
|
||||||
|
return date
|
||||||
|
return None
|
||||||
|
|
||||||
date = None
|
date = None
|
||||||
|
|
||||||
# if filename date parsing is enabled, search there first:
|
# if filename date parsing is enabled, search there first:
|
||||||
@ -223,7 +230,8 @@ def parse_date(filename, text):
|
|||||||
# Skip all matches that do not parse to a proper date
|
# Skip all matches that do not parse to a proper date
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if date and date.year > 1900 and date <= timezone.now():
|
date = __filter(date)
|
||||||
|
if date is not None:
|
||||||
return date
|
return date
|
||||||
|
|
||||||
# Iterate through all regex matches in text and try to parse the date
|
# Iterate through all regex matches in text and try to parse the date
|
||||||
@ -236,10 +244,9 @@ def parse_date(filename, text):
|
|||||||
# Skip all matches that do not parse to a proper date
|
# Skip all matches that do not parse to a proper date
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if date and date.year > 1900 and date <= timezone.now():
|
date = __filter(date)
|
||||||
|
if date is not None:
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
date = None
|
|
||||||
|
|
||||||
return date
|
return date
|
||||||
|
|
||||||
|
@ -138,3 +138,18 @@ class TestDate(TestCase):
|
|||||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||||
def test_filename_date_parse_invalid(self, *args):
|
def test_filename_date_parse_invalid(self, *args):
|
||||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
||||||
|
|
||||||
|
@override_settings(IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)))
|
||||||
|
def test_ignored_dates(self, *args):
|
||||||
|
text = (
|
||||||
|
"lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem "
|
||||||
|
"ipsum"
|
||||||
|
)
|
||||||
|
date = parse_date("", text)
|
||||||
|
self.assertEqual(
|
||||||
|
date,
|
||||||
|
datetime.datetime(
|
||||||
|
2018, 2, 13, 0, 0,
|
||||||
|
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||||
|
)
|
||||||
|
)
|
@ -4,6 +4,7 @@ import multiprocessing
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import dateparser
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
@ -446,3 +447,10 @@ PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost
|
|||||||
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
||||||
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
|
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# List dates that should be ignored when trying to parse date from document text
|
||||||
|
IGNORE_DATES = set()
|
||||||
|
for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
|
||||||
|
d = dateparser.parse(s)
|
||||||
|
if d:
|
||||||
|
IGNORE_DATES.add(d.date())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user