mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Updates the ignore date parsing to utilize the settings defined date order, instead of guessing a bit
This commit is contained in:
parent
8a6aaf4e2d
commit
5b96944940
@ -247,10 +247,8 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
|
|||||||
|
|
||||||
# if filename date parsing is enabled, search there first:
|
# if filename date parsing is enabled, search there first:
|
||||||
if settings.FILENAME_DATE_ORDER:
|
if settings.FILENAME_DATE_ORDER:
|
||||||
logger.info("Attempting parsing from filename")
|
|
||||||
for m in re.finditer(DATE_REGEX, filename):
|
for m in re.finditer(DATE_REGEX, filename):
|
||||||
date_string = m.group(0)
|
date_string = m.group(0)
|
||||||
logger.info(f"Found potential date: {date_string}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||||
@ -260,16 +258,11 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
|
|||||||
|
|
||||||
date = __filter(date)
|
date = __filter(date)
|
||||||
if date is not None:
|
if date is not None:
|
||||||
logger.info(f"Found date: {date}")
|
|
||||||
return date
|
return date
|
||||||
else:
|
|
||||||
logger.info("Filtered date out")
|
|
||||||
|
|
||||||
logger.info("Attempting parsing from content")
|
|
||||||
# Iterate through all regex matches in text and try to parse the date
|
# Iterate through all regex matches in text and try to parse the date
|
||||||
for m in re.finditer(DATE_REGEX, text):
|
for m in re.finditer(DATE_REGEX, text):
|
||||||
date_string = m.group(0)
|
date_string = m.group(0)
|
||||||
logger.info(f"Found potential date: {date_string}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
date = __parser(date_string, settings.DATE_ORDER)
|
date = __parser(date_string, settings.DATE_ORDER)
|
||||||
@ -279,10 +272,7 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
|
|||||||
|
|
||||||
date = __filter(date)
|
date = __filter(date)
|
||||||
if date is not None:
|
if date is not None:
|
||||||
logger.info(f"Found date: {date}")
|
|
||||||
return date
|
return date
|
||||||
else:
|
|
||||||
logger.info("Filtered date out")
|
|
||||||
|
|
||||||
return date
|
return date
|
||||||
|
|
||||||
|
@ -605,21 +605,40 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
|
|||||||
if PAPERLESS_TIKA_ENABLED:
|
if PAPERLESS_TIKA_ENABLED:
|
||||||
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
|
||||||
|
|
||||||
# List dates that should be ignored when trying to parse date from document text
|
|
||||||
IGNORE_DATES: Set[datetime.date] = set()
|
|
||||||
|
|
||||||
|
def _parse_ignore_dates(
|
||||||
|
env_ignore: str,
|
||||||
|
date_order: str = DATE_ORDER,
|
||||||
|
) -> Set[datetime.datetime]:
|
||||||
|
"""
|
||||||
|
If the PAPERLESS_IGNORE_DATES environment variable is set, parse the
|
||||||
|
user provided string(s) into dates
|
||||||
|
|
||||||
def _parse_ignore_dates(env_ignore: str) -> Set[datetime.datetime]:
|
Args:
|
||||||
|
env_ignore (str): The value of the environment variable, comma seperated dates
|
||||||
|
date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set[datetime.datetime]: The set of parsed date objects
|
||||||
|
"""
|
||||||
import dateparser
|
import dateparser
|
||||||
|
|
||||||
ignored_dates = set()
|
ignored_dates = set()
|
||||||
for s in env_ignore.split(","):
|
for s in env_ignore.split(","):
|
||||||
d = dateparser.parse(s)
|
d = dateparser.parse(
|
||||||
|
s,
|
||||||
|
settings={
|
||||||
|
"DATE_ORDER": date_order,
|
||||||
|
},
|
||||||
|
)
|
||||||
if d:
|
if d:
|
||||||
ignored_dates.add(d.date())
|
ignored_dates.add(d.date())
|
||||||
return ignored_dates
|
return ignored_dates
|
||||||
|
|
||||||
|
|
||||||
|
# List dates that should be ignored when trying to parse date from document text
|
||||||
|
IGNORE_DATES: Set[datetime.date] = set()
|
||||||
|
|
||||||
if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
|
if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
|
||||||
IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES"))
|
IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES"))
|
||||||
|
|
||||||
|
@ -9,6 +9,20 @@ class TestIgnoreDateParsing(TestCase):
|
|||||||
Tests the parsing of the PAPERLESS_IGNORE_DATES setting value
|
Tests the parsing of the PAPERLESS_IGNORE_DATES setting value
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def _parse_checker(self, test_cases):
|
||||||
|
"""
|
||||||
|
Helper function to check ignore date parsing
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_cases (_type_): _description_
|
||||||
|
"""
|
||||||
|
for env_str, date_format, expected_date_set in test_cases:
|
||||||
|
|
||||||
|
self.assertSetEqual(
|
||||||
|
_parse_ignore_dates(env_str, date_format),
|
||||||
|
expected_date_set,
|
||||||
|
)
|
||||||
|
|
||||||
def test_no_ignore_dates_set(self):
|
def test_no_ignore_dates_set(self):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@ -26,20 +40,19 @@ class TestIgnoreDateParsing(TestCase):
|
|||||||
- All ignore dates are parsed
|
- All ignore dates are parsed
|
||||||
"""
|
"""
|
||||||
test_cases = [
|
test_cases = [
|
||||||
("1985-05-01", [datetime.date(1985, 5, 1)]),
|
("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}),
|
||||||
(
|
(
|
||||||
"1985-05-01,1991-12-05",
|
"1985-05-01,1991-12-05",
|
||||||
[datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)],
|
"YMD",
|
||||||
|
{datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)},
|
||||||
|
),
|
||||||
|
("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}),
|
||||||
|
("11.01.10", "DMY", {datetime.date(2010, 1, 11)}),
|
||||||
|
(
|
||||||
|
"11.01.2001,15-06-1996",
|
||||||
|
"DMY",
|
||||||
|
{datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)},
|
||||||
),
|
),
|
||||||
("2010-12-13", [datetime.date(2010, 12, 13)]),
|
|
||||||
]
|
]
|
||||||
for env_str, expected_dates in test_cases:
|
|
||||||
expected_date_set = set()
|
|
||||||
|
|
||||||
for expected_date in expected_dates:
|
self._parse_checker(test_cases)
|
||||||
expected_date_set.add(expected_date)
|
|
||||||
|
|
||||||
self.assertSetEqual(
|
|
||||||
_parse_ignore_dates(env_str),
|
|
||||||
expected_date_set,
|
|
||||||
)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user