diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 8335d433e..b910778bd 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -247,10 +247,8 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: # if filename date parsing is enabled, search there first: if settings.FILENAME_DATE_ORDER: - logger.info("Attempting parsing from filename") for m in re.finditer(DATE_REGEX, filename): date_string = m.group(0) - logger.info(f"Found potential date: {date_string}") try: date = __parser(date_string, settings.FILENAME_DATE_ORDER) @@ -260,16 +258,11 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: date = __filter(date) if date is not None: - logger.info(f"Found date: {date}") return date - else: - logger.info("Filtered date out") - logger.info("Attempting parsing from content") # Iterate through all regex matches in text and try to parse the date for m in re.finditer(DATE_REGEX, text): date_string = m.group(0) - logger.info(f"Found potential date: {date_string}") try: date = __parser(date_string, settings.DATE_ORDER) @@ -279,10 +272,7 @@ def parse_date(filename, text) -> Optional[datetime.datetime]: date = __filter(date) if date is not None: - logger.info(f"Found date: {date}") return date - else: - logger.info("Filtered date out") return date diff --git a/src/paperless/settings.py b/src/paperless/settings.py index c58a45945..b5be6c420 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -605,21 +605,40 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( if PAPERLESS_TIKA_ENABLED: INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") -# List dates that should be ignored when trying to parse date from document text -IGNORE_DATES: Set[datetime.date] = set() +def _parse_ignore_dates( + env_ignore: str, + date_order: str = DATE_ORDER, +) -> Set[datetime.datetime]: + """ + If the PAPERLESS_IGNORE_DATES environment variable is set, parse the + user provided string(s) into dates -def _parse_ignore_dates(env_ignore: str) -> Set[datetime.datetime]: + Args: + env_ignore (str): The value of the environment variable, comma seperated dates + date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER. + + Returns: + Set[datetime.datetime]: The set of parsed date objects + """ import dateparser ignored_dates = set() for s in env_ignore.split(","): - d = dateparser.parse(s) + d = dateparser.parse( + s, + settings={ + "DATE_ORDER": date_order, + }, + ) if d: ignored_dates.add(d.date()) return ignored_dates +# List dates that should be ignored when trying to parse date from document text +IGNORE_DATES: Set[datetime.date] = set() + if os.getenv("PAPERLESS_IGNORE_DATES") is not None: IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES")) diff --git a/src/paperless/tests/test_settings.py b/src/paperless/tests/test_settings.py index cedcb0509..57481df5b 100644 --- a/src/paperless/tests/test_settings.py +++ b/src/paperless/tests/test_settings.py @@ -9,6 +9,20 @@ class TestIgnoreDateParsing(TestCase): Tests the parsing of the PAPERLESS_IGNORE_DATES setting value """ + def _parse_checker(self, test_cases): + """ + Helper function to check ignore date parsing + + Args: + test_cases (_type_): _description_ + """ + for env_str, date_format, expected_date_set in test_cases: + + self.assertSetEqual( + _parse_ignore_dates(env_str, date_format), + expected_date_set, + ) + def test_no_ignore_dates_set(self): """ GIVEN: @@ -26,20 +40,19 @@ class TestIgnoreDateParsing(TestCase): - All ignore dates are parsed """ test_cases = [ - ("1985-05-01", [datetime.date(1985, 5, 1)]), + ("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}), ( "1985-05-01,1991-12-05", - [datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)], + "YMD", + {datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)}, + ), + ("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}), + ("11.01.10", "DMY", {datetime.date(2010, 1, 11)}), + ( + "11.01.2001,15-06-1996", + "DMY", + {datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)}, ), - ("2010-12-13", [datetime.date(2010, 12, 13)]), ] - for env_str, expected_dates in test_cases: - expected_date_set = set() - for expected_date in expected_dates: - expected_date_set.add(expected_date) - - self.assertSetEqual( - _parse_ignore_dates(env_str), - expected_date_set, - ) + self._parse_checker(test_cases)