diff --git a/src/documents/plugins/date_parsing/__init__.py b/src/documents/plugins/date_parsing/__init__.py new file mode 100644 index 000000000..f540ce155 --- /dev/null +++ b/src/documents/plugins/date_parsing/__init__.py @@ -0,0 +1,88 @@ +import logging +from functools import lru_cache +from importlib.metadata import EntryPoint +from importlib.metadata import entry_points +from typing import Final + +from django.conf import settings +from django.utils import timezone + +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.base import DateParserPluginBase +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + +logger = logging.getLogger(__name__) + +DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers" + + +@lru_cache(maxsize=1) +def _discover_parser_class() -> type[DateParserPluginBase]: + """ + Discovers the date parser plugin class to use. + + - If one or more plugins are found, sorts them by name and returns the first. + - If no plugins are found, returns the default RegexDateParser. + """ + + eps: tuple[EntryPoint, ...] + try: + eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP) + except Exception as e: + # Log a warning + logger.warning(f"Could not query entry points for date parsers: {e}") + eps = tuple() + + valid_plugins: list[EntryPoint] = [] + for ep in eps: + try: + plugin_class = ep.load() + if plugin_class and issubclass(plugin_class, DateParserPluginBase): + valid_plugins.append(ep) + else: + logger.warning(f"Plugin {ep.name} does not subclass DateParser.") + except Exception as e: + logger.error(f"Unable to load date parser plugin {ep.name}: {e}") + + if not valid_plugins: + return RegexDateParserPlugin + + valid_plugins.sort(key=lambda ep: ep.name) + + if len(valid_plugins) > 1: + logger.warning( + f"Multiple date parsers found: " + f"{[ep.name for ep in valid_plugins]}. " + f"Using the first one by name: '{valid_plugins[0].name}'.", + ) + + return valid_plugins[0].load() + + +def get_date_parser() -> DateParserPluginBase: + """ + Factory function to get an initialized date parser instance. + + This function is responsible for: + 1. Discovering the correct parser class (plugin or default). + 2. Loading configuration from Django settings. + 3. Instantiating the parser with the configuration. + """ + # 1. Discover the class (this is cached) + parser_class = _discover_parser_class() + + # 2. Load configuration from settings + # TODO: Get the language from the settings and/or configuration object, depending + languages = settings.DATE_PARSER_LANGUAGES + + config = DateParserConfig( + languages=languages, + timezone_str=settings.TIME_ZONE, + ignore_dates=settings.IGNORE_DATES, + reference_time=timezone.now(), + filename_date_order=settings.FILENAME_DATE_ORDER, + content_date_order=settings.DATE_ORDER, + ) + + # 3. Instantiate the discovered class with the config + return parser_class(config=config) diff --git a/src/documents/plugins/date_parsing/base.py b/src/documents/plugins/date_parsing/base.py new file mode 100644 index 000000000..09e49e30b --- /dev/null +++ b/src/documents/plugins/date_parsing/base.py @@ -0,0 +1,96 @@ +import datetime +import logging +from abc import ABC +from abc import abstractmethod +from collections.abc import Iterator +from dataclasses import dataclass +from pathlib import Path + +import dateparser + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class DateParserConfig: + """ + Configuration for a DateParser instance. + + This object is created by the factory and passed to the + parser's constructor, decoupling the parser from settings. + """ + + languages: list[str] + timezone_str: str + ignore_dates: set[datetime.date] + + # A "now" timestamp for filtering future dates. + # Passed in by the factory. + reference_time: datetime.datetime + + # Settings for the default RegexDateParser + filename_date_order: str | None + content_date_order: str + + +class DateParserPluginBase(ABC): + """ + Abstract base class for date parsing strategies. + + Instances are configured via a DateParserConfig object. + """ + + def __init__(self, config: DateParserConfig): + """ + Initializes the parser with its configuration. + """ + self.config = config + + def _parse_string( + self, + date_string: str, + date_order: str, + ) -> datetime.datetime | None: + """ + Helper method to parse a single date string using dateparser. + + Uses configuration from `self.config`. + """ + try: + return dateparser.parse( + date_string, + settings={ + "DATE_ORDER": date_order, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": True, + "TIMEZONE": self.config.timezone_str, + }, + locales=self.config.languages, + ) + except Exception as e: + logger.error(f"Error while parsing date string '{date_string}': {e}") + return None + + def _filter_date( + self, + date: datetime.datetime | None, + ) -> datetime.datetime | None: + """ + Helper method to validate a parsed datetime object. + + Uses configuration from `self.config`. + """ + if ( + date is not None + and date.year > 1900 + and date <= self.config.reference_time + and date.date() not in self.config.ignore_dates + ): + return date + return None + + @abstractmethod + def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]: + """ + Parses a document's filename and content, yielding valid datetime objects. + """ diff --git a/src/documents/plugins/date_parsing/regex_parser.py b/src/documents/plugins/date_parsing/regex_parser.py new file mode 100644 index 000000000..7972efd7f --- /dev/null +++ b/src/documents/plugins/date_parsing/regex_parser.py @@ -0,0 +1,66 @@ +import datetime +import re +from collections.abc import Iterator +from pathlib import Path +from re import Match + +from documents.plugins.date_parsing.base import DateParserPluginBase + + +class RegexDateParserPlugin(DateParserPluginBase): + """ + The default date parser, using a series of regular expressions. + + It is configured entirely by the DateParserConfig object + passed to its constructor. + """ + + DATE_REGEX = re.compile( + r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))", + re.IGNORECASE, + ) + + def _process_match( + self, + match: Match[str], + date_order: str, + ) -> datetime.datetime | None: + """ + Processes a single regex match using the base class helpers. + """ + date_string = match.group(0) + date = self._parse_string(date_string, date_order) + return self._filter_date(date) + + def _process_content( + self, + content: str, + date_order: str, + ) -> Iterator[datetime.datetime]: + """ + Finds all regex matches in content and yields valid dates. + """ + for m in re.finditer(self.DATE_REGEX, content): + date = self._process_match(m, date_order) + if date is not None: + yield date + + def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]: + """ + Implementation of the abstract parse method. + + Reads its configuration from `self.config`. + """ + if self.config.filename_date_order: + yield from self._process_content( + filename.name, + self.config.filename_date_order, + ) + + yield from self._process_content(content, self.config.content_date_order) diff --git a/src/documents/tests/date_parsing/__init__.py b/src/documents/tests/date_parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/tests/date_parsing/conftest.py b/src/documents/tests/date_parsing/conftest.py new file mode 100644 index 000000000..ea9e2447d --- /dev/null +++ b/src/documents/tests/date_parsing/conftest.py @@ -0,0 +1,82 @@ +import datetime +from collections.abc import Generator +from typing import Any + +import pytest +import pytest_django + +from documents.plugins.date_parsing import _discover_parser_class +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + + +@pytest.fixture +def base_config() -> DateParserConfig: + """Basic configuration for date parser testing.""" + return DateParserConfig( + languages=["en"], + timezone_str="UTC", + ignore_dates=set(), + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order="YMD", + content_date_order="DMY", + ) + + +@pytest.fixture +def config_with_ignore_dates() -> DateParserConfig: + """Configuration with dates to ignore.""" + return DateParserConfig( + languages=["en", "de"], + timezone_str="America/New_York", + ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)}, + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order="DMY", + content_date_order="MDY", + ) + + +@pytest.fixture +def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin: + """Instance of RegexDateParser with base config.""" + return RegexDateParserPlugin(base_config) + + +@pytest.fixture +def clear_lru_cache() -> Generator[None, None, None]: + """ + Ensure the LRU cache for _discover_parser_class is cleared + before and after any test that depends on it. + """ + _discover_parser_class.cache_clear() + yield + _discover_parser_class.cache_clear() + + +@pytest.fixture +def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any: + """ + Override Django settings for the duration of date parser tests. + """ + settings.DATE_PARSER_LANGUAGES = ["en", "de"] + settings.TIME_ZONE = "UTC" + settings.IGNORE_DATES = [datetime.date(1900, 1, 1)] + settings.FILENAME_DATE_ORDER = "YMD" + settings.DATE_ORDER = "DMY" + return settings diff --git a/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py b/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py new file mode 100644 index 000000000..9da939a76 --- /dev/null +++ b/src/documents/tests/date_parsing/test_date_parser_plugin_loading.py @@ -0,0 +1,237 @@ +import datetime +import logging +from collections.abc import Iterator +from importlib.metadata import EntryPoint +from pathlib import Path + +import pytest +import pytest_mock +from django.utils import timezone + +from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP +from documents.plugins.date_parsing import _discover_parser_class +from documents.plugins.date_parsing import get_date_parser +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.base import DateParserPluginBase +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + + +class AlphaParser(DateParserPluginBase): + def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]: + yield timezone.now() + + +class BetaParser(DateParserPluginBase): + def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]: + yield timezone.now() + + +@pytest.mark.date_parsing +@pytest.mark.usefixtures("clear_lru_cache") +class TestDiscoverParserClass: + """Tests for the _discover_parser_class() function.""" + + def test_returns_default_when_no_plugins_found( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=tuple(), + ) + result = _discover_parser_class() + assert result is RegexDateParserPlugin + + def test_returns_default_when_entrypoint_query_fails( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing.entry_points", + side_effect=RuntimeError("boom"), + ) + result = _discover_parser_class() + assert result is RegexDateParserPlugin + assert "Could not query entry points" in caplog.text + + def test_filters_out_invalid_plugins( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + fake_ep = mocker.MagicMock(spec=EntryPoint) + fake_ep.name = "bad_plugin" + fake_ep.load.return_value = object # not subclass of DateParser + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(fake_ep,), + ) + + result = _discover_parser_class() + assert result is RegexDateParserPlugin + assert "does not subclass DateParser" in caplog.text + + def test_skips_plugins_that_fail_to_load( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + fake_ep = mocker.MagicMock(spec=EntryPoint) + fake_ep.name = "failing_plugin" + fake_ep.load.side_effect = ImportError("cannot import") + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(fake_ep,), + ) + + result = _discover_parser_class() + assert result is RegexDateParserPlugin + assert "Unable to load date parser plugin failing_plugin" in caplog.text + + def test_returns_single_valid_plugin_without_warning( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + """If exactly one valid plugin is discovered, it should be returned without logging a warning.""" + + class AlphaPlugin(DateParserPluginBase): + def parse( + self, + filename: Path, + content: str, + ) -> Iterator[datetime.datetime]: + yield timezone.now() + + ep = mocker.MagicMock(spec=EntryPoint) + ep.name = "alpha" + ep.load.return_value = AlphaPlugin + + mock_entry_points = mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(ep,), + ) + + with caplog.at_level( + logging.WARNING, + logger="documents.plugins.date_parsing", + ): + result = _discover_parser_class() + + # It should have called entry_points with the correct group + mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP) + + # The discovered class should be exactly our AlphaPlugin + assert result is AlphaPlugin + + # No warnings should have been logged + assert not any( + "Multiple date parsers found" in record.message for record in caplog.records + ), "Unexpected warning logged when only one plugin was found" + + def test_returns_first_valid_plugin_by_name( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + ep_a = mocker.MagicMock(spec=EntryPoint) + ep_a.name = "alpha" + ep_a.load.return_value = AlphaParser + + ep_b = mocker.MagicMock(spec=EntryPoint) + ep_b.name = "beta" + ep_b.load.return_value = BetaParser + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(ep_b, ep_a), + ) + + result = _discover_parser_class() + assert result is AlphaParser + + def test_logs_warning_if_multiple_plugins_found( + self, + mocker: pytest_mock.MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + ep1 = mocker.MagicMock(spec=EntryPoint) + ep1.name = "a" + ep1.load.return_value = AlphaParser + + ep2 = mocker.MagicMock(spec=EntryPoint) + ep2.name = "b" + ep2.load.return_value = BetaParser + + mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=(ep1, ep2), + ) + + with caplog.at_level( + logging.WARNING, + logger="documents.plugins.date_parsing", + ): + result = _discover_parser_class() + + # Should select alphabetically first plugin ("a") + assert result is AlphaParser + + # Should log a warning mentioning multiple parsers + assert any( + "Multiple date parsers found" in record.message for record in caplog.records + ), "Expected a warning about multiple date parsers" + + def test_cache_behavior_only_runs_once( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mock_entry_points = mocker.patch( + "documents.plugins.date_parsing.entry_points", + return_value=tuple(), + ) + + # First call populates cache + _discover_parser_class() + # Second call should not re-invoke entry_points + _discover_parser_class() + mock_entry_points.assert_called_once() + + +@pytest.mark.date_parsing +@pytest.mark.usefixtures("mock_date_parser_settings") +class TestGetDateParser: + """Tests for the get_date_parser() factory function.""" + + def test_returns_instance_of_discovered_class( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing._discover_parser_class", + return_value=AlphaParser, + ) + parser = get_date_parser() + assert isinstance(parser, AlphaParser) + assert isinstance(parser.config, DateParserConfig) + assert parser.config.languages == ["en", "de"] + assert parser.config.timezone_str == "UTC" + assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)] + assert parser.config.filename_date_order == "YMD" + assert parser.config.content_date_order == "DMY" + # Check reference_time near now + delta = abs((parser.config.reference_time - timezone.now()).total_seconds()) + assert delta < 2 + + def test_uses_default_regex_parser_when_no_plugins( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + mocker.patch( + "documents.plugins.date_parsing._discover_parser_class", + return_value=RegexDateParserPlugin, + ) + parser = get_date_parser() + assert isinstance(parser, RegexDateParserPlugin) diff --git a/src/documents/tests/date_parsing/test_date_parsing.py b/src/documents/tests/date_parsing/test_date_parsing.py new file mode 100644 index 000000000..f0948324a --- /dev/null +++ b/src/documents/tests/date_parsing/test_date_parsing.py @@ -0,0 +1,422 @@ +import datetime +import logging +from pathlib import Path + +import pytest +import pytest_mock + +from documents.plugins.date_parsing.base import DateParserConfig +from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin + + +@pytest.mark.date_parsing +class TestParseString: + """Tests for DateParser._parse_string method via RegexDateParser.""" + + @pytest.mark.parametrize( + ("date_string", "date_order", "expected_year"), + [ + pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"), + pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"), + pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"), + pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"), + pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"), + pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"), + pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"), + ], + ) + def test_parse_string_valid_formats( + self, + regex_parser: RegexDateParserPlugin, + date_string: str, + date_order: str, + expected_year: int, + ) -> None: + """Should correctly parse various valid date formats.""" + result = regex_parser._parse_string(date_string, date_order) + + assert result is not None + assert result.year == expected_year + + @pytest.mark.parametrize( + "invalid_string", + [ + pytest.param("not a date", id="plain_text"), + pytest.param("32/13/2024", id="invalid_day_month"), + pytest.param("", id="empty_string"), + pytest.param("abc123xyz", id="alphanumeric_gibberish"), + pytest.param("99/99/9999", id="out_of_range"), + ], + ) + def test_parse_string_invalid_input( + self, + regex_parser: RegexDateParserPlugin, + invalid_string: str, + ) -> None: + """Should return None for invalid date strings.""" + result = regex_parser._parse_string(invalid_string, "DMY") + + assert result is None + + def test_parse_string_handles_exceptions( + self, + caplog: pytest.LogCaptureFixture, + mocker: pytest_mock.MockerFixture, + regex_parser: RegexDateParserPlugin, + ) -> None: + """Should handle and log exceptions from dateparser gracefully.""" + with caplog.at_level( + logging.ERROR, + logger="documents.plugins.date_parsing.base", + ): + # We still need to mock dateparser.parse to force the exception + mocker.patch( + "documents.plugins.date_parsing.base.dateparser.parse", + side_effect=ValueError( + "Parsing error: 01/01/2024", + ), + ) + + # 1. Execute the function under test + result = regex_parser._parse_string("01/01/2024", "DMY") + + assert result is None + + # Check if an error was logged + assert len(caplog.records) == 1 + assert caplog.records[0].levelname == "ERROR" + + # Check if the specific error message is present + assert "Error while parsing date string" in caplog.text + # Optional: Check for the exact exception message if it's included in the log + assert "Parsing error: 01/01/2024" in caplog.text + + +@pytest.mark.date_parsing +class TestFilterDate: + """Tests for DateParser._filter_date method via RegexDateParser.""" + + @pytest.mark.parametrize( + ("date", "expected_output"), + [ + # Valid Dates + pytest.param( + datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc), + id="valid_past_date", + ), + pytest.param( + datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc), + id="exactly_at_reference", + ), + pytest.param( + datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc), + id="year_1901_valid", + ), + # Date is > reference_time + pytest.param( + datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc), + None, + id="future_date_day_after", + ), + # date.date() in ignore_dates + pytest.param( + datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + None, + id="ignored_date_midnight_jan1", + ), + pytest.param( + datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc), + None, + id="ignored_date_midday_jan1", + ), + pytest.param( + datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc), + None, + id="ignored_date_dec25_future", + ), + # date.year <= 1900 + pytest.param( + datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc), + None, + id="year_1899", + ), + pytest.param( + datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc), + None, + id="year_1900_boundary", + ), + # date is None + pytest.param(None, None, id="none_input"), + ], + ) + def test_filter_date_validation_rules( + self, + config_with_ignore_dates: DateParserConfig, + date: datetime.datetime | None, + expected_output: datetime.datetime | None, + ) -> None: + """Should correctly validate dates against various rules.""" + parser = RegexDateParserPlugin(config_with_ignore_dates) + result = parser._filter_date(date) + assert result == expected_output + + def test_filter_date_respects_ignore_dates( + self, + config_with_ignore_dates: DateParserConfig, + ) -> None: + """Should filter out dates in the ignore_dates set.""" + parser = RegexDateParserPlugin(config_with_ignore_dates) + + ignored_date = datetime.datetime( + 2024, + 1, + 1, + 12, + 0, + tzinfo=datetime.timezone.utc, + ) + another_ignored = datetime.datetime( + 2024, + 12, + 25, + 15, + 30, + tzinfo=datetime.timezone.utc, + ) + allowed_date = datetime.datetime( + 2024, + 1, + 2, + 12, + 0, + tzinfo=datetime.timezone.utc, + ) + + assert parser._filter_date(ignored_date) is None + assert parser._filter_date(another_ignored) is None + assert parser._filter_date(allowed_date) == allowed_date + + def test_filter_date_timezone_aware( + self, + regex_parser: RegexDateParserPlugin, + ) -> None: + """Should work with timezone-aware datetimes.""" + date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc) + + result = regex_parser._filter_date(date_utc) + + assert result is not None + assert result.tzinfo is not None + + +@pytest.mark.date_parsing +@pytest.mark.regex_date_parser +class TestRegexDateParser: + @pytest.mark.parametrize( + ("filename", "content", "expected"), + [ + pytest.param( + "report-2023-12-25.txt", + "Event recorded on 25/12/2022.", + [ + datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc), + datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc), + ], + id="filename-y-m-d_and_content-d-m-y", + ), + pytest.param( + "img_2023.01.02.jpg", + "Taken on 01/02/2023", + [ + datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc), + datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc), + ], + id="ambiguous-dates-respect-orders", + ), + pytest.param( + "notes.txt", + "bad date 99/99/9999 and 25/12/2022", + [ + datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc), + ], + id="parse-exception-skips-bad-and-yields-good", + ), + ], + ) + def test_parse_returns_expected_dates( + self, + base_config: DateParserConfig, + mocker: pytest_mock.MockerFixture, + filename: str, + content: str, + expected: list[datetime.datetime], + ) -> None: + """ + High-level tests that exercise RegexDateParser.parse only. + dateparser.parse is mocked so tests are deterministic. + """ + parser = RegexDateParserPlugin(base_config) + + # Patch the dateparser.parse + target = "documents.plugins.date_parsing.base.dateparser.parse" + + def fake_parse(date_string: str, settings=None, locales=None): + date_order = settings.get("DATE_ORDER") if settings else None + + # Filename-style YYYY-MM-DD / YYYY.MM.DD + if ( + "2023-12-25" in date_string + or "2023.12.25" in date_string + or "2023-12-25" in date_string + ): + return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc) + + # content DMY 25/12/2022 + if "25/12/2022" in date_string or "25-12-2022" in date_string: + return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc) + + # filename YMD 2023.01.02 + if "2023.01.02" in date_string or "2023-01-02" in date_string: + return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc) + + # ambiguous 01/02/2023 -> respect DATE_ORDER setting + if "01/02/2023" in date_string: + if date_order == "DMY": + return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc) + if date_order == "YMD": + return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc) + # fallback + return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc) + + # simulate parse failure for malformed input + if "99/99/9999" in date_string or "bad date" in date_string: + raise Exception("parse failed for malformed date") + + return None + + mocker.patch(target, side_effect=fake_parse) + + results = list(parser.parse(Path(filename), content)) + + assert results == expected + for dt in results: + assert dt.tzinfo is not None + + def test_parse_filters_future_and_ignored_dates( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + """ + Ensure parser filters out: + - dates after reference_time + - dates whose .date() are in ignore_dates + """ + cfg = DateParserConfig( + languages=["en"], + timezone_str="UTC", + ignore_dates={datetime.date(2023, 12, 10)}, + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order="YMD", + content_date_order="DMY", + ) + parser = RegexDateParserPlugin(cfg) + + target = "documents.plugins.date_parsing.base.dateparser.parse" + + def fake_parse(date_string: str, settings=None, locales=None): + if "10/12/2023" in date_string or "10-12-2023" in date_string: + # ignored date + return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc) + if "01/02/2024" in date_string or "01-02-2024" in date_string: + # future relative to reference_time -> filtered + return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc) + if "05/01/2023" in date_string or "05-01-2023" in date_string: + # valid + return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc) + return None + + mocker.patch(target, side_effect=fake_parse) + + content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023" + results = list(parser.parse(Path("whatever.txt"), content)) + + assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)] + + def test_parse_handles_no_matches_and_returns_empty_list( + self, + base_config: DateParserConfig, + ) -> None: + """ + When there are no matching date-like substrings, parse should yield nothing. + """ + parser = RegexDateParserPlugin(base_config) + results = list( + parser.parse(Path("no-dates.txt"), "this has no dates whatsoever"), + ) + assert results == [] + + def test_parse_skips_filename_when_filename_date_order_none( + self, + mocker: pytest_mock.MockerFixture, + ) -> None: + """ + When filename_date_order is None the parser must not attempt to parse the filename. + Only dates found in the content should be passed to dateparser.parse. + """ + cfg = DateParserConfig( + languages=["en"], + timezone_str="UTC", + ignore_dates=set(), + reference_time=datetime.datetime( + 2024, + 1, + 15, + 12, + 0, + 0, + tzinfo=datetime.timezone.utc, + ), + filename_date_order=None, + content_date_order="DMY", + ) + parser = RegexDateParserPlugin(cfg) + + # Patch the module's dateparser.parse so we can inspect calls + target = "documents.plugins.date_parsing.base.dateparser.parse" + + def fake_parse(date_string: str, settings=None, locales=None): + # return distinct datetimes so we can tell which source was parsed + if "25/12/2022" in date_string: + return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc) + if "2023-12-25" in date_string: + return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc) + return None + + mock = mocker.patch(target, side_effect=fake_parse) + + filename = "report-2023-12-25.txt" + content = "Event recorded on 25/12/2022." + + results = list(parser.parse(Path(filename), content)) + + # Only the content date should have been parsed -> one call + assert mock.call_count == 1 + + # # first call, first positional arg + called_date_string = mock.call_args_list[0][0][0] + assert "25/12/2022" in called_date_string + # And the parser should have yielded the corresponding datetime + assert results == [ + datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc), + ]