mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Copy over the code and tests, to see if this even works
This commit is contained in:
88
src/documents/plugins/date_parsing/__init__.py
Normal file
88
src/documents/plugins/date_parsing/__init__.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from importlib.metadata import EntryPoint
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Final
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _discover_parser_class() -> type[DateParserPluginBase]:
|
||||
"""
|
||||
Discovers the date parser plugin class to use.
|
||||
|
||||
- If one or more plugins are found, sorts them by name and returns the first.
|
||||
- If no plugins are found, returns the default RegexDateParser.
|
||||
"""
|
||||
|
||||
eps: tuple[EntryPoint, ...]
|
||||
try:
|
||||
eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP)
|
||||
except Exception as e:
|
||||
# Log a warning
|
||||
logger.warning(f"Could not query entry points for date parsers: {e}")
|
||||
eps = tuple()
|
||||
|
||||
valid_plugins: list[EntryPoint] = []
|
||||
for ep in eps:
|
||||
try:
|
||||
plugin_class = ep.load()
|
||||
if plugin_class and issubclass(plugin_class, DateParserPluginBase):
|
||||
valid_plugins.append(ep)
|
||||
else:
|
||||
logger.warning(f"Plugin {ep.name} does not subclass DateParser.")
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to load date parser plugin {ep.name}: {e}")
|
||||
|
||||
if not valid_plugins:
|
||||
return RegexDateParserPlugin
|
||||
|
||||
valid_plugins.sort(key=lambda ep: ep.name)
|
||||
|
||||
if len(valid_plugins) > 1:
|
||||
logger.warning(
|
||||
f"Multiple date parsers found: "
|
||||
f"{[ep.name for ep in valid_plugins]}. "
|
||||
f"Using the first one by name: '{valid_plugins[0].name}'.",
|
||||
)
|
||||
|
||||
return valid_plugins[0].load()
|
||||
|
||||
|
||||
def get_date_parser() -> DateParserPluginBase:
|
||||
"""
|
||||
Factory function to get an initialized date parser instance.
|
||||
|
||||
This function is responsible for:
|
||||
1. Discovering the correct parser class (plugin or default).
|
||||
2. Loading configuration from Django settings.
|
||||
3. Instantiating the parser with the configuration.
|
||||
"""
|
||||
# 1. Discover the class (this is cached)
|
||||
parser_class = _discover_parser_class()
|
||||
|
||||
# 2. Load configuration from settings
|
||||
# TODO: Get the language from the settings and/or configuration object, depending
|
||||
languages = settings.DATE_PARSER_LANGUAGES
|
||||
|
||||
config = DateParserConfig(
|
||||
languages=languages,
|
||||
timezone_str=settings.TIME_ZONE,
|
||||
ignore_dates=settings.IGNORE_DATES,
|
||||
reference_time=timezone.now(),
|
||||
filename_date_order=settings.FILENAME_DATE_ORDER,
|
||||
content_date_order=settings.DATE_ORDER,
|
||||
)
|
||||
|
||||
# 3. Instantiate the discovered class with the config
|
||||
return parser_class(config=config)
|
||||
96
src/documents/plugins/date_parsing/base.py
Normal file
96
src/documents/plugins/date_parsing/base.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import datetime
|
||||
import logging
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import dateparser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class DateParserConfig:
|
||||
"""
|
||||
Configuration for a DateParser instance.
|
||||
|
||||
This object is created by the factory and passed to the
|
||||
parser's constructor, decoupling the parser from settings.
|
||||
"""
|
||||
|
||||
languages: list[str]
|
||||
timezone_str: str
|
||||
ignore_dates: set[datetime.date]
|
||||
|
||||
# A "now" timestamp for filtering future dates.
|
||||
# Passed in by the factory.
|
||||
reference_time: datetime.datetime
|
||||
|
||||
# Settings for the default RegexDateParser
|
||||
filename_date_order: str | None
|
||||
content_date_order: str
|
||||
|
||||
|
||||
class DateParserPluginBase(ABC):
|
||||
"""
|
||||
Abstract base class for date parsing strategies.
|
||||
|
||||
Instances are configured via a DateParserConfig object.
|
||||
"""
|
||||
|
||||
def __init__(self, config: DateParserConfig):
|
||||
"""
|
||||
Initializes the parser with its configuration.
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
def _parse_string(
|
||||
self,
|
||||
date_string: str,
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Helper method to parse a single date string using dateparser.
|
||||
|
||||
Uses configuration from `self.config`.
|
||||
"""
|
||||
try:
|
||||
return dateparser.parse(
|
||||
date_string,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
"TIMEZONE": self.config.timezone_str,
|
||||
},
|
||||
locales=self.config.languages,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error while parsing date string '{date_string}': {e}")
|
||||
return None
|
||||
|
||||
def _filter_date(
|
||||
self,
|
||||
date: datetime.datetime | None,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Helper method to validate a parsed datetime object.
|
||||
|
||||
Uses configuration from `self.config`.
|
||||
"""
|
||||
if (
|
||||
date is not None
|
||||
and date.year > 1900
|
||||
and date <= self.config.reference_time
|
||||
and date.date() not in self.config.ignore_dates
|
||||
):
|
||||
return date
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Parses a document's filename and content, yielding valid datetime objects.
|
||||
"""
|
||||
66
src/documents/plugins/date_parsing/regex_parser.py
Normal file
66
src/documents/plugins/date_parsing/regex_parser.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import datetime
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from re import Match
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
|
||||
|
||||
class RegexDateParserPlugin(DateParserPluginBase):
|
||||
"""
|
||||
The default date parser, using a series of regular expressions.
|
||||
|
||||
It is configured entirely by the DateParserConfig object
|
||||
passed to its constructor.
|
||||
"""
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _process_match(
|
||||
self,
|
||||
match: Match[str],
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Processes a single regex match using the base class helpers.
|
||||
"""
|
||||
date_string = match.group(0)
|
||||
date = self._parse_string(date_string, date_order)
|
||||
return self._filter_date(date)
|
||||
|
||||
def _process_content(
|
||||
self,
|
||||
content: str,
|
||||
date_order: str,
|
||||
) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Finds all regex matches in content and yields valid dates.
|
||||
"""
|
||||
for m in re.finditer(self.DATE_REGEX, content):
|
||||
date = self._process_match(m, date_order)
|
||||
if date is not None:
|
||||
yield date
|
||||
|
||||
def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Implementation of the abstract parse method.
|
||||
|
||||
Reads its configuration from `self.config`.
|
||||
"""
|
||||
if self.config.filename_date_order:
|
||||
yield from self._process_content(
|
||||
filename.name,
|
||||
self.config.filename_date_order,
|
||||
)
|
||||
|
||||
yield from self._process_content(content, self.config.content_date_order)
|
||||
0
src/documents/tests/date_parsing/__init__.py
Normal file
0
src/documents/tests/date_parsing/__init__.py
Normal file
82
src/documents/tests/date_parsing/conftest.py
Normal file
82
src/documents/tests/date_parsing/conftest.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import datetime
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import pytest_django
|
||||
|
||||
from documents.plugins.date_parsing import _discover_parser_class
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_config() -> DateParserConfig:
|
||||
"""Basic configuration for date parser testing."""
|
||||
return DateParserConfig(
|
||||
languages=["en"],
|
||||
timezone_str="UTC",
|
||||
ignore_dates=set(),
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order="YMD",
|
||||
content_date_order="DMY",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def config_with_ignore_dates() -> DateParserConfig:
|
||||
"""Configuration with dates to ignore."""
|
||||
return DateParserConfig(
|
||||
languages=["en", "de"],
|
||||
timezone_str="America/New_York",
|
||||
ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)},
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order="DMY",
|
||||
content_date_order="MDY",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin:
|
||||
"""Instance of RegexDateParser with base config."""
|
||||
return RegexDateParserPlugin(base_config)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def clear_lru_cache() -> Generator[None, None, None]:
|
||||
"""
|
||||
Ensure the LRU cache for _discover_parser_class is cleared
|
||||
before and after any test that depends on it.
|
||||
"""
|
||||
_discover_parser_class.cache_clear()
|
||||
yield
|
||||
_discover_parser_class.cache_clear()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any:
|
||||
"""
|
||||
Override Django settings for the duration of date parser tests.
|
||||
"""
|
||||
settings.DATE_PARSER_LANGUAGES = ["en", "de"]
|
||||
settings.TIME_ZONE = "UTC"
|
||||
settings.IGNORE_DATES = [datetime.date(1900, 1, 1)]
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
settings.DATE_ORDER = "DMY"
|
||||
return settings
|
||||
@@ -0,0 +1,237 @@
|
||||
import datetime
|
||||
import logging
|
||||
from collections.abc import Iterator
|
||||
from importlib.metadata import EntryPoint
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import pytest_mock
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP
|
||||
from documents.plugins.date_parsing import _discover_parser_class
|
||||
from documents.plugins.date_parsing import get_date_parser
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
|
||||
class AlphaParser(DateParserPluginBase):
|
||||
def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]:
|
||||
yield timezone.now()
|
||||
|
||||
|
||||
class BetaParser(DateParserPluginBase):
|
||||
def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]:
|
||||
yield timezone.now()
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
@pytest.mark.usefixtures("clear_lru_cache")
|
||||
class TestDiscoverParserClass:
|
||||
"""Tests for the _discover_parser_class() function."""
|
||||
|
||||
def test_returns_default_when_no_plugins_found(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=tuple(),
|
||||
)
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
|
||||
def test_returns_default_when_entrypoint_query_fails(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
side_effect=RuntimeError("boom"),
|
||||
)
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
assert "Could not query entry points" in caplog.text
|
||||
|
||||
def test_filters_out_invalid_plugins(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
fake_ep = mocker.MagicMock(spec=EntryPoint)
|
||||
fake_ep.name = "bad_plugin"
|
||||
fake_ep.load.return_value = object # not subclass of DateParser
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(fake_ep,),
|
||||
)
|
||||
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
assert "does not subclass DateParser" in caplog.text
|
||||
|
||||
def test_skips_plugins_that_fail_to_load(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
fake_ep = mocker.MagicMock(spec=EntryPoint)
|
||||
fake_ep.name = "failing_plugin"
|
||||
fake_ep.load.side_effect = ImportError("cannot import")
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(fake_ep,),
|
||||
)
|
||||
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
assert "Unable to load date parser plugin failing_plugin" in caplog.text
|
||||
|
||||
def test_returns_single_valid_plugin_without_warning(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""If exactly one valid plugin is discovered, it should be returned without logging a warning."""
|
||||
|
||||
class AlphaPlugin(DateParserPluginBase):
|
||||
def parse(
|
||||
self,
|
||||
filename: Path,
|
||||
content: str,
|
||||
) -> Iterator[datetime.datetime]:
|
||||
yield timezone.now()
|
||||
|
||||
ep = mocker.MagicMock(spec=EntryPoint)
|
||||
ep.name = "alpha"
|
||||
ep.load.return_value = AlphaPlugin
|
||||
|
||||
mock_entry_points = mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(ep,),
|
||||
)
|
||||
|
||||
with caplog.at_level(
|
||||
logging.WARNING,
|
||||
logger="documents.plugins.date_parsing",
|
||||
):
|
||||
result = _discover_parser_class()
|
||||
|
||||
# It should have called entry_points with the correct group
|
||||
mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP)
|
||||
|
||||
# The discovered class should be exactly our AlphaPlugin
|
||||
assert result is AlphaPlugin
|
||||
|
||||
# No warnings should have been logged
|
||||
assert not any(
|
||||
"Multiple date parsers found" in record.message for record in caplog.records
|
||||
), "Unexpected warning logged when only one plugin was found"
|
||||
|
||||
def test_returns_first_valid_plugin_by_name(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
ep_a = mocker.MagicMock(spec=EntryPoint)
|
||||
ep_a.name = "alpha"
|
||||
ep_a.load.return_value = AlphaParser
|
||||
|
||||
ep_b = mocker.MagicMock(spec=EntryPoint)
|
||||
ep_b.name = "beta"
|
||||
ep_b.load.return_value = BetaParser
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(ep_b, ep_a),
|
||||
)
|
||||
|
||||
result = _discover_parser_class()
|
||||
assert result is AlphaParser
|
||||
|
||||
def test_logs_warning_if_multiple_plugins_found(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
ep1 = mocker.MagicMock(spec=EntryPoint)
|
||||
ep1.name = "a"
|
||||
ep1.load.return_value = AlphaParser
|
||||
|
||||
ep2 = mocker.MagicMock(spec=EntryPoint)
|
||||
ep2.name = "b"
|
||||
ep2.load.return_value = BetaParser
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(ep1, ep2),
|
||||
)
|
||||
|
||||
with caplog.at_level(
|
||||
logging.WARNING,
|
||||
logger="documents.plugins.date_parsing",
|
||||
):
|
||||
result = _discover_parser_class()
|
||||
|
||||
# Should select alphabetically first plugin ("a")
|
||||
assert result is AlphaParser
|
||||
|
||||
# Should log a warning mentioning multiple parsers
|
||||
assert any(
|
||||
"Multiple date parsers found" in record.message for record in caplog.records
|
||||
), "Expected a warning about multiple date parsers"
|
||||
|
||||
def test_cache_behavior_only_runs_once(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mock_entry_points = mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=tuple(),
|
||||
)
|
||||
|
||||
# First call populates cache
|
||||
_discover_parser_class()
|
||||
# Second call should not re-invoke entry_points
|
||||
_discover_parser_class()
|
||||
mock_entry_points.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
@pytest.mark.usefixtures("mock_date_parser_settings")
|
||||
class TestGetDateParser:
|
||||
"""Tests for the get_date_parser() factory function."""
|
||||
|
||||
def test_returns_instance_of_discovered_class(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing._discover_parser_class",
|
||||
return_value=AlphaParser,
|
||||
)
|
||||
parser = get_date_parser()
|
||||
assert isinstance(parser, AlphaParser)
|
||||
assert isinstance(parser.config, DateParserConfig)
|
||||
assert parser.config.languages == ["en", "de"]
|
||||
assert parser.config.timezone_str == "UTC"
|
||||
assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)]
|
||||
assert parser.config.filename_date_order == "YMD"
|
||||
assert parser.config.content_date_order == "DMY"
|
||||
# Check reference_time near now
|
||||
delta = abs((parser.config.reference_time - timezone.now()).total_seconds())
|
||||
assert delta < 2
|
||||
|
||||
def test_uses_default_regex_parser_when_no_plugins(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing._discover_parser_class",
|
||||
return_value=RegexDateParserPlugin,
|
||||
)
|
||||
parser = get_date_parser()
|
||||
assert isinstance(parser, RegexDateParserPlugin)
|
||||
422
src/documents/tests/date_parsing/test_date_parsing.py
Normal file
422
src/documents/tests/date_parsing/test_date_parsing.py
Normal file
@@ -0,0 +1,422 @@
|
||||
import datetime
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import pytest_mock
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
class TestParseString:
|
||||
"""Tests for DateParser._parse_string method via RegexDateParser."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("date_string", "date_order", "expected_year"),
|
||||
[
|
||||
pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"),
|
||||
pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"),
|
||||
pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"),
|
||||
pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"),
|
||||
pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"),
|
||||
pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"),
|
||||
pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"),
|
||||
],
|
||||
)
|
||||
def test_parse_string_valid_formats(
|
||||
self,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
date_string: str,
|
||||
date_order: str,
|
||||
expected_year: int,
|
||||
) -> None:
|
||||
"""Should correctly parse various valid date formats."""
|
||||
result = regex_parser._parse_string(date_string, date_order)
|
||||
|
||||
assert result is not None
|
||||
assert result.year == expected_year
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"invalid_string",
|
||||
[
|
||||
pytest.param("not a date", id="plain_text"),
|
||||
pytest.param("32/13/2024", id="invalid_day_month"),
|
||||
pytest.param("", id="empty_string"),
|
||||
pytest.param("abc123xyz", id="alphanumeric_gibberish"),
|
||||
pytest.param("99/99/9999", id="out_of_range"),
|
||||
],
|
||||
)
|
||||
def test_parse_string_invalid_input(
|
||||
self,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
invalid_string: str,
|
||||
) -> None:
|
||||
"""Should return None for invalid date strings."""
|
||||
result = regex_parser._parse_string(invalid_string, "DMY")
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_parse_string_handles_exceptions(
|
||||
self,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
) -> None:
|
||||
"""Should handle and log exceptions from dateparser gracefully."""
|
||||
with caplog.at_level(
|
||||
logging.ERROR,
|
||||
logger="documents.plugins.date_parsing.base",
|
||||
):
|
||||
# We still need to mock dateparser.parse to force the exception
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.base.dateparser.parse",
|
||||
side_effect=ValueError(
|
||||
"Parsing error: 01/01/2024",
|
||||
),
|
||||
)
|
||||
|
||||
# 1. Execute the function under test
|
||||
result = regex_parser._parse_string("01/01/2024", "DMY")
|
||||
|
||||
assert result is None
|
||||
|
||||
# Check if an error was logged
|
||||
assert len(caplog.records) == 1
|
||||
assert caplog.records[0].levelname == "ERROR"
|
||||
|
||||
# Check if the specific error message is present
|
||||
assert "Error while parsing date string" in caplog.text
|
||||
# Optional: Check for the exact exception message if it's included in the log
|
||||
assert "Parsing error: 01/01/2024" in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
class TestFilterDate:
|
||||
"""Tests for DateParser._filter_date method via RegexDateParser."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("date", "expected_output"),
|
||||
[
|
||||
# Valid Dates
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
|
||||
id="valid_past_date",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
id="exactly_at_reference",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
|
||||
id="year_1901_valid",
|
||||
),
|
||||
# Date is > reference_time
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="future_date_day_after",
|
||||
),
|
||||
# date.date() in ignore_dates
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="ignored_date_midnight_jan1",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="ignored_date_midday_jan1",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="ignored_date_dec25_future",
|
||||
),
|
||||
# date.year <= 1900
|
||||
pytest.param(
|
||||
datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="year_1899",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="year_1900_boundary",
|
||||
),
|
||||
# date is None
|
||||
pytest.param(None, None, id="none_input"),
|
||||
],
|
||||
)
|
||||
def test_filter_date_validation_rules(
|
||||
self,
|
||||
config_with_ignore_dates: DateParserConfig,
|
||||
date: datetime.datetime | None,
|
||||
expected_output: datetime.datetime | None,
|
||||
) -> None:
|
||||
"""Should correctly validate dates against various rules."""
|
||||
parser = RegexDateParserPlugin(config_with_ignore_dates)
|
||||
result = parser._filter_date(date)
|
||||
assert result == expected_output
|
||||
|
||||
def test_filter_date_respects_ignore_dates(
|
||||
self,
|
||||
config_with_ignore_dates: DateParserConfig,
|
||||
) -> None:
|
||||
"""Should filter out dates in the ignore_dates set."""
|
||||
parser = RegexDateParserPlugin(config_with_ignore_dates)
|
||||
|
||||
ignored_date = datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
1,
|
||||
12,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
)
|
||||
another_ignored = datetime.datetime(
|
||||
2024,
|
||||
12,
|
||||
25,
|
||||
15,
|
||||
30,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
)
|
||||
allowed_date = datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
2,
|
||||
12,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
)
|
||||
|
||||
assert parser._filter_date(ignored_date) is None
|
||||
assert parser._filter_date(another_ignored) is None
|
||||
assert parser._filter_date(allowed_date) == allowed_date
|
||||
|
||||
def test_filter_date_timezone_aware(
|
||||
self,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
) -> None:
|
||||
"""Should work with timezone-aware datetimes."""
|
||||
date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc)
|
||||
|
||||
result = regex_parser._filter_date(date_utc)
|
||||
|
||||
assert result is not None
|
||||
assert result.tzinfo is not None
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
@pytest.mark.regex_date_parser
|
||||
class TestRegexDateParser:
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "content", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"report-2023-12-25.txt",
|
||||
"Event recorded on 25/12/2022.",
|
||||
[
|
||||
datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
],
|
||||
id="filename-y-m-d_and_content-d-m-y",
|
||||
),
|
||||
pytest.param(
|
||||
"img_2023.01.02.jpg",
|
||||
"Taken on 01/02/2023",
|
||||
[
|
||||
datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc),
|
||||
],
|
||||
id="ambiguous-dates-respect-orders",
|
||||
),
|
||||
pytest.param(
|
||||
"notes.txt",
|
||||
"bad date 99/99/9999 and 25/12/2022",
|
||||
[
|
||||
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
],
|
||||
id="parse-exception-skips-bad-and-yields-good",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_returns_expected_dates(
|
||||
self,
|
||||
base_config: DateParserConfig,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
filename: str,
|
||||
content: str,
|
||||
expected: list[datetime.datetime],
|
||||
) -> None:
|
||||
"""
|
||||
High-level tests that exercise RegexDateParser.parse only.
|
||||
dateparser.parse is mocked so tests are deterministic.
|
||||
"""
|
||||
parser = RegexDateParserPlugin(base_config)
|
||||
|
||||
# Patch the dateparser.parse
|
||||
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||
|
||||
def fake_parse(date_string: str, settings=None, locales=None):
|
||||
date_order = settings.get("DATE_ORDER") if settings else None
|
||||
|
||||
# Filename-style YYYY-MM-DD / YYYY.MM.DD
|
||||
if (
|
||||
"2023-12-25" in date_string
|
||||
or "2023.12.25" in date_string
|
||||
or "2023-12-25" in date_string
|
||||
):
|
||||
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# content DMY 25/12/2022
|
||||
if "25/12/2022" in date_string or "25-12-2022" in date_string:
|
||||
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# filename YMD 2023.01.02
|
||||
if "2023.01.02" in date_string or "2023-01-02" in date_string:
|
||||
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# ambiguous 01/02/2023 -> respect DATE_ORDER setting
|
||||
if "01/02/2023" in date_string:
|
||||
if date_order == "DMY":
|
||||
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
|
||||
if date_order == "YMD":
|
||||
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
|
||||
# fallback
|
||||
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# simulate parse failure for malformed input
|
||||
if "99/99/9999" in date_string or "bad date" in date_string:
|
||||
raise Exception("parse failed for malformed date")
|
||||
|
||||
return None
|
||||
|
||||
mocker.patch(target, side_effect=fake_parse)
|
||||
|
||||
results = list(parser.parse(Path(filename), content))
|
||||
|
||||
assert results == expected
|
||||
for dt in results:
|
||||
assert dt.tzinfo is not None
|
||||
|
||||
def test_parse_filters_future_and_ignored_dates(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
Ensure parser filters out:
|
||||
- dates after reference_time
|
||||
- dates whose .date() are in ignore_dates
|
||||
"""
|
||||
cfg = DateParserConfig(
|
||||
languages=["en"],
|
||||
timezone_str="UTC",
|
||||
ignore_dates={datetime.date(2023, 12, 10)},
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order="YMD",
|
||||
content_date_order="DMY",
|
||||
)
|
||||
parser = RegexDateParserPlugin(cfg)
|
||||
|
||||
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||
|
||||
def fake_parse(date_string: str, settings=None, locales=None):
|
||||
if "10/12/2023" in date_string or "10-12-2023" in date_string:
|
||||
# ignored date
|
||||
return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc)
|
||||
if "01/02/2024" in date_string or "01-02-2024" in date_string:
|
||||
# future relative to reference_time -> filtered
|
||||
return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc)
|
||||
if "05/01/2023" in date_string or "05-01-2023" in date_string:
|
||||
# valid
|
||||
return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)
|
||||
return None
|
||||
|
||||
mocker.patch(target, side_effect=fake_parse)
|
||||
|
||||
content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023"
|
||||
results = list(parser.parse(Path("whatever.txt"), content))
|
||||
|
||||
assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)]
|
||||
|
||||
def test_parse_handles_no_matches_and_returns_empty_list(
|
||||
self,
|
||||
base_config: DateParserConfig,
|
||||
) -> None:
|
||||
"""
|
||||
When there are no matching date-like substrings, parse should yield nothing.
|
||||
"""
|
||||
parser = RegexDateParserPlugin(base_config)
|
||||
results = list(
|
||||
parser.parse(Path("no-dates.txt"), "this has no dates whatsoever"),
|
||||
)
|
||||
assert results == []
|
||||
|
||||
def test_parse_skips_filename_when_filename_date_order_none(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
When filename_date_order is None the parser must not attempt to parse the filename.
|
||||
Only dates found in the content should be passed to dateparser.parse.
|
||||
"""
|
||||
cfg = DateParserConfig(
|
||||
languages=["en"],
|
||||
timezone_str="UTC",
|
||||
ignore_dates=set(),
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order=None,
|
||||
content_date_order="DMY",
|
||||
)
|
||||
parser = RegexDateParserPlugin(cfg)
|
||||
|
||||
# Patch the module's dateparser.parse so we can inspect calls
|
||||
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||
|
||||
def fake_parse(date_string: str, settings=None, locales=None):
|
||||
# return distinct datetimes so we can tell which source was parsed
|
||||
if "25/12/2022" in date_string:
|
||||
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
if "2023-12-25" in date_string:
|
||||
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
return None
|
||||
|
||||
mock = mocker.patch(target, side_effect=fake_parse)
|
||||
|
||||
filename = "report-2023-12-25.txt"
|
||||
content = "Event recorded on 25/12/2022."
|
||||
|
||||
results = list(parser.parse(Path(filename), content))
|
||||
|
||||
# Only the content date should have been parsed -> one call
|
||||
assert mock.call_count == 1
|
||||
|
||||
# # first call, first positional arg
|
||||
called_date_string = mock.call_args_list[0][0][0]
|
||||
assert "25/12/2022" in called_date_string
|
||||
# And the parser should have yielded the corresponding datetime
|
||||
assert results == [
|
||||
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
]
|
||||
Reference in New Issue
Block a user