mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-11 23:59:31 -06:00
Feature: Enable users to customize date parsing via plugins (#11931)
This commit is contained in:
0
src/documents/tests/date_parsing/__init__.py
Normal file
0
src/documents/tests/date_parsing/__init__.py
Normal file
82
src/documents/tests/date_parsing/conftest.py
Normal file
82
src/documents/tests/date_parsing/conftest.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import datetime
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import pytest_django
|
||||
|
||||
from documents.plugins.date_parsing import _discover_parser_class
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_config() -> DateParserConfig:
|
||||
"""Basic configuration for date parser testing."""
|
||||
return DateParserConfig(
|
||||
languages=["en"],
|
||||
timezone_str="UTC",
|
||||
ignore_dates=set(),
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order="YMD",
|
||||
content_date_order="DMY",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def config_with_ignore_dates() -> DateParserConfig:
|
||||
"""Configuration with dates to ignore."""
|
||||
return DateParserConfig(
|
||||
languages=["en", "de"],
|
||||
timezone_str="America/New_York",
|
||||
ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)},
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order="DMY",
|
||||
content_date_order="MDY",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin:
|
||||
"""Instance of RegexDateParser with base config."""
|
||||
return RegexDateParserPlugin(base_config)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def clear_lru_cache() -> Generator[None, None, None]:
|
||||
"""
|
||||
Ensure the LRU cache for _discover_parser_class is cleared
|
||||
before and after any test that depends on it.
|
||||
"""
|
||||
_discover_parser_class.cache_clear()
|
||||
yield
|
||||
_discover_parser_class.cache_clear()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any:
|
||||
"""
|
||||
Override Django settings for the duration of date parser tests.
|
||||
"""
|
||||
settings.DATE_PARSER_LANGUAGES = ["en", "de"]
|
||||
settings.TIME_ZONE = "UTC"
|
||||
settings.IGNORE_DATES = [datetime.date(1900, 1, 1)]
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
settings.DATE_ORDER = "DMY"
|
||||
return settings
|
||||
@@ -0,0 +1,229 @@
|
||||
import datetime
|
||||
import logging
|
||||
from collections.abc import Iterator
|
||||
from importlib.metadata import EntryPoint
|
||||
|
||||
import pytest
|
||||
import pytest_mock
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP
|
||||
from documents.plugins.date_parsing import _discover_parser_class
|
||||
from documents.plugins.date_parsing import get_date_parser
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
|
||||
class AlphaParser(DateParserPluginBase):
|
||||
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||
yield timezone.now()
|
||||
|
||||
|
||||
class BetaParser(DateParserPluginBase):
|
||||
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||
yield timezone.now()
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
@pytest.mark.usefixtures("clear_lru_cache")
|
||||
class TestDiscoverParserClass:
|
||||
"""Tests for the _discover_parser_class() function."""
|
||||
|
||||
def test_returns_default_when_no_plugins_found(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(),
|
||||
)
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
|
||||
def test_returns_default_when_entrypoint_query_fails(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
side_effect=RuntimeError("boom"),
|
||||
)
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
assert "Could not query entry points" in caplog.text
|
||||
|
||||
def test_filters_out_invalid_plugins(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
fake_ep = mocker.MagicMock(spec=EntryPoint)
|
||||
fake_ep.name = "bad_plugin"
|
||||
fake_ep.load.return_value = object # not subclass of DateParser
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(fake_ep,),
|
||||
)
|
||||
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
assert "does not subclass DateParser" in caplog.text
|
||||
|
||||
def test_skips_plugins_that_fail_to_load(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
fake_ep = mocker.MagicMock(spec=EntryPoint)
|
||||
fake_ep.name = "failing_plugin"
|
||||
fake_ep.load.side_effect = ImportError("cannot import")
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(fake_ep,),
|
||||
)
|
||||
|
||||
result = _discover_parser_class()
|
||||
assert result is RegexDateParserPlugin
|
||||
assert "Unable to load date parser plugin failing_plugin" in caplog.text
|
||||
|
||||
def test_returns_single_valid_plugin_without_warning(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""If exactly one valid plugin is discovered, it should be returned without logging a warning."""
|
||||
|
||||
ep = mocker.MagicMock(spec=EntryPoint)
|
||||
ep.name = "alpha"
|
||||
ep.load.return_value = AlphaParser
|
||||
|
||||
mock_entry_points = mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(ep,),
|
||||
)
|
||||
|
||||
with caplog.at_level(
|
||||
logging.WARNING,
|
||||
logger="documents.plugins.date_parsing",
|
||||
):
|
||||
result = _discover_parser_class()
|
||||
|
||||
# It should have called entry_points with the correct group
|
||||
mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP)
|
||||
|
||||
# The discovered class should be exactly our AlphaParser
|
||||
assert result is AlphaParser
|
||||
|
||||
# No warnings should have been logged
|
||||
assert not any(
|
||||
"Multiple date parsers found" in record.message for record in caplog.records
|
||||
), "Unexpected warning logged when only one plugin was found"
|
||||
|
||||
def test_returns_first_valid_plugin_by_name(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
ep_a = mocker.MagicMock(spec=EntryPoint)
|
||||
ep_a.name = "alpha"
|
||||
ep_a.load.return_value = AlphaParser
|
||||
|
||||
ep_b = mocker.MagicMock(spec=EntryPoint)
|
||||
ep_b.name = "beta"
|
||||
ep_b.load.return_value = BetaParser
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(ep_b, ep_a),
|
||||
)
|
||||
|
||||
result = _discover_parser_class()
|
||||
assert result is AlphaParser
|
||||
|
||||
def test_logs_warning_if_multiple_plugins_found(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
ep1 = mocker.MagicMock(spec=EntryPoint)
|
||||
ep1.name = "a"
|
||||
ep1.load.return_value = AlphaParser
|
||||
|
||||
ep2 = mocker.MagicMock(spec=EntryPoint)
|
||||
ep2.name = "b"
|
||||
ep2.load.return_value = BetaParser
|
||||
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(ep1, ep2),
|
||||
)
|
||||
|
||||
with caplog.at_level(
|
||||
logging.WARNING,
|
||||
logger="documents.plugins.date_parsing",
|
||||
):
|
||||
result = _discover_parser_class()
|
||||
|
||||
# Should select alphabetically first plugin ("a")
|
||||
assert result is AlphaParser
|
||||
|
||||
# Should log a warning mentioning multiple parsers
|
||||
assert any(
|
||||
"Multiple date parsers found" in record.message for record in caplog.records
|
||||
), "Expected a warning about multiple date parsers"
|
||||
|
||||
def test_cache_behavior_only_runs_once(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mock_entry_points = mocker.patch(
|
||||
"documents.plugins.date_parsing.entry_points",
|
||||
return_value=(),
|
||||
)
|
||||
|
||||
# First call populates cache
|
||||
_discover_parser_class()
|
||||
# Second call should not re-invoke entry_points
|
||||
_discover_parser_class()
|
||||
mock_entry_points.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.date_parsing
|
||||
@pytest.mark.usefixtures("mock_date_parser_settings")
|
||||
class TestGetDateParser:
|
||||
"""Tests for the get_date_parser() factory function."""
|
||||
|
||||
def test_returns_instance_of_discovered_class(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing._discover_parser_class",
|
||||
return_value=AlphaParser,
|
||||
)
|
||||
parser = get_date_parser()
|
||||
assert isinstance(parser, AlphaParser)
|
||||
assert isinstance(parser.config, DateParserConfig)
|
||||
assert parser.config.languages == ["en", "de"]
|
||||
assert parser.config.timezone_str == "UTC"
|
||||
assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)]
|
||||
assert parser.config.filename_date_order == "YMD"
|
||||
assert parser.config.content_date_order == "DMY"
|
||||
# Check reference_time near now
|
||||
delta = abs((parser.config.reference_time - timezone.now()).total_seconds())
|
||||
assert delta < 2
|
||||
|
||||
def test_uses_default_regex_parser_when_no_plugins(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing._discover_parser_class",
|
||||
return_value=RegexDateParserPlugin,
|
||||
)
|
||||
parser = get_date_parser()
|
||||
assert isinstance(parser, RegexDateParserPlugin)
|
||||
433
src/documents/tests/date_parsing/test_date_parsing.py
Normal file
433
src/documents/tests/date_parsing/test_date_parsing.py
Normal file
@@ -0,0 +1,433 @@
|
||||
import datetime
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import pytest_mock
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
class TestParseString:
|
||||
"""Tests for DateParser._parse_string method via RegexDateParser."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("date_string", "date_order", "expected_year"),
|
||||
[
|
||||
pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"),
|
||||
pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"),
|
||||
pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"),
|
||||
pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"),
|
||||
pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"),
|
||||
pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"),
|
||||
pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"),
|
||||
],
|
||||
)
|
||||
def test_parse_string_valid_formats(
|
||||
self,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
date_string: str,
|
||||
date_order: str,
|
||||
expected_year: int,
|
||||
) -> None:
|
||||
"""Should correctly parse various valid date formats."""
|
||||
result = regex_parser._parse_string(date_string, date_order)
|
||||
|
||||
assert result is not None
|
||||
assert result.year == expected_year
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"invalid_string",
|
||||
[
|
||||
pytest.param("not a date", id="plain_text"),
|
||||
pytest.param("32/13/2024", id="invalid_day_month"),
|
||||
pytest.param("", id="empty_string"),
|
||||
pytest.param("abc123xyz", id="alphanumeric_gibberish"),
|
||||
pytest.param("99/99/9999", id="out_of_range"),
|
||||
],
|
||||
)
|
||||
def test_parse_string_invalid_input(
|
||||
self,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
invalid_string: str,
|
||||
) -> None:
|
||||
"""Should return None for invalid date strings."""
|
||||
result = regex_parser._parse_string(invalid_string, "DMY")
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_parse_string_handles_exceptions(
|
||||
self,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
) -> None:
|
||||
"""Should handle and log exceptions from dateparser gracefully."""
|
||||
with caplog.at_level(
|
||||
logging.ERROR,
|
||||
logger="documents.plugins.date_parsing.base",
|
||||
):
|
||||
# We still need to mock dateparser.parse to force the exception
|
||||
mocker.patch(
|
||||
"documents.plugins.date_parsing.base.dateparser.parse",
|
||||
side_effect=ValueError(
|
||||
"Parsing error: 01/01/2024",
|
||||
),
|
||||
)
|
||||
|
||||
# 1. Execute the function under test
|
||||
result = regex_parser._parse_string("01/01/2024", "DMY")
|
||||
|
||||
assert result is None
|
||||
|
||||
# Check if an error was logged
|
||||
assert len(caplog.records) == 1
|
||||
assert caplog.records[0].levelname == "ERROR"
|
||||
|
||||
# Check if the specific error message is present
|
||||
assert "Error while parsing date string" in caplog.text
|
||||
# Optional: Check for the exact exception message if it's included in the log
|
||||
assert "Parsing error: 01/01/2024" in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
class TestFilterDate:
|
||||
"""Tests for DateParser._filter_date method via RegexDateParser."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("date", "expected_output"),
|
||||
[
|
||||
# Valid Dates
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
|
||||
id="valid_past_date",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
id="exactly_at_reference",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
|
||||
id="year_1901_valid",
|
||||
),
|
||||
# Date is > reference_time
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="future_date_day_after",
|
||||
),
|
||||
# date.date() in ignore_dates
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="ignored_date_midnight_jan1",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="ignored_date_midday_jan1",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="ignored_date_dec25_future",
|
||||
),
|
||||
# date.year <= 1900
|
||||
pytest.param(
|
||||
datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="year_1899",
|
||||
),
|
||||
pytest.param(
|
||||
datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc),
|
||||
None,
|
||||
id="year_1900_boundary",
|
||||
),
|
||||
# date is None
|
||||
pytest.param(None, None, id="none_input"),
|
||||
],
|
||||
)
|
||||
def test_filter_date_validation_rules(
|
||||
self,
|
||||
config_with_ignore_dates: DateParserConfig,
|
||||
date: datetime.datetime | None,
|
||||
expected_output: datetime.datetime | None,
|
||||
) -> None:
|
||||
"""Should correctly validate dates against various rules."""
|
||||
parser = RegexDateParserPlugin(config_with_ignore_dates)
|
||||
result = parser._filter_date(date)
|
||||
assert result == expected_output
|
||||
|
||||
def test_filter_date_respects_ignore_dates(
|
||||
self,
|
||||
config_with_ignore_dates: DateParserConfig,
|
||||
) -> None:
|
||||
"""Should filter out dates in the ignore_dates set."""
|
||||
parser = RegexDateParserPlugin(config_with_ignore_dates)
|
||||
|
||||
ignored_date = datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
1,
|
||||
12,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
)
|
||||
another_ignored = datetime.datetime(
|
||||
2024,
|
||||
12,
|
||||
25,
|
||||
15,
|
||||
30,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
)
|
||||
allowed_date = datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
2,
|
||||
12,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
)
|
||||
|
||||
assert parser._filter_date(ignored_date) is None
|
||||
assert parser._filter_date(another_ignored) is None
|
||||
assert parser._filter_date(allowed_date) == allowed_date
|
||||
|
||||
def test_filter_date_timezone_aware(
|
||||
self,
|
||||
regex_parser: RegexDateParserPlugin,
|
||||
) -> None:
|
||||
"""Should work with timezone-aware datetimes."""
|
||||
date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc)
|
||||
|
||||
result = regex_parser._filter_date(date_utc)
|
||||
|
||||
assert result is not None
|
||||
assert result.tzinfo is not None
|
||||
|
||||
|
||||
@pytest.mark.date_parsing
|
||||
class TestRegexDateParser:
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "content", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"report-2023-12-25.txt",
|
||||
"Event recorded on 25/12/2022.",
|
||||
[
|
||||
datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
],
|
||||
id="filename-y-m-d_and_content-d-m-y",
|
||||
),
|
||||
pytest.param(
|
||||
"img_2023.01.02.jpg",
|
||||
"Taken on 01/02/2023",
|
||||
[
|
||||
datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc),
|
||||
datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc),
|
||||
],
|
||||
id="ambiguous-dates-respect-orders",
|
||||
),
|
||||
pytest.param(
|
||||
"notes.txt",
|
||||
"bad date 99/99/9999 and 25/12/2022",
|
||||
[
|
||||
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
],
|
||||
id="parse-exception-skips-bad-and-yields-good",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_returns_expected_dates(
|
||||
self,
|
||||
base_config: DateParserConfig,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
filename: str,
|
||||
content: str,
|
||||
expected: list[datetime.datetime],
|
||||
) -> None:
|
||||
"""
|
||||
High-level tests that exercise RegexDateParser.parse only.
|
||||
dateparser.parse is mocked so tests are deterministic.
|
||||
"""
|
||||
parser = RegexDateParserPlugin(base_config)
|
||||
|
||||
# Patch the dateparser.parse
|
||||
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||
|
||||
def fake_parse(
|
||||
date_string: str,
|
||||
settings: dict[str, Any] | None = None,
|
||||
locales: None = None,
|
||||
) -> datetime.datetime | None:
|
||||
date_order = settings.get("DATE_ORDER") if settings else None
|
||||
|
||||
# Filename-style YYYY-MM-DD / YYYY.MM.DD
|
||||
if (
|
||||
"2023-12-25" in date_string
|
||||
or "2023.12.25" in date_string
|
||||
or "2023-12-25" in date_string
|
||||
):
|
||||
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# content DMY 25/12/2022
|
||||
if "25/12/2022" in date_string or "25-12-2022" in date_string:
|
||||
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# filename YMD 2023.01.02
|
||||
if "2023.01.02" in date_string or "2023-01-02" in date_string:
|
||||
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# ambiguous 01/02/2023 -> respect DATE_ORDER setting
|
||||
if "01/02/2023" in date_string:
|
||||
if date_order == "DMY":
|
||||
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
|
||||
if date_order == "YMD":
|
||||
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
|
||||
# fallback
|
||||
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
|
||||
|
||||
# simulate parse failure for malformed input
|
||||
if "99/99/9999" in date_string or "bad date" in date_string:
|
||||
raise Exception("parse failed for malformed date")
|
||||
|
||||
return None
|
||||
|
||||
mocker.patch(target, side_effect=fake_parse)
|
||||
|
||||
results = list(parser.parse(filename, content))
|
||||
|
||||
assert results == expected
|
||||
for dt in results:
|
||||
assert dt.tzinfo is not None
|
||||
|
||||
def test_parse_filters_future_and_ignored_dates(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
Ensure parser filters out:
|
||||
- dates after reference_time
|
||||
- dates whose .date() are in ignore_dates
|
||||
"""
|
||||
cfg = DateParserConfig(
|
||||
languages=["en"],
|
||||
timezone_str="UTC",
|
||||
ignore_dates={datetime.date(2023, 12, 10)},
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order="YMD",
|
||||
content_date_order="DMY",
|
||||
)
|
||||
parser = RegexDateParserPlugin(cfg)
|
||||
|
||||
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||
|
||||
def fake_parse(
|
||||
date_string: str,
|
||||
settings: dict[str, Any] | None = None,
|
||||
locales: None = None,
|
||||
) -> datetime.datetime | None:
|
||||
if "10/12/2023" in date_string or "10-12-2023" in date_string:
|
||||
# ignored date
|
||||
return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc)
|
||||
if "01/02/2024" in date_string or "01-02-2024" in date_string:
|
||||
# future relative to reference_time -> filtered
|
||||
return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc)
|
||||
if "05/01/2023" in date_string or "05-01-2023" in date_string:
|
||||
# valid
|
||||
return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)
|
||||
return None
|
||||
|
||||
mocker.patch(target, side_effect=fake_parse)
|
||||
|
||||
content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023"
|
||||
results = list(parser.parse("whatever.txt", content))
|
||||
|
||||
assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)]
|
||||
|
||||
def test_parse_handles_no_matches_and_returns_empty_list(
|
||||
self,
|
||||
base_config: DateParserConfig,
|
||||
) -> None:
|
||||
"""
|
||||
When there are no matching date-like substrings, parse should yield nothing.
|
||||
"""
|
||||
parser = RegexDateParserPlugin(base_config)
|
||||
results = list(
|
||||
parser.parse("no-dates.txt", "this has no dates whatsoever"),
|
||||
)
|
||||
assert results == []
|
||||
|
||||
def test_parse_skips_filename_when_filename_date_order_none(
|
||||
self,
|
||||
mocker: pytest_mock.MockerFixture,
|
||||
) -> None:
|
||||
"""
|
||||
When filename_date_order is None the parser must not attempt to parse the filename.
|
||||
Only dates found in the content should be passed to dateparser.parse.
|
||||
"""
|
||||
cfg = DateParserConfig(
|
||||
languages=["en"],
|
||||
timezone_str="UTC",
|
||||
ignore_dates=set(),
|
||||
reference_time=datetime.datetime(
|
||||
2024,
|
||||
1,
|
||||
15,
|
||||
12,
|
||||
0,
|
||||
0,
|
||||
tzinfo=datetime.timezone.utc,
|
||||
),
|
||||
filename_date_order=None,
|
||||
content_date_order="DMY",
|
||||
)
|
||||
parser = RegexDateParserPlugin(cfg)
|
||||
|
||||
# Patch the module's dateparser.parse so we can inspect calls
|
||||
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||
|
||||
def fake_parse(
|
||||
date_string: str,
|
||||
settings: dict[str, Any] | None = None,
|
||||
locales: None = None,
|
||||
) -> datetime.datetime | None:
|
||||
# return distinct datetimes so we can tell which source was parsed
|
||||
if "25/12/2022" in date_string:
|
||||
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
if "2023-12-25" in date_string:
|
||||
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
|
||||
return None
|
||||
|
||||
mock = mocker.patch(target, side_effect=fake_parse)
|
||||
|
||||
filename = "report-2023-12-25.txt"
|
||||
content = "Event recorded on 25/12/2022."
|
||||
|
||||
results = list(parser.parse(filename, content))
|
||||
|
||||
# Only the content date should have been parsed -> one call
|
||||
assert mock.call_count == 1
|
||||
|
||||
# # first call, first positional arg
|
||||
called_date_string = mock.call_args_list[0][0][0]
|
||||
assert "25/12/2022" in called_date_string
|
||||
# And the parser should have yielded the corresponding datetime
|
||||
assert results == [
|
||||
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||
]
|
||||
@@ -1989,11 +1989,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
|
||||
@mock.patch("documents.parsers.parse_date_generator")
|
||||
@mock.patch("documents.views.get_date_parser")
|
||||
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
|
||||
def test_get_suggestions_dates_disabled(
|
||||
self,
|
||||
parse_date_generator,
|
||||
mock_get_date_parser: mock.MagicMock,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -2010,7 +2010,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
)
|
||||
|
||||
self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||
self.assertFalse(parse_date_generator.called)
|
||||
|
||||
mock_get_date_parser.assert_not_called()
|
||||
|
||||
def test_saved_views(self) -> None:
|
||||
u1 = User.objects.create_superuser("user1")
|
||||
|
||||
@@ -1,538 +0,0 @@
|
||||
import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import pytest
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from documents.parsers import parse_date_generator
|
||||
|
||||
|
||||
@pytest.mark.django_db()
|
||||
class TestDate:
|
||||
def test_date_format_1(self) -> None:
|
||||
text = "lorem ipsum 130218 lorem ipsum"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_2(self) -> None:
|
||||
text = "lorem ipsum 2018 lorem ipsum"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_3(self) -> None:
|
||||
text = "lorem ipsum 20180213 lorem ipsum"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_4(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_date_format_5(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_date_format_6(self) -> None:
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_7(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_date_format_8(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2020,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_9(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2020,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_10(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_11(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_12(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_13(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_14(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_15(self) -> None:
|
||||
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_16(self) -> None:
|
||||
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_17(self) -> None:
|
||||
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_18(self) -> None:
|
||||
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_19(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
21,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_20(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_21(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_22(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
23,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_23(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
24,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_24(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
21,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_25(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
25,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_26(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2019,
|
||||
9,
|
||||
25,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_crazy_date_past(self) -> None:
|
||||
assert parse_date("", "01-07-0590 00:00:00") is None
|
||||
|
||||
def test_crazy_date_future(self) -> None:
|
||||
assert parse_date("", "01-07-2350 00:00:00") is None
|
||||
|
||||
def test_crazy_date_with_spaces(self) -> None:
|
||||
assert parse_date("", "20 408000l 2475") is None
|
||||
|
||||
def test_utf_month_names(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
|
||||
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
||||
2023,
|
||||
12,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "13 août 2022") == datetime.datetime(
|
||||
2022,
|
||||
8,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "11 März 2020") == datetime.datetime(
|
||||
2020,
|
||||
3,
|
||||
11,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
|
||||
2018,
|
||||
3,
|
||||
17,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
|
||||
2016,
|
||||
2,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "15. února 1985") == datetime.datetime(
|
||||
1985,
|
||||
2,
|
||||
15,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "30. září 2011") == datetime.datetime(
|
||||
2011,
|
||||
9,
|
||||
30,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "28. května 1990") == datetime.datetime(
|
||||
1990,
|
||||
5,
|
||||
28,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
|
||||
1997,
|
||||
12,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
|
||||
2024,
|
||||
2,
|
||||
17,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
|
||||
2012,
|
||||
8,
|
||||
30,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
|
||||
2000,
|
||||
9,
|
||||
17,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "5. október 1992") == datetime.datetime(
|
||||
1992,
|
||||
10,
|
||||
5,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_multiple_dates(self, settings_timezone: ZoneInfo) -> None:
|
||||
text = """This text has multiple dates.
|
||||
For example 02.02.2018, 22 July 2022 and December 2021.
|
||||
But not 24-12-9999 because it's in the future..."""
|
||||
dates = list(parse_date_generator("", text))
|
||||
|
||||
assert dates == [
|
||||
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
|
||||
datetime.datetime(
|
||||
2022,
|
||||
7,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
),
|
||||
datetime.datetime(
|
||||
2021,
|
||||
12,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
),
|
||||
]
|
||||
|
||||
def test_filename_date_parse_valid_ymd(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename date format is with Year Month Day (YMD)
|
||||
- Filename contains date matching the format
|
||||
|
||||
THEN:
|
||||
- Should parse the date from the filename
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
|
||||
assert parse_date(
|
||||
"/tmp/Scan-2022-04-01.pdf",
|
||||
"No date in here",
|
||||
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_filename_date_parse_valid_dmy(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename date format is with Day Month Year (DMY)
|
||||
- Filename contains date matching the format
|
||||
|
||||
THEN:
|
||||
- Should parse the date from the filename
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "DMY"
|
||||
assert parse_date(
|
||||
"/tmp/Scan-10.01.2021.pdf",
|
||||
"No date in here",
|
||||
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_filename_date_parse_invalid(self, settings: SettingsWrapper) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename includes no date
|
||||
- File content includes no date
|
||||
|
||||
THEN:
|
||||
- No date is parsed
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
|
||||
|
||||
def test_filename_date_ignored_use_content(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename date format is with Day Month Year (YMD)
|
||||
- Date order is Day Month Year (DMY, the default)
|
||||
- Filename contains date matching the format
|
||||
- Filename date is an ignored date
|
||||
- File content includes a date
|
||||
|
||||
THEN:
|
||||
- Should parse the date from the content not filename
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
|
||||
assert parse_date(
|
||||
"/tmp/Scan-2022-04-01.pdf",
|
||||
"The matching date is 24.03.2022",
|
||||
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_ignored_dates_default_order(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Ignore dates have been set
|
||||
- File content includes ignored dates
|
||||
- File content includes 1 non-ignored date
|
||||
|
||||
THEN:
|
||||
- Should parse the date non-ignored date from content
|
||||
"""
|
||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
||||
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2018,
|
||||
2,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_ignored_dates_order_ymd(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
) -> None:
|
||||
"""
|
||||
GIVEN:
|
||||
- Ignore dates have been set
|
||||
- Date order is Year Month Date (YMD)
|
||||
- File content includes ignored dates
|
||||
- File content includes 1 non-ignored date
|
||||
|
||||
THEN:
|
||||
- Should parse the date non-ignored date from content
|
||||
"""
|
||||
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
||||
|
||||
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
|
||||
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2018,
|
||||
2,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
Reference in New Issue
Block a user