Compare commits

..

12 Commits

Author SHA1 Message Date
Trenton H
972f9a069c One more tuple here 2026-01-28 15:57:33 -08:00
Trenton H
bd99fb66cf Resolves Sonarr issues 2026-01-28 15:50:11 -08:00
Trenton H
7704bc5399 To enable cleanup, use as a context manager 2026-01-28 15:45:27 -08:00
Trenton H
a055de0ce4 Restores environment 2026-01-28 15:25:14 -08:00
Trenton H
e0fdf1caa9 Adds example type checking configuration, with a default broard ignore and a tight scoped check 2026-01-28 15:19:22 -08:00
Trenton H
f80ae51a7d Two more missed 2026-01-28 14:44:06 -08:00
Trenton H
e101019924 Got to update the tests too 2026-01-28 14:33:48 -08:00
Trenton H
7afc8ceb24 Change the contract, just take the actual filename, not the file path 2026-01-28 14:24:14 -08:00
Trenton H
dfe0012872 Forgot the marker again 2026-01-28 14:14:11 -08:00
Trenton H
32771391ad Hooks up the class and fixes up the old testing. Includes ocr to date parser conversion we now do 2026-01-28 14:13:29 -08:00
Trenton H
9b7ae1c8ea Copy over the code and tests, to see if this even works 2026-01-28 13:54:53 -08:00
Trenton H
66593ec660 Chore: Bulk backend updates (#11543) 2026-01-28 13:30:12 -08:00
16 changed files with 3343 additions and 2462 deletions

View File

@@ -37,7 +37,7 @@ repos:
- json - json
# See https://github.com/prettier/prettier/issues/15742 for the fork reason # See https://github.com/prettier/prettier/issues/15742 for the fork reason
- repo: https://github.com/rbubley/mirrors-prettier - repo: https://github.com/rbubley/mirrors-prettier
rev: 'v3.6.2' rev: 'v3.8.1'
hooks: hooks:
- id: prettier - id: prettier
types_or: types_or:
@@ -49,7 +49,7 @@ repos:
- 'prettier-plugin-organize-imports@4.1.0' - 'prettier-plugin-organize-imports@4.1.0'
# Python hooks # Python hooks
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.5 rev: v0.14.14
hooks: hooks:
- id: ruff-check - id: ruff-check
- id: ruff-format - id: ruff-format
@@ -76,7 +76,7 @@ repos:
hooks: hooks:
- id: shellcheck - id: shellcheck
- repo: https://github.com/google/yamlfmt - repo: https://github.com/google/yamlfmt
rev: v0.20.0 rev: v0.21.0
hooks: hooks:
- id: yamlfmt - id: yamlfmt
exclude: "^src-ui/pnpm-lock.yaml" exclude: "^src-ui/pnpm-lock.yaml"

View File

@@ -16,18 +16,17 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc # This will allow testing to not install a webserver, mysql, etc
dependencies = [ dependencies = [
"adrf~=0.1.12",
"azure-ai-documentintelligence>=1.0.2", "azure-ai-documentintelligence>=1.0.2",
"babel>=2.17", "babel>=2.17",
"bleach~=6.3.0", "bleach~=6.3.0",
"celery[redis]~=5.5.1", "celery[redis]~=5.6.2",
"channels~=4.2", "channels~=4.2",
"channels-redis~=4.2", "channels-redis~=4.2",
"concurrent-log-handler~=0.9.25", "concurrent-log-handler~=0.9.25",
"dateparser~=1.2", "dateparser~=1.2",
# WARNING: django does not use semver. # WARNING: django does not use semver.
# Only patch versions are guaranteed to not introduce breaking changes. # Only patch versions are guaranteed to not introduce breaking changes.
"django~=5.2.5", "django~=5.2.10",
"django-allauth[mfa,socialaccount]~=65.13.1", "django-allauth[mfa,socialaccount]~=65.13.1",
"django-auditlog~=3.4.1", "django-auditlog~=3.4.1",
"django-cachalot~=2.8.0", "django-cachalot~=2.8.0",
@@ -80,7 +79,7 @@ dependencies = [
"torch~=2.9.1", "torch~=2.9.1",
"tqdm~=4.67.1", "tqdm~=4.67.1",
"watchfiles>=1.1.1", "watchfiles>=1.1.1",
"whitenoise~=6.9", "whitenoise~=6.11",
"whoosh-reloaded>=2.7.5", "whoosh-reloaded>=2.7.5",
"zxing-cpp~=2.3.0", "zxing-cpp~=2.3.0",
] ]
@@ -89,13 +88,13 @@ optional-dependencies.mariadb = [
"mysqlclient~=2.2.7", "mysqlclient~=2.2.7",
] ]
optional-dependencies.postgres = [ optional-dependencies.postgres = [
"psycopg[c,pool]==3.2.12", "psycopg[c,pool]==3.3",
# Direct dependency for proper resolution of the pre-built wheels # Direct dependency for proper resolution of the pre-built wheels
"psycopg-c==3.2.12", "psycopg-c==3.3",
"psycopg-pool==3.3", "psycopg-pool==3.3",
] ]
optional-dependencies.webserver = [ optional-dependencies.webserver = [
"granian[uvloop]~=2.5.1", "granian[uvloop]~=2.6.0",
] ]
[dependency-groups] [dependency-groups]
@@ -153,7 +152,7 @@ typing = [
] ]
[tool.uv] [tool.uv]
required-version = ">=0.5.14" required-version = ">=0.9.0"
package = false package = false
environments = [ environments = [
"sys_platform == 'darwin'", "sys_platform == 'darwin'",
@@ -163,8 +162,8 @@ environments = [
[tool.uv.sources] [tool.uv.sources]
# Markers are chosen to select these almost exclusively when building the Docker image # Markers are chosen to select these almost exclusively when building the Docker image
psycopg-c = [ psycopg-c = [
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-bookworm-3.2.12/psycopg_c-3.2.12-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" }, { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-trixie-3.3.0/psycopg_c-3.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-bookworm-3.2.12/psycopg_c-3.2.12-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64' and python_version == '3.12'" }, { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-trixie-3.3.0/psycopg_c-3.3.0-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64' and python_version == '3.12'" },
] ]
zxing-cpp = [ zxing-cpp = [
{ url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" }, { url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
@@ -307,6 +306,7 @@ markers = [
"gotenberg: Tests requiring Gotenberg service", "gotenberg: Tests requiring Gotenberg service",
"tika: Tests requiring Tika service", "tika: Tests requiring Tika service",
"greenmail: Tests requiring Greenmail service", "greenmail: Tests requiring Greenmail service",
"date_parsing: Tests which cover date parsing from content or filename",
] ]
[tool.pytest_env] [tool.pytest_env]
@@ -333,6 +333,10 @@ exclude_also = [
[tool.mypy] [tool.mypy]
mypy_path = "src" mypy_path = "src"
files = [
"src/documents/plugins/date_parsing",
"src/documents/tests/date_parsing",
]
plugins = [ plugins = [
"mypy_django_plugin.main", "mypy_django_plugin.main",
"mypy_drf_plugin.main", "mypy_drf_plugin.main",
@@ -344,5 +348,28 @@ disallow_untyped_defs = true
warn_redundant_casts = true warn_redundant_casts = true
warn_unused_ignores = true warn_unused_ignores = true
# This prevents errors from imports, but allows type-checking logic to work
follow_imports = "silent"
[[tool.mypy.overrides]]
module = [
"documents.*",
"paperless.*",
"paperless_ai.*",
"paperless_mail.*",
"paperless_tesseract.*",
"paperless_remote.*",
"paperless_text.*",
"paperless_tika.*",
]
ignore_errors = true
[[tool.mypy.overrides]]
module = [
"documents.plugins.date_parsing.*",
"documents.tests.date_parsing.*",
]
ignore_errors = false
[tool.django-stubs] [tool.django-stubs]
django_settings_module = "paperless.settings" django_settings_module = "paperless.settings"

View File

@@ -32,12 +32,12 @@ from documents.models import WorkflowTrigger
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
from documents.permissions import set_permissions_for_object from documents.permissions import set_permissions_for_object
from documents.plugins.base import AlwaysRunPluginMixin from documents.plugins.base import AlwaysRunPluginMixin
from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.date_parsing import get_date_parser
from documents.plugins.helpers import ProgressManager from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions from documents.plugins.helpers import ProgressStatusOptions
from documents.signals import document_consumption_finished from documents.signals import document_consumption_finished
@@ -426,7 +426,8 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING, ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSE_DATE, ConsumerStatusShortMessage.PARSE_DATE,
) )
date = parse_date(self.filename, text) with get_date_parser() as date_parser:
date = next(date_parser.parse(self.filename, text), None)
archive_path = document_parser.get_archive_path() archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(self.working_copy, mime_type) page_count = document_parser.get_page_count(self.working_copy, mime_type)

View File

@@ -9,22 +9,17 @@ import subprocess
import tempfile import tempfile
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
from re import Match
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from django.conf import settings from django.conf import settings
from django.utils import timezone
from documents.loggers import LoggingMixin from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.utils import ocr_to_dateparser_languages
if TYPE_CHECKING: if TYPE_CHECKING:
import datetime import datetime
from collections.abc import Iterator
# This regular expression will try to find dates in the document at # This regular expression will try to find dates in the document at
# hand and will match the following formats: # hand and will match the following formats:
@@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
return out_path return out_path
def parse_date(filename, text) -> datetime.datetime | None:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
def __parser(ds: str, date_order: str) -> datetime.datetime:
"""
Call dateparser.parse with a particular date ordering
"""
import dateparser
ocr_config = OcrConfig()
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
ocr_config.language,
)
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE,
},
locales=languages,
)
def __filter(date: datetime.datetime) -> datetime.datetime | None:
if (
date is not None
and date.year > 1900
and date <= timezone.now()
and date.date() not in settings.IGNORE_DATES
):
return date
return None
def __process_match(
match: Match[str],
date_order: str,
) -> datetime.datetime | None:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except Exception:
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
# Iterate through all regex matches in text and try to parse the date
yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception): class ParseError(Exception):
pass pass

View File

@@ -0,0 +1,92 @@
import logging
from functools import lru_cache
from importlib.metadata import EntryPoint
from importlib.metadata import entry_points
from typing import Final
from django.conf import settings
from django.utils import timezone
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.base import DateParserPluginBase
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
from paperless.utils import ocr_to_dateparser_languages
logger = logging.getLogger(__name__)
DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers"
@lru_cache(maxsize=1)
def _discover_parser_class() -> type[DateParserPluginBase]:
"""
Discovers the date parser plugin class to use.
- If one or more plugins are found, sorts them by name and returns the first.
- If no plugins are found, returns the default RegexDateParser.
"""
eps: tuple[EntryPoint, ...]
try:
eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP)
except Exception as e:
# Log a warning
logger.warning(f"Could not query entry points for date parsers: {e}")
eps = ()
valid_plugins: list[EntryPoint] = []
for ep in eps:
try:
plugin_class = ep.load()
if plugin_class and issubclass(plugin_class, DateParserPluginBase):
valid_plugins.append(ep)
else:
logger.warning(f"Plugin {ep.name} does not subclass DateParser.")
except Exception as e:
logger.error(f"Unable to load date parser plugin {ep.name}: {e}")
if not valid_plugins:
return RegexDateParserPlugin
valid_plugins.sort(key=lambda ep: ep.name)
if len(valid_plugins) > 1:
logger.warning(
f"Multiple date parsers found: "
f"{[ep.name for ep in valid_plugins]}. "
f"Using the first one by name: '{valid_plugins[0].name}'.",
)
return valid_plugins[0].load()
def get_date_parser() -> DateParserPluginBase:
"""
Factory function to get an initialized date parser instance.
This function is responsible for:
1. Discovering the correct parser class (plugin or default).
2. Loading configuration from Django settings.
3. Instantiating the parser with the configuration.
"""
# 1. Discover the class (this is cached)
parser_class = _discover_parser_class()
# 2. Load configuration from settings
# TODO: Get the language from the settings and/or configuration object, depending
languages = languages = (
settings.DATE_PARSER_LANGUAGES
or ocr_to_dateparser_languages(settings.OCR_LANGUAGE)
)
config = DateParserConfig(
languages=languages,
timezone_str=settings.TIME_ZONE,
ignore_dates=settings.IGNORE_DATES,
reference_time=timezone.now(),
filename_date_order=settings.FILENAME_DATE_ORDER,
content_date_order=settings.DATE_ORDER,
)
# 3. Instantiate the discovered class with the config
return parser_class(config=config)

View File

@@ -0,0 +1,124 @@
import datetime
import logging
from abc import ABC
from abc import abstractmethod
from collections.abc import Iterator
from dataclasses import dataclass
from types import TracebackType
try:
from typing import Self
except ImportError:
from typing_extensions import Self
import dateparser
logger = logging.getLogger(__name__)
@dataclass(frozen=True, slots=True)
class DateParserConfig:
"""
Configuration for a DateParser instance.
This object is created by the factory and passed to the
parser's constructor, decoupling the parser from settings.
"""
languages: list[str]
timezone_str: str
ignore_dates: set[datetime.date]
# A "now" timestamp for filtering future dates.
# Passed in by the factory.
reference_time: datetime.datetime
# Settings for the default RegexDateParser
# Other plugins should use or consider these, but it is not required
filename_date_order: str | None
content_date_order: str
class DateParserPluginBase(ABC):
"""
Abstract base class for date parsing strategies.
Instances are configured via a DateParserConfig object.
"""
def __init__(self, config: DateParserConfig):
"""
Initializes the parser with its configuration.
"""
self.config = config
def __enter__(self) -> Self:
"""
Enter the runtime context related to this object.
Subclasses can override this to acquire resources (connections, handles).
"""
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
"""
Exit the runtime context related to this object.
Subclasses can override this to release resources.
"""
# Default implementation does nothing.
# Returning None implies exceptions are propagated.
def _parse_string(
self,
date_string: str,
date_order: str,
) -> datetime.datetime | None:
"""
Helper method to parse a single date string using dateparser.
Uses configuration from `self.config`.
"""
try:
return dateparser.parse(
date_string,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": self.config.timezone_str,
},
locales=self.config.languages,
)
except Exception as e:
logger.error(f"Error while parsing date string '{date_string}': {e}")
return None
def _filter_date(
self,
date: datetime.datetime | None,
) -> datetime.datetime | None:
"""
Helper method to validate a parsed datetime object.
Uses configuration from `self.config`.
"""
if (
date is not None
and date.year > 1900
and date <= self.config.reference_time
and date.date() not in self.config.ignore_dates
):
return date
return None
@abstractmethod
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
"""
Parses a document's filename and content, yielding valid datetime objects.
"""

View File

@@ -0,0 +1,65 @@
import datetime
import re
from collections.abc import Iterator
from re import Match
from documents.plugins.date_parsing.base import DateParserPluginBase
class RegexDateParserPlugin(DateParserPluginBase):
"""
The default date parser, using a series of regular expressions.
It is configured entirely by the DateParserConfig object
passed to its constructor.
"""
DATE_REGEX = re.compile(
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
re.IGNORECASE,
)
def _process_match(
self,
match: Match[str],
date_order: str,
) -> datetime.datetime | None:
"""
Processes a single regex match using the base class helpers.
"""
date_string = match.group(0)
date = self._parse_string(date_string, date_order)
return self._filter_date(date)
def _process_content(
self,
content: str,
date_order: str,
) -> Iterator[datetime.datetime]:
"""
Finds all regex matches in content and yields valid dates.
"""
for m in re.finditer(self.DATE_REGEX, content):
date = self._process_match(m, date_order)
if date is not None:
yield date
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
"""
Implementation of the abstract parse method.
Reads its configuration from `self.config`.
"""
if self.config.filename_date_order:
yield from self._process_content(
filename,
self.config.filename_date_order,
)
yield from self._process_content(content, self.config.content_date_order)

View File

@@ -0,0 +1,82 @@
import datetime
from collections.abc import Generator
from typing import Any
import pytest
import pytest_django
from documents.plugins.date_parsing import _discover_parser_class
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
@pytest.fixture
def base_config() -> DateParserConfig:
"""Basic configuration for date parser testing."""
return DateParserConfig(
languages=["en"],
timezone_str="UTC",
ignore_dates=set(),
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order="YMD",
content_date_order="DMY",
)
@pytest.fixture
def config_with_ignore_dates() -> DateParserConfig:
"""Configuration with dates to ignore."""
return DateParserConfig(
languages=["en", "de"],
timezone_str="America/New_York",
ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)},
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order="DMY",
content_date_order="MDY",
)
@pytest.fixture
def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin:
"""Instance of RegexDateParser with base config."""
return RegexDateParserPlugin(base_config)
@pytest.fixture
def clear_lru_cache() -> Generator[None, None, None]:
"""
Ensure the LRU cache for _discover_parser_class is cleared
before and after any test that depends on it.
"""
_discover_parser_class.cache_clear()
yield
_discover_parser_class.cache_clear()
@pytest.fixture
def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any:
"""
Override Django settings for the duration of date parser tests.
"""
settings.DATE_PARSER_LANGUAGES = ["en", "de"]
settings.TIME_ZONE = "UTC"
settings.IGNORE_DATES = [datetime.date(1900, 1, 1)]
settings.FILENAME_DATE_ORDER = "YMD"
settings.DATE_ORDER = "DMY"
return settings

View File

@@ -0,0 +1,228 @@
import datetime
import logging
from collections.abc import Iterator
from importlib.metadata import EntryPoint
import pytest
import pytest_mock
from django.utils import timezone
from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP
from documents.plugins.date_parsing import _discover_parser_class
from documents.plugins.date_parsing import get_date_parser
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.base import DateParserPluginBase
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
class AlphaParser(DateParserPluginBase):
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
yield timezone.now()
class BetaParser(DateParserPluginBase):
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
yield timezone.now()
@pytest.mark.date_parsing
@pytest.mark.usefixtures("clear_lru_cache")
class TestDiscoverParserClass:
"""Tests for the _discover_parser_class() function."""
def test_returns_default_when_no_plugins_found(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
def test_returns_default_when_entrypoint_query_fails(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing.entry_points",
side_effect=RuntimeError("boom"),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
assert "Could not query entry points" in caplog.text
def test_filters_out_invalid_plugins(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
fake_ep = mocker.MagicMock(spec=EntryPoint)
fake_ep.name = "bad_plugin"
fake_ep.load.return_value = object # not subclass of DateParser
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(fake_ep,),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
assert "does not subclass DateParser" in caplog.text
def test_skips_plugins_that_fail_to_load(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
fake_ep = mocker.MagicMock(spec=EntryPoint)
fake_ep.name = "failing_plugin"
fake_ep.load.side_effect = ImportError("cannot import")
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(fake_ep,),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
assert "Unable to load date parser plugin failing_plugin" in caplog.text
def test_returns_single_valid_plugin_without_warning(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
"""If exactly one valid plugin is discovered, it should be returned without logging a warning."""
ep = mocker.MagicMock(spec=EntryPoint)
ep.name = "alpha"
ep.load.return_value = AlphaParser
mock_entry_points = mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(ep,),
)
with caplog.at_level(
logging.WARNING,
logger="documents.plugins.date_parsing",
):
result = _discover_parser_class()
# It should have called entry_points with the correct group
mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP)
# The discovered class should be exactly our AlphaParser
assert result is AlphaParser
# No warnings should have been logged
assert not any(
"Multiple date parsers found" in record.message for record in caplog.records
), "Unexpected warning logged when only one plugin was found"
def test_returns_first_valid_plugin_by_name(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
ep_a = mocker.MagicMock(spec=EntryPoint)
ep_a.name = "alpha"
ep_a.load.return_value = AlphaParser
ep_b = mocker.MagicMock(spec=EntryPoint)
ep_b.name = "beta"
ep_b.load.return_value = BetaParser
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(ep_b, ep_a),
)
result = _discover_parser_class()
assert result is AlphaParser
def test_logs_warning_if_multiple_plugins_found(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
ep1 = mocker.MagicMock(spec=EntryPoint)
ep1.name = "a"
ep1.load.return_value = AlphaParser
ep2 = mocker.MagicMock(spec=EntryPoint)
ep2.name = "b"
ep2.load.return_value = BetaParser
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(ep1, ep2),
)
with caplog.at_level(
logging.WARNING,
logger="documents.plugins.date_parsing",
):
result = _discover_parser_class()
# Should select alphabetically first plugin ("a")
assert result is AlphaParser
# Should log a warning mentioning multiple parsers
assert any(
"Multiple date parsers found" in record.message for record in caplog.records
), "Expected a warning about multiple date parsers"
def test_cache_behavior_only_runs_once(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mock_entry_points = mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(),
)
# First call populates cache
_discover_parser_class()
# Second call should not re-invoke entry_points
_discover_parser_class()
mock_entry_points.assert_called_once()
@pytest.mark.date_parsing
@pytest.mark.usefixtures("mock_date_parser_settings")
class TestGetDateParser:
"""Tests for the get_date_parser() factory function."""
def test_returns_instance_of_discovered_class(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing._discover_parser_class",
return_value=AlphaParser,
)
parser = get_date_parser()
assert isinstance(parser, AlphaParser)
assert isinstance(parser.config, DateParserConfig)
assert parser.config.languages == ["en", "de"]
assert parser.config.timezone_str == "UTC"
assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)]
assert parser.config.filename_date_order == "YMD"
assert parser.config.content_date_order == "DMY"
# Check reference_time near now
delta = abs((parser.config.reference_time - timezone.now()).total_seconds())
assert delta < 2
def test_uses_default_regex_parser_when_no_plugins(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing._discover_parser_class",
return_value=RegexDateParserPlugin,
)
parser = get_date_parser()
assert isinstance(parser, RegexDateParserPlugin)

View File

@@ -0,0 +1,433 @@
import datetime
import logging
from typing import Any
import pytest
import pytest_mock
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
@pytest.mark.date_parsing
class TestParseString:
"""Tests for DateParser._parse_string method via RegexDateParser."""
@pytest.mark.parametrize(
("date_string", "date_order", "expected_year"),
[
pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"),
pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"),
pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"),
pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"),
pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"),
pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"),
pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"),
],
)
def test_parse_string_valid_formats(
self,
regex_parser: RegexDateParserPlugin,
date_string: str,
date_order: str,
expected_year: int,
) -> None:
"""Should correctly parse various valid date formats."""
result = regex_parser._parse_string(date_string, date_order)
assert result is not None
assert result.year == expected_year
@pytest.mark.parametrize(
"invalid_string",
[
pytest.param("not a date", id="plain_text"),
pytest.param("32/13/2024", id="invalid_day_month"),
pytest.param("", id="empty_string"),
pytest.param("abc123xyz", id="alphanumeric_gibberish"),
pytest.param("99/99/9999", id="out_of_range"),
],
)
def test_parse_string_invalid_input(
self,
regex_parser: RegexDateParserPlugin,
invalid_string: str,
) -> None:
"""Should return None for invalid date strings."""
result = regex_parser._parse_string(invalid_string, "DMY")
assert result is None
def test_parse_string_handles_exceptions(
self,
caplog: pytest.LogCaptureFixture,
mocker: pytest_mock.MockerFixture,
regex_parser: RegexDateParserPlugin,
) -> None:
"""Should handle and log exceptions from dateparser gracefully."""
with caplog.at_level(
logging.ERROR,
logger="documents.plugins.date_parsing.base",
):
# We still need to mock dateparser.parse to force the exception
mocker.patch(
"documents.plugins.date_parsing.base.dateparser.parse",
side_effect=ValueError(
"Parsing error: 01/01/2024",
),
)
# 1. Execute the function under test
result = regex_parser._parse_string("01/01/2024", "DMY")
assert result is None
# Check if an error was logged
assert len(caplog.records) == 1
assert caplog.records[0].levelname == "ERROR"
# Check if the specific error message is present
assert "Error while parsing date string" in caplog.text
# Optional: Check for the exact exception message if it's included in the log
assert "Parsing error: 01/01/2024" in caplog.text
@pytest.mark.date_parsing
class TestFilterDate:
"""Tests for DateParser._filter_date method via RegexDateParser."""
@pytest.mark.parametrize(
("date", "expected_output"),
[
# Valid Dates
pytest.param(
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
id="valid_past_date",
),
pytest.param(
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
id="exactly_at_reference",
),
pytest.param(
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
id="year_1901_valid",
),
# Date is > reference_time
pytest.param(
datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc),
None,
id="future_date_day_after",
),
# date.date() in ignore_dates
pytest.param(
datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
None,
id="ignored_date_midnight_jan1",
),
pytest.param(
datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc),
None,
id="ignored_date_midday_jan1",
),
pytest.param(
datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc),
None,
id="ignored_date_dec25_future",
),
# date.year <= 1900
pytest.param(
datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc),
None,
id="year_1899",
),
pytest.param(
datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc),
None,
id="year_1900_boundary",
),
# date is None
pytest.param(None, None, id="none_input"),
],
)
def test_filter_date_validation_rules(
self,
config_with_ignore_dates: DateParserConfig,
date: datetime.datetime | None,
expected_output: datetime.datetime | None,
) -> None:
"""Should correctly validate dates against various rules."""
parser = RegexDateParserPlugin(config_with_ignore_dates)
result = parser._filter_date(date)
assert result == expected_output
def test_filter_date_respects_ignore_dates(
self,
config_with_ignore_dates: DateParserConfig,
) -> None:
"""Should filter out dates in the ignore_dates set."""
parser = RegexDateParserPlugin(config_with_ignore_dates)
ignored_date = datetime.datetime(
2024,
1,
1,
12,
0,
tzinfo=datetime.timezone.utc,
)
another_ignored = datetime.datetime(
2024,
12,
25,
15,
30,
tzinfo=datetime.timezone.utc,
)
allowed_date = datetime.datetime(
2024,
1,
2,
12,
0,
tzinfo=datetime.timezone.utc,
)
assert parser._filter_date(ignored_date) is None
assert parser._filter_date(another_ignored) is None
assert parser._filter_date(allowed_date) == allowed_date
def test_filter_date_timezone_aware(
self,
regex_parser: RegexDateParserPlugin,
) -> None:
"""Should work with timezone-aware datetimes."""
date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc)
result = regex_parser._filter_date(date_utc)
assert result is not None
assert result.tzinfo is not None
@pytest.mark.date_parsing
class TestRegexDateParser:
@pytest.mark.parametrize(
("filename", "content", "expected"),
[
pytest.param(
"report-2023-12-25.txt",
"Event recorded on 25/12/2022.",
[
datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc),
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
],
id="filename-y-m-d_and_content-d-m-y",
),
pytest.param(
"img_2023.01.02.jpg",
"Taken on 01/02/2023",
[
datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc),
datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc),
],
id="ambiguous-dates-respect-orders",
),
pytest.param(
"notes.txt",
"bad date 99/99/9999 and 25/12/2022",
[
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
],
id="parse-exception-skips-bad-and-yields-good",
),
],
)
def test_parse_returns_expected_dates(
self,
base_config: DateParserConfig,
mocker: pytest_mock.MockerFixture,
filename: str,
content: str,
expected: list[datetime.datetime],
) -> None:
"""
High-level tests that exercise RegexDateParser.parse only.
dateparser.parse is mocked so tests are deterministic.
"""
parser = RegexDateParserPlugin(base_config)
# Patch the dateparser.parse
target = "documents.plugins.date_parsing.base.dateparser.parse"
def fake_parse(
date_string: str,
settings: dict[str, Any] | None = None,
locales: None = None,
) -> datetime.datetime | None:
date_order = settings.get("DATE_ORDER") if settings else None
# Filename-style YYYY-MM-DD / YYYY.MM.DD
if (
"2023-12-25" in date_string
or "2023.12.25" in date_string
or "2023-12-25" in date_string
):
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
# content DMY 25/12/2022
if "25/12/2022" in date_string or "25-12-2022" in date_string:
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
# filename YMD 2023.01.02
if "2023.01.02" in date_string or "2023-01-02" in date_string:
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
# ambiguous 01/02/2023 -> respect DATE_ORDER setting
if "01/02/2023" in date_string:
if date_order == "DMY":
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
if date_order == "YMD":
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
# fallback
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
# simulate parse failure for malformed input
if "99/99/9999" in date_string or "bad date" in date_string:
raise Exception("parse failed for malformed date")
return None
mocker.patch(target, side_effect=fake_parse)
results = list(parser.parse(filename, content))
assert results == expected
for dt in results:
assert dt.tzinfo is not None
def test_parse_filters_future_and_ignored_dates(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
"""
Ensure parser filters out:
- dates after reference_time
- dates whose .date() are in ignore_dates
"""
cfg = DateParserConfig(
languages=["en"],
timezone_str="UTC",
ignore_dates={datetime.date(2023, 12, 10)},
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order="YMD",
content_date_order="DMY",
)
parser = RegexDateParserPlugin(cfg)
target = "documents.plugins.date_parsing.base.dateparser.parse"
def fake_parse(
date_string: str,
settings: dict[str, Any] | None = None,
locales: None = None,
) -> datetime.datetime | None:
if "10/12/2023" in date_string or "10-12-2023" in date_string:
# ignored date
return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc)
if "01/02/2024" in date_string or "01-02-2024" in date_string:
# future relative to reference_time -> filtered
return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc)
if "05/01/2023" in date_string or "05-01-2023" in date_string:
# valid
return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)
return None
mocker.patch(target, side_effect=fake_parse)
content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023"
results = list(parser.parse("whatever.txt", content))
assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)]
def test_parse_handles_no_matches_and_returns_empty_list(
self,
base_config: DateParserConfig,
) -> None:
"""
When there are no matching date-like substrings, parse should yield nothing.
"""
parser = RegexDateParserPlugin(base_config)
results = list(
parser.parse("no-dates.txt", "this has no dates whatsoever"),
)
assert results == []
def test_parse_skips_filename_when_filename_date_order_none(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
"""
When filename_date_order is None the parser must not attempt to parse the filename.
Only dates found in the content should be passed to dateparser.parse.
"""
cfg = DateParserConfig(
languages=["en"],
timezone_str="UTC",
ignore_dates=set(),
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order=None,
content_date_order="DMY",
)
parser = RegexDateParserPlugin(cfg)
# Patch the module's dateparser.parse so we can inspect calls
target = "documents.plugins.date_parsing.base.dateparser.parse"
def fake_parse(
date_string: str,
settings: dict[str, Any] | None = None,
locales: None = None,
) -> datetime.datetime | None:
# return distinct datetimes so we can tell which source was parsed
if "25/12/2022" in date_string:
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
if "2023-12-25" in date_string:
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
return None
mock = mocker.patch(target, side_effect=fake_parse)
filename = "report-2023-12-25.txt"
content = "Event recorded on 25/12/2022."
results = list(parser.parse(filename, content))
# Only the content date should have been parsed -> one call
assert mock.call_count == 1
# # first call, first positional arg
called_date_string = mock.call_args_list[0][0][0]
assert "25/12/2022" in called_date_string
# And the parser should have yielded the corresponding datetime
assert results == [
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
]

View File

@@ -1978,11 +1978,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
@mock.patch("documents.parsers.parse_date_generator") @mock.patch("documents.views.get_date_parser")
@override_settings(NUMBER_OF_SUGGESTED_DATES=0) @override_settings(NUMBER_OF_SUGGESTED_DATES=0)
def test_get_suggestions_dates_disabled( def test_get_suggestions_dates_disabled(
self, self,
parse_date_generator, mock_get_date_parser: mock.MagicMock,
): ):
""" """
GIVEN: GIVEN:
@@ -1999,7 +1999,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
) )
self.client.get(f"/api/documents/{doc.pk}/suggestions/") self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertFalse(parse_date_generator.called)
mock_get_date_parser.assert_not_called()
def test_saved_views(self): def test_saved_views(self):
u1 = User.objects.create_superuser("user1") u1 = User.objects.create_superuser("user1")

View File

@@ -1,538 +0,0 @@
import datetime
from zoneinfo import ZoneInfo
import pytest
from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
@pytest.mark.django_db()
class TestDate:
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
assert parse_date("", text) is None
def test_date_format_2(self):
text = "lorem ipsum 2018 lorem ipsum"
assert parse_date("", text) is None
def test_date_format_3(self):
text = "lorem ipsum 20180213 lorem ipsum"
assert parse_date("", text) is None
def test_date_format_4(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 13.02.2018 lorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
def test_date_format_5(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
def test_date_format_6(self):
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum"
)
assert parse_date("", text) is None
def test_date_format_7(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
def test_date_format_8(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020"
)
assert parse_date("", text) == datetime.datetime(
2020,
3,
1,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_9(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
assert parse_date("", text) == datetime.datetime(
2020,
3,
1,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_10(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_11(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_12(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_13(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_14(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_15(self):
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_16(self):
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_17(self):
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_18(self):
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_19(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
21,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_20(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_21(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
2,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_22(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
23,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_23(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
24,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_24(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
21,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_25(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
25,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_26(self, settings_timezone: ZoneInfo):
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
assert parse_date("", text) == datetime.datetime(
2019,
9,
25,
0,
0,
tzinfo=settings_timezone,
)
def test_crazy_date_past(self):
assert parse_date("", "01-07-0590 00:00:00") is None
def test_crazy_date_future(self):
assert parse_date("", "01-07-2350 00:00:00") is None
def test_crazy_date_with_spaces(self):
assert parse_date("", "20 408000l 2475") is None
def test_utf_month_names(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
assert parse_date("", "13 décembre 2023") == datetime.datetime(
2023,
12,
13,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "13 août 2022") == datetime.datetime(
2022,
8,
13,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "11 März 2020") == datetime.datetime(
2020,
3,
11,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
2018,
3,
17,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
2016,
2,
1,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "15. února 1985") == datetime.datetime(
1985,
2,
15,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "30. září 2011") == datetime.datetime(
2011,
9,
30,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "28. května 1990") == datetime.datetime(
1990,
5,
28,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
1997,
12,
1,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
2024,
2,
17,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
2012,
8,
30,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
2000,
9,
17,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "5. október 1992") == datetime.datetime(
1992,
10,
5,
0,
0,
tzinfo=settings_timezone,
)
def test_multiple_dates(self, settings_timezone: ZoneInfo):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and December 2021.
But not 24-12-9999 because it's in the future..."""
dates = list(parse_date_generator("", text))
assert dates == [
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
datetime.datetime(
2022,
7,
22,
0,
0,
tzinfo=settings_timezone,
),
datetime.datetime(
2021,
12,
1,
0,
0,
tzinfo=settings_timezone,
),
]
def test_filename_date_parse_valid_ymd(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Year Month Day (YMD)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
settings.FILENAME_DATE_ORDER = "YMD"
assert parse_date(
"/tmp/Scan-2022-04-01.pdf",
"No date in here",
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
def test_filename_date_parse_valid_dmy(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (DMY)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
settings.FILENAME_DATE_ORDER = "DMY"
assert parse_date(
"/tmp/Scan-10.01.2021.pdf",
"No date in here",
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename includes no date
- File content includes no date
THEN:
- No date is parsed
"""
settings.FILENAME_DATE_ORDER = "YMD"
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
def test_filename_date_ignored_use_content(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (YMD)
- Date order is Day Month Year (DMY, the default)
- Filename contains date matching the format
- Filename date is an ignored date
- File content includes a date
THEN:
- Should parse the date from the content not filename
"""
settings.FILENAME_DATE_ORDER = "YMD"
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
assert parse_date(
"/tmp/Scan-2022-04-01.pdf",
"The matching date is 24.03.2022",
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
def test_ignored_dates_default_order(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Ignore dates have been set
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
assert parse_date("", text) == datetime.datetime(
2018,
2,
13,
0,
0,
tzinfo=settings_timezone,
)
def test_ignored_dates_order_ymd(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Ignore dates have been set
- Date order is Year Month Date (YMD)
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
settings.FILENAME_DATE_ORDER = "YMD"
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
assert parse_date("", text) == datetime.datetime(
2018,
2,
13,
0,
0,
tzinfo=settings_timezone,
)

View File

@@ -148,7 +148,6 @@ from documents.models import Workflow
from documents.models import WorkflowAction from documents.models import WorkflowAction
from documents.models import WorkflowTrigger from documents.models import WorkflowTrigger
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date_generator
from documents.permissions import AcknowledgeTasksPermissions from documents.permissions import AcknowledgeTasksPermissions
from documents.permissions import PaperlessAdminPermissions from documents.permissions import PaperlessAdminPermissions
from documents.permissions import PaperlessNotePermissions from documents.permissions import PaperlessNotePermissions
@@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user
from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import get_objects_for_user_owner_aware
from documents.permissions import has_perms_owner_aware from documents.permissions import has_perms_owner_aware
from documents.permissions import set_permissions_for_object from documents.permissions import set_permissions_for_object
from documents.plugins.date_parsing import get_date_parser
from documents.schema import generate_object_with_permissions_schema from documents.schema import generate_object_with_permissions_schema
from documents.serialisers import AcknowledgeTasksViewSerializer from documents.serialisers import AcknowledgeTasksViewSerializer
from documents.serialisers import BulkDownloadSerializer from documents.serialisers import BulkDownloadSerializer
@@ -1023,16 +1023,17 @@ class DocumentViewSet(
dates = [] dates = []
if settings.NUMBER_OF_SUGGESTED_DATES > 0: if settings.NUMBER_OF_SUGGESTED_DATES > 0:
gen = parse_date_generator(doc.filename, doc.content) with get_date_parser() as date_parser:
dates = sorted( gen = date_parser.parse(doc.filename, doc.content)
{ dates = sorted(
i {
for i in itertools.islice( i
gen, for i in itertools.islice(
settings.NUMBER_OF_SUGGESTED_DATES, gen,
) settings.NUMBER_OF_SUGGESTED_DATES,
}, )
) },
)
resp_data = { resp_data = {
"correspondents": [ "correspondents": [

View File

@@ -1,12 +1,7 @@
import datetime import datetime
import logging import logging
from datetime import timedelta from datetime import timedelta
from typing import Any
from adrf.views import APIView
from adrf.viewsets import ModelViewSet
from adrf.viewsets import ReadOnlyModelViewSet
from asgiref.sync import sync_to_async
from django.http import HttpResponseBadRequest from django.http import HttpResponseBadRequest
from django.http import HttpResponseForbidden from django.http import HttpResponseForbidden
from django.http import HttpResponseRedirect from django.http import HttpResponseRedirect
@@ -20,9 +15,11 @@ from httpx_oauth.oauth2 import GetAccessTokenError
from rest_framework import serializers from rest_framework import serializers
from rest_framework.decorators import action from rest_framework.decorators import action
from rest_framework.filters import OrderingFilter from rest_framework.filters import OrderingFilter
from rest_framework.generics import GenericAPIView
from rest_framework.permissions import IsAuthenticated from rest_framework.permissions import IsAuthenticated
from rest_framework.request import Request
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from rest_framework.viewsets import ReadOnlyModelViewSet
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
from documents.permissions import PaperlessObjectPermissions from documents.permissions import PaperlessObjectPermissions
@@ -42,8 +39,6 @@ from paperless_mail.serialisers import MailRuleSerializer
from paperless_mail.serialisers import ProcessedMailSerializer from paperless_mail.serialisers import ProcessedMailSerializer
from paperless_mail.tasks import process_mail_accounts from paperless_mail.tasks import process_mail_accounts
logger: logging.Logger = logging.getLogger("paperless_mail")
@extend_schema_view( @extend_schema_view(
test=extend_schema( test=extend_schema(
@@ -71,75 +66,71 @@ logger: logging.Logger = logging.getLogger("paperless_mail")
), ),
) )
class MailAccountViewSet(ModelViewSet, PassUserMixin): class MailAccountViewSet(ModelViewSet, PassUserMixin):
model = MailAccount
queryset = MailAccount.objects.all().order_by("pk") queryset = MailAccount.objects.all().order_by("pk")
serializer_class = MailAccountSerializer serializer_class = MailAccountSerializer
pagination_class = StandardPagination pagination_class = StandardPagination
permission_classes = (IsAuthenticated, PaperlessObjectPermissions) permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,) filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
def get_permissions(self) -> list[Any]: def get_permissions(self):
if self.action == "test": if self.action == "test":
return [IsAuthenticated()] # Test action does not require object level permissions
self.permission_classes = (IsAuthenticated,)
return super().get_permissions() return super().get_permissions()
@action(methods=["post"], detail=False) @action(methods=["post"], detail=False)
async def test(self, request: Request) -> Response | HttpResponseBadRequest: def test(self, request):
logger = logging.getLogger("paperless_mail")
request.data["name"] = datetime.datetime.now().isoformat() request.data["name"] = datetime.datetime.now().isoformat()
serializer = self.get_serializer(data=request.data) serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
# Validation must be wrapped because of sync DB validators # account exists, use the password from there instead of *** and refresh_token / expiration
await sync_to_async(serializer.is_valid)(raise_exception=True)
validated_data: dict[str, Any] = serializer.validated_data
if ( if (
len(str(validated_data.get("password", "")).replace("*", "")) == 0 len(serializer.validated_data.get("password").replace("*", "")) == 0
and request.data.get("id") is not None and request.data["id"] is not None
): ):
existing_account = await MailAccount.objects.aget(pk=request.data["id"]) existing_account = MailAccount.objects.get(pk=request.data["id"])
validated_data.update( serializer.validated_data["password"] = existing_account.password
{ serializer.validated_data["account_type"] = existing_account.account_type
"password": existing_account.password, serializer.validated_data["refresh_token"] = existing_account.refresh_token
"account_type": existing_account.account_type, serializer.validated_data["expiration"] = existing_account.expiration
"refresh_token": existing_account.refresh_token,
"expiration": existing_account.expiration,
},
)
account = MailAccount(**validated_data) account = MailAccount(**serializer.validated_data)
with get_mailbox(
def _blocking_imap_test() -> bool: account.imap_server,
with get_mailbox( account.imap_port,
account.imap_server, account.imap_security,
account.imap_port, ) as M:
account.imap_security, try:
) as m_box:
if ( if (
account.is_token account.is_token
and account.expiration and account.expiration is not None
and account.expiration < timezone.now() and account.expiration < timezone.now()
): ):
oauth_manager = PaperlessMailOAuth2Manager() oauth_manager = PaperlessMailOAuth2Manager()
if oauth_manager.refresh_account_oauth_token(existing_account): if oauth_manager.refresh_account_oauth_token(existing_account):
# User is not changing password and token needs to be refreshed # User is not changing password and token needs to be refreshed
existing_account.refresh_from_db()
account.password = existing_account.password account.password = existing_account.password
else: else:
raise MailError("Unable to refresh oauth token") raise MailError("Unable to refresh oauth token")
mailbox_login(m_box, account)
return True
try: mailbox_login(M, account)
await sync_to_async(_blocking_imap_test, thread_sensitive=False)() return Response({"success": True})
return Response({"success": True}) except MailError as e:
except MailError as e: logger.error(
logger.error(f"Mail account {account} test failed: {e}") f"Mail account {account} test failed: {e}",
return HttpResponseBadRequest("Unable to connect to server") )
return HttpResponseBadRequest("Unable to connect to server")
@action(methods=["post"], detail=True) @action(methods=["post"], detail=True)
async def process(self, request: Request, pk: int | None = None) -> Response: def process(self, request, pk=None):
# FIX: Use aget_object() provided by adrf to avoid SynchronousOnlyOperation account = self.get_object()
account = await self.aget_object()
process_mail_accounts.delay([account.pk]) process_mail_accounts.delay([account.pk])
return Response({"result": "OK"}) return Response({"result": "OK"})
@@ -153,38 +144,21 @@ class ProcessedMailViewSet(ReadOnlyModelViewSet, PassUserMixin):
ObjectOwnedOrGrantedPermissionsFilter, ObjectOwnedOrGrantedPermissionsFilter,
) )
filterset_class = ProcessedMailFilterSet filterset_class = ProcessedMailFilterSet
queryset = ProcessedMail.objects.all().order_by("-processed") queryset = ProcessedMail.objects.all().order_by("-processed")
@action(methods=["post"], detail=False) @action(methods=["post"], detail=False)
async def bulk_delete( def bulk_delete(self, request):
self, mail_ids = request.data.get("mail_ids", [])
request: Request,
) -> Response | HttpResponseBadRequest | HttpResponseForbidden:
mail_ids: list[int] = request.data.get("mail_ids", [])
if not isinstance(mail_ids, list) or not all( if not isinstance(mail_ids, list) or not all(
isinstance(i, int) for i in mail_ids isinstance(i, int) for i in mail_ids
): ):
return HttpResponseBadRequest("mail_ids must be a list of integers") return HttpResponseBadRequest("mail_ids must be a list of integers")
mails = ProcessedMail.objects.filter(id__in=mail_ids)
# Store objects to delete after verification for mail in mails:
to_delete: list[ProcessedMail] = [] if not has_perms_owner_aware(request.user, "delete_processedmail", mail):
# We must verify permissions for every requested ID
async for mail in ProcessedMail.objects.filter(id__in=mail_ids):
can_delete = await sync_to_async(has_perms_owner_aware)(
request.user,
"delete_processedmail",
mail,
)
if not can_delete:
# This is what the test is looking for: 403 on permission failure
return HttpResponseForbidden("Insufficient permissions") return HttpResponseForbidden("Insufficient permissions")
to_delete.append(mail) mail.delete()
# Only perform deletions if all items passed the permission check
for mail in to_delete:
await mail.adelete()
return Response({"result": "OK", "deleted_mail_ids": mail_ids}) return Response({"result": "OK", "deleted_mail_ids": mail_ids})
@@ -204,74 +178,77 @@ class MailRuleViewSet(ModelViewSet, PassUserMixin):
responses={200: None}, responses={200: None},
), ),
) )
class OauthCallbackView(APIView): class OauthCallbackView(GenericAPIView):
permission_classes = (IsAuthenticated,) permission_classes = (IsAuthenticated,)
async def get( def get(self, request, format=None):
self, if not (
request: Request, request.user and request.user.has_perms(["paperless_mail.add_mailaccount"])
) -> Response | HttpResponseBadRequest | HttpResponseRedirect: ):
has_perm = await sync_to_async(request.user.has_perm)(
"paperless_mail.add_mailaccount",
)
if not has_perm:
return HttpResponseBadRequest( return HttpResponseBadRequest(
"You do not have permission to add mail accounts", "You do not have permission to add mail accounts",
) )
code: str | None = request.query_params.get("code") logger = logging.getLogger("paperless_mail")
state: str | None = request.query_params.get("state") code = request.query_params.get("code")
scope: str | None = request.query_params.get("scope") # Gmail passes scope as a query param, Outlook does not
scope = request.query_params.get("scope")
if not code or not state: if code is None:
return HttpResponseBadRequest("Invalid request parameters") logger.error(
f"Invalid oauth callback request, code: {code}, scope: {scope}",
)
return HttpResponseBadRequest("Invalid request, see logs for more detail")
oauth_manager = PaperlessMailOAuth2Manager( oauth_manager = PaperlessMailOAuth2Manager(
state=request.session.get("oauth_state"), state=request.session.get("oauth_state"),
) )
state = request.query_params.get("state", "")
if not oauth_manager.validate_state(state): if not oauth_manager.validate_state(state):
return HttpResponseBadRequest("Invalid OAuth state") logger.error(
f"Invalid oauth callback request received state: {state}, expected: {oauth_manager.state}",
)
return HttpResponseBadRequest("Invalid request, see logs for more detail")
try: try:
defaults: dict[str, Any] = { if scope is not None and "google" in scope:
"username": "", # Google
"imap_security": MailAccount.ImapSecurity.SSL,
"imap_port": 993,
}
if scope and "google" in scope:
account_type = MailAccount.MailAccountType.GMAIL_OAUTH account_type = MailAccount.MailAccountType.GMAIL_OAUTH
imap_server = "imap.gmail.com" imap_server = "imap.gmail.com"
defaults.update( defaults = {
{ "name": f"Gmail OAuth {timezone.now()}",
"name": f"Gmail OAuth {timezone.now()}", "username": "",
"account_type": account_type, "imap_security": MailAccount.ImapSecurity.SSL,
}, "imap_port": 993,
) "account_type": account_type,
result = await sync_to_async(oauth_manager.get_gmail_access_token)(code) }
else: result = oauth_manager.get_gmail_access_token(code)
elif scope is None:
# Outlook
account_type = MailAccount.MailAccountType.OUTLOOK_OAUTH account_type = MailAccount.MailAccountType.OUTLOOK_OAUTH
imap_server = "outlook.office365.com" imap_server = "outlook.office365.com"
defaults.update( defaults = {
{ "name": f"Outlook OAuth {timezone.now()}",
"name": f"Outlook OAuth {timezone.now()}", "username": "",
"account_type": account_type, "imap_security": MailAccount.ImapSecurity.SSL,
}, "imap_port": 993,
) "account_type": account_type,
result = await sync_to_async(oauth_manager.get_outlook_access_token)( }
code,
)
account, _ = await MailAccount.objects.aupdate_or_create( result = oauth_manager.get_outlook_access_token(code)
access_token = result["access_token"]
refresh_token = result["refresh_token"]
expires_in = result["expires_in"]
account, _ = MailAccount.objects.update_or_create(
password=access_token,
is_token=True,
imap_server=imap_server, imap_server=imap_server,
refresh_token=result["refresh_token"], refresh_token=refresh_token,
defaults={ expiration=timezone.now() + timedelta(seconds=expires_in),
**defaults, defaults=defaults,
"password": result["access_token"],
"is_token": True,
"expiration": timezone.now()
+ timedelta(seconds=result["expires_in"]),
},
) )
return HttpResponseRedirect( return HttpResponseRedirect(
f"{oauth_manager.oauth_redirect_url}?oauth_success=1&account_id={account.pk}", f"{oauth_manager.oauth_redirect_url}?oauth_success=1&account_id={account.pk}",

3866
uv.lock generated

File diff suppressed because it is too large Load Diff