mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Compare commits
12 Commits
feature-as
...
feature-da
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
972f9a069c | ||
|
|
bd99fb66cf | ||
|
|
7704bc5399 | ||
|
|
a055de0ce4 | ||
|
|
e0fdf1caa9 | ||
|
|
f80ae51a7d | ||
|
|
e101019924 | ||
|
|
7afc8ceb24 | ||
|
|
dfe0012872 | ||
|
|
32771391ad | ||
|
|
9b7ae1c8ea | ||
|
|
66593ec660 |
@@ -37,7 +37,7 @@ repos:
|
|||||||
- json
|
- json
|
||||||
# See https://github.com/prettier/prettier/issues/15742 for the fork reason
|
# See https://github.com/prettier/prettier/issues/15742 for the fork reason
|
||||||
- repo: https://github.com/rbubley/mirrors-prettier
|
- repo: https://github.com/rbubley/mirrors-prettier
|
||||||
rev: 'v3.6.2'
|
rev: 'v3.8.1'
|
||||||
hooks:
|
hooks:
|
||||||
- id: prettier
|
- id: prettier
|
||||||
types_or:
|
types_or:
|
||||||
@@ -49,7 +49,7 @@ repos:
|
|||||||
- 'prettier-plugin-organize-imports@4.1.0'
|
- 'prettier-plugin-organize-imports@4.1.0'
|
||||||
# Python hooks
|
# Python hooks
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.14.5
|
rev: v0.14.14
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: ruff-check
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
@@ -76,7 +76,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
- repo: https://github.com/google/yamlfmt
|
- repo: https://github.com/google/yamlfmt
|
||||||
rev: v0.20.0
|
rev: v0.21.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yamlfmt
|
- id: yamlfmt
|
||||||
exclude: "^src-ui/pnpm-lock.yaml"
|
exclude: "^src-ui/pnpm-lock.yaml"
|
||||||
|
|||||||
@@ -16,18 +16,17 @@ classifiers = [
|
|||||||
# This will allow testing to not install a webserver, mysql, etc
|
# This will allow testing to not install a webserver, mysql, etc
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"adrf~=0.1.12",
|
|
||||||
"azure-ai-documentintelligence>=1.0.2",
|
"azure-ai-documentintelligence>=1.0.2",
|
||||||
"babel>=2.17",
|
"babel>=2.17",
|
||||||
"bleach~=6.3.0",
|
"bleach~=6.3.0",
|
||||||
"celery[redis]~=5.5.1",
|
"celery[redis]~=5.6.2",
|
||||||
"channels~=4.2",
|
"channels~=4.2",
|
||||||
"channels-redis~=4.2",
|
"channels-redis~=4.2",
|
||||||
"concurrent-log-handler~=0.9.25",
|
"concurrent-log-handler~=0.9.25",
|
||||||
"dateparser~=1.2",
|
"dateparser~=1.2",
|
||||||
# WARNING: django does not use semver.
|
# WARNING: django does not use semver.
|
||||||
# Only patch versions are guaranteed to not introduce breaking changes.
|
# Only patch versions are guaranteed to not introduce breaking changes.
|
||||||
"django~=5.2.5",
|
"django~=5.2.10",
|
||||||
"django-allauth[mfa,socialaccount]~=65.13.1",
|
"django-allauth[mfa,socialaccount]~=65.13.1",
|
||||||
"django-auditlog~=3.4.1",
|
"django-auditlog~=3.4.1",
|
||||||
"django-cachalot~=2.8.0",
|
"django-cachalot~=2.8.0",
|
||||||
@@ -80,7 +79,7 @@ dependencies = [
|
|||||||
"torch~=2.9.1",
|
"torch~=2.9.1",
|
||||||
"tqdm~=4.67.1",
|
"tqdm~=4.67.1",
|
||||||
"watchfiles>=1.1.1",
|
"watchfiles>=1.1.1",
|
||||||
"whitenoise~=6.9",
|
"whitenoise~=6.11",
|
||||||
"whoosh-reloaded>=2.7.5",
|
"whoosh-reloaded>=2.7.5",
|
||||||
"zxing-cpp~=2.3.0",
|
"zxing-cpp~=2.3.0",
|
||||||
]
|
]
|
||||||
@@ -89,13 +88,13 @@ optional-dependencies.mariadb = [
|
|||||||
"mysqlclient~=2.2.7",
|
"mysqlclient~=2.2.7",
|
||||||
]
|
]
|
||||||
optional-dependencies.postgres = [
|
optional-dependencies.postgres = [
|
||||||
"psycopg[c,pool]==3.2.12",
|
"psycopg[c,pool]==3.3",
|
||||||
# Direct dependency for proper resolution of the pre-built wheels
|
# Direct dependency for proper resolution of the pre-built wheels
|
||||||
"psycopg-c==3.2.12",
|
"psycopg-c==3.3",
|
||||||
"psycopg-pool==3.3",
|
"psycopg-pool==3.3",
|
||||||
]
|
]
|
||||||
optional-dependencies.webserver = [
|
optional-dependencies.webserver = [
|
||||||
"granian[uvloop]~=2.5.1",
|
"granian[uvloop]~=2.6.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
@@ -153,7 +152,7 @@ typing = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[tool.uv]
|
[tool.uv]
|
||||||
required-version = ">=0.5.14"
|
required-version = ">=0.9.0"
|
||||||
package = false
|
package = false
|
||||||
environments = [
|
environments = [
|
||||||
"sys_platform == 'darwin'",
|
"sys_platform == 'darwin'",
|
||||||
@@ -163,8 +162,8 @@ environments = [
|
|||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
# Markers are chosen to select these almost exclusively when building the Docker image
|
# Markers are chosen to select these almost exclusively when building the Docker image
|
||||||
psycopg-c = [
|
psycopg-c = [
|
||||||
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-bookworm-3.2.12/psycopg_c-3.2.12-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
|
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-trixie-3.3.0/psycopg_c-3.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
|
||||||
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-bookworm-3.2.12/psycopg_c-3.2.12-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64' and python_version == '3.12'" },
|
{ url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-trixie-3.3.0/psycopg_c-3.3.0-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64' and python_version == '3.12'" },
|
||||||
]
|
]
|
||||||
zxing-cpp = [
|
zxing-cpp = [
|
||||||
{ url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
|
{ url = "https://github.com/paperless-ngx/builder/releases/download/zxing-2.3.0/zxing_cpp-2.3.0-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64' and python_version == '3.12'" },
|
||||||
@@ -307,6 +306,7 @@ markers = [
|
|||||||
"gotenberg: Tests requiring Gotenberg service",
|
"gotenberg: Tests requiring Gotenberg service",
|
||||||
"tika: Tests requiring Tika service",
|
"tika: Tests requiring Tika service",
|
||||||
"greenmail: Tests requiring Greenmail service",
|
"greenmail: Tests requiring Greenmail service",
|
||||||
|
"date_parsing: Tests which cover date parsing from content or filename",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.pytest_env]
|
[tool.pytest_env]
|
||||||
@@ -333,6 +333,10 @@ exclude_also = [
|
|||||||
|
|
||||||
[tool.mypy]
|
[tool.mypy]
|
||||||
mypy_path = "src"
|
mypy_path = "src"
|
||||||
|
files = [
|
||||||
|
"src/documents/plugins/date_parsing",
|
||||||
|
"src/documents/tests/date_parsing",
|
||||||
|
]
|
||||||
plugins = [
|
plugins = [
|
||||||
"mypy_django_plugin.main",
|
"mypy_django_plugin.main",
|
||||||
"mypy_drf_plugin.main",
|
"mypy_drf_plugin.main",
|
||||||
@@ -344,5 +348,28 @@ disallow_untyped_defs = true
|
|||||||
warn_redundant_casts = true
|
warn_redundant_casts = true
|
||||||
warn_unused_ignores = true
|
warn_unused_ignores = true
|
||||||
|
|
||||||
|
# This prevents errors from imports, but allows type-checking logic to work
|
||||||
|
follow_imports = "silent"
|
||||||
|
|
||||||
|
[[tool.mypy.overrides]]
|
||||||
|
module = [
|
||||||
|
"documents.*",
|
||||||
|
"paperless.*",
|
||||||
|
"paperless_ai.*",
|
||||||
|
"paperless_mail.*",
|
||||||
|
"paperless_tesseract.*",
|
||||||
|
"paperless_remote.*",
|
||||||
|
"paperless_text.*",
|
||||||
|
"paperless_tika.*",
|
||||||
|
]
|
||||||
|
ignore_errors = true
|
||||||
|
|
||||||
|
[[tool.mypy.overrides]]
|
||||||
|
module = [
|
||||||
|
"documents.plugins.date_parsing.*",
|
||||||
|
"documents.tests.date_parsing.*",
|
||||||
|
]
|
||||||
|
ignore_errors = false
|
||||||
|
|
||||||
[tool.django-stubs]
|
[tool.django-stubs]
|
||||||
django_settings_module = "paperless.settings"
|
django_settings_module = "paperless.settings"
|
||||||
|
|||||||
@@ -32,12 +32,12 @@ from documents.models import WorkflowTrigger
|
|||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import ParseError
|
from documents.parsers import ParseError
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import parse_date
|
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
from documents.plugins.base import AlwaysRunPluginMixin
|
from documents.plugins.base import AlwaysRunPluginMixin
|
||||||
from documents.plugins.base import ConsumeTaskPlugin
|
from documents.plugins.base import ConsumeTaskPlugin
|
||||||
from documents.plugins.base import NoCleanupPluginMixin
|
from documents.plugins.base import NoCleanupPluginMixin
|
||||||
from documents.plugins.base import NoSetupPluginMixin
|
from documents.plugins.base import NoSetupPluginMixin
|
||||||
|
from documents.plugins.date_parsing import get_date_parser
|
||||||
from documents.plugins.helpers import ProgressManager
|
from documents.plugins.helpers import ProgressManager
|
||||||
from documents.plugins.helpers import ProgressStatusOptions
|
from documents.plugins.helpers import ProgressStatusOptions
|
||||||
from documents.signals import document_consumption_finished
|
from documents.signals import document_consumption_finished
|
||||||
@@ -426,7 +426,8 @@ class ConsumerPlugin(
|
|||||||
ProgressStatusOptions.WORKING,
|
ProgressStatusOptions.WORKING,
|
||||||
ConsumerStatusShortMessage.PARSE_DATE,
|
ConsumerStatusShortMessage.PARSE_DATE,
|
||||||
)
|
)
|
||||||
date = parse_date(self.filename, text)
|
with get_date_parser() as date_parser:
|
||||||
|
date = next(date_parser.parse(self.filename, text), None)
|
||||||
archive_path = document_parser.get_archive_path()
|
archive_path = document_parser.get_archive_path()
|
||||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||||
|
|
||||||
|
|||||||
@@ -9,22 +9,17 @@ import subprocess
|
|||||||
import tempfile
|
import tempfile
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import Match
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
from documents.loggers import LoggingMixin
|
from documents.loggers import LoggingMixin
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
from paperless.config import OcrConfig
|
|
||||||
from paperless.utils import ocr_to_dateparser_languages
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import datetime
|
import datetime
|
||||||
from collections.abc import Iterator
|
|
||||||
|
|
||||||
# This regular expression will try to find dates in the document at
|
# This regular expression will try to find dates in the document at
|
||||||
# hand and will match the following formats:
|
# hand and will match the following formats:
|
||||||
@@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
|
|||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
def parse_date(filename, text) -> datetime.datetime | None:
|
|
||||||
return next(parse_date_generator(filename, text), None)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
|
||||||
"""
|
|
||||||
Returns the date of the document.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __parser(ds: str, date_order: str) -> datetime.datetime:
|
|
||||||
"""
|
|
||||||
Call dateparser.parse with a particular date ordering
|
|
||||||
"""
|
|
||||||
import dateparser
|
|
||||||
|
|
||||||
ocr_config = OcrConfig()
|
|
||||||
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
|
||||||
ocr_config.language,
|
|
||||||
)
|
|
||||||
|
|
||||||
return dateparser.parse(
|
|
||||||
ds,
|
|
||||||
settings={
|
|
||||||
"DATE_ORDER": date_order,
|
|
||||||
"PREFER_DAY_OF_MONTH": "first",
|
|
||||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
|
||||||
"TIMEZONE": settings.TIME_ZONE,
|
|
||||||
},
|
|
||||||
locales=languages,
|
|
||||||
)
|
|
||||||
|
|
||||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
|
||||||
if (
|
|
||||||
date is not None
|
|
||||||
and date.year > 1900
|
|
||||||
and date <= timezone.now()
|
|
||||||
and date.date() not in settings.IGNORE_DATES
|
|
||||||
):
|
|
||||||
return date
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __process_match(
|
|
||||||
match: Match[str],
|
|
||||||
date_order: str,
|
|
||||||
) -> datetime.datetime | None:
|
|
||||||
date_string = match.group(0)
|
|
||||||
|
|
||||||
try:
|
|
||||||
date = __parser(date_string, date_order)
|
|
||||||
except Exception:
|
|
||||||
# Skip all matches that do not parse to a proper date
|
|
||||||
date = None
|
|
||||||
|
|
||||||
return __filter(date)
|
|
||||||
|
|
||||||
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
|
||||||
for m in re.finditer(DATE_REGEX, content):
|
|
||||||
date = __process_match(m, date_order)
|
|
||||||
if date is not None:
|
|
||||||
yield date
|
|
||||||
|
|
||||||
# if filename date parsing is enabled, search there first:
|
|
||||||
if settings.FILENAME_DATE_ORDER:
|
|
||||||
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
|
|
||||||
|
|
||||||
# Iterate through all regex matches in text and try to parse the date
|
|
||||||
yield from __process_content(text, settings.DATE_ORDER)
|
|
||||||
|
|
||||||
|
|
||||||
class ParseError(Exception):
|
class ParseError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
92
src/documents/plugins/date_parsing/__init__.py
Normal file
92
src/documents/plugins/date_parsing/__init__.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
import logging
|
||||||
|
from functools import lru_cache
|
||||||
|
from importlib.metadata import EntryPoint
|
||||||
|
from importlib.metadata import entry_points
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from documents.plugins.date_parsing.base import DateParserConfig
|
||||||
|
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||||
|
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||||
|
from paperless.utils import ocr_to_dateparser_languages
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers"
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _discover_parser_class() -> type[DateParserPluginBase]:
|
||||||
|
"""
|
||||||
|
Discovers the date parser plugin class to use.
|
||||||
|
|
||||||
|
- If one or more plugins are found, sorts them by name and returns the first.
|
||||||
|
- If no plugins are found, returns the default RegexDateParser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
eps: tuple[EntryPoint, ...]
|
||||||
|
try:
|
||||||
|
eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP)
|
||||||
|
except Exception as e:
|
||||||
|
# Log a warning
|
||||||
|
logger.warning(f"Could not query entry points for date parsers: {e}")
|
||||||
|
eps = ()
|
||||||
|
|
||||||
|
valid_plugins: list[EntryPoint] = []
|
||||||
|
for ep in eps:
|
||||||
|
try:
|
||||||
|
plugin_class = ep.load()
|
||||||
|
if plugin_class and issubclass(plugin_class, DateParserPluginBase):
|
||||||
|
valid_plugins.append(ep)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Plugin {ep.name} does not subclass DateParser.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unable to load date parser plugin {ep.name}: {e}")
|
||||||
|
|
||||||
|
if not valid_plugins:
|
||||||
|
return RegexDateParserPlugin
|
||||||
|
|
||||||
|
valid_plugins.sort(key=lambda ep: ep.name)
|
||||||
|
|
||||||
|
if len(valid_plugins) > 1:
|
||||||
|
logger.warning(
|
||||||
|
f"Multiple date parsers found: "
|
||||||
|
f"{[ep.name for ep in valid_plugins]}. "
|
||||||
|
f"Using the first one by name: '{valid_plugins[0].name}'.",
|
||||||
|
)
|
||||||
|
|
||||||
|
return valid_plugins[0].load()
|
||||||
|
|
||||||
|
|
||||||
|
def get_date_parser() -> DateParserPluginBase:
|
||||||
|
"""
|
||||||
|
Factory function to get an initialized date parser instance.
|
||||||
|
|
||||||
|
This function is responsible for:
|
||||||
|
1. Discovering the correct parser class (plugin or default).
|
||||||
|
2. Loading configuration from Django settings.
|
||||||
|
3. Instantiating the parser with the configuration.
|
||||||
|
"""
|
||||||
|
# 1. Discover the class (this is cached)
|
||||||
|
parser_class = _discover_parser_class()
|
||||||
|
|
||||||
|
# 2. Load configuration from settings
|
||||||
|
# TODO: Get the language from the settings and/or configuration object, depending
|
||||||
|
languages = languages = (
|
||||||
|
settings.DATE_PARSER_LANGUAGES
|
||||||
|
or ocr_to_dateparser_languages(settings.OCR_LANGUAGE)
|
||||||
|
)
|
||||||
|
|
||||||
|
config = DateParserConfig(
|
||||||
|
languages=languages,
|
||||||
|
timezone_str=settings.TIME_ZONE,
|
||||||
|
ignore_dates=settings.IGNORE_DATES,
|
||||||
|
reference_time=timezone.now(),
|
||||||
|
filename_date_order=settings.FILENAME_DATE_ORDER,
|
||||||
|
content_date_order=settings.DATE_ORDER,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Instantiate the discovered class with the config
|
||||||
|
return parser_class(config=config)
|
||||||
124
src/documents/plugins/date_parsing/base.py
Normal file
124
src/documents/plugins/date_parsing/base.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
from abc import ABC
|
||||||
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from types import TracebackType
|
||||||
|
|
||||||
|
try:
|
||||||
|
from typing import Self
|
||||||
|
except ImportError:
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
import dateparser
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True, slots=True)
|
||||||
|
class DateParserConfig:
|
||||||
|
"""
|
||||||
|
Configuration for a DateParser instance.
|
||||||
|
|
||||||
|
This object is created by the factory and passed to the
|
||||||
|
parser's constructor, decoupling the parser from settings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
languages: list[str]
|
||||||
|
timezone_str: str
|
||||||
|
ignore_dates: set[datetime.date]
|
||||||
|
|
||||||
|
# A "now" timestamp for filtering future dates.
|
||||||
|
# Passed in by the factory.
|
||||||
|
reference_time: datetime.datetime
|
||||||
|
|
||||||
|
# Settings for the default RegexDateParser
|
||||||
|
# Other plugins should use or consider these, but it is not required
|
||||||
|
filename_date_order: str | None
|
||||||
|
content_date_order: str
|
||||||
|
|
||||||
|
|
||||||
|
class DateParserPluginBase(ABC):
|
||||||
|
"""
|
||||||
|
Abstract base class for date parsing strategies.
|
||||||
|
|
||||||
|
Instances are configured via a DateParserConfig object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: DateParserConfig):
|
||||||
|
"""
|
||||||
|
Initializes the parser with its configuration.
|
||||||
|
"""
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
"""
|
||||||
|
Enter the runtime context related to this object.
|
||||||
|
|
||||||
|
Subclasses can override this to acquire resources (connections, handles).
|
||||||
|
"""
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(
|
||||||
|
self,
|
||||||
|
exc_type: type[BaseException] | None,
|
||||||
|
exc_val: BaseException | None,
|
||||||
|
exc_tb: TracebackType | None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Exit the runtime context related to this object.
|
||||||
|
|
||||||
|
Subclasses can override this to release resources.
|
||||||
|
"""
|
||||||
|
# Default implementation does nothing.
|
||||||
|
# Returning None implies exceptions are propagated.
|
||||||
|
|
||||||
|
def _parse_string(
|
||||||
|
self,
|
||||||
|
date_string: str,
|
||||||
|
date_order: str,
|
||||||
|
) -> datetime.datetime | None:
|
||||||
|
"""
|
||||||
|
Helper method to parse a single date string using dateparser.
|
||||||
|
|
||||||
|
Uses configuration from `self.config`.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return dateparser.parse(
|
||||||
|
date_string,
|
||||||
|
settings={
|
||||||
|
"DATE_ORDER": date_order,
|
||||||
|
"PREFER_DAY_OF_MONTH": "first",
|
||||||
|
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||||
|
"TIMEZONE": self.config.timezone_str,
|
||||||
|
},
|
||||||
|
locales=self.config.languages,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while parsing date string '{date_string}': {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _filter_date(
|
||||||
|
self,
|
||||||
|
date: datetime.datetime | None,
|
||||||
|
) -> datetime.datetime | None:
|
||||||
|
"""
|
||||||
|
Helper method to validate a parsed datetime object.
|
||||||
|
|
||||||
|
Uses configuration from `self.config`.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
date is not None
|
||||||
|
and date.year > 1900
|
||||||
|
and date <= self.config.reference_time
|
||||||
|
and date.date() not in self.config.ignore_dates
|
||||||
|
):
|
||||||
|
return date
|
||||||
|
return None
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||||
|
"""
|
||||||
|
Parses a document's filename and content, yielding valid datetime objects.
|
||||||
|
"""
|
||||||
65
src/documents/plugins/date_parsing/regex_parser.py
Normal file
65
src/documents/plugins/date_parsing/regex_parser.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from re import Match
|
||||||
|
|
||||||
|
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||||
|
|
||||||
|
|
||||||
|
class RegexDateParserPlugin(DateParserPluginBase):
|
||||||
|
"""
|
||||||
|
The default date parser, using a series of regular expressions.
|
||||||
|
|
||||||
|
It is configured entirely by the DateParserConfig object
|
||||||
|
passed to its constructor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DATE_REGEX = re.compile(
|
||||||
|
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||||
|
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||||
|
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||||
|
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||||
|
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||||
|
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||||
|
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _process_match(
|
||||||
|
self,
|
||||||
|
match: Match[str],
|
||||||
|
date_order: str,
|
||||||
|
) -> datetime.datetime | None:
|
||||||
|
"""
|
||||||
|
Processes a single regex match using the base class helpers.
|
||||||
|
"""
|
||||||
|
date_string = match.group(0)
|
||||||
|
date = self._parse_string(date_string, date_order)
|
||||||
|
return self._filter_date(date)
|
||||||
|
|
||||||
|
def _process_content(
|
||||||
|
self,
|
||||||
|
content: str,
|
||||||
|
date_order: str,
|
||||||
|
) -> Iterator[datetime.datetime]:
|
||||||
|
"""
|
||||||
|
Finds all regex matches in content and yields valid dates.
|
||||||
|
"""
|
||||||
|
for m in re.finditer(self.DATE_REGEX, content):
|
||||||
|
date = self._process_match(m, date_order)
|
||||||
|
if date is not None:
|
||||||
|
yield date
|
||||||
|
|
||||||
|
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||||
|
"""
|
||||||
|
Implementation of the abstract parse method.
|
||||||
|
|
||||||
|
Reads its configuration from `self.config`.
|
||||||
|
"""
|
||||||
|
if self.config.filename_date_order:
|
||||||
|
yield from self._process_content(
|
||||||
|
filename,
|
||||||
|
self.config.filename_date_order,
|
||||||
|
)
|
||||||
|
|
||||||
|
yield from self._process_content(content, self.config.content_date_order)
|
||||||
0
src/documents/tests/date_parsing/__init__.py
Normal file
0
src/documents/tests/date_parsing/__init__.py
Normal file
82
src/documents/tests/date_parsing/conftest.py
Normal file
82
src/documents/tests/date_parsing/conftest.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
import datetime
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_django
|
||||||
|
|
||||||
|
from documents.plugins.date_parsing import _discover_parser_class
|
||||||
|
from documents.plugins.date_parsing.base import DateParserConfig
|
||||||
|
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def base_config() -> DateParserConfig:
|
||||||
|
"""Basic configuration for date parser testing."""
|
||||||
|
return DateParserConfig(
|
||||||
|
languages=["en"],
|
||||||
|
timezone_str="UTC",
|
||||||
|
ignore_dates=set(),
|
||||||
|
reference_time=datetime.datetime(
|
||||||
|
2024,
|
||||||
|
1,
|
||||||
|
15,
|
||||||
|
12,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
),
|
||||||
|
filename_date_order="YMD",
|
||||||
|
content_date_order="DMY",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def config_with_ignore_dates() -> DateParserConfig:
|
||||||
|
"""Configuration with dates to ignore."""
|
||||||
|
return DateParserConfig(
|
||||||
|
languages=["en", "de"],
|
||||||
|
timezone_str="America/New_York",
|
||||||
|
ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)},
|
||||||
|
reference_time=datetime.datetime(
|
||||||
|
2024,
|
||||||
|
1,
|
||||||
|
15,
|
||||||
|
12,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
),
|
||||||
|
filename_date_order="DMY",
|
||||||
|
content_date_order="MDY",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin:
|
||||||
|
"""Instance of RegexDateParser with base config."""
|
||||||
|
return RegexDateParserPlugin(base_config)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def clear_lru_cache() -> Generator[None, None, None]:
|
||||||
|
"""
|
||||||
|
Ensure the LRU cache for _discover_parser_class is cleared
|
||||||
|
before and after any test that depends on it.
|
||||||
|
"""
|
||||||
|
_discover_parser_class.cache_clear()
|
||||||
|
yield
|
||||||
|
_discover_parser_class.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any:
|
||||||
|
"""
|
||||||
|
Override Django settings for the duration of date parser tests.
|
||||||
|
"""
|
||||||
|
settings.DATE_PARSER_LANGUAGES = ["en", "de"]
|
||||||
|
settings.TIME_ZONE = "UTC"
|
||||||
|
settings.IGNORE_DATES = [datetime.date(1900, 1, 1)]
|
||||||
|
settings.FILENAME_DATE_ORDER = "YMD"
|
||||||
|
settings.DATE_ORDER = "DMY"
|
||||||
|
return settings
|
||||||
@@ -0,0 +1,228 @@
|
|||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
from collections.abc import Iterator
|
||||||
|
from importlib.metadata import EntryPoint
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_mock
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP
|
||||||
|
from documents.plugins.date_parsing import _discover_parser_class
|
||||||
|
from documents.plugins.date_parsing import get_date_parser
|
||||||
|
from documents.plugins.date_parsing.base import DateParserConfig
|
||||||
|
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||||
|
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||||
|
|
||||||
|
|
||||||
|
class AlphaParser(DateParserPluginBase):
|
||||||
|
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||||
|
yield timezone.now()
|
||||||
|
|
||||||
|
|
||||||
|
class BetaParser(DateParserPluginBase):
|
||||||
|
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||||
|
yield timezone.now()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.date_parsing
|
||||||
|
@pytest.mark.usefixtures("clear_lru_cache")
|
||||||
|
class TestDiscoverParserClass:
|
||||||
|
"""Tests for the _discover_parser_class() function."""
|
||||||
|
|
||||||
|
def test_returns_default_when_no_plugins_found(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(),
|
||||||
|
)
|
||||||
|
result = _discover_parser_class()
|
||||||
|
assert result is RegexDateParserPlugin
|
||||||
|
|
||||||
|
def test_returns_default_when_entrypoint_query_fails(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
side_effect=RuntimeError("boom"),
|
||||||
|
)
|
||||||
|
result = _discover_parser_class()
|
||||||
|
assert result is RegexDateParserPlugin
|
||||||
|
assert "Could not query entry points" in caplog.text
|
||||||
|
|
||||||
|
def test_filters_out_invalid_plugins(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
fake_ep = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
fake_ep.name = "bad_plugin"
|
||||||
|
fake_ep.load.return_value = object # not subclass of DateParser
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(fake_ep,),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = _discover_parser_class()
|
||||||
|
assert result is RegexDateParserPlugin
|
||||||
|
assert "does not subclass DateParser" in caplog.text
|
||||||
|
|
||||||
|
def test_skips_plugins_that_fail_to_load(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
fake_ep = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
fake_ep.name = "failing_plugin"
|
||||||
|
fake_ep.load.side_effect = ImportError("cannot import")
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(fake_ep,),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = _discover_parser_class()
|
||||||
|
assert result is RegexDateParserPlugin
|
||||||
|
assert "Unable to load date parser plugin failing_plugin" in caplog.text
|
||||||
|
|
||||||
|
def test_returns_single_valid_plugin_without_warning(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
"""If exactly one valid plugin is discovered, it should be returned without logging a warning."""
|
||||||
|
|
||||||
|
ep = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
ep.name = "alpha"
|
||||||
|
ep.load.return_value = AlphaParser
|
||||||
|
|
||||||
|
mock_entry_points = mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(ep,),
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog.at_level(
|
||||||
|
logging.WARNING,
|
||||||
|
logger="documents.plugins.date_parsing",
|
||||||
|
):
|
||||||
|
result = _discover_parser_class()
|
||||||
|
|
||||||
|
# It should have called entry_points with the correct group
|
||||||
|
mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP)
|
||||||
|
|
||||||
|
# The discovered class should be exactly our AlphaParser
|
||||||
|
assert result is AlphaParser
|
||||||
|
|
||||||
|
# No warnings should have been logged
|
||||||
|
assert not any(
|
||||||
|
"Multiple date parsers found" in record.message for record in caplog.records
|
||||||
|
), "Unexpected warning logged when only one plugin was found"
|
||||||
|
|
||||||
|
def test_returns_first_valid_plugin_by_name(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
ep_a = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
ep_a.name = "alpha"
|
||||||
|
ep_a.load.return_value = AlphaParser
|
||||||
|
|
||||||
|
ep_b = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
ep_b.name = "beta"
|
||||||
|
ep_b.load.return_value = BetaParser
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(ep_b, ep_a),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = _discover_parser_class()
|
||||||
|
assert result is AlphaParser
|
||||||
|
|
||||||
|
def test_logs_warning_if_multiple_plugins_found(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
ep1 = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
ep1.name = "a"
|
||||||
|
ep1.load.return_value = AlphaParser
|
||||||
|
|
||||||
|
ep2 = mocker.MagicMock(spec=EntryPoint)
|
||||||
|
ep2.name = "b"
|
||||||
|
ep2.load.return_value = BetaParser
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(ep1, ep2),
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog.at_level(
|
||||||
|
logging.WARNING,
|
||||||
|
logger="documents.plugins.date_parsing",
|
||||||
|
):
|
||||||
|
result = _discover_parser_class()
|
||||||
|
|
||||||
|
# Should select alphabetically first plugin ("a")
|
||||||
|
assert result is AlphaParser
|
||||||
|
|
||||||
|
# Should log a warning mentioning multiple parsers
|
||||||
|
assert any(
|
||||||
|
"Multiple date parsers found" in record.message for record in caplog.records
|
||||||
|
), "Expected a warning about multiple date parsers"
|
||||||
|
|
||||||
|
def test_cache_behavior_only_runs_once(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mock_entry_points = mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.entry_points",
|
||||||
|
return_value=(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# First call populates cache
|
||||||
|
_discover_parser_class()
|
||||||
|
# Second call should not re-invoke entry_points
|
||||||
|
_discover_parser_class()
|
||||||
|
mock_entry_points.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.date_parsing
|
||||||
|
@pytest.mark.usefixtures("mock_date_parser_settings")
|
||||||
|
class TestGetDateParser:
|
||||||
|
"""Tests for the get_date_parser() factory function."""
|
||||||
|
|
||||||
|
def test_returns_instance_of_discovered_class(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing._discover_parser_class",
|
||||||
|
return_value=AlphaParser,
|
||||||
|
)
|
||||||
|
parser = get_date_parser()
|
||||||
|
assert isinstance(parser, AlphaParser)
|
||||||
|
assert isinstance(parser.config, DateParserConfig)
|
||||||
|
assert parser.config.languages == ["en", "de"]
|
||||||
|
assert parser.config.timezone_str == "UTC"
|
||||||
|
assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)]
|
||||||
|
assert parser.config.filename_date_order == "YMD"
|
||||||
|
assert parser.config.content_date_order == "DMY"
|
||||||
|
# Check reference_time near now
|
||||||
|
delta = abs((parser.config.reference_time - timezone.now()).total_seconds())
|
||||||
|
assert delta < 2
|
||||||
|
|
||||||
|
def test_uses_default_regex_parser_when_no_plugins(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing._discover_parser_class",
|
||||||
|
return_value=RegexDateParserPlugin,
|
||||||
|
)
|
||||||
|
parser = get_date_parser()
|
||||||
|
assert isinstance(parser, RegexDateParserPlugin)
|
||||||
433
src/documents/tests/date_parsing/test_date_parsing.py
Normal file
433
src/documents/tests/date_parsing/test_date_parsing.py
Normal file
@@ -0,0 +1,433 @@
|
|||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_mock
|
||||||
|
|
||||||
|
from documents.plugins.date_parsing.base import DateParserConfig
|
||||||
|
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.date_parsing
|
||||||
|
class TestParseString:
|
||||||
|
"""Tests for DateParser._parse_string method via RegexDateParser."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("date_string", "date_order", "expected_year"),
|
||||||
|
[
|
||||||
|
pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"),
|
||||||
|
pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"),
|
||||||
|
pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"),
|
||||||
|
pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"),
|
||||||
|
pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"),
|
||||||
|
pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"),
|
||||||
|
pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_string_valid_formats(
|
||||||
|
self,
|
||||||
|
regex_parser: RegexDateParserPlugin,
|
||||||
|
date_string: str,
|
||||||
|
date_order: str,
|
||||||
|
expected_year: int,
|
||||||
|
) -> None:
|
||||||
|
"""Should correctly parse various valid date formats."""
|
||||||
|
result = regex_parser._parse_string(date_string, date_order)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result.year == expected_year
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"invalid_string",
|
||||||
|
[
|
||||||
|
pytest.param("not a date", id="plain_text"),
|
||||||
|
pytest.param("32/13/2024", id="invalid_day_month"),
|
||||||
|
pytest.param("", id="empty_string"),
|
||||||
|
pytest.param("abc123xyz", id="alphanumeric_gibberish"),
|
||||||
|
pytest.param("99/99/9999", id="out_of_range"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_string_invalid_input(
|
||||||
|
self,
|
||||||
|
regex_parser: RegexDateParserPlugin,
|
||||||
|
invalid_string: str,
|
||||||
|
) -> None:
|
||||||
|
"""Should return None for invalid date strings."""
|
||||||
|
result = regex_parser._parse_string(invalid_string, "DMY")
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_parse_string_handles_exceptions(
|
||||||
|
self,
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
regex_parser: RegexDateParserPlugin,
|
||||||
|
) -> None:
|
||||||
|
"""Should handle and log exceptions from dateparser gracefully."""
|
||||||
|
with caplog.at_level(
|
||||||
|
logging.ERROR,
|
||||||
|
logger="documents.plugins.date_parsing.base",
|
||||||
|
):
|
||||||
|
# We still need to mock dateparser.parse to force the exception
|
||||||
|
mocker.patch(
|
||||||
|
"documents.plugins.date_parsing.base.dateparser.parse",
|
||||||
|
side_effect=ValueError(
|
||||||
|
"Parsing error: 01/01/2024",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# 1. Execute the function under test
|
||||||
|
result = regex_parser._parse_string("01/01/2024", "DMY")
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
# Check if an error was logged
|
||||||
|
assert len(caplog.records) == 1
|
||||||
|
assert caplog.records[0].levelname == "ERROR"
|
||||||
|
|
||||||
|
# Check if the specific error message is present
|
||||||
|
assert "Error while parsing date string" in caplog.text
|
||||||
|
# Optional: Check for the exact exception message if it's included in the log
|
||||||
|
assert "Parsing error: 01/01/2024" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.date_parsing
|
||||||
|
class TestFilterDate:
|
||||||
|
"""Tests for DateParser._filter_date method via RegexDateParser."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("date", "expected_output"),
|
||||||
|
[
|
||||||
|
# Valid Dates
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
|
||||||
|
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
|
||||||
|
id="valid_past_date",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
|
||||||
|
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
|
||||||
|
id="exactly_at_reference",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
|
||||||
|
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
|
||||||
|
id="year_1901_valid",
|
||||||
|
),
|
||||||
|
# Date is > reference_time
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc),
|
||||||
|
None,
|
||||||
|
id="future_date_day_after",
|
||||||
|
),
|
||||||
|
# date.date() in ignore_dates
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
|
||||||
|
None,
|
||||||
|
id="ignored_date_midnight_jan1",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc),
|
||||||
|
None,
|
||||||
|
id="ignored_date_midday_jan1",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc),
|
||||||
|
None,
|
||||||
|
id="ignored_date_dec25_future",
|
||||||
|
),
|
||||||
|
# date.year <= 1900
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc),
|
||||||
|
None,
|
||||||
|
id="year_1899",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc),
|
||||||
|
None,
|
||||||
|
id="year_1900_boundary",
|
||||||
|
),
|
||||||
|
# date is None
|
||||||
|
pytest.param(None, None, id="none_input"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_filter_date_validation_rules(
|
||||||
|
self,
|
||||||
|
config_with_ignore_dates: DateParserConfig,
|
||||||
|
date: datetime.datetime | None,
|
||||||
|
expected_output: datetime.datetime | None,
|
||||||
|
) -> None:
|
||||||
|
"""Should correctly validate dates against various rules."""
|
||||||
|
parser = RegexDateParserPlugin(config_with_ignore_dates)
|
||||||
|
result = parser._filter_date(date)
|
||||||
|
assert result == expected_output
|
||||||
|
|
||||||
|
def test_filter_date_respects_ignore_dates(
|
||||||
|
self,
|
||||||
|
config_with_ignore_dates: DateParserConfig,
|
||||||
|
) -> None:
|
||||||
|
"""Should filter out dates in the ignore_dates set."""
|
||||||
|
parser = RegexDateParserPlugin(config_with_ignore_dates)
|
||||||
|
|
||||||
|
ignored_date = datetime.datetime(
|
||||||
|
2024,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
12,
|
||||||
|
0,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
)
|
||||||
|
another_ignored = datetime.datetime(
|
||||||
|
2024,
|
||||||
|
12,
|
||||||
|
25,
|
||||||
|
15,
|
||||||
|
30,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
)
|
||||||
|
allowed_date = datetime.datetime(
|
||||||
|
2024,
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
12,
|
||||||
|
0,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert parser._filter_date(ignored_date) is None
|
||||||
|
assert parser._filter_date(another_ignored) is None
|
||||||
|
assert parser._filter_date(allowed_date) == allowed_date
|
||||||
|
|
||||||
|
def test_filter_date_timezone_aware(
|
||||||
|
self,
|
||||||
|
regex_parser: RegexDateParserPlugin,
|
||||||
|
) -> None:
|
||||||
|
"""Should work with timezone-aware datetimes."""
|
||||||
|
date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
|
result = regex_parser._filter_date(date_utc)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result.tzinfo is not None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.date_parsing
|
||||||
|
class TestRegexDateParser:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("filename", "content", "expected"),
|
||||||
|
[
|
||||||
|
pytest.param(
|
||||||
|
"report-2023-12-25.txt",
|
||||||
|
"Event recorded on 25/12/2022.",
|
||||||
|
[
|
||||||
|
datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc),
|
||||||
|
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||||
|
],
|
||||||
|
id="filename-y-m-d_and_content-d-m-y",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"img_2023.01.02.jpg",
|
||||||
|
"Taken on 01/02/2023",
|
||||||
|
[
|
||||||
|
datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc),
|
||||||
|
datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc),
|
||||||
|
],
|
||||||
|
id="ambiguous-dates-respect-orders",
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"notes.txt",
|
||||||
|
"bad date 99/99/9999 and 25/12/2022",
|
||||||
|
[
|
||||||
|
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||||
|
],
|
||||||
|
id="parse-exception-skips-bad-and-yields-good",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_returns_expected_dates(
|
||||||
|
self,
|
||||||
|
base_config: DateParserConfig,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
filename: str,
|
||||||
|
content: str,
|
||||||
|
expected: list[datetime.datetime],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
High-level tests that exercise RegexDateParser.parse only.
|
||||||
|
dateparser.parse is mocked so tests are deterministic.
|
||||||
|
"""
|
||||||
|
parser = RegexDateParserPlugin(base_config)
|
||||||
|
|
||||||
|
# Patch the dateparser.parse
|
||||||
|
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||||
|
|
||||||
|
def fake_parse(
|
||||||
|
date_string: str,
|
||||||
|
settings: dict[str, Any] | None = None,
|
||||||
|
locales: None = None,
|
||||||
|
) -> datetime.datetime | None:
|
||||||
|
date_order = settings.get("DATE_ORDER") if settings else None
|
||||||
|
|
||||||
|
# Filename-style YYYY-MM-DD / YYYY.MM.DD
|
||||||
|
if (
|
||||||
|
"2023-12-25" in date_string
|
||||||
|
or "2023.12.25" in date_string
|
||||||
|
or "2023-12-25" in date_string
|
||||||
|
):
|
||||||
|
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
|
# content DMY 25/12/2022
|
||||||
|
if "25/12/2022" in date_string or "25-12-2022" in date_string:
|
||||||
|
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
|
# filename YMD 2023.01.02
|
||||||
|
if "2023.01.02" in date_string or "2023-01-02" in date_string:
|
||||||
|
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
|
# ambiguous 01/02/2023 -> respect DATE_ORDER setting
|
||||||
|
if "01/02/2023" in date_string:
|
||||||
|
if date_order == "DMY":
|
||||||
|
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
|
||||||
|
if date_order == "YMD":
|
||||||
|
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
|
||||||
|
# fallback
|
||||||
|
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
|
||||||
|
|
||||||
|
# simulate parse failure for malformed input
|
||||||
|
if "99/99/9999" in date_string or "bad date" in date_string:
|
||||||
|
raise Exception("parse failed for malformed date")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
mocker.patch(target, side_effect=fake_parse)
|
||||||
|
|
||||||
|
results = list(parser.parse(filename, content))
|
||||||
|
|
||||||
|
assert results == expected
|
||||||
|
for dt in results:
|
||||||
|
assert dt.tzinfo is not None
|
||||||
|
|
||||||
|
def test_parse_filters_future_and_ignored_dates(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Ensure parser filters out:
|
||||||
|
- dates after reference_time
|
||||||
|
- dates whose .date() are in ignore_dates
|
||||||
|
"""
|
||||||
|
cfg = DateParserConfig(
|
||||||
|
languages=["en"],
|
||||||
|
timezone_str="UTC",
|
||||||
|
ignore_dates={datetime.date(2023, 12, 10)},
|
||||||
|
reference_time=datetime.datetime(
|
||||||
|
2024,
|
||||||
|
1,
|
||||||
|
15,
|
||||||
|
12,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
),
|
||||||
|
filename_date_order="YMD",
|
||||||
|
content_date_order="DMY",
|
||||||
|
)
|
||||||
|
parser = RegexDateParserPlugin(cfg)
|
||||||
|
|
||||||
|
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||||
|
|
||||||
|
def fake_parse(
|
||||||
|
date_string: str,
|
||||||
|
settings: dict[str, Any] | None = None,
|
||||||
|
locales: None = None,
|
||||||
|
) -> datetime.datetime | None:
|
||||||
|
if "10/12/2023" in date_string or "10-12-2023" in date_string:
|
||||||
|
# ignored date
|
||||||
|
return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc)
|
||||||
|
if "01/02/2024" in date_string or "01-02-2024" in date_string:
|
||||||
|
# future relative to reference_time -> filtered
|
||||||
|
return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc)
|
||||||
|
if "05/01/2023" in date_string or "05-01-2023" in date_string:
|
||||||
|
# valid
|
||||||
|
return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
mocker.patch(target, side_effect=fake_parse)
|
||||||
|
|
||||||
|
content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023"
|
||||||
|
results = list(parser.parse("whatever.txt", content))
|
||||||
|
|
||||||
|
assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)]
|
||||||
|
|
||||||
|
def test_parse_handles_no_matches_and_returns_empty_list(
|
||||||
|
self,
|
||||||
|
base_config: DateParserConfig,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
When there are no matching date-like substrings, parse should yield nothing.
|
||||||
|
"""
|
||||||
|
parser = RegexDateParserPlugin(base_config)
|
||||||
|
results = list(
|
||||||
|
parser.parse("no-dates.txt", "this has no dates whatsoever"),
|
||||||
|
)
|
||||||
|
assert results == []
|
||||||
|
|
||||||
|
def test_parse_skips_filename_when_filename_date_order_none(
|
||||||
|
self,
|
||||||
|
mocker: pytest_mock.MockerFixture,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
When filename_date_order is None the parser must not attempt to parse the filename.
|
||||||
|
Only dates found in the content should be passed to dateparser.parse.
|
||||||
|
"""
|
||||||
|
cfg = DateParserConfig(
|
||||||
|
languages=["en"],
|
||||||
|
timezone_str="UTC",
|
||||||
|
ignore_dates=set(),
|
||||||
|
reference_time=datetime.datetime(
|
||||||
|
2024,
|
||||||
|
1,
|
||||||
|
15,
|
||||||
|
12,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
tzinfo=datetime.timezone.utc,
|
||||||
|
),
|
||||||
|
filename_date_order=None,
|
||||||
|
content_date_order="DMY",
|
||||||
|
)
|
||||||
|
parser = RegexDateParserPlugin(cfg)
|
||||||
|
|
||||||
|
# Patch the module's dateparser.parse so we can inspect calls
|
||||||
|
target = "documents.plugins.date_parsing.base.dateparser.parse"
|
||||||
|
|
||||||
|
def fake_parse(
|
||||||
|
date_string: str,
|
||||||
|
settings: dict[str, Any] | None = None,
|
||||||
|
locales: None = None,
|
||||||
|
) -> datetime.datetime | None:
|
||||||
|
# return distinct datetimes so we can tell which source was parsed
|
||||||
|
if "25/12/2022" in date_string:
|
||||||
|
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
|
||||||
|
if "2023-12-25" in date_string:
|
||||||
|
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
mock = mocker.patch(target, side_effect=fake_parse)
|
||||||
|
|
||||||
|
filename = "report-2023-12-25.txt"
|
||||||
|
content = "Event recorded on 25/12/2022."
|
||||||
|
|
||||||
|
results = list(parser.parse(filename, content))
|
||||||
|
|
||||||
|
# Only the content date should have been parsed -> one call
|
||||||
|
assert mock.call_count == 1
|
||||||
|
|
||||||
|
# # first call, first positional arg
|
||||||
|
called_date_string = mock.call_args_list[0][0][0]
|
||||||
|
assert "25/12/2022" in called_date_string
|
||||||
|
# And the parser should have yielded the corresponding datetime
|
||||||
|
assert results == [
|
||||||
|
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
|
||||||
|
]
|
||||||
@@ -1978,11 +1978,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
|||||||
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
|
||||||
@mock.patch("documents.parsers.parse_date_generator")
|
@mock.patch("documents.views.get_date_parser")
|
||||||
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
|
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
|
||||||
def test_get_suggestions_dates_disabled(
|
def test_get_suggestions_dates_disabled(
|
||||||
self,
|
self,
|
||||||
parse_date_generator,
|
mock_get_date_parser: mock.MagicMock,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@@ -1999,7 +1999,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||||
self.assertFalse(parse_date_generator.called)
|
|
||||||
|
mock_get_date_parser.assert_not_called()
|
||||||
|
|
||||||
def test_saved_views(self):
|
def test_saved_views(self):
|
||||||
u1 = User.objects.create_superuser("user1")
|
u1 = User.objects.create_superuser("user1")
|
||||||
|
|||||||
@@ -1,538 +0,0 @@
|
|||||||
import datetime
|
|
||||||
from zoneinfo import ZoneInfo
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pytest_django.fixtures import SettingsWrapper
|
|
||||||
|
|
||||||
from documents.parsers import parse_date
|
|
||||||
from documents.parsers import parse_date_generator
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db()
|
|
||||||
class TestDate:
|
|
||||||
def test_date_format_1(self):
|
|
||||||
text = "lorem ipsum 130218 lorem ipsum"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_2(self):
|
|
||||||
text = "lorem ipsum 2018 lorem ipsum"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_3(self):
|
|
||||||
text = "lorem ipsum 20180213 lorem ipsum"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_4(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
|
||||||
date = parse_date("", text)
|
|
||||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_date_format_5(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
|
|
||||||
date = parse_date("", text)
|
|
||||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_date_format_6(self):
|
|
||||||
text = (
|
|
||||||
"lorem ipsum\n"
|
|
||||||
"Wohnort\n"
|
|
||||||
"3100\n"
|
|
||||||
"IBAN\n"
|
|
||||||
"AT87 4534\n"
|
|
||||||
"1234\n"
|
|
||||||
"1234 5678\n"
|
|
||||||
"BIC\n"
|
|
||||||
"lorem ipsum"
|
|
||||||
)
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_7(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
|
||||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
|
||||||
date = parse_date("", text)
|
|
||||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_date_format_8(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
|
||||||
text = (
|
|
||||||
"lorem ipsum\n"
|
|
||||||
"Wohnort\n"
|
|
||||||
"3100\n"
|
|
||||||
"IBAN\n"
|
|
||||||
"AT87 4534\n"
|
|
||||||
"1234\n"
|
|
||||||
"1234 5678\n"
|
|
||||||
"BIC\n"
|
|
||||||
"lorem ipsum\n"
|
|
||||||
"März 2020"
|
|
||||||
)
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2020,
|
|
||||||
3,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_9(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
|
||||||
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2020,
|
|
||||||
3,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_10(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_11(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_12(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_13(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_14(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_15(self):
|
|
||||||
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_16(self):
|
|
||||||
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_17(self):
|
|
||||||
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_18(self):
|
|
||||||
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) is None
|
|
||||||
|
|
||||||
def test_date_format_19(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
21,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_20(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_21(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
2,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_22(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
23,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_23(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
24,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_24(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
21,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_25(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
3,
|
|
||||||
25,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_date_format_26(self, settings_timezone: ZoneInfo):
|
|
||||||
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2019,
|
|
||||||
9,
|
|
||||||
25,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_crazy_date_past(self):
|
|
||||||
assert parse_date("", "01-07-0590 00:00:00") is None
|
|
||||||
|
|
||||||
def test_crazy_date_future(self):
|
|
||||||
assert parse_date("", "01-07-2350 00:00:00") is None
|
|
||||||
|
|
||||||
def test_crazy_date_with_spaces(self):
|
|
||||||
assert parse_date("", "20 408000l 2475") is None
|
|
||||||
|
|
||||||
def test_utf_month_names(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
|
|
||||||
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
|
||||||
2023,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "13 août 2022") == datetime.datetime(
|
|
||||||
2022,
|
|
||||||
8,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "11 März 2020") == datetime.datetime(
|
|
||||||
2020,
|
|
||||||
3,
|
|
||||||
11,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
|
|
||||||
2018,
|
|
||||||
3,
|
|
||||||
17,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
|
|
||||||
2016,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "15. února 1985") == datetime.datetime(
|
|
||||||
1985,
|
|
||||||
2,
|
|
||||||
15,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "30. září 2011") == datetime.datetime(
|
|
||||||
2011,
|
|
||||||
9,
|
|
||||||
30,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "28. května 1990") == datetime.datetime(
|
|
||||||
1990,
|
|
||||||
5,
|
|
||||||
28,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
|
|
||||||
1997,
|
|
||||||
12,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
|
|
||||||
2024,
|
|
||||||
2,
|
|
||||||
17,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
|
|
||||||
2012,
|
|
||||||
8,
|
|
||||||
30,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
|
|
||||||
2000,
|
|
||||||
9,
|
|
||||||
17,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
assert parse_date("", "5. október 1992") == datetime.datetime(
|
|
||||||
1992,
|
|
||||||
10,
|
|
||||||
5,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multiple_dates(self, settings_timezone: ZoneInfo):
|
|
||||||
text = """This text has multiple dates.
|
|
||||||
For example 02.02.2018, 22 July 2022 and December 2021.
|
|
||||||
But not 24-12-9999 because it's in the future..."""
|
|
||||||
dates = list(parse_date_generator("", text))
|
|
||||||
|
|
||||||
assert dates == [
|
|
||||||
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
|
|
||||||
datetime.datetime(
|
|
||||||
2022,
|
|
||||||
7,
|
|
||||||
22,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
),
|
|
||||||
datetime.datetime(
|
|
||||||
2021,
|
|
||||||
12,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_filename_date_parse_valid_ymd(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename date format is with Year Month Day (YMD)
|
|
||||||
- Filename contains date matching the format
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date from the filename
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
|
|
||||||
assert parse_date(
|
|
||||||
"/tmp/Scan-2022-04-01.pdf",
|
|
||||||
"No date in here",
|
|
||||||
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_filename_date_parse_valid_dmy(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename date format is with Day Month Year (DMY)
|
|
||||||
- Filename contains date matching the format
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date from the filename
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "DMY"
|
|
||||||
assert parse_date(
|
|
||||||
"/tmp/Scan-10.01.2021.pdf",
|
|
||||||
"No date in here",
|
|
||||||
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename includes no date
|
|
||||||
- File content includes no date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- No date is parsed
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
|
|
||||||
|
|
||||||
def test_filename_date_ignored_use_content(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Date parsing from the filename is enabled
|
|
||||||
- Filename date format is with Day Month Year (YMD)
|
|
||||||
- Date order is Day Month Year (DMY, the default)
|
|
||||||
- Filename contains date matching the format
|
|
||||||
- Filename date is an ignored date
|
|
||||||
- File content includes a date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date from the content not filename
|
|
||||||
"""
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
|
|
||||||
assert parse_date(
|
|
||||||
"/tmp/Scan-2022-04-01.pdf",
|
|
||||||
"The matching date is 24.03.2022",
|
|
||||||
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
|
|
||||||
|
|
||||||
def test_ignored_dates_default_order(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Ignore dates have been set
|
|
||||||
- File content includes ignored dates
|
|
||||||
- File content includes 1 non-ignored date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date non-ignored date from content
|
|
||||||
"""
|
|
||||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
|
||||||
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2018,
|
|
||||||
2,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_ignored_dates_order_ymd(
|
|
||||||
self,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
settings_timezone: ZoneInfo,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- Ignore dates have been set
|
|
||||||
- Date order is Year Month Date (YMD)
|
|
||||||
- File content includes ignored dates
|
|
||||||
- File content includes 1 non-ignored date
|
|
||||||
|
|
||||||
THEN:
|
|
||||||
- Should parse the date non-ignored date from content
|
|
||||||
"""
|
|
||||||
|
|
||||||
settings.FILENAME_DATE_ORDER = "YMD"
|
|
||||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
|
||||||
|
|
||||||
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
|
|
||||||
|
|
||||||
assert parse_date("", text) == datetime.datetime(
|
|
||||||
2018,
|
|
||||||
2,
|
|
||||||
13,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
tzinfo=settings_timezone,
|
|
||||||
)
|
|
||||||
@@ -148,7 +148,6 @@ from documents.models import Workflow
|
|||||||
from documents.models import WorkflowAction
|
from documents.models import WorkflowAction
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import parse_date_generator
|
|
||||||
from documents.permissions import AcknowledgeTasksPermissions
|
from documents.permissions import AcknowledgeTasksPermissions
|
||||||
from documents.permissions import PaperlessAdminPermissions
|
from documents.permissions import PaperlessAdminPermissions
|
||||||
from documents.permissions import PaperlessNotePermissions
|
from documents.permissions import PaperlessNotePermissions
|
||||||
@@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user
|
|||||||
from documents.permissions import get_objects_for_user_owner_aware
|
from documents.permissions import get_objects_for_user_owner_aware
|
||||||
from documents.permissions import has_perms_owner_aware
|
from documents.permissions import has_perms_owner_aware
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
|
from documents.plugins.date_parsing import get_date_parser
|
||||||
from documents.schema import generate_object_with_permissions_schema
|
from documents.schema import generate_object_with_permissions_schema
|
||||||
from documents.serialisers import AcknowledgeTasksViewSerializer
|
from documents.serialisers import AcknowledgeTasksViewSerializer
|
||||||
from documents.serialisers import BulkDownloadSerializer
|
from documents.serialisers import BulkDownloadSerializer
|
||||||
@@ -1023,16 +1023,17 @@ class DocumentViewSet(
|
|||||||
|
|
||||||
dates = []
|
dates = []
|
||||||
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
||||||
gen = parse_date_generator(doc.filename, doc.content)
|
with get_date_parser() as date_parser:
|
||||||
dates = sorted(
|
gen = date_parser.parse(doc.filename, doc.content)
|
||||||
{
|
dates = sorted(
|
||||||
i
|
{
|
||||||
for i in itertools.islice(
|
i
|
||||||
gen,
|
for i in itertools.islice(
|
||||||
settings.NUMBER_OF_SUGGESTED_DATES,
|
gen,
|
||||||
)
|
settings.NUMBER_OF_SUGGESTED_DATES,
|
||||||
},
|
)
|
||||||
)
|
},
|
||||||
|
)
|
||||||
|
|
||||||
resp_data = {
|
resp_data = {
|
||||||
"correspondents": [
|
"correspondents": [
|
||||||
|
|||||||
@@ -1,12 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from adrf.views import APIView
|
|
||||||
from adrf.viewsets import ModelViewSet
|
|
||||||
from adrf.viewsets import ReadOnlyModelViewSet
|
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
from django.http import HttpResponseBadRequest
|
from django.http import HttpResponseBadRequest
|
||||||
from django.http import HttpResponseForbidden
|
from django.http import HttpResponseForbidden
|
||||||
from django.http import HttpResponseRedirect
|
from django.http import HttpResponseRedirect
|
||||||
@@ -20,9 +15,11 @@ from httpx_oauth.oauth2 import GetAccessTokenError
|
|||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
from rest_framework.decorators import action
|
from rest_framework.decorators import action
|
||||||
from rest_framework.filters import OrderingFilter
|
from rest_framework.filters import OrderingFilter
|
||||||
|
from rest_framework.generics import GenericAPIView
|
||||||
from rest_framework.permissions import IsAuthenticated
|
from rest_framework.permissions import IsAuthenticated
|
||||||
from rest_framework.request import Request
|
|
||||||
from rest_framework.response import Response
|
from rest_framework.response import Response
|
||||||
|
from rest_framework.viewsets import ModelViewSet
|
||||||
|
from rest_framework.viewsets import ReadOnlyModelViewSet
|
||||||
|
|
||||||
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
|
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
|
||||||
from documents.permissions import PaperlessObjectPermissions
|
from documents.permissions import PaperlessObjectPermissions
|
||||||
@@ -42,8 +39,6 @@ from paperless_mail.serialisers import MailRuleSerializer
|
|||||||
from paperless_mail.serialisers import ProcessedMailSerializer
|
from paperless_mail.serialisers import ProcessedMailSerializer
|
||||||
from paperless_mail.tasks import process_mail_accounts
|
from paperless_mail.tasks import process_mail_accounts
|
||||||
|
|
||||||
logger: logging.Logger = logging.getLogger("paperless_mail")
|
|
||||||
|
|
||||||
|
|
||||||
@extend_schema_view(
|
@extend_schema_view(
|
||||||
test=extend_schema(
|
test=extend_schema(
|
||||||
@@ -71,75 +66,71 @@ logger: logging.Logger = logging.getLogger("paperless_mail")
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
class MailAccountViewSet(ModelViewSet, PassUserMixin):
|
class MailAccountViewSet(ModelViewSet, PassUserMixin):
|
||||||
|
model = MailAccount
|
||||||
|
|
||||||
queryset = MailAccount.objects.all().order_by("pk")
|
queryset = MailAccount.objects.all().order_by("pk")
|
||||||
serializer_class = MailAccountSerializer
|
serializer_class = MailAccountSerializer
|
||||||
pagination_class = StandardPagination
|
pagination_class = StandardPagination
|
||||||
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
|
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
|
||||||
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
|
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
|
||||||
|
|
||||||
def get_permissions(self) -> list[Any]:
|
def get_permissions(self):
|
||||||
if self.action == "test":
|
if self.action == "test":
|
||||||
return [IsAuthenticated()]
|
# Test action does not require object level permissions
|
||||||
|
self.permission_classes = (IsAuthenticated,)
|
||||||
return super().get_permissions()
|
return super().get_permissions()
|
||||||
|
|
||||||
@action(methods=["post"], detail=False)
|
@action(methods=["post"], detail=False)
|
||||||
async def test(self, request: Request) -> Response | HttpResponseBadRequest:
|
def test(self, request):
|
||||||
|
logger = logging.getLogger("paperless_mail")
|
||||||
request.data["name"] = datetime.datetime.now().isoformat()
|
request.data["name"] = datetime.datetime.now().isoformat()
|
||||||
serializer = self.get_serializer(data=request.data)
|
serializer = self.get_serializer(data=request.data)
|
||||||
|
serializer.is_valid(raise_exception=True)
|
||||||
|
|
||||||
# Validation must be wrapped because of sync DB validators
|
# account exists, use the password from there instead of *** and refresh_token / expiration
|
||||||
await sync_to_async(serializer.is_valid)(raise_exception=True)
|
|
||||||
|
|
||||||
validated_data: dict[str, Any] = serializer.validated_data
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
len(str(validated_data.get("password", "")).replace("*", "")) == 0
|
len(serializer.validated_data.get("password").replace("*", "")) == 0
|
||||||
and request.data.get("id") is not None
|
and request.data["id"] is not None
|
||||||
):
|
):
|
||||||
existing_account = await MailAccount.objects.aget(pk=request.data["id"])
|
existing_account = MailAccount.objects.get(pk=request.data["id"])
|
||||||
validated_data.update(
|
serializer.validated_data["password"] = existing_account.password
|
||||||
{
|
serializer.validated_data["account_type"] = existing_account.account_type
|
||||||
"password": existing_account.password,
|
serializer.validated_data["refresh_token"] = existing_account.refresh_token
|
||||||
"account_type": existing_account.account_type,
|
serializer.validated_data["expiration"] = existing_account.expiration
|
||||||
"refresh_token": existing_account.refresh_token,
|
|
||||||
"expiration": existing_account.expiration,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
account = MailAccount(**validated_data)
|
account = MailAccount(**serializer.validated_data)
|
||||||
|
with get_mailbox(
|
||||||
def _blocking_imap_test() -> bool:
|
account.imap_server,
|
||||||
with get_mailbox(
|
account.imap_port,
|
||||||
account.imap_server,
|
account.imap_security,
|
||||||
account.imap_port,
|
) as M:
|
||||||
account.imap_security,
|
try:
|
||||||
) as m_box:
|
|
||||||
if (
|
if (
|
||||||
account.is_token
|
account.is_token
|
||||||
and account.expiration
|
and account.expiration is not None
|
||||||
and account.expiration < timezone.now()
|
and account.expiration < timezone.now()
|
||||||
):
|
):
|
||||||
oauth_manager = PaperlessMailOAuth2Manager()
|
oauth_manager = PaperlessMailOAuth2Manager()
|
||||||
if oauth_manager.refresh_account_oauth_token(existing_account):
|
if oauth_manager.refresh_account_oauth_token(existing_account):
|
||||||
# User is not changing password and token needs to be refreshed
|
# User is not changing password and token needs to be refreshed
|
||||||
|
existing_account.refresh_from_db()
|
||||||
account.password = existing_account.password
|
account.password = existing_account.password
|
||||||
else:
|
else:
|
||||||
raise MailError("Unable to refresh oauth token")
|
raise MailError("Unable to refresh oauth token")
|
||||||
mailbox_login(m_box, account)
|
|
||||||
return True
|
|
||||||
|
|
||||||
try:
|
mailbox_login(M, account)
|
||||||
await sync_to_async(_blocking_imap_test, thread_sensitive=False)()
|
return Response({"success": True})
|
||||||
return Response({"success": True})
|
except MailError as e:
|
||||||
except MailError as e:
|
logger.error(
|
||||||
logger.error(f"Mail account {account} test failed: {e}")
|
f"Mail account {account} test failed: {e}",
|
||||||
return HttpResponseBadRequest("Unable to connect to server")
|
)
|
||||||
|
return HttpResponseBadRequest("Unable to connect to server")
|
||||||
|
|
||||||
@action(methods=["post"], detail=True)
|
@action(methods=["post"], detail=True)
|
||||||
async def process(self, request: Request, pk: int | None = None) -> Response:
|
def process(self, request, pk=None):
|
||||||
# FIX: Use aget_object() provided by adrf to avoid SynchronousOnlyOperation
|
account = self.get_object()
|
||||||
account = await self.aget_object()
|
|
||||||
process_mail_accounts.delay([account.pk])
|
process_mail_accounts.delay([account.pk])
|
||||||
|
|
||||||
return Response({"result": "OK"})
|
return Response({"result": "OK"})
|
||||||
|
|
||||||
|
|
||||||
@@ -153,38 +144,21 @@ class ProcessedMailViewSet(ReadOnlyModelViewSet, PassUserMixin):
|
|||||||
ObjectOwnedOrGrantedPermissionsFilter,
|
ObjectOwnedOrGrantedPermissionsFilter,
|
||||||
)
|
)
|
||||||
filterset_class = ProcessedMailFilterSet
|
filterset_class = ProcessedMailFilterSet
|
||||||
|
|
||||||
queryset = ProcessedMail.objects.all().order_by("-processed")
|
queryset = ProcessedMail.objects.all().order_by("-processed")
|
||||||
|
|
||||||
@action(methods=["post"], detail=False)
|
@action(methods=["post"], detail=False)
|
||||||
async def bulk_delete(
|
def bulk_delete(self, request):
|
||||||
self,
|
mail_ids = request.data.get("mail_ids", [])
|
||||||
request: Request,
|
|
||||||
) -> Response | HttpResponseBadRequest | HttpResponseForbidden:
|
|
||||||
mail_ids: list[int] = request.data.get("mail_ids", [])
|
|
||||||
if not isinstance(mail_ids, list) or not all(
|
if not isinstance(mail_ids, list) or not all(
|
||||||
isinstance(i, int) for i in mail_ids
|
isinstance(i, int) for i in mail_ids
|
||||||
):
|
):
|
||||||
return HttpResponseBadRequest("mail_ids must be a list of integers")
|
return HttpResponseBadRequest("mail_ids must be a list of integers")
|
||||||
|
mails = ProcessedMail.objects.filter(id__in=mail_ids)
|
||||||
# Store objects to delete after verification
|
for mail in mails:
|
||||||
to_delete: list[ProcessedMail] = []
|
if not has_perms_owner_aware(request.user, "delete_processedmail", mail):
|
||||||
|
|
||||||
# We must verify permissions for every requested ID
|
|
||||||
async for mail in ProcessedMail.objects.filter(id__in=mail_ids):
|
|
||||||
can_delete = await sync_to_async(has_perms_owner_aware)(
|
|
||||||
request.user,
|
|
||||||
"delete_processedmail",
|
|
||||||
mail,
|
|
||||||
)
|
|
||||||
if not can_delete:
|
|
||||||
# This is what the test is looking for: 403 on permission failure
|
|
||||||
return HttpResponseForbidden("Insufficient permissions")
|
return HttpResponseForbidden("Insufficient permissions")
|
||||||
to_delete.append(mail)
|
mail.delete()
|
||||||
|
|
||||||
# Only perform deletions if all items passed the permission check
|
|
||||||
for mail in to_delete:
|
|
||||||
await mail.adelete()
|
|
||||||
|
|
||||||
return Response({"result": "OK", "deleted_mail_ids": mail_ids})
|
return Response({"result": "OK", "deleted_mail_ids": mail_ids})
|
||||||
|
|
||||||
|
|
||||||
@@ -204,74 +178,77 @@ class MailRuleViewSet(ModelViewSet, PassUserMixin):
|
|||||||
responses={200: None},
|
responses={200: None},
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
class OauthCallbackView(APIView):
|
class OauthCallbackView(GenericAPIView):
|
||||||
permission_classes = (IsAuthenticated,)
|
permission_classes = (IsAuthenticated,)
|
||||||
|
|
||||||
async def get(
|
def get(self, request, format=None):
|
||||||
self,
|
if not (
|
||||||
request: Request,
|
request.user and request.user.has_perms(["paperless_mail.add_mailaccount"])
|
||||||
) -> Response | HttpResponseBadRequest | HttpResponseRedirect:
|
):
|
||||||
has_perm = await sync_to_async(request.user.has_perm)(
|
|
||||||
"paperless_mail.add_mailaccount",
|
|
||||||
)
|
|
||||||
if not has_perm:
|
|
||||||
return HttpResponseBadRequest(
|
return HttpResponseBadRequest(
|
||||||
"You do not have permission to add mail accounts",
|
"You do not have permission to add mail accounts",
|
||||||
)
|
)
|
||||||
|
|
||||||
code: str | None = request.query_params.get("code")
|
logger = logging.getLogger("paperless_mail")
|
||||||
state: str | None = request.query_params.get("state")
|
code = request.query_params.get("code")
|
||||||
scope: str | None = request.query_params.get("scope")
|
# Gmail passes scope as a query param, Outlook does not
|
||||||
|
scope = request.query_params.get("scope")
|
||||||
|
|
||||||
if not code or not state:
|
if code is None:
|
||||||
return HttpResponseBadRequest("Invalid request parameters")
|
logger.error(
|
||||||
|
f"Invalid oauth callback request, code: {code}, scope: {scope}",
|
||||||
|
)
|
||||||
|
return HttpResponseBadRequest("Invalid request, see logs for more detail")
|
||||||
|
|
||||||
oauth_manager = PaperlessMailOAuth2Manager(
|
oauth_manager = PaperlessMailOAuth2Manager(
|
||||||
state=request.session.get("oauth_state"),
|
state=request.session.get("oauth_state"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
state = request.query_params.get("state", "")
|
||||||
if not oauth_manager.validate_state(state):
|
if not oauth_manager.validate_state(state):
|
||||||
return HttpResponseBadRequest("Invalid OAuth state")
|
logger.error(
|
||||||
|
f"Invalid oauth callback request received state: {state}, expected: {oauth_manager.state}",
|
||||||
|
)
|
||||||
|
return HttpResponseBadRequest("Invalid request, see logs for more detail")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
defaults: dict[str, Any] = {
|
if scope is not None and "google" in scope:
|
||||||
"username": "",
|
# Google
|
||||||
"imap_security": MailAccount.ImapSecurity.SSL,
|
|
||||||
"imap_port": 993,
|
|
||||||
}
|
|
||||||
|
|
||||||
if scope and "google" in scope:
|
|
||||||
account_type = MailAccount.MailAccountType.GMAIL_OAUTH
|
account_type = MailAccount.MailAccountType.GMAIL_OAUTH
|
||||||
imap_server = "imap.gmail.com"
|
imap_server = "imap.gmail.com"
|
||||||
defaults.update(
|
defaults = {
|
||||||
{
|
"name": f"Gmail OAuth {timezone.now()}",
|
||||||
"name": f"Gmail OAuth {timezone.now()}",
|
"username": "",
|
||||||
"account_type": account_type,
|
"imap_security": MailAccount.ImapSecurity.SSL,
|
||||||
},
|
"imap_port": 993,
|
||||||
)
|
"account_type": account_type,
|
||||||
result = await sync_to_async(oauth_manager.get_gmail_access_token)(code)
|
}
|
||||||
else:
|
result = oauth_manager.get_gmail_access_token(code)
|
||||||
|
|
||||||
|
elif scope is None:
|
||||||
|
# Outlook
|
||||||
account_type = MailAccount.MailAccountType.OUTLOOK_OAUTH
|
account_type = MailAccount.MailAccountType.OUTLOOK_OAUTH
|
||||||
imap_server = "outlook.office365.com"
|
imap_server = "outlook.office365.com"
|
||||||
defaults.update(
|
defaults = {
|
||||||
{
|
"name": f"Outlook OAuth {timezone.now()}",
|
||||||
"name": f"Outlook OAuth {timezone.now()}",
|
"username": "",
|
||||||
"account_type": account_type,
|
"imap_security": MailAccount.ImapSecurity.SSL,
|
||||||
},
|
"imap_port": 993,
|
||||||
)
|
"account_type": account_type,
|
||||||
result = await sync_to_async(oauth_manager.get_outlook_access_token)(
|
}
|
||||||
code,
|
|
||||||
)
|
|
||||||
|
|
||||||
account, _ = await MailAccount.objects.aupdate_or_create(
|
result = oauth_manager.get_outlook_access_token(code)
|
||||||
|
|
||||||
|
access_token = result["access_token"]
|
||||||
|
refresh_token = result["refresh_token"]
|
||||||
|
expires_in = result["expires_in"]
|
||||||
|
account, _ = MailAccount.objects.update_or_create(
|
||||||
|
password=access_token,
|
||||||
|
is_token=True,
|
||||||
imap_server=imap_server,
|
imap_server=imap_server,
|
||||||
refresh_token=result["refresh_token"],
|
refresh_token=refresh_token,
|
||||||
defaults={
|
expiration=timezone.now() + timedelta(seconds=expires_in),
|
||||||
**defaults,
|
defaults=defaults,
|
||||||
"password": result["access_token"],
|
|
||||||
"is_token": True,
|
|
||||||
"expiration": timezone.now()
|
|
||||||
+ timedelta(seconds=result["expires_in"]),
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
return HttpResponseRedirect(
|
return HttpResponseRedirect(
|
||||||
f"{oauth_manager.oauth_redirect_url}?oauth_success=1&account_id={account.pk}",
|
f"{oauth_manager.oauth_redirect_url}?oauth_success=1&account_id={account.pk}",
|
||||||
|
|||||||
Reference in New Issue
Block a user