mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-24 00:59:35 -06:00
Feature: Enable users to customize date parsing via plugins (#11931)
This commit is contained in:
101
src/documents/plugins/date_parsing/__init__.py
Normal file
101
src/documents/plugins/date_parsing/__init__.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from importlib.metadata import EntryPoint
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Final
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _discover_parser_class() -> type[DateParserPluginBase]:
|
||||
"""
|
||||
Discovers the date parser plugin class to use.
|
||||
|
||||
- If one or more plugins are found, sorts them by name and returns the first.
|
||||
- If no plugins are found, returns the default RegexDateParser.
|
||||
"""
|
||||
|
||||
eps: tuple[EntryPoint, ...]
|
||||
try:
|
||||
eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP)
|
||||
except Exception as e:
|
||||
# Log a warning
|
||||
logger.warning(f"Could not query entry points for date parsers: {e}")
|
||||
eps = ()
|
||||
|
||||
valid_plugins: list[EntryPoint] = []
|
||||
for ep in eps:
|
||||
try:
|
||||
plugin_class = ep.load()
|
||||
if plugin_class and issubclass(plugin_class, DateParserPluginBase):
|
||||
valid_plugins.append(ep)
|
||||
else:
|
||||
logger.warning(f"Plugin {ep.name} does not subclass DateParser.")
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to load date parser plugin {ep.name}: {e}")
|
||||
|
||||
if not valid_plugins:
|
||||
return RegexDateParserPlugin
|
||||
|
||||
valid_plugins.sort(key=lambda ep: ep.name)
|
||||
|
||||
if len(valid_plugins) > 1:
|
||||
logger.warning(
|
||||
f"Multiple date parsers found: "
|
||||
f"{[ep.name for ep in valid_plugins]}. "
|
||||
f"Using the first one by name: '{valid_plugins[0].name}'.",
|
||||
)
|
||||
|
||||
return valid_plugins[0].load()
|
||||
|
||||
|
||||
def get_date_parser() -> DateParserPluginBase:
|
||||
"""
|
||||
Factory function to get an initialized date parser instance.
|
||||
|
||||
This function is responsible for:
|
||||
1. Discovering the correct parser class (plugin or default).
|
||||
2. Loading configuration from Django settings.
|
||||
3. Instantiating the parser with the configuration.
|
||||
"""
|
||||
# 1. Discover the class (this is cached)
|
||||
parser_class = _discover_parser_class()
|
||||
|
||||
# 2. Load configuration from settings
|
||||
# TODO: Get the language from the settings and/or configuration object, depending
|
||||
ocr_config = OcrConfig()
|
||||
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
||||
ocr_config.language,
|
||||
)
|
||||
|
||||
config = DateParserConfig(
|
||||
languages=languages,
|
||||
timezone_str=settings.TIME_ZONE,
|
||||
ignore_dates=settings.IGNORE_DATES,
|
||||
reference_time=timezone.now(),
|
||||
filename_date_order=settings.FILENAME_DATE_ORDER,
|
||||
content_date_order=settings.DATE_ORDER,
|
||||
)
|
||||
|
||||
# 3. Instantiate the discovered class with the config
|
||||
return parser_class(config=config)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DateParserConfig",
|
||||
"DateParserPluginBase",
|
||||
"RegexDateParserPlugin",
|
||||
"get_date_parser",
|
||||
]
|
||||
124
src/documents/plugins/date_parsing/base.py
Normal file
124
src/documents/plugins/date_parsing/base.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import datetime
|
||||
import logging
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import dataclass
|
||||
from types import TracebackType
|
||||
|
||||
try:
|
||||
from typing import Self
|
||||
except ImportError:
|
||||
from typing_extensions import Self
|
||||
|
||||
import dateparser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class DateParserConfig:
|
||||
"""
|
||||
Configuration for a DateParser instance.
|
||||
|
||||
This object is created by the factory and passed to the
|
||||
parser's constructor, decoupling the parser from settings.
|
||||
"""
|
||||
|
||||
languages: list[str]
|
||||
timezone_str: str
|
||||
ignore_dates: set[datetime.date]
|
||||
|
||||
# A "now" timestamp for filtering future dates.
|
||||
# Passed in by the factory.
|
||||
reference_time: datetime.datetime
|
||||
|
||||
# Settings for the default RegexDateParser
|
||||
# Other plugins should use or consider these, but it is not required
|
||||
filename_date_order: str | None
|
||||
content_date_order: str
|
||||
|
||||
|
||||
class DateParserPluginBase(ABC):
|
||||
"""
|
||||
Abstract base class for date parsing strategies.
|
||||
|
||||
Instances are configured via a DateParserConfig object.
|
||||
"""
|
||||
|
||||
def __init__(self, config: DateParserConfig):
|
||||
"""
|
||||
Initializes the parser with its configuration.
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
"""
|
||||
Enter the runtime context related to this object.
|
||||
|
||||
Subclasses can override this to acquire resources (connections, handles).
|
||||
"""
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
"""
|
||||
Exit the runtime context related to this object.
|
||||
|
||||
Subclasses can override this to release resources.
|
||||
"""
|
||||
# Default implementation does nothing.
|
||||
# Returning None implies exceptions are propagated.
|
||||
|
||||
def _parse_string(
|
||||
self,
|
||||
date_string: str,
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Helper method to parse a single date string using dateparser.
|
||||
|
||||
Uses configuration from `self.config`.
|
||||
"""
|
||||
try:
|
||||
return dateparser.parse(
|
||||
date_string,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
"TIMEZONE": self.config.timezone_str,
|
||||
},
|
||||
locales=self.config.languages,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error while parsing date string '{date_string}': {e}")
|
||||
return None
|
||||
|
||||
def _filter_date(
|
||||
self,
|
||||
date: datetime.datetime | None,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Helper method to validate a parsed datetime object.
|
||||
|
||||
Uses configuration from `self.config`.
|
||||
"""
|
||||
if (
|
||||
date is not None
|
||||
and date.year > 1900
|
||||
and date <= self.config.reference_time
|
||||
and date.date() not in self.config.ignore_dates
|
||||
):
|
||||
return date
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Parses a document's filename and content, yielding valid datetime objects.
|
||||
"""
|
||||
65
src/documents/plugins/date_parsing/regex_parser.py
Normal file
65
src/documents/plugins/date_parsing/regex_parser.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import datetime
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from re import Match
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
|
||||
|
||||
class RegexDateParserPlugin(DateParserPluginBase):
|
||||
"""
|
||||
The default date parser, using a series of regular expressions.
|
||||
|
||||
It is configured entirely by the DateParserConfig object
|
||||
passed to its constructor.
|
||||
"""
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _process_match(
|
||||
self,
|
||||
match: Match[str],
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Processes a single regex match using the base class helpers.
|
||||
"""
|
||||
date_string = match.group(0)
|
||||
date = self._parse_string(date_string, date_order)
|
||||
return self._filter_date(date)
|
||||
|
||||
def _process_content(
|
||||
self,
|
||||
content: str,
|
||||
date_order: str,
|
||||
) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Finds all regex matches in content and yields valid dates.
|
||||
"""
|
||||
for m in re.finditer(self.DATE_REGEX, content):
|
||||
date = self._process_match(m, date_order)
|
||||
if date is not None:
|
||||
yield date
|
||||
|
||||
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Implementation of the abstract parse method.
|
||||
|
||||
Reads its configuration from `self.config`.
|
||||
"""
|
||||
if self.config.filename_date_order:
|
||||
yield from self._process_content(
|
||||
filename,
|
||||
self.config.filename_date_order,
|
||||
)
|
||||
|
||||
yield from self._process_content(content, self.config.content_date_order)
|
||||
Reference in New Issue
Block a user