mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Copy over the code and tests, to see if this even works
This commit is contained in:
66
src/documents/plugins/date_parsing/regex_parser.py
Normal file
66
src/documents/plugins/date_parsing/regex_parser.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import datetime
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from re import Match
|
||||
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
|
||||
|
||||
class RegexDateParserPlugin(DateParserPluginBase):
|
||||
"""
|
||||
The default date parser, using a series of regular expressions.
|
||||
|
||||
It is configured entirely by the DateParserConfig object
|
||||
passed to its constructor.
|
||||
"""
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _process_match(
|
||||
self,
|
||||
match: Match[str],
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
"""
|
||||
Processes a single regex match using the base class helpers.
|
||||
"""
|
||||
date_string = match.group(0)
|
||||
date = self._parse_string(date_string, date_order)
|
||||
return self._filter_date(date)
|
||||
|
||||
def _process_content(
|
||||
self,
|
||||
content: str,
|
||||
date_order: str,
|
||||
) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Finds all regex matches in content and yields valid dates.
|
||||
"""
|
||||
for m in re.finditer(self.DATE_REGEX, content):
|
||||
date = self._process_match(m, date_order)
|
||||
if date is not None:
|
||||
yield date
|
||||
|
||||
def parse(self, filename: Path, content: str) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Implementation of the abstract parse method.
|
||||
|
||||
Reads its configuration from `self.config`.
|
||||
"""
|
||||
if self.config.filename_date_order:
|
||||
yield from self._process_content(
|
||||
filename.name,
|
||||
self.config.filename_date_order,
|
||||
)
|
||||
|
||||
yield from self._process_content(content, self.config.content_date_order)
|
||||
Reference in New Issue
Block a user