mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Hooks up the class and fixes up the old testing. Includes ocr to date parser conversion we now do
This commit is contained in:
@@ -9,22 +9,17 @@ import subprocess
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from re import Match
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from collections.abc import Iterator
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
|
||||
return out_path
|
||||
|
||||
|
||||
def parse_date(filename, text) -> datetime.datetime | None:
|
||||
return next(parse_date_generator(filename, text), None)
|
||||
|
||||
|
||||
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
def __parser(ds: str, date_order: str) -> datetime.datetime:
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
import dateparser
|
||||
|
||||
ocr_config = OcrConfig()
|
||||
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
||||
ocr_config.language,
|
||||
)
|
||||
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
"TIMEZONE": settings.TIME_ZONE,
|
||||
},
|
||||
locales=languages,
|
||||
)
|
||||
|
||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
||||
if (
|
||||
date is not None
|
||||
and date.year > 1900
|
||||
and date <= timezone.now()
|
||||
and date.date() not in settings.IGNORE_DATES
|
||||
):
|
||||
return date
|
||||
return None
|
||||
|
||||
def __process_match(
|
||||
match: Match[str],
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
date_string = match.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, date_order)
|
||||
except Exception:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
date = None
|
||||
|
||||
return __filter(date)
|
||||
|
||||
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
||||
for m in re.finditer(DATE_REGEX, content):
|
||||
date = __process_match(m, date_order)
|
||||
if date is not None:
|
||||
yield date
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
yield from __process_content(text, settings.DATE_ORDER)
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user