Fix: include application config language settings for dateparser auto-detection (#10722)

This commit is contained in:
shamoon
2025-08-31 15:22:39 -07:00
committed by GitHub
parent 107374af71
commit cb927c5b22
6 changed files with 223 additions and 186 deletions

View File

@@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.utils import ocr_to_dateparser_languages
if TYPE_CHECKING:
import datetime
@@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
import dateparser
ocr_config = OcrConfig()
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
ocr_config.language,
)
return dateparser.parse(
ds,
settings={
@@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE,
},
locales=settings.DATE_PARSER_LANGUAGES,
locales=languages,
)
def __filter(date: datetime.datetime) -> datetime.datetime | None:

View File

@@ -1,12 +1,14 @@
import datetime
from zoneinfo import ZoneInfo
import pytest
from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
@pytest.mark.django_db()
class TestDate:
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
@@ -49,7 +51,7 @@ class TestDate:
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = []
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)