mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-01 01:46:16 +00:00
Fix: include application config language settings for dateparser auto-detection (#10722)
This commit is contained in:
@@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
@@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
import dateparser
|
||||
|
||||
ocr_config = OcrConfig()
|
||||
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
||||
ocr_config.language,
|
||||
)
|
||||
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
@@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
"TIMEZONE": settings.TIME_ZONE,
|
||||
},
|
||||
locales=settings.DATE_PARSER_LANGUAGES,
|
||||
locales=languages,
|
||||
)
|
||||
|
||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
||||
|
@@ -1,12 +1,14 @@
|
||||
import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import pytest
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from documents.parsers import parse_date_generator
|
||||
|
||||
|
||||
@pytest.mark.django_db()
|
||||
class TestDate:
|
||||
def test_date_format_1(self):
|
||||
text = "lorem ipsum 130218 lorem ipsum"
|
||||
@@ -49,7 +51,7 @@ class TestDate:
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
settings.DATE_PARSER_LANGUAGES = []
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
||||
|
@@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
logger = logging.getLogger("paperless.settings")
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
@@ -1184,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||
|
||||
|
||||
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
|
||||
"""
|
||||
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
|
||||
into a list of locales compatible with the `dateparser` library.
|
||||
|
||||
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
|
||||
Falls back to the base language (e.g., "az") if needed.
|
||||
- If a language cannot be mapped or validated, it is skipped with a warning.
|
||||
- Returns a list of valid locales, or an empty list if none could be converted.
|
||||
"""
|
||||
ocr_to_dateparser = ocr_to_dateparser_languages()
|
||||
loader = LocaleDataLoader()
|
||||
result = []
|
||||
try:
|
||||
for ocr_language in ocr_languages.split("+"):
|
||||
# Split into language and optional script
|
||||
ocr_lang_part, *script = ocr_language.split("_")
|
||||
ocr_script_part = script[0] if script else None
|
||||
|
||||
language_part = ocr_to_dateparser.get(ocr_lang_part)
|
||||
if language_part is None:
|
||||
logger.debug(
|
||||
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
|
||||
)
|
||||
continue
|
||||
|
||||
# Ensure base language is supported by dateparser
|
||||
loader.get_locale_map(locales=[language_part])
|
||||
|
||||
# Try to add the script part if it's supported by dateparser
|
||||
if ocr_script_part:
|
||||
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
|
||||
try:
|
||||
loader.get_locale_map(locales=[dateparser_language])
|
||||
except Exception:
|
||||
logger.info(
|
||||
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
|
||||
)
|
||||
dateparser_language = language_part
|
||||
else:
|
||||
dateparser_language = language_part
|
||||
if dateparser_language not in result:
|
||||
result.append(dateparser_language)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
|
||||
)
|
||||
return []
|
||||
if not result:
|
||||
logger.info(
|
||||
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _parse_dateparser_languages(languages: str | None):
|
||||
language_list = languages.split("+") if languages else []
|
||||
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
|
||||
@@ -1253,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None):
|
||||
return list(LocaleDataLoader().get_locale_map(locales=language_list))
|
||||
|
||||
|
||||
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
|
||||
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
|
||||
# If not set, we will infer it at runtime
|
||||
DATE_PARSER_LANGUAGES = (
|
||||
_parse_dateparser_languages(
|
||||
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
|
||||
)
|
||||
else:
|
||||
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
|
||||
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
|
||||
else None
|
||||
)
|
||||
|
||||
|
||||
# Maximum number of dates taken from document start to end to show as suggestions for
|
||||
|
@@ -6,7 +6,6 @@ from unittest import mock
|
||||
import pytest
|
||||
from celery.schedules import crontab
|
||||
|
||||
from paperless.settings import _ocr_to_dateparser_languages
|
||||
from paperless.settings import _parse_base_paths
|
||||
from paperless.settings import _parse_beat_schedule
|
||||
from paperless.settings import _parse_dateparser_languages
|
||||
@@ -476,33 +475,6 @@ class TestPathSettings(TestCase):
|
||||
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_language", "expected"),
|
||||
[
|
||||
# One language
|
||||
("eng", ["en"]),
|
||||
# Multiple languages
|
||||
("fra+ita+lao", ["fr", "it", "lo"]),
|
||||
# Languages that don't have a two-letter equivalent
|
||||
("fil", ["fil"]),
|
||||
# Languages with a script part supported by dateparser
|
||||
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
|
||||
# Languages with a script part not supported by dateparser
|
||||
# In this case, default to the language without script
|
||||
("deu_frak", ["de"]),
|
||||
# Traditional and simplified chinese don't have the same name in dateparser,
|
||||
# so they're converted to the general chinese language
|
||||
("chi_tra+chi_sim", ["zh"]),
|
||||
# If a language is not supported by dateparser, fallback to the supported ones
|
||||
("eng+unsupported_language+por", ["en", "pt"]),
|
||||
# If no language is supported, fallback to default
|
||||
("unsupported1+unsupported2", []),
|
||||
],
|
||||
)
|
||||
def test_ocr_to_dateparser_languages(ocr_language, expected):
|
||||
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("languages", "expected"),
|
||||
[
|
||||
|
52
src/paperless/tests/test_utils.py
Normal file
52
src/paperless/tests/test_utils.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
|
||||
from paperless import utils
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_language", "expected"),
|
||||
[
|
||||
# One language
|
||||
("eng", ["en"]),
|
||||
# Multiple languages
|
||||
("fra+ita+lao", ["fr", "it", "lo"]),
|
||||
# Languages that don't have a two-letter equivalent
|
||||
("fil", ["fil"]),
|
||||
# Languages with a script part supported by dateparser
|
||||
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
|
||||
# Languages with a script part not supported by dateparser
|
||||
# In this case, default to the language without script
|
||||
("deu_frak", ["de"]),
|
||||
# Traditional and simplified chinese don't have the same name in dateparser,
|
||||
# so they're converted to the general chinese language
|
||||
("chi_tra+chi_sim", ["zh"]),
|
||||
# If a language is not supported by dateparser, fallback to the supported ones
|
||||
("eng+unsupported_language+por", ["en", "pt"]),
|
||||
# If no language is supported, fallback to default
|
||||
("unsupported1+unsupported2", []),
|
||||
# Duplicate languages, should not duplicate in result
|
||||
("eng+eng", ["en"]),
|
||||
# Language with script, but script is not mapped
|
||||
("ita_unknownscript", ["it"]),
|
||||
],
|
||||
)
|
||||
def test_ocr_to_dateparser_languages(ocr_language, expected):
|
||||
assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
|
||||
|
||||
|
||||
def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog):
|
||||
# Patch LocaleDataLoader.get_locale_map to raise an exception
|
||||
class DummyLoader:
|
||||
def get_locale_map(self, locales=None):
|
||||
raise RuntimeError("Simulated error")
|
||||
|
||||
with caplog.at_level(logging.WARNING):
|
||||
monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader())
|
||||
result = utils.ocr_to_dateparser_languages("eng+fra")
|
||||
assert result == []
|
||||
assert (
|
||||
"Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text
|
||||
)
|
@@ -1,4 +1,10 @@
|
||||
def ocr_to_dateparser_languages() -> dict[str, str]:
|
||||
import logging
|
||||
|
||||
from dateparser.languages.loader import LocaleDataLoader
|
||||
|
||||
logger = logging.getLogger("paperless.utils")
|
||||
|
||||
OCR_TO_DATEPARSER_LANGUAGES = {
|
||||
"""
|
||||
Translation map from languages supported by Tesseract OCR
|
||||
to languages supported by dateparser.
|
||||
@@ -14,97 +20,150 @@ def ocr_to_dateparser_languages() -> dict[str, str]:
|
||||
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
|
||||
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
|
||||
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
|
||||
return {
|
||||
"afr": "af",
|
||||
"amh": "am",
|
||||
"ara": "ar",
|
||||
"asm": "as",
|
||||
"ast": "ast",
|
||||
"aze": "az",
|
||||
"bel": "be",
|
||||
"bul": "bg",
|
||||
"ben": "bn",
|
||||
"bod": "bo",
|
||||
"bre": "br",
|
||||
"bos": "bs",
|
||||
"cat": "ca",
|
||||
"cher": "chr",
|
||||
"ces": "cs",
|
||||
"cym": "cy",
|
||||
"dan": "da",
|
||||
"deu": "de",
|
||||
"dzo": "dz",
|
||||
"ell": "el",
|
||||
"eng": "en",
|
||||
"epo": "eo",
|
||||
"spa": "es",
|
||||
"est": "et",
|
||||
"eus": "eu",
|
||||
"fas": "fa",
|
||||
"fin": "fi",
|
||||
"fil": "fil",
|
||||
"fao": "fo", # codespell:ignore
|
||||
"fra": "fr",
|
||||
"fry": "fy",
|
||||
"gle": "ga",
|
||||
"gla": "gd",
|
||||
"glg": "gl",
|
||||
"guj": "gu",
|
||||
"heb": "he",
|
||||
"hin": "hi",
|
||||
"hrv": "hr",
|
||||
"hun": "hu",
|
||||
"hye": "hy",
|
||||
"ind": "id",
|
||||
"isl": "is",
|
||||
"ita": "it",
|
||||
"jpn": "ja",
|
||||
"kat": "ka",
|
||||
"kaz": "kk",
|
||||
"khm": "km",
|
||||
"knda": "kn",
|
||||
"kor": "ko",
|
||||
"kir": "ky",
|
||||
"ltz": "lb",
|
||||
"lao": "lo",
|
||||
"lit": "lt",
|
||||
"lav": "lv",
|
||||
"mal": "ml",
|
||||
"mon": "mn",
|
||||
"mar": "mr",
|
||||
"msa": "ms",
|
||||
"mlt": "mt",
|
||||
"mya": "my",
|
||||
"nep": "ne",
|
||||
"nld": "nl",
|
||||
"ori": "or",
|
||||
"pan": "pa",
|
||||
"pol": "pl",
|
||||
"pus": "ps",
|
||||
"por": "pt",
|
||||
"que": "qu",
|
||||
"ron": "ro",
|
||||
"rus": "ru",
|
||||
"sin": "si",
|
||||
"slk": "sk",
|
||||
"slv": "sl",
|
||||
"sqi": "sq",
|
||||
"srp": "sr",
|
||||
"swe": "sv",
|
||||
"swa": "sw",
|
||||
"tam": "ta",
|
||||
"tel": "te", # codespell:ignore
|
||||
"tha": "th", # codespell:ignore
|
||||
"tir": "ti",
|
||||
"tgl": "tl",
|
||||
"ton": "to",
|
||||
"tur": "tr",
|
||||
"uig": "ug",
|
||||
"ukr": "uk",
|
||||
"urd": "ur",
|
||||
"uzb": "uz",
|
||||
"via": "vi",
|
||||
"yid": "yi",
|
||||
"yor": "yo",
|
||||
"chi": "zh",
|
||||
}
|
||||
"afr": "af",
|
||||
"amh": "am",
|
||||
"ara": "ar",
|
||||
"asm": "as",
|
||||
"ast": "ast",
|
||||
"aze": "az",
|
||||
"bel": "be",
|
||||
"bul": "bg",
|
||||
"ben": "bn",
|
||||
"bod": "bo",
|
||||
"bre": "br",
|
||||
"bos": "bs",
|
||||
"cat": "ca",
|
||||
"cher": "chr",
|
||||
"ces": "cs",
|
||||
"cym": "cy",
|
||||
"dan": "da",
|
||||
"deu": "de",
|
||||
"dzo": "dz",
|
||||
"ell": "el",
|
||||
"eng": "en",
|
||||
"epo": "eo",
|
||||
"spa": "es",
|
||||
"est": "et",
|
||||
"eus": "eu",
|
||||
"fas": "fa",
|
||||
"fin": "fi",
|
||||
"fil": "fil",
|
||||
"fao": "fo", # codespell:ignore
|
||||
"fra": "fr",
|
||||
"fry": "fy",
|
||||
"gle": "ga",
|
||||
"gla": "gd",
|
||||
"glg": "gl",
|
||||
"guj": "gu",
|
||||
"heb": "he",
|
||||
"hin": "hi",
|
||||
"hrv": "hr",
|
||||
"hun": "hu",
|
||||
"hye": "hy",
|
||||
"ind": "id",
|
||||
"isl": "is",
|
||||
"ita": "it",
|
||||
"jpn": "ja",
|
||||
"kat": "ka",
|
||||
"kaz": "kk",
|
||||
"khm": "km",
|
||||
"knda": "kn",
|
||||
"kor": "ko",
|
||||
"kir": "ky",
|
||||
"ltz": "lb",
|
||||
"lao": "lo",
|
||||
"lit": "lt",
|
||||
"lav": "lv",
|
||||
"mal": "ml",
|
||||
"mon": "mn",
|
||||
"mar": "mr",
|
||||
"msa": "ms",
|
||||
"mlt": "mt",
|
||||
"mya": "my",
|
||||
"nep": "ne",
|
||||
"nld": "nl",
|
||||
"ori": "or",
|
||||
"pan": "pa",
|
||||
"pol": "pl",
|
||||
"pus": "ps",
|
||||
"por": "pt",
|
||||
"que": "qu",
|
||||
"ron": "ro",
|
||||
"rus": "ru",
|
||||
"sin": "si",
|
||||
"slk": "sk",
|
||||
"slv": "sl",
|
||||
"sqi": "sq",
|
||||
"srp": "sr",
|
||||
"swe": "sv",
|
||||
"swa": "sw",
|
||||
"tam": "ta",
|
||||
"tel": "te", # codespell:ignore
|
||||
"tha": "th", # codespell:ignore
|
||||
"tir": "ti",
|
||||
"tgl": "tl",
|
||||
"ton": "to",
|
||||
"tur": "tr",
|
||||
"uig": "ug",
|
||||
"ukr": "uk",
|
||||
"urd": "ur",
|
||||
"uzb": "uz",
|
||||
"via": "vi",
|
||||
"yid": "yi",
|
||||
"yor": "yo",
|
||||
"chi": "zh",
|
||||
}
|
||||
|
||||
|
||||
def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
|
||||
"""
|
||||
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
|
||||
into a list of locales compatible with the `dateparser` library.
|
||||
|
||||
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
|
||||
Falls back to the base language (e.g., "az") if needed.
|
||||
- If a language cannot be mapped or validated, it is skipped with a warning.
|
||||
- Returns a list of valid locales, or an empty list if none could be converted.
|
||||
"""
|
||||
loader = LocaleDataLoader()
|
||||
result = []
|
||||
try:
|
||||
for ocr_language in ocr_languages.split("+"):
|
||||
# Split into language and optional script
|
||||
ocr_lang_part, *script = ocr_language.split("_")
|
||||
ocr_script_part = script[0] if script else None
|
||||
|
||||
language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
|
||||
if language_part is None:
|
||||
logger.debug(
|
||||
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
|
||||
)
|
||||
continue
|
||||
|
||||
# Ensure base language is supported by dateparser
|
||||
loader.get_locale_map(locales=[language_part])
|
||||
|
||||
# Try to add the script part if it's supported by dateparser
|
||||
if ocr_script_part:
|
||||
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
|
||||
try:
|
||||
loader.get_locale_map(locales=[dateparser_language])
|
||||
except Exception:
|
||||
logger.info(
|
||||
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
|
||||
)
|
||||
dateparser_language = language_part
|
||||
else:
|
||||
dateparser_language = language_part
|
||||
if dateparser_language not in result:
|
||||
result.append(dateparser_language)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
|
||||
)
|
||||
return []
|
||||
if not result:
|
||||
logger.info(
|
||||
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
|
||||
)
|
||||
return result
|
||||
|
Reference in New Issue
Block a user