mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-06 21:13:43 -05:00
Fix: include application config language settings for dateparser auto-detection (#10722)
This commit is contained in:
@@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
logger = logging.getLogger("paperless.settings")
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
@@ -1184,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||
|
||||
|
||||
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
|
||||
"""
|
||||
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
|
||||
into a list of locales compatible with the `dateparser` library.
|
||||
|
||||
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
|
||||
Falls back to the base language (e.g., "az") if needed.
|
||||
- If a language cannot be mapped or validated, it is skipped with a warning.
|
||||
- Returns a list of valid locales, or an empty list if none could be converted.
|
||||
"""
|
||||
ocr_to_dateparser = ocr_to_dateparser_languages()
|
||||
loader = LocaleDataLoader()
|
||||
result = []
|
||||
try:
|
||||
for ocr_language in ocr_languages.split("+"):
|
||||
# Split into language and optional script
|
||||
ocr_lang_part, *script = ocr_language.split("_")
|
||||
ocr_script_part = script[0] if script else None
|
||||
|
||||
language_part = ocr_to_dateparser.get(ocr_lang_part)
|
||||
if language_part is None:
|
||||
logger.debug(
|
||||
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
|
||||
)
|
||||
continue
|
||||
|
||||
# Ensure base language is supported by dateparser
|
||||
loader.get_locale_map(locales=[language_part])
|
||||
|
||||
# Try to add the script part if it's supported by dateparser
|
||||
if ocr_script_part:
|
||||
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
|
||||
try:
|
||||
loader.get_locale_map(locales=[dateparser_language])
|
||||
except Exception:
|
||||
logger.info(
|
||||
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
|
||||
)
|
||||
dateparser_language = language_part
|
||||
else:
|
||||
dateparser_language = language_part
|
||||
if dateparser_language not in result:
|
||||
result.append(dateparser_language)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
|
||||
)
|
||||
return []
|
||||
if not result:
|
||||
logger.info(
|
||||
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _parse_dateparser_languages(languages: str | None):
|
||||
language_list = languages.split("+") if languages else []
|
||||
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
|
||||
@@ -1253,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None):
|
||||
return list(LocaleDataLoader().get_locale_map(locales=language_list))
|
||||
|
||||
|
||||
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
|
||||
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
|
||||
# If not set, we will infer it at runtime
|
||||
DATE_PARSER_LANGUAGES = (
|
||||
_parse_dateparser_languages(
|
||||
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
|
||||
)
|
||||
else:
|
||||
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
|
||||
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
|
||||
else None
|
||||
)
|
||||
|
||||
|
||||
# Maximum number of dates taken from document start to end to show as suggestions for
|
||||
|
Reference in New Issue
Block a user