Compare commits

..

2 Commits

Author SHA1 Message Date
GitHub Actions
a6e41b4145 Auto translate strings 2025-08-31 22:25:05 +00:00
shamoon
cb927c5b22 Fix: include application config language settings for dateparser auto-detection (#10722) 2025-08-31 15:22:39 -07:00
9 changed files with 307 additions and 333 deletions

View File

@@ -4,13 +4,10 @@ import logging
import pickle
import re
import warnings
from functools import lru_cache
from hashlib import sha256
from pathlib import Path
from typing import TYPE_CHECKING
import joblib
if TYPE_CHECKING:
from collections.abc import Iterator
from datetime import datetime
@@ -53,24 +50,7 @@ class ClassifierModelCorruptError(Exception):
pass
def _model_cache_token() -> tuple[str, int, int]:
p = Path(settings.MODEL_FILE)
if p.exists():
try:
st = p.stat()
return (str(p), int(st.st_mtime), int(st.st_size))
except OSError:
return (str(p), 0, 0)
return (str(p), 0, 0)
@lru_cache(maxsize=1)
def _load_classifier_cached(
token: tuple[str, int, int],
*,
raise_exception: bool = False,
) -> DocumentClassifier | None:
# token used only for cache key; logic depends on current settings
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
if not settings.MODEL_FILE.is_file():
logger.debug(
"Document classification model does not exist (yet), not "
@@ -81,23 +61,20 @@ def _load_classifier_cached(
classifier = DocumentClassifier()
try:
classifier.load()
except IncompatibleClassifierVersionError as e:
logger.info(f"Classifier version incompatible: {e.message}, will re-train")
try:
Path(settings.MODEL_FILE).unlink()
except Exception:
pass
Path(settings.MODEL_FILE).unlink()
classifier = None
if raise_exception:
raise e
except ClassifierModelCorruptError as e:
# there's something wrong with the model file.
logger.exception(
"Unrecoverable error while loading document classification model, deleting model file.",
"Unrecoverable error while loading document "
"classification model, deleting model file.",
)
try:
Path(settings.MODEL_FILE).unlink()
except Exception:
pass
Path(settings.MODEL_FILE).unlink
classifier = None
if raise_exception:
raise e
@@ -115,17 +92,11 @@ def _load_classifier_cached(
return classifier
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
token = _model_cache_token()
return _load_classifier_cached(token, raise_exception=raise_exception)
class DocumentClassifier:
# v7 - Updated scikit-learn package version
# v8 - Added storage path classifier
# v9 - Changed from hashing to time/ids for re-train check
# v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes
FORMAT_VERSION = 10
FORMAT_VERSION = 9
def __init__(self) -> None:
# last time a document changed and therefore training might be required
@@ -161,57 +132,28 @@ class DocumentClassifier:
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
state = None
try:
state = joblib.load(settings.MODEL_FILE, mmap_mode="r")
except ValueError:
# Some environments may fail to mmap small files; fall back to normal load
state = joblib.load(settings.MODEL_FILE, mmap_mode=None)
except Exception as err:
# Fallback to old pickle-based format. Try to read the version and a field to
# distinguish truly corrupt files from incompatible versions.
try:
with Path(settings.MODEL_FILE).open("rb") as f:
_version = pickle.load(f)
try:
_ = pickle.load(f)
except Exception as inner:
raise ClassifierModelCorruptError from inner
# Old, incompatible format
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
) from err
except (
IncompatibleClassifierVersionError,
ClassifierModelCorruptError,
):
raise
except Exception:
# Not even a readable pickle header
raise ClassifierModelCorruptError from err
with Path(settings.MODEL_FILE).open("rb") as f:
schema_version = pickle.load(f)
if (
not isinstance(state, dict)
or state.get("format_version") != self.FORMAT_VERSION
):
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
)
if schema_version != self.FORMAT_VERSION:
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
)
else:
try:
self.last_doc_change_time = pickle.load(f)
self.last_auto_type_hash = pickle.load(f)
try:
self.last_doc_change_time = state.get("last_doc_change_time")
self.last_auto_type_hash = state.get("last_auto_type_hash")
self.data_vectorizer = pickle.load(f)
self._update_data_vectorizer_hash()
self.tags_binarizer = pickle.load(f)
self.data_vectorizer = state.get("data_vectorizer")
self._update_data_vectorizer_hash()
self.tags_binarizer = state.get("tags_binarizer")
self.tags_classifier = state.get("tags_classifier")
self.correspondent_classifier = state.get("correspondent_classifier")
self.document_type_classifier = state.get("document_type_classifier")
self.storage_path_classifier = state.get("storage_path_classifier")
except Exception as err:
raise ClassifierModelCorruptError from err
self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f)
self.document_type_classifier = pickle.load(f)
self.storage_path_classifier = pickle.load(f)
except Exception as err:
raise ClassifierModelCorruptError from err
# Check for the warning about unpickling from differing versions
# and consider it incompatible
@@ -230,28 +172,24 @@ class DocumentClassifier:
def save(self) -> None:
target_file: Path = settings.MODEL_FILE
target_file_temp: Path = target_file.with_suffix(".joblib.part")
target_file_temp: Path = target_file.with_suffix(".pickle.part")
state = {
"format_version": self.FORMAT_VERSION,
"last_doc_change_time": self.last_doc_change_time,
"last_auto_type_hash": self.last_auto_type_hash,
"data_vectorizer": self.data_vectorizer,
"tags_binarizer": self.tags_binarizer,
"tags_classifier": self.tags_classifier,
"correspondent_classifier": self.correspondent_classifier,
"document_type_classifier": self.document_type_classifier,
"storage_path_classifier": self.storage_path_classifier,
}
with target_file_temp.open("wb") as f:
pickle.dump(self.FORMAT_VERSION, f)
joblib.dump(state, target_file_temp, compress=3)
pickle.dump(self.last_doc_change_time, f)
pickle.dump(self.last_auto_type_hash, f)
pickle.dump(self.data_vectorizer, f)
pickle.dump(self.tags_binarizer, f)
pickle.dump(self.tags_classifier, f)
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
target_file_temp.rename(target_file)
# Invalidate cached classifier loader so subsequent calls see the new file
try:
_load_classifier_cached.cache_clear()
except Exception:
pass
def train(self) -> bool:
# Get non-inbox documents

View File

@@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.utils import ocr_to_dateparser_languages
if TYPE_CHECKING:
import datetime
@@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
import dateparser
ocr_config = OcrConfig()
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
ocr_config.language,
)
return dateparser.parse(
ds,
settings={
@@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE,
},
locales=settings.DATE_PARSER_LANGUAGES,
locales=languages,
)
def __filter(date: datetime.datetime) -> datetime.datetime | None:

View File

@@ -370,7 +370,7 @@ class TestClassifier(DirectoriesMixin, TestCase):
def test_load_corrupt_file(self, patched_pickle_load: mock.MagicMock):
"""
GIVEN:
- Corrupted legacy classifier pickle file
- Corrupted classifier pickle file
WHEN:
- An attempt is made to load the classifier
THEN:
@@ -381,10 +381,9 @@ class TestClassifier(DirectoriesMixin, TestCase):
# First load is the schema version,allow it
patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]
# Force the loader down the legacy path by making joblib.load fail
with mock.patch("joblib.load", side_effect=Exception("bad joblib")):
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
with self.assertRaises(ClassifierModelCorruptError):
self.classifier.load()
patched_pickle_load.assert_called()
patched_pickle_load.reset_mock()
patched_pickle_load.side_effect = [
@@ -392,8 +391,8 @@ class TestClassifier(DirectoriesMixin, TestCase):
ClassifierModelCorruptError(),
]
with mock.patch("joblib.load", side_effect=Exception("bad joblib")):
self.assertIsNone(load_classifier())
self.assertIsNone(load_classifier())
patched_pickle_load.assert_called()
def test_load_new_scikit_learn_version(self):
"""

View File

@@ -1,12 +1,14 @@
import datetime
from zoneinfo import ZoneInfo
import pytest
from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
@pytest.mark.django_db()
class TestDate:
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
@@ -49,7 +51,7 @@ class TestDate:
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = []
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)

View File

@@ -2,7 +2,7 @@ msgid ""
msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2025-08-16 14:34+0000\n"
"POT-Creation-Date: 2025-08-31 22:24+0000\n"
"PO-Revision-Date: 2022-02-17 04:17\n"
"Last-Translator: \n"
"Language-Team: English\n"
@@ -1645,147 +1645,147 @@ msgstr ""
msgid "paperless application settings"
msgstr ""
#: paperless/settings.py:774
#: paperless/settings.py:772
msgid "English (US)"
msgstr ""
#: paperless/settings.py:775
#: paperless/settings.py:773
msgid "Arabic"
msgstr ""
#: paperless/settings.py:776
#: paperless/settings.py:774
msgid "Afrikaans"
msgstr ""
#: paperless/settings.py:777
#: paperless/settings.py:775
msgid "Belarusian"
msgstr ""
#: paperless/settings.py:778
#: paperless/settings.py:776
msgid "Bulgarian"
msgstr ""
#: paperless/settings.py:779
#: paperless/settings.py:777
msgid "Catalan"
msgstr ""
#: paperless/settings.py:780
#: paperless/settings.py:778
msgid "Czech"
msgstr ""
#: paperless/settings.py:781
#: paperless/settings.py:779
msgid "Danish"
msgstr ""
#: paperless/settings.py:782
#: paperless/settings.py:780
msgid "German"
msgstr ""
#: paperless/settings.py:783
#: paperless/settings.py:781
msgid "Greek"
msgstr ""
#: paperless/settings.py:784
#: paperless/settings.py:782
msgid "English (GB)"
msgstr ""
#: paperless/settings.py:785
#: paperless/settings.py:783
msgid "Spanish"
msgstr ""
#: paperless/settings.py:786
#: paperless/settings.py:784
msgid "Persian"
msgstr ""
#: paperless/settings.py:787
#: paperless/settings.py:785
msgid "Finnish"
msgstr ""
#: paperless/settings.py:788
#: paperless/settings.py:786
msgid "French"
msgstr ""
#: paperless/settings.py:789
#: paperless/settings.py:787
msgid "Hungarian"
msgstr ""
#: paperless/settings.py:790
#: paperless/settings.py:788
msgid "Italian"
msgstr ""
#: paperless/settings.py:791
#: paperless/settings.py:789
msgid "Japanese"
msgstr ""
#: paperless/settings.py:792
#: paperless/settings.py:790
msgid "Korean"
msgstr ""
#: paperless/settings.py:793
#: paperless/settings.py:791
msgid "Luxembourgish"
msgstr ""
#: paperless/settings.py:794
#: paperless/settings.py:792
msgid "Norwegian"
msgstr ""
#: paperless/settings.py:795
#: paperless/settings.py:793
msgid "Dutch"
msgstr ""
#: paperless/settings.py:796
#: paperless/settings.py:794
msgid "Polish"
msgstr ""
#: paperless/settings.py:797
#: paperless/settings.py:795
msgid "Portuguese (Brazil)"
msgstr ""
#: paperless/settings.py:798
#: paperless/settings.py:796
msgid "Portuguese"
msgstr ""
#: paperless/settings.py:799
#: paperless/settings.py:797
msgid "Romanian"
msgstr ""
#: paperless/settings.py:800
#: paperless/settings.py:798
msgid "Russian"
msgstr ""
#: paperless/settings.py:801
#: paperless/settings.py:799
msgid "Slovak"
msgstr ""
#: paperless/settings.py:802
#: paperless/settings.py:800
msgid "Slovenian"
msgstr ""
#: paperless/settings.py:803
#: paperless/settings.py:801
msgid "Serbian"
msgstr ""
#: paperless/settings.py:804
#: paperless/settings.py:802
msgid "Swedish"
msgstr ""
#: paperless/settings.py:805
#: paperless/settings.py:803
msgid "Turkish"
msgstr ""
#: paperless/settings.py:806
#: paperless/settings.py:804
msgid "Ukrainian"
msgstr ""
#: paperless/settings.py:807
#: paperless/settings.py:805
msgid "Vietnamese"
msgstr ""
#: paperless/settings.py:808
#: paperless/settings.py:806
msgid "Chinese Simplified"
msgstr ""
#: paperless/settings.py:809
#: paperless/settings.py:807
msgid "Chinese Traditional"
msgstr ""

View File

@@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader
from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv
from paperless.utils import ocr_to_dateparser_languages
logger = logging.getLogger("paperless.settings")
# Tap paperless.conf if it's available
@@ -1184,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
"""
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
into a list of locales compatible with the `dateparser` library.
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
Falls back to the base language (e.g., "az") if needed.
- If a language cannot be mapped or validated, it is skipped with a warning.
- Returns a list of valid locales, or an empty list if none could be converted.
"""
ocr_to_dateparser = ocr_to_dateparser_languages()
loader = LocaleDataLoader()
result = []
try:
for ocr_language in ocr_languages.split("+"):
# Split into language and optional script
ocr_lang_part, *script = ocr_language.split("_")
ocr_script_part = script[0] if script else None
language_part = ocr_to_dateparser.get(ocr_lang_part)
if language_part is None:
logger.debug(
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
)
continue
# Ensure base language is supported by dateparser
loader.get_locale_map(locales=[language_part])
# Try to add the script part if it's supported by dateparser
if ocr_script_part:
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
try:
loader.get_locale_map(locales=[dateparser_language])
except Exception:
logger.info(
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
)
dateparser_language = language_part
else:
dateparser_language = language_part
if dateparser_language not in result:
result.append(dateparser_language)
except Exception as e:
logger.warning(
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
)
return []
if not result:
logger.info(
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
)
return result
def _parse_dateparser_languages(languages: str | None):
language_list = languages.split("+") if languages else []
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
@@ -1253,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None):
return list(LocaleDataLoader().get_locale_map(locales=language_list))
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
# If not set, we will infer it at runtime
DATE_PARSER_LANGUAGES = (
_parse_dateparser_languages(
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
)
else:
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
else None
)
# Maximum number of dates taken from document start to end to show as suggestions for

View File

@@ -6,7 +6,6 @@ from unittest import mock
import pytest
from celery.schedules import crontab
from paperless.settings import _ocr_to_dateparser_languages
from paperless.settings import _parse_base_paths
from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_dateparser_languages
@@ -476,33 +475,6 @@ class TestPathSettings(TestCase):
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
@pytest.mark.parametrize(
("ocr_language", "expected"),
[
# One language
("eng", ["en"]),
# Multiple languages
("fra+ita+lao", ["fr", "it", "lo"]),
# Languages that don't have a two-letter equivalent
("fil", ["fil"]),
# Languages with a script part supported by dateparser
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
# Languages with a script part not supported by dateparser
# In this case, default to the language without script
("deu_frak", ["de"]),
# Traditional and simplified chinese don't have the same name in dateparser,
# so they're converted to the general chinese language
("chi_tra+chi_sim", ["zh"]),
# If a language is not supported by dateparser, fallback to the supported ones
("eng+unsupported_language+por", ["en", "pt"]),
# If no language is supported, fallback to default
("unsupported1+unsupported2", []),
],
)
def test_ocr_to_dateparser_languages(ocr_language, expected):
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
@pytest.mark.parametrize(
("languages", "expected"),
[

View File

@@ -0,0 +1,52 @@
import logging
import pytest
from paperless import utils
from paperless.utils import ocr_to_dateparser_languages
@pytest.mark.parametrize(
("ocr_language", "expected"),
[
# One language
("eng", ["en"]),
# Multiple languages
("fra+ita+lao", ["fr", "it", "lo"]),
# Languages that don't have a two-letter equivalent
("fil", ["fil"]),
# Languages with a script part supported by dateparser
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
# Languages with a script part not supported by dateparser
# In this case, default to the language without script
("deu_frak", ["de"]),
# Traditional and simplified chinese don't have the same name in dateparser,
# so they're converted to the general chinese language
("chi_tra+chi_sim", ["zh"]),
# If a language is not supported by dateparser, fallback to the supported ones
("eng+unsupported_language+por", ["en", "pt"]),
# If no language is supported, fallback to default
("unsupported1+unsupported2", []),
# Duplicate languages, should not duplicate in result
("eng+eng", ["en"]),
# Language with script, but script is not mapped
("ita_unknownscript", ["it"]),
],
)
def test_ocr_to_dateparser_languages(ocr_language, expected):
assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog):
# Patch LocaleDataLoader.get_locale_map to raise an exception
class DummyLoader:
def get_locale_map(self, locales=None):
raise RuntimeError("Simulated error")
with caplog.at_level(logging.WARNING):
monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader())
result = utils.ocr_to_dateparser_languages("eng+fra")
assert result == []
assert (
"Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text
)

View File

@@ -1,4 +1,10 @@
def ocr_to_dateparser_languages() -> dict[str, str]:
import logging
from dateparser.languages.loader import LocaleDataLoader
logger = logging.getLogger("paperless.utils")
OCR_TO_DATEPARSER_LANGUAGES = {
"""
Translation map from languages supported by Tesseract OCR
to languages supported by dateparser.
@@ -14,97 +20,150 @@ def ocr_to_dateparser_languages() -> dict[str, str]:
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
return {
"afr": "af",
"amh": "am",
"ara": "ar",
"asm": "as",
"ast": "ast",
"aze": "az",
"bel": "be",
"bul": "bg",
"ben": "bn",
"bod": "bo",
"bre": "br",
"bos": "bs",
"cat": "ca",
"cher": "chr",
"ces": "cs",
"cym": "cy",
"dan": "da",
"deu": "de",
"dzo": "dz",
"ell": "el",
"eng": "en",
"epo": "eo",
"spa": "es",
"est": "et",
"eus": "eu",
"fas": "fa",
"fin": "fi",
"fil": "fil",
"fao": "fo", # codespell:ignore
"fra": "fr",
"fry": "fy",
"gle": "ga",
"gla": "gd",
"glg": "gl",
"guj": "gu",
"heb": "he",
"hin": "hi",
"hrv": "hr",
"hun": "hu",
"hye": "hy",
"ind": "id",
"isl": "is",
"ita": "it",
"jpn": "ja",
"kat": "ka",
"kaz": "kk",
"khm": "km",
"knda": "kn",
"kor": "ko",
"kir": "ky",
"ltz": "lb",
"lao": "lo",
"lit": "lt",
"lav": "lv",
"mal": "ml",
"mon": "mn",
"mar": "mr",
"msa": "ms",
"mlt": "mt",
"mya": "my",
"nep": "ne",
"nld": "nl",
"ori": "or",
"pan": "pa",
"pol": "pl",
"pus": "ps",
"por": "pt",
"que": "qu",
"ron": "ro",
"rus": "ru",
"sin": "si",
"slk": "sk",
"slv": "sl",
"sqi": "sq",
"srp": "sr",
"swe": "sv",
"swa": "sw",
"tam": "ta",
"tel": "te", # codespell:ignore
"tha": "th", # codespell:ignore
"tir": "ti",
"tgl": "tl",
"ton": "to",
"tur": "tr",
"uig": "ug",
"ukr": "uk",
"urd": "ur",
"uzb": "uz",
"via": "vi",
"yid": "yi",
"yor": "yo",
"chi": "zh",
}
"afr": "af",
"amh": "am",
"ara": "ar",
"asm": "as",
"ast": "ast",
"aze": "az",
"bel": "be",
"bul": "bg",
"ben": "bn",
"bod": "bo",
"bre": "br",
"bos": "bs",
"cat": "ca",
"cher": "chr",
"ces": "cs",
"cym": "cy",
"dan": "da",
"deu": "de",
"dzo": "dz",
"ell": "el",
"eng": "en",
"epo": "eo",
"spa": "es",
"est": "et",
"eus": "eu",
"fas": "fa",
"fin": "fi",
"fil": "fil",
"fao": "fo", # codespell:ignore
"fra": "fr",
"fry": "fy",
"gle": "ga",
"gla": "gd",
"glg": "gl",
"guj": "gu",
"heb": "he",
"hin": "hi",
"hrv": "hr",
"hun": "hu",
"hye": "hy",
"ind": "id",
"isl": "is",
"ita": "it",
"jpn": "ja",
"kat": "ka",
"kaz": "kk",
"khm": "km",
"knda": "kn",
"kor": "ko",
"kir": "ky",
"ltz": "lb",
"lao": "lo",
"lit": "lt",
"lav": "lv",
"mal": "ml",
"mon": "mn",
"mar": "mr",
"msa": "ms",
"mlt": "mt",
"mya": "my",
"nep": "ne",
"nld": "nl",
"ori": "or",
"pan": "pa",
"pol": "pl",
"pus": "ps",
"por": "pt",
"que": "qu",
"ron": "ro",
"rus": "ru",
"sin": "si",
"slk": "sk",
"slv": "sl",
"sqi": "sq",
"srp": "sr",
"swe": "sv",
"swa": "sw",
"tam": "ta",
"tel": "te", # codespell:ignore
"tha": "th", # codespell:ignore
"tir": "ti",
"tgl": "tl",
"ton": "to",
"tur": "tr",
"uig": "ug",
"ukr": "uk",
"urd": "ur",
"uzb": "uz",
"via": "vi",
"yid": "yi",
"yor": "yo",
"chi": "zh",
}
def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
"""
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
into a list of locales compatible with the `dateparser` library.
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
Falls back to the base language (e.g., "az") if needed.
- If a language cannot be mapped or validated, it is skipped with a warning.
- Returns a list of valid locales, or an empty list if none could be converted.
"""
loader = LocaleDataLoader()
result = []
try:
for ocr_language in ocr_languages.split("+"):
# Split into language and optional script
ocr_lang_part, *script = ocr_language.split("_")
ocr_script_part = script[0] if script else None
language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
if language_part is None:
logger.debug(
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
)
continue
# Ensure base language is supported by dateparser
loader.get_locale_map(locales=[language_part])
# Try to add the script part if it's supported by dateparser
if ocr_script_part:
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
try:
loader.get_locale_map(locales=[dateparser_language])
except Exception:
logger.info(
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
)
dateparser_language = language_part
else:
dateparser_language = language_part
if dateparser_language not in result:
result.append(dateparser_language)
except Exception as e:
logger.warning(
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
)
return []
if not result:
logger.info(
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
)
return result