import logging import pytest from paperless import utils from paperless.utils import ocr_to_dateparser_languages @pytest.mark.parametrize( ("ocr_language", "expected"), [ # One language ("eng", ["en"]), # Multiple languages ("fra+ita+lao", ["fr", "it", "lo"]), # Languages that don't have a two-letter equivalent ("fil", ["fil"]), # Languages with a script part supported by dateparser ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]), # Languages with a script part not supported by dateparser # In this case, default to the language without script ("deu_frak", ["de"]), # Traditional and simplified chinese don't have the same name in dateparser, # so they're converted to the general chinese language ("chi_tra+chi_sim", ["zh"]), # If a language is not supported by dateparser, fallback to the supported ones ("eng+unsupported_language+por", ["en", "pt"]), # If no language is supported, fallback to default ("unsupported1+unsupported2", []), # Duplicate languages, should not duplicate in result ("eng+eng", ["en"]), # Language with script, but script is not mapped ("ita_unknownscript", ["it"]), ], ) def test_ocr_to_dateparser_languages(ocr_language, expected): assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected) def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog): # Patch LocaleDataLoader.get_locale_map to raise an exception class DummyLoader: def get_locale_map(self, locales=None): raise RuntimeError("Simulated error") with caplog.at_level(logging.WARNING): monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader()) result = utils.ocr_to_dateparser_languages("eng+fra") assert result == [] assert ( "Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text )