diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 526c131d0..b1f7061f8 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.config import OcrConfig +from paperless.utils import ocr_to_dateparser_languages if TYPE_CHECKING: import datetime @@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: """ import dateparser + ocr_config = OcrConfig() + languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages( + ocr_config.language, + ) + return dateparser.parse( ds, settings={ @@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: "RETURN_AS_TIMEZONE_AWARE": True, "TIMEZONE": settings.TIME_ZONE, }, - locales=settings.DATE_PARSER_LANGUAGES, + locales=languages, ) def __filter(date: datetime.datetime) -> datetime.datetime | None: diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index 1bad27266..f565a9544 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -1,12 +1,14 @@ import datetime from zoneinfo import ZoneInfo +import pytest from pytest_django.fixtures import SettingsWrapper from documents.parsers import parse_date from documents.parsers import parse_date_generator +@pytest.mark.django_db() class TestDate: def test_date_format_1(self): text = "lorem ipsum 130218 lorem ipsum" @@ -49,7 +51,7 @@ class TestDate: settings: SettingsWrapper, settings_timezone: ZoneInfo, ): - settings.DATE_PARSER_LANGUAGES = [] + settings.DATE_PARSER_LANGUAGES = ["de"] text = "lorem ipsum\nMärz 2019\nlorem ipsum" date = parse_date("", text) assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 24d297d06..5e6e2a14e 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader from django.utils.translation import gettext_lazy as _ from dotenv import load_dotenv -from paperless.utils import ocr_to_dateparser_languages - logger = logging.getLogger("paperless.settings") # Tap paperless.conf if it's available @@ -1184,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") -def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]: - """ - Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl") - into a list of locales compatible with the `dateparser` library. - - - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl"). - Falls back to the base language (e.g., "az") if needed. - - If a language cannot be mapped or validated, it is skipped with a warning. - - Returns a list of valid locales, or an empty list if none could be converted. - """ - ocr_to_dateparser = ocr_to_dateparser_languages() - loader = LocaleDataLoader() - result = [] - try: - for ocr_language in ocr_languages.split("+"): - # Split into language and optional script - ocr_lang_part, *script = ocr_language.split("_") - ocr_script_part = script[0] if script else None - - language_part = ocr_to_dateparser.get(ocr_lang_part) - if language_part is None: - logger.debug( - f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ', - ) - continue - - # Ensure base language is supported by dateparser - loader.get_locale_map(locales=[language_part]) - - # Try to add the script part if it's supported by dateparser - if ocr_script_part: - dateparser_language = f"{language_part}-{ocr_script_part.title()}" - try: - loader.get_locale_map(locales=[dateparser_language]) - except Exception: - logger.info( - f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.", - ) - dateparser_language = language_part - else: - dateparser_language = language_part - if dateparser_language not in result: - result.append(dateparser_language) - except Exception as e: - logger.warning( - f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}", - ) - return [] - if not result: - logger.info( - "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.", - ) - return result - - def _parse_dateparser_languages(languages: str | None): language_list = languages.split("+") if languages else [] # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib. @@ -1253,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None): return list(LocaleDataLoader().get_locale_map(locales=language_list)) -if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"): - DATE_PARSER_LANGUAGES = _parse_dateparser_languages( +# If not set, we will infer it at runtime +DATE_PARSER_LANGUAGES = ( + _parse_dateparser_languages( os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"), ) -else: - DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE) + if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES") + else None +) # Maximum number of dates taken from document start to end to show as suggestions for diff --git a/src/paperless/tests/test_settings.py b/src/paperless/tests/test_settings.py index 8a191f209..10995291e 100644 --- a/src/paperless/tests/test_settings.py +++ b/src/paperless/tests/test_settings.py @@ -6,7 +6,6 @@ from unittest import mock import pytest from celery.schedules import crontab -from paperless.settings import _ocr_to_dateparser_languages from paperless.settings import _parse_base_paths from paperless.settings import _parse_beat_schedule from paperless.settings import _parse_dateparser_languages @@ -476,33 +475,6 @@ class TestPathSettings(TestCase): self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL -@pytest.mark.parametrize( - ("ocr_language", "expected"), - [ - # One language - ("eng", ["en"]), - # Multiple languages - ("fra+ita+lao", ["fr", "it", "lo"]), - # Languages that don't have a two-letter equivalent - ("fil", ["fil"]), - # Languages with a script part supported by dateparser - ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]), - # Languages with a script part not supported by dateparser - # In this case, default to the language without script - ("deu_frak", ["de"]), - # Traditional and simplified chinese don't have the same name in dateparser, - # so they're converted to the general chinese language - ("chi_tra+chi_sim", ["zh"]), - # If a language is not supported by dateparser, fallback to the supported ones - ("eng+unsupported_language+por", ["en", "pt"]), - # If no language is supported, fallback to default - ("unsupported1+unsupported2", []), - ], -) -def test_ocr_to_dateparser_languages(ocr_language, expected): - assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected) - - @pytest.mark.parametrize( ("languages", "expected"), [ diff --git a/src/paperless/tests/test_utils.py b/src/paperless/tests/test_utils.py new file mode 100644 index 000000000..215498c29 --- /dev/null +++ b/src/paperless/tests/test_utils.py @@ -0,0 +1,52 @@ +import logging + +import pytest + +from paperless import utils +from paperless.utils import ocr_to_dateparser_languages + + +@pytest.mark.parametrize( + ("ocr_language", "expected"), + [ + # One language + ("eng", ["en"]), + # Multiple languages + ("fra+ita+lao", ["fr", "it", "lo"]), + # Languages that don't have a two-letter equivalent + ("fil", ["fil"]), + # Languages with a script part supported by dateparser + ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]), + # Languages with a script part not supported by dateparser + # In this case, default to the language without script + ("deu_frak", ["de"]), + # Traditional and simplified chinese don't have the same name in dateparser, + # so they're converted to the general chinese language + ("chi_tra+chi_sim", ["zh"]), + # If a language is not supported by dateparser, fallback to the supported ones + ("eng+unsupported_language+por", ["en", "pt"]), + # If no language is supported, fallback to default + ("unsupported1+unsupported2", []), + # Duplicate languages, should not duplicate in result + ("eng+eng", ["en"]), + # Language with script, but script is not mapped + ("ita_unknownscript", ["it"]), + ], +) +def test_ocr_to_dateparser_languages(ocr_language, expected): + assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected) + + +def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog): + # Patch LocaleDataLoader.get_locale_map to raise an exception + class DummyLoader: + def get_locale_map(self, locales=None): + raise RuntimeError("Simulated error") + + with caplog.at_level(logging.WARNING): + monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader()) + result = utils.ocr_to_dateparser_languages("eng+fra") + assert result == [] + assert ( + "Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text + ) diff --git a/src/paperless/utils.py b/src/paperless/utils.py index 965b862f7..46269588c 100644 --- a/src/paperless/utils.py +++ b/src/paperless/utils.py @@ -1,4 +1,10 @@ -def ocr_to_dateparser_languages() -> dict[str, str]: +import logging + +from dateparser.languages.loader import LocaleDataLoader + +logger = logging.getLogger("paperless.utils") + +OCR_TO_DATEPARSER_LANGUAGES = { """ Translation map from languages supported by Tesseract OCR to languages supported by dateparser. @@ -14,97 +20,150 @@ def ocr_to_dateparser_languages() -> dict[str, str]: # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln, # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus, # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue - return { - "afr": "af", - "amh": "am", - "ara": "ar", - "asm": "as", - "ast": "ast", - "aze": "az", - "bel": "be", - "bul": "bg", - "ben": "bn", - "bod": "bo", - "bre": "br", - "bos": "bs", - "cat": "ca", - "cher": "chr", - "ces": "cs", - "cym": "cy", - "dan": "da", - "deu": "de", - "dzo": "dz", - "ell": "el", - "eng": "en", - "epo": "eo", - "spa": "es", - "est": "et", - "eus": "eu", - "fas": "fa", - "fin": "fi", - "fil": "fil", - "fao": "fo", # codespell:ignore - "fra": "fr", - "fry": "fy", - "gle": "ga", - "gla": "gd", - "glg": "gl", - "guj": "gu", - "heb": "he", - "hin": "hi", - "hrv": "hr", - "hun": "hu", - "hye": "hy", - "ind": "id", - "isl": "is", - "ita": "it", - "jpn": "ja", - "kat": "ka", - "kaz": "kk", - "khm": "km", - "knda": "kn", - "kor": "ko", - "kir": "ky", - "ltz": "lb", - "lao": "lo", - "lit": "lt", - "lav": "lv", - "mal": "ml", - "mon": "mn", - "mar": "mr", - "msa": "ms", - "mlt": "mt", - "mya": "my", - "nep": "ne", - "nld": "nl", - "ori": "or", - "pan": "pa", - "pol": "pl", - "pus": "ps", - "por": "pt", - "que": "qu", - "ron": "ro", - "rus": "ru", - "sin": "si", - "slk": "sk", - "slv": "sl", - "sqi": "sq", - "srp": "sr", - "swe": "sv", - "swa": "sw", - "tam": "ta", - "tel": "te", # codespell:ignore - "tha": "th", # codespell:ignore - "tir": "ti", - "tgl": "tl", - "ton": "to", - "tur": "tr", - "uig": "ug", - "ukr": "uk", - "urd": "ur", - "uzb": "uz", - "via": "vi", - "yid": "yi", - "yor": "yo", - "chi": "zh", - } + "afr": "af", + "amh": "am", + "ara": "ar", + "asm": "as", + "ast": "ast", + "aze": "az", + "bel": "be", + "bul": "bg", + "ben": "bn", + "bod": "bo", + "bre": "br", + "bos": "bs", + "cat": "ca", + "cher": "chr", + "ces": "cs", + "cym": "cy", + "dan": "da", + "deu": "de", + "dzo": "dz", + "ell": "el", + "eng": "en", + "epo": "eo", + "spa": "es", + "est": "et", + "eus": "eu", + "fas": "fa", + "fin": "fi", + "fil": "fil", + "fao": "fo", # codespell:ignore + "fra": "fr", + "fry": "fy", + "gle": "ga", + "gla": "gd", + "glg": "gl", + "guj": "gu", + "heb": "he", + "hin": "hi", + "hrv": "hr", + "hun": "hu", + "hye": "hy", + "ind": "id", + "isl": "is", + "ita": "it", + "jpn": "ja", + "kat": "ka", + "kaz": "kk", + "khm": "km", + "knda": "kn", + "kor": "ko", + "kir": "ky", + "ltz": "lb", + "lao": "lo", + "lit": "lt", + "lav": "lv", + "mal": "ml", + "mon": "mn", + "mar": "mr", + "msa": "ms", + "mlt": "mt", + "mya": "my", + "nep": "ne", + "nld": "nl", + "ori": "or", + "pan": "pa", + "pol": "pl", + "pus": "ps", + "por": "pt", + "que": "qu", + "ron": "ro", + "rus": "ru", + "sin": "si", + "slk": "sk", + "slv": "sl", + "sqi": "sq", + "srp": "sr", + "swe": "sv", + "swa": "sw", + "tam": "ta", + "tel": "te", # codespell:ignore + "tha": "th", # codespell:ignore + "tir": "ti", + "tgl": "tl", + "ton": "to", + "tur": "tr", + "uig": "ug", + "ukr": "uk", + "urd": "ur", + "uzb": "uz", + "via": "vi", + "yid": "yi", + "yor": "yo", + "chi": "zh", +} + + +def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]: + """ + Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl") + into a list of locales compatible with the `dateparser` library. + + - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl"). + Falls back to the base language (e.g., "az") if needed. + - If a language cannot be mapped or validated, it is skipped with a warning. + - Returns a list of valid locales, or an empty list if none could be converted. + """ + loader = LocaleDataLoader() + result = [] + try: + for ocr_language in ocr_languages.split("+"): + # Split into language and optional script + ocr_lang_part, *script = ocr_language.split("_") + ocr_script_part = script[0] if script else None + + language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part) + if language_part is None: + logger.debug( + f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ', + ) + continue + + # Ensure base language is supported by dateparser + loader.get_locale_map(locales=[language_part]) + + # Try to add the script part if it's supported by dateparser + if ocr_script_part: + dateparser_language = f"{language_part}-{ocr_script_part.title()}" + try: + loader.get_locale_map(locales=[dateparser_language]) + except Exception: + logger.info( + f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.", + ) + dateparser_language = language_part + else: + dateparser_language = language_part + if dateparser_language not in result: + result.append(dateparser_language) + except Exception as e: + logger.warning( + f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}", + ) + return [] + if not result: + logger.info( + "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.", + ) + return result