Performance: Add support for configuring date parser languages (#10181)

--------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2025-08-18 00:46:25 +00:00 · 2025-07-01 07:57:38 +02:00
parent c974dc9400
commit 6591d5da63
6 changed files with 290 additions and 4 deletions
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1003,6 +1003,22 @@ still perform some basic text pre-processing before matching.

    Defaults to 1.

+#### [`PAPERLESS_DATE_PARSER_LANGUAGES=<lang>`](#PAPERLESS_DATE_PARSER_LANGUAGES) {#PAPERLESS_DATE_PARSER_LANGUAGES}
+
+Specifies which language Paperless should use when parsing dates from documents.
+
+    This should be a language code supported by the dateparser library,
+    for example: "en", or a combination such as "en+de".
+    Locales are also supported (e.g., "en-AU").
+    Multiple languages can be combined using "+", for example: "en+de" or "en-AU+de".
+    For valid values, refer to the list of supported languages and locales in the [dateparser documentation](https://dateparser.readthedocs.io/en/latest/supported_locales.html).
+
+    Set this to match the languages in which most of your documents are written.
+    If not set, Paperless will attempt to infer the language(s) from the OCR configuration (`PAPERLESS_OCR_LANGUAGE`).
+
+!!! note
+This format differs from the `PAPERLESS_OCR_LANGUAGE` setting, which uses ISO 639-2 codes (3 letters, e.g., "eng+deu" for Tesseract OCR).
+
 #### [`PAPERLESS_EMAIL_TASK_CRON=<cron expression>`](#PAPERLESS_EMAIL_TASK_CRON) {#PAPERLESS_EMAIL_TASK_CRON}

 : Configures the scheduled email fetching frequency. The value
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -280,6 +280,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
                "RETURN_AS_TIMEZONE_AWARE": True,
                "TIMEZONE": settings.TIME_ZONE,
            },
+            locales=settings.DATE_PARSER_LANGUAGES,
        )

    def __filter(date: datetime.datetime) -> datetime.datetime | None:
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -44,12 +44,22 @@ class TestDate:
        )
        assert parse_date("", text) is None

-    def test_date_format_7(self, settings_timezone: ZoneInfo):
+    def test_date_format_7(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
+        settings.DATE_PARSER_LANGUAGES = []
        text = "lorem ipsum\nMärz 2019\nlorem ipsum"
        date = parse_date("", text)
        assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)

-    def test_date_format_8(self, settings_timezone: ZoneInfo):
+    def test_date_format_8(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
+        settings.DATE_PARSER_LANGUAGES = ["de"]
        text = (
            "lorem ipsum\n"
            "Wohnort\n"
@@ -71,7 +81,12 @@ class TestDate:
            tzinfo=settings_timezone,
        )

-    def test_date_format_9(self, settings_timezone: ZoneInfo):
+    def test_date_format_9(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
+        settings.DATE_PARSER_LANGUAGES = ["de"]
        text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
        assert parse_date("", text) == datetime.datetime(
            2020,
@@ -250,7 +265,12 @@ class TestDate:
    def test_crazy_date_with_spaces(self):
        assert parse_date("", "20 408000l 2475") is None

-    def test_utf_month_names(self, settings_timezone: ZoneInfo):
+    def test_utf_month_names(
+        self,
+        settings: SettingsWrapper,
+        settings_timezone: ZoneInfo,
+    ):
+        settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
        assert parse_date("", "13 décembre 2023") == datetime.datetime(
            2023,
            12,
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1,5 +1,7 @@
 import datetime
 import json
+import logging
+import logging.config
 import math
 import multiprocessing
 import os
@@ -12,9 +14,14 @@ from urllib.parse import urlparse

 from celery.schedules import crontab
 from concurrent_log_handler.queue import setup_logging_queues
+from dateparser.languages.loader import LocaleDataLoader
 from django.utils.translation import gettext_lazy as _
 from dotenv import load_dotenv

+from paperless.utils import ocr_to_dateparser_languages
+
+logger = logging.getLogger("paperless.settings")
+
 # Tap paperless.conf if it's available
 for path in [
    os.getenv("PAPERLESS_CONFIGURATION_PATH"),
@@ -864,6 +871,10 @@ LOGGING = {
    },
 }

+# Configure logging before calling any logger in settings.py so it will respect the log format, even if Django has not parsed the settings yet.
+logging.config.dictConfig(LOGGING)
+
+
 ###############################################################################
 # Task queue                                                                  #
 ###############################################################################
@@ -1166,6 +1177,84 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")

+
+def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
+    """
+    Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
+    into a list of locales compatible with the `dateparser` library.
+
+    - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
+    Falls back to the base language (e.g., "az") if needed.
+    - If a language cannot be mapped or validated, it is skipped with a warning.
+    - Returns a list of valid locales, or an empty list if none could be converted.
+    """
+    ocr_to_dateparser = ocr_to_dateparser_languages()
+    loader = LocaleDataLoader()
+    result = []
+    try:
+        for ocr_language in ocr_languages.split("+"):
+            # Split into language and optional script
+            ocr_lang_part, *script = ocr_language.split("_")
+            ocr_script_part = script[0] if script else None
+
+            language_part = ocr_to_dateparser.get(ocr_lang_part)
+            if language_part is None:
+                logger.warning(
+                    f'Skipping unknown OCR language "{ocr_language}" — no dateparser equivalent.',
+                )
+                continue
+
+            # Ensure base language is supported by dateparser
+            loader.get_locale_map(locales=[language_part])
+
+            # Try to add the script part if it's supported by dateparser
+            if ocr_script_part:
+                dateparser_language = f"{language_part}-{ocr_script_part.title()}"
+                try:
+                    loader.get_locale_map(locales=[dateparser_language])
+                except Exception:
+                    logger.warning(
+                        f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
+                    )
+                    dateparser_language = language_part
+            else:
+                dateparser_language = language_part
+            if dateparser_language not in result:
+                result.append(dateparser_language)
+    except Exception as e:
+        logger.warning(
+            f"Could not configure dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
+        )
+        return []
+    if not result:
+        logger.warning(
+            "Could not configure any dateparser languages from OCR_LANGUAGE — fallback to autodetection.",
+        )
+    return result
+
+
+def _parse_dateparser_languages(languages: str | None):
+    language_list = languages.split("+") if languages else []
+    # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
+    # See: https://github.com/scrapinghub/dateparser/issues/875
+    for index, language in enumerate(language_list):
+        if language.startswith("zh-") and "zh" not in language_list:
+            logger.warning(
+                f'Chinese locale detected: {language}. dateparser might fail to parse some dates with this locale, so Chinese ("zh") will be used as a fallback.',
+            )
+            language_list.append("zh")
+
+    return list(LocaleDataLoader().get_locale_map(locales=language_list))
+
+
+if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
+    DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
+        os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
+    )
+else:
+    DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
+
+
 # Maximum number of dates taken from document start to end to show as suggestions for
 # `created` date in the frontend. Duplicates are removed, which can result in
 # fewer dates shown.
--- a/src/paperless/tests/test_settings.py
+++ b/src/paperless/tests/test_settings.py
@@ -3,10 +3,13 @@ import os
 from unittest import TestCase
 from unittest import mock

+import pytest
 from celery.schedules import crontab

+from paperless.settings import _ocr_to_dateparser_languages
 from paperless.settings import _parse_base_paths
 from paperless.settings import _parse_beat_schedule
+from paperless.settings import _parse_dateparser_languages
 from paperless.settings import _parse_db_settings
 from paperless.settings import _parse_ignore_dates
 from paperless.settings import _parse_paperless_url
@@ -471,3 +474,50 @@ class TestPathSettings(TestCase):
        base_paths = _parse_base_paths()
        self.assertEqual("/paperless/", base_paths[1])  # BASE_URL
        self.assertEqual("/foobar/", base_paths[4])  # LOGOUT_REDIRECT_URL
+
+
+@pytest.mark.parametrize(
+    ("ocr_language", "expected"),
+    [
+        # One language
+        ("eng", ["en"]),
+        # Multiple languages
+        ("fra+ita+lao", ["fr", "it", "lo"]),
+        # Languages that don't have a two-letter equivalent
+        ("fil", ["fil"]),
+        # Languages with a script part supported by dateparser
+        ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
+        # Languages with a script part not supported by dateparser
+        # In this case, default to the language without script
+        ("deu_frak", ["de"]),
+        # Traditional and simplified chinese don't have the same name in dateparser,
+        # so they're converted to the general chinese language
+        ("chi_tra+chi_sim", ["zh"]),
+        # If a language is not supported by dateparser, fallback to the supported ones
+        ("eng+unsupported_language+por", ["en", "pt"]),
+        # If no language is supported, fallback to default
+        ("unsupported1+unsupported2", []),
+    ],
+)
+def test_ocr_to_dateparser_languages(ocr_language, expected):
+    assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
+
+
+@pytest.mark.parametrize(
+    ("languages", "expected"),
+    [
+        ("de", ["de"]),
+        ("zh", ["zh"]),
+        ("fr+en", ["fr", "en"]),
+        # Locales must be supported
+        ("en-001+fr-CA", ["en-001", "fr-CA"]),
+        ("en-001+fr", ["en-001", "fr"]),
+        # Special case for Chinese: variants seem to miss some dates,
+        # so we always add "zh" as a fallback.
+        ("en+zh-Hans-HK", ["en", "zh-Hans-HK", "zh"]),
+        ("en+zh-Hans", ["en", "zh-Hans", "zh"]),
+        ("en+zh-Hans+zh-Hant", ["en", "zh-Hans", "zh-Hant", "zh"]),
+    ],
+)
+def test_parser_date_parser_languages(languages, expected):
+    assert sorted(_parse_dateparser_languages(languages)) == sorted(expected)
--- a/src/paperless/utils.py
+++ b/src/paperless/utils.py
@@ -0,0 +1,110 @@
+def ocr_to_dateparser_languages() -> dict[str, str]:
+    """
+    Translation map from languages supported by Tesseract OCR
+    to languages supported by dateparser.
+    To add a language, make sure it is supported by both libraries.
+    The ISO 639-2 will help you link a 3-char to 2-char language code.
+    Links:
+    - Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
+    - Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html
+    - ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php
+    """
+    # TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,
+    # so we didn't find the equivalent in Tesseract:
+    # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
+    # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
+    # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
+    return {
+        "afr": "af",
+        "amh": "am",
+        "ara": "ar",
+        "asm": "as",
+        "ast": "ast",
+        "aze": "az",
+        "bel": "be",
+        "bul": "bg",
+        "ben": "bn",
+        "bod": "bo",
+        "bre": "br",
+        "bos": "bs",
+        "cat": "ca",
+        "cher": "chr",
+        "ces": "cs",
+        "cym": "cy",
+        "dan": "da",
+        "deu": "de",
+        "dzo": "dz",
+        "ell": "el",
+        "eng": "en",
+        "epo": "eo",
+        "spa": "es",
+        "est": "et",
+        "eus": "eu",
+        "fas": "fa",
+        "fin": "fi",
+        "fil": "fil",
+        "fao": "fo",  # codespell:ignore
+        "fra": "fr",
+        "fry": "fy",
+        "gle": "ga",
+        "gla": "gd",
+        "glg": "gl",
+        "guj": "gu",
+        "heb": "he",
+        "hin": "hi",
+        "hrv": "hr",
+        "hun": "hu",
+        "hye": "hy",
+        "ind": "id",
+        "isl": "is",
+        "ita": "it",
+        "jpn": "ja",
+        "kat": "ka",
+        "kaz": "kk",
+        "khm": "km",
+        "knda": "kn",
+        "kor": "ko",
+        "kir": "ky",
+        "ltz": "lb",
+        "lao": "lo",
+        "lit": "lt",
+        "lav": "lv",
+        "mal": "ml",
+        "mon": "mn",
+        "mar": "mr",
+        "msa": "ms",
+        "mlt": "mt",
+        "mya": "my",
+        "nep": "ne",
+        "nld": "nl",
+        "ori": "or",
+        "pan": "pa",
+        "pol": "pl",
+        "pus": "ps",
+        "por": "pt",
+        "que": "qu",
+        "ron": "ro",
+        "rus": "ru",
+        "sin": "si",
+        "slk": "sk",
+        "slv": "sl",
+        "sqi": "sq",
+        "srp": "sr",
+        "swe": "sv",
+        "swa": "sw",
+        "tam": "ta",
+        "tel": "te",  # codespell:ignore
+        "tha": "th",  # codespell:ignore
+        "tir": "ti",
+        "tgl": "tl",
+        "ton": "to",
+        "tur": "tr",
+        "uig": "ug",
+        "ukr": "uk",
+        "urd": "ur",
+        "uzb": "uz",
+        "via": "vi",
+        "yid": "yi",
+        "yor": "yo",
+        "chi": "zh",
+    }