From 6591d5da633abcb82c70bf9a585beb66db3bcd45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20M=C3=A9rino?= Date: Tue, 1 Jul 2025 07:57:38 +0200 Subject: [PATCH] Performance: Add support for configuring date parser languages (#10181) --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- docs/configuration.md | 16 ++++ src/documents/parsers.py | 1 + src/documents/tests/test_date_parsing.py | 28 +++++- src/paperless/settings.py | 89 ++++++++++++++++++ src/paperless/tests/test_settings.py | 50 +++++++++++ src/paperless/utils.py | 110 +++++++++++++++++++++++ 6 files changed, 290 insertions(+), 4 deletions(-) create mode 100644 src/paperless/utils.py diff --git a/docs/configuration.md b/docs/configuration.md index 5da5b8e3e..7ad235376 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1003,6 +1003,22 @@ still perform some basic text pre-processing before matching. Defaults to 1. +#### [`PAPERLESS_DATE_PARSER_LANGUAGES=`](#PAPERLESS_DATE_PARSER_LANGUAGES) {#PAPERLESS_DATE_PARSER_LANGUAGES} + +Specifies which language Paperless should use when parsing dates from documents. + + This should be a language code supported by the dateparser library, + for example: "en", or a combination such as "en+de". + Locales are also supported (e.g., "en-AU"). + Multiple languages can be combined using "+", for example: "en+de" or "en-AU+de". + For valid values, refer to the list of supported languages and locales in the [dateparser documentation](https://dateparser.readthedocs.io/en/latest/supported_locales.html). + + Set this to match the languages in which most of your documents are written. + If not set, Paperless will attempt to infer the language(s) from the OCR configuration (`PAPERLESS_OCR_LANGUAGE`). + +!!! note +This format differs from the `PAPERLESS_OCR_LANGUAGE` setting, which uses ISO 639-2 codes (3 letters, e.g., "eng+deu" for Tesseract OCR). + #### [`PAPERLESS_EMAIL_TASK_CRON=`](#PAPERLESS_EMAIL_TASK_CRON) {#PAPERLESS_EMAIL_TASK_CRON} : Configures the scheduled email fetching frequency. The value diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 1465234a9..526c131d0 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -280,6 +280,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: "RETURN_AS_TIMEZONE_AWARE": True, "TIMEZONE": settings.TIME_ZONE, }, + locales=settings.DATE_PARSER_LANGUAGES, ) def __filter(date: datetime.datetime) -> datetime.datetime | None: diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index f0afae543..1bad27266 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -44,12 +44,22 @@ class TestDate: ) assert parse_date("", text) is None - def test_date_format_7(self, settings_timezone: ZoneInfo): + def test_date_format_7( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): + settings.DATE_PARSER_LANGUAGES = [] text = "lorem ipsum\nMärz 2019\nlorem ipsum" date = parse_date("", text) assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) - def test_date_format_8(self, settings_timezone: ZoneInfo): + def test_date_format_8( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): + settings.DATE_PARSER_LANGUAGES = ["de"] text = ( "lorem ipsum\n" "Wohnort\n" @@ -71,7 +81,12 @@ class TestDate: tzinfo=settings_timezone, ) - def test_date_format_9(self, settings_timezone: ZoneInfo): + def test_date_format_9( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): + settings.DATE_PARSER_LANGUAGES = ["de"] text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum" assert parse_date("", text) == datetime.datetime( 2020, @@ -250,7 +265,12 @@ class TestDate: def test_crazy_date_with_spaces(self): assert parse_date("", "20 408000l 2475") is None - def test_utf_month_names(self, settings_timezone: ZoneInfo): + def test_utf_month_names( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): + settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"] assert parse_date("", "13 décembre 2023") == datetime.datetime( 2023, 12, diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b140bc17e..e77fceb96 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1,5 +1,7 @@ import datetime import json +import logging +import logging.config import math import multiprocessing import os @@ -12,9 +14,14 @@ from urllib.parse import urlparse from celery.schedules import crontab from concurrent_log_handler.queue import setup_logging_queues +from dateparser.languages.loader import LocaleDataLoader from django.utils.translation import gettext_lazy as _ from dotenv import load_dotenv +from paperless.utils import ocr_to_dateparser_languages + +logger = logging.getLogger("paperless.settings") + # Tap paperless.conf if it's available for path in [ os.getenv("PAPERLESS_CONFIGURATION_PATH"), @@ -864,6 +871,10 @@ LOGGING = { }, } +# Configure logging before calling any logger in settings.py so it will respect the log format, even if Django has not parsed the settings yet. +logging.config.dictConfig(LOGGING) + + ############################################################################### # Task queue # ############################################################################### @@ -1166,6 +1177,84 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT") DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") + +def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]: + """ + Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl") + into a list of locales compatible with the `dateparser` library. + + - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl"). + Falls back to the base language (e.g., "az") if needed. + - If a language cannot be mapped or validated, it is skipped with a warning. + - Returns a list of valid locales, or an empty list if none could be converted. + """ + ocr_to_dateparser = ocr_to_dateparser_languages() + loader = LocaleDataLoader() + result = [] + try: + for ocr_language in ocr_languages.split("+"): + # Split into language and optional script + ocr_lang_part, *script = ocr_language.split("_") + ocr_script_part = script[0] if script else None + + language_part = ocr_to_dateparser.get(ocr_lang_part) + if language_part is None: + logger.warning( + f'Skipping unknown OCR language "{ocr_language}" — no dateparser equivalent.', + ) + continue + + # Ensure base language is supported by dateparser + loader.get_locale_map(locales=[language_part]) + + # Try to add the script part if it's supported by dateparser + if ocr_script_part: + dateparser_language = f"{language_part}-{ocr_script_part.title()}" + try: + loader.get_locale_map(locales=[dateparser_language]) + except Exception: + logger.warning( + f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.", + ) + dateparser_language = language_part + else: + dateparser_language = language_part + if dateparser_language not in result: + result.append(dateparser_language) + except Exception as e: + logger.warning( + f"Could not configure dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}", + ) + return [] + if not result: + logger.warning( + "Could not configure any dateparser languages from OCR_LANGUAGE — fallback to autodetection.", + ) + return result + + +def _parse_dateparser_languages(languages: str | None): + language_list = languages.split("+") if languages else [] + # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib. + # See: https://github.com/scrapinghub/dateparser/issues/875 + for index, language in enumerate(language_list): + if language.startswith("zh-") and "zh" not in language_list: + logger.warning( + f'Chinese locale detected: {language}. dateparser might fail to parse some dates with this locale, so Chinese ("zh") will be used as a fallback.', + ) + language_list.append("zh") + + return list(LocaleDataLoader().get_locale_map(locales=language_list)) + + +if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"): + DATE_PARSER_LANGUAGES = _parse_dateparser_languages( + os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"), + ) +else: + DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE) + + # Maximum number of dates taken from document start to end to show as suggestions for # `created` date in the frontend. Duplicates are removed, which can result in # fewer dates shown. diff --git a/src/paperless/tests/test_settings.py b/src/paperless/tests/test_settings.py index fe7356947..8a191f209 100644 --- a/src/paperless/tests/test_settings.py +++ b/src/paperless/tests/test_settings.py @@ -3,10 +3,13 @@ import os from unittest import TestCase from unittest import mock +import pytest from celery.schedules import crontab +from paperless.settings import _ocr_to_dateparser_languages from paperless.settings import _parse_base_paths from paperless.settings import _parse_beat_schedule +from paperless.settings import _parse_dateparser_languages from paperless.settings import _parse_db_settings from paperless.settings import _parse_ignore_dates from paperless.settings import _parse_paperless_url @@ -471,3 +474,50 @@ class TestPathSettings(TestCase): base_paths = _parse_base_paths() self.assertEqual("/paperless/", base_paths[1]) # BASE_URL self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL + + +@pytest.mark.parametrize( + ("ocr_language", "expected"), + [ + # One language + ("eng", ["en"]), + # Multiple languages + ("fra+ita+lao", ["fr", "it", "lo"]), + # Languages that don't have a two-letter equivalent + ("fil", ["fil"]), + # Languages with a script part supported by dateparser + ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]), + # Languages with a script part not supported by dateparser + # In this case, default to the language without script + ("deu_frak", ["de"]), + # Traditional and simplified chinese don't have the same name in dateparser, + # so they're converted to the general chinese language + ("chi_tra+chi_sim", ["zh"]), + # If a language is not supported by dateparser, fallback to the supported ones + ("eng+unsupported_language+por", ["en", "pt"]), + # If no language is supported, fallback to default + ("unsupported1+unsupported2", []), + ], +) +def test_ocr_to_dateparser_languages(ocr_language, expected): + assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected) + + +@pytest.mark.parametrize( + ("languages", "expected"), + [ + ("de", ["de"]), + ("zh", ["zh"]), + ("fr+en", ["fr", "en"]), + # Locales must be supported + ("en-001+fr-CA", ["en-001", "fr-CA"]), + ("en-001+fr", ["en-001", "fr"]), + # Special case for Chinese: variants seem to miss some dates, + # so we always add "zh" as a fallback. + ("en+zh-Hans-HK", ["en", "zh-Hans-HK", "zh"]), + ("en+zh-Hans", ["en", "zh-Hans", "zh"]), + ("en+zh-Hans+zh-Hant", ["en", "zh-Hans", "zh-Hant", "zh"]), + ], +) +def test_parser_date_parser_languages(languages, expected): + assert sorted(_parse_dateparser_languages(languages)) == sorted(expected) diff --git a/src/paperless/utils.py b/src/paperless/utils.py new file mode 100644 index 000000000..965b862f7 --- /dev/null +++ b/src/paperless/utils.py @@ -0,0 +1,110 @@ +def ocr_to_dateparser_languages() -> dict[str, str]: + """ + Translation map from languages supported by Tesseract OCR + to languages supported by dateparser. + To add a language, make sure it is supported by both libraries. + The ISO 639-2 will help you link a 3-char to 2-char language code. + Links: + - Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html + - Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html + - ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php + """ + # TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard, + # so we didn't find the equivalent in Tesseract: + # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln, + # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus, + # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue + return { + "afr": "af", + "amh": "am", + "ara": "ar", + "asm": "as", + "ast": "ast", + "aze": "az", + "bel": "be", + "bul": "bg", + "ben": "bn", + "bod": "bo", + "bre": "br", + "bos": "bs", + "cat": "ca", + "cher": "chr", + "ces": "cs", + "cym": "cy", + "dan": "da", + "deu": "de", + "dzo": "dz", + "ell": "el", + "eng": "en", + "epo": "eo", + "spa": "es", + "est": "et", + "eus": "eu", + "fas": "fa", + "fin": "fi", + "fil": "fil", + "fao": "fo", # codespell:ignore + "fra": "fr", + "fry": "fy", + "gle": "ga", + "gla": "gd", + "glg": "gl", + "guj": "gu", + "heb": "he", + "hin": "hi", + "hrv": "hr", + "hun": "hu", + "hye": "hy", + "ind": "id", + "isl": "is", + "ita": "it", + "jpn": "ja", + "kat": "ka", + "kaz": "kk", + "khm": "km", + "knda": "kn", + "kor": "ko", + "kir": "ky", + "ltz": "lb", + "lao": "lo", + "lit": "lt", + "lav": "lv", + "mal": "ml", + "mon": "mn", + "mar": "mr", + "msa": "ms", + "mlt": "mt", + "mya": "my", + "nep": "ne", + "nld": "nl", + "ori": "or", + "pan": "pa", + "pol": "pl", + "pus": "ps", + "por": "pt", + "que": "qu", + "ron": "ro", + "rus": "ru", + "sin": "si", + "slk": "sk", + "slv": "sl", + "sqi": "sq", + "srp": "sr", + "swe": "sv", + "swa": "sw", + "tam": "ta", + "tel": "te", # codespell:ignore + "tha": "th", # codespell:ignore + "tir": "ti", + "tgl": "tl", + "ton": "to", + "tur": "tr", + "uig": "ug", + "ukr": "uk", + "urd": "ur", + "uzb": "uz", + "via": "vi", + "yid": "yi", + "yor": "yo", + "chi": "zh", + }