Performance: Add support for configuring date parser languages (#10181)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Antoine Mérino
2025-07-01 07:57:38 +02:00
committed by GitHub
parent c974dc9400
commit 6591d5da63
6 changed files with 290 additions and 4 deletions

View File

@@ -1,5 +1,7 @@
import datetime
import json
import logging
import logging.config
import math
import multiprocessing
import os
@@ -12,9 +14,14 @@ from urllib.parse import urlparse
from celery.schedules import crontab
from concurrent_log_handler.queue import setup_logging_queues
from dateparser.languages.loader import LocaleDataLoader
from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv
from paperless.utils import ocr_to_dateparser_languages
logger = logging.getLogger("paperless.settings")
# Tap paperless.conf if it's available
for path in [
os.getenv("PAPERLESS_CONFIGURATION_PATH"),
@@ -864,6 +871,10 @@ LOGGING = {
},
}
# Configure logging before calling any logger in settings.py so it will respect the log format, even if Django has not parsed the settings yet.
logging.config.dictConfig(LOGGING)
###############################################################################
# Task queue #
###############################################################################
@@ -1166,6 +1177,84 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
"""
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
into a list of locales compatible with the `dateparser` library.
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
Falls back to the base language (e.g., "az") if needed.
- If a language cannot be mapped or validated, it is skipped with a warning.
- Returns a list of valid locales, or an empty list if none could be converted.
"""
ocr_to_dateparser = ocr_to_dateparser_languages()
loader = LocaleDataLoader()
result = []
try:
for ocr_language in ocr_languages.split("+"):
# Split into language and optional script
ocr_lang_part, *script = ocr_language.split("_")
ocr_script_part = script[0] if script else None
language_part = ocr_to_dateparser.get(ocr_lang_part)
if language_part is None:
logger.warning(
f'Skipping unknown OCR language "{ocr_language}" — no dateparser equivalent.',
)
continue
# Ensure base language is supported by dateparser
loader.get_locale_map(locales=[language_part])
# Try to add the script part if it's supported by dateparser
if ocr_script_part:
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
try:
loader.get_locale_map(locales=[dateparser_language])
except Exception:
logger.warning(
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
)
dateparser_language = language_part
else:
dateparser_language = language_part
if dateparser_language not in result:
result.append(dateparser_language)
except Exception as e:
logger.warning(
f"Could not configure dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
)
return []
if not result:
logger.warning(
"Could not configure any dateparser languages from OCR_LANGUAGE — fallback to autodetection.",
)
return result
def _parse_dateparser_languages(languages: str | None):
language_list = languages.split("+") if languages else []
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
# See: https://github.com/scrapinghub/dateparser/issues/875
for index, language in enumerate(language_list):
if language.startswith("zh-") and "zh" not in language_list:
logger.warning(
f'Chinese locale detected: {language}. dateparser might fail to parse some dates with this locale, so Chinese ("zh") will be used as a fallback.',
)
language_list.append("zh")
return list(LocaleDataLoader().get_locale_map(locales=language_list))
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
)
else:
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
# Maximum number of dates taken from document start to end to show as suggestions for
# `created` date in the frontend. Duplicates are removed, which can result in
# fewer dates shown.

View File

@@ -3,10 +3,13 @@ import os
from unittest import TestCase
from unittest import mock
import pytest
from celery.schedules import crontab
from paperless.settings import _ocr_to_dateparser_languages
from paperless.settings import _parse_base_paths
from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_dateparser_languages
from paperless.settings import _parse_db_settings
from paperless.settings import _parse_ignore_dates
from paperless.settings import _parse_paperless_url
@@ -471,3 +474,50 @@ class TestPathSettings(TestCase):
base_paths = _parse_base_paths()
self.assertEqual("/paperless/", base_paths[1]) # BASE_URL
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
@pytest.mark.parametrize(
("ocr_language", "expected"),
[
# One language
("eng", ["en"]),
# Multiple languages
("fra+ita+lao", ["fr", "it", "lo"]),
# Languages that don't have a two-letter equivalent
("fil", ["fil"]),
# Languages with a script part supported by dateparser
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
# Languages with a script part not supported by dateparser
# In this case, default to the language without script
("deu_frak", ["de"]),
# Traditional and simplified chinese don't have the same name in dateparser,
# so they're converted to the general chinese language
("chi_tra+chi_sim", ["zh"]),
# If a language is not supported by dateparser, fallback to the supported ones
("eng+unsupported_language+por", ["en", "pt"]),
# If no language is supported, fallback to default
("unsupported1+unsupported2", []),
],
)
def test_ocr_to_dateparser_languages(ocr_language, expected):
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
@pytest.mark.parametrize(
("languages", "expected"),
[
("de", ["de"]),
("zh", ["zh"]),
("fr+en", ["fr", "en"]),
# Locales must be supported
("en-001+fr-CA", ["en-001", "fr-CA"]),
("en-001+fr", ["en-001", "fr"]),
# Special case for Chinese: variants seem to miss some dates,
# so we always add "zh" as a fallback.
("en+zh-Hans-HK", ["en", "zh-Hans-HK", "zh"]),
("en+zh-Hans", ["en", "zh-Hans", "zh"]),
("en+zh-Hans+zh-Hant", ["en", "zh-Hans", "zh-Hant", "zh"]),
],
)
def test_parser_date_parser_languages(languages, expected):
assert sorted(_parse_dateparser_languages(languages)) == sorted(expected)

110
src/paperless/utils.py Normal file
View File

@@ -0,0 +1,110 @@
def ocr_to_dateparser_languages() -> dict[str, str]:
"""
Translation map from languages supported by Tesseract OCR
to languages supported by dateparser.
To add a language, make sure it is supported by both libraries.
The ISO 639-2 will help you link a 3-char to 2-char language code.
Links:
- Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
- Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html
- ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php
"""
# TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,
# so we didn't find the equivalent in Tesseract:
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
return {
"afr": "af",
"amh": "am",
"ara": "ar",
"asm": "as",
"ast": "ast",
"aze": "az",
"bel": "be",
"bul": "bg",
"ben": "bn",
"bod": "bo",
"bre": "br",
"bos": "bs",
"cat": "ca",
"cher": "chr",
"ces": "cs",
"cym": "cy",
"dan": "da",
"deu": "de",
"dzo": "dz",
"ell": "el",
"eng": "en",
"epo": "eo",
"spa": "es",
"est": "et",
"eus": "eu",
"fas": "fa",
"fin": "fi",
"fil": "fil",
"fao": "fo", # codespell:ignore
"fra": "fr",
"fry": "fy",
"gle": "ga",
"gla": "gd",
"glg": "gl",
"guj": "gu",
"heb": "he",
"hin": "hi",
"hrv": "hr",
"hun": "hu",
"hye": "hy",
"ind": "id",
"isl": "is",
"ita": "it",
"jpn": "ja",
"kat": "ka",
"kaz": "kk",
"khm": "km",
"knda": "kn",
"kor": "ko",
"kir": "ky",
"ltz": "lb",
"lao": "lo",
"lit": "lt",
"lav": "lv",
"mal": "ml",
"mon": "mn",
"mar": "mr",
"msa": "ms",
"mlt": "mt",
"mya": "my",
"nep": "ne",
"nld": "nl",
"ori": "or",
"pan": "pa",
"pol": "pl",
"pus": "ps",
"por": "pt",
"que": "qu",
"ron": "ro",
"rus": "ru",
"sin": "si",
"slk": "sk",
"slv": "sl",
"sqi": "sq",
"srp": "sr",
"swe": "sv",
"swa": "sw",
"tam": "ta",
"tel": "te", # codespell:ignore
"tha": "th", # codespell:ignore
"tir": "ti",
"tgl": "tl",
"ton": "to",
"tur": "tr",
"uig": "ug",
"ukr": "uk",
"urd": "ur",
"uzb": "uz",
"via": "vi",
"yid": "yi",
"yor": "yo",
"chi": "zh",
}