Performance: Add support for configuring date parser languages (#10181)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Antoine Mérino 2025-07-01 07:57:38 +02:00 committed by GitHub
parent c974dc9400
commit 6591d5da63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 290 additions and 4 deletions

View File

@ -1003,6 +1003,22 @@ still perform some basic text pre-processing before matching.
Defaults to 1. Defaults to 1.
#### [`PAPERLESS_DATE_PARSER_LANGUAGES=<lang>`](#PAPERLESS_DATE_PARSER_LANGUAGES) {#PAPERLESS_DATE_PARSER_LANGUAGES}
Specifies which language Paperless should use when parsing dates from documents.
This should be a language code supported by the dateparser library,
for example: "en", or a combination such as "en+de".
Locales are also supported (e.g., "en-AU").
Multiple languages can be combined using "+", for example: "en+de" or "en-AU+de".
For valid values, refer to the list of supported languages and locales in the [dateparser documentation](https://dateparser.readthedocs.io/en/latest/supported_locales.html).
Set this to match the languages in which most of your documents are written.
If not set, Paperless will attempt to infer the language(s) from the OCR configuration (`PAPERLESS_OCR_LANGUAGE`).
!!! note
This format differs from the `PAPERLESS_OCR_LANGUAGE` setting, which uses ISO 639-2 codes (3 letters, e.g., "eng+deu" for Tesseract OCR).
#### [`PAPERLESS_EMAIL_TASK_CRON=<cron expression>`](#PAPERLESS_EMAIL_TASK_CRON) {#PAPERLESS_EMAIL_TASK_CRON} #### [`PAPERLESS_EMAIL_TASK_CRON=<cron expression>`](#PAPERLESS_EMAIL_TASK_CRON) {#PAPERLESS_EMAIL_TASK_CRON}
: Configures the scheduled email fetching frequency. The value : Configures the scheduled email fetching frequency. The value

View File

@ -280,6 +280,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"RETURN_AS_TIMEZONE_AWARE": True, "RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE, "TIMEZONE": settings.TIME_ZONE,
}, },
locales=settings.DATE_PARSER_LANGUAGES,
) )
def __filter(date: datetime.datetime) -> datetime.datetime | None: def __filter(date: datetime.datetime) -> datetime.datetime | None:

View File

@ -44,12 +44,22 @@ class TestDate:
) )
assert parse_date("", text) is None assert parse_date("", text) is None
def test_date_format_7(self, settings_timezone: ZoneInfo): def test_date_format_7(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = []
text = "lorem ipsum\nMärz 2019\nlorem ipsum" text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text) date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
def test_date_format_8(self, settings_timezone: ZoneInfo): def test_date_format_8(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = ( text = (
"lorem ipsum\n" "lorem ipsum\n"
"Wohnort\n" "Wohnort\n"
@ -71,7 +81,12 @@ class TestDate:
tzinfo=settings_timezone, tzinfo=settings_timezone,
) )
def test_date_format_9(self, settings_timezone: ZoneInfo): def test_date_format_9(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum" text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
assert parse_date("", text) == datetime.datetime( assert parse_date("", text) == datetime.datetime(
2020, 2020,
@ -250,7 +265,12 @@ class TestDate:
def test_crazy_date_with_spaces(self): def test_crazy_date_with_spaces(self):
assert parse_date("", "20 408000l 2475") is None assert parse_date("", "20 408000l 2475") is None
def test_utf_month_names(self, settings_timezone: ZoneInfo): def test_utf_month_names(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
assert parse_date("", "13 décembre 2023") == datetime.datetime( assert parse_date("", "13 décembre 2023") == datetime.datetime(
2023, 2023,
12, 12,

View File

@ -1,5 +1,7 @@
import datetime import datetime
import json import json
import logging
import logging.config
import math import math
import multiprocessing import multiprocessing
import os import os
@ -12,9 +14,14 @@ from urllib.parse import urlparse
from celery.schedules import crontab from celery.schedules import crontab
from concurrent_log_handler.queue import setup_logging_queues from concurrent_log_handler.queue import setup_logging_queues
from dateparser.languages.loader import LocaleDataLoader
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv from dotenv import load_dotenv
from paperless.utils import ocr_to_dateparser_languages
logger = logging.getLogger("paperless.settings")
# Tap paperless.conf if it's available # Tap paperless.conf if it's available
for path in [ for path in [
os.getenv("PAPERLESS_CONFIGURATION_PATH"), os.getenv("PAPERLESS_CONFIGURATION_PATH"),
@ -864,6 +871,10 @@ LOGGING = {
}, },
} }
# Configure logging before calling any logger in settings.py so it will respect the log format, even if Django has not parsed the settings yet.
logging.config.dictConfig(LOGGING)
############################################################################### ###############################################################################
# Task queue # # Task queue #
############################################################################### ###############################################################################
@ -1166,6 +1177,84 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
"""
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
into a list of locales compatible with the `dateparser` library.
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
Falls back to the base language (e.g., "az") if needed.
- If a language cannot be mapped or validated, it is skipped with a warning.
- Returns a list of valid locales, or an empty list if none could be converted.
"""
ocr_to_dateparser = ocr_to_dateparser_languages()
loader = LocaleDataLoader()
result = []
try:
for ocr_language in ocr_languages.split("+"):
# Split into language and optional script
ocr_lang_part, *script = ocr_language.split("_")
ocr_script_part = script[0] if script else None
language_part = ocr_to_dateparser.get(ocr_lang_part)
if language_part is None:
logger.warning(
f'Skipping unknown OCR language "{ocr_language}" — no dateparser equivalent.',
)
continue
# Ensure base language is supported by dateparser
loader.get_locale_map(locales=[language_part])
# Try to add the script part if it's supported by dateparser
if ocr_script_part:
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
try:
loader.get_locale_map(locales=[dateparser_language])
except Exception:
logger.warning(
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
)
dateparser_language = language_part
else:
dateparser_language = language_part
if dateparser_language not in result:
result.append(dateparser_language)
except Exception as e:
logger.warning(
f"Could not configure dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
)
return []
if not result:
logger.warning(
"Could not configure any dateparser languages from OCR_LANGUAGE — fallback to autodetection.",
)
return result
def _parse_dateparser_languages(languages: str | None):
language_list = languages.split("+") if languages else []
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
# See: https://github.com/scrapinghub/dateparser/issues/875
for index, language in enumerate(language_list):
if language.startswith("zh-") and "zh" not in language_list:
logger.warning(
f'Chinese locale detected: {language}. dateparser might fail to parse some dates with this locale, so Chinese ("zh") will be used as a fallback.',
)
language_list.append("zh")
return list(LocaleDataLoader().get_locale_map(locales=language_list))
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
)
else:
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
# Maximum number of dates taken from document start to end to show as suggestions for # Maximum number of dates taken from document start to end to show as suggestions for
# `created` date in the frontend. Duplicates are removed, which can result in # `created` date in the frontend. Duplicates are removed, which can result in
# fewer dates shown. # fewer dates shown.

View File

@ -3,10 +3,13 @@ import os
from unittest import TestCase from unittest import TestCase
from unittest import mock from unittest import mock
import pytest
from celery.schedules import crontab from celery.schedules import crontab
from paperless.settings import _ocr_to_dateparser_languages
from paperless.settings import _parse_base_paths from paperless.settings import _parse_base_paths
from paperless.settings import _parse_beat_schedule from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_dateparser_languages
from paperless.settings import _parse_db_settings from paperless.settings import _parse_db_settings
from paperless.settings import _parse_ignore_dates from paperless.settings import _parse_ignore_dates
from paperless.settings import _parse_paperless_url from paperless.settings import _parse_paperless_url
@ -471,3 +474,50 @@ class TestPathSettings(TestCase):
base_paths = _parse_base_paths() base_paths = _parse_base_paths()
self.assertEqual("/paperless/", base_paths[1]) # BASE_URL self.assertEqual("/paperless/", base_paths[1]) # BASE_URL
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
@pytest.mark.parametrize(
("ocr_language", "expected"),
[
# One language
("eng", ["en"]),
# Multiple languages
("fra+ita+lao", ["fr", "it", "lo"]),
# Languages that don't have a two-letter equivalent
("fil", ["fil"]),
# Languages with a script part supported by dateparser
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
# Languages with a script part not supported by dateparser
# In this case, default to the language without script
("deu_frak", ["de"]),
# Traditional and simplified chinese don't have the same name in dateparser,
# so they're converted to the general chinese language
("chi_tra+chi_sim", ["zh"]),
# If a language is not supported by dateparser, fallback to the supported ones
("eng+unsupported_language+por", ["en", "pt"]),
# If no language is supported, fallback to default
("unsupported1+unsupported2", []),
],
)
def test_ocr_to_dateparser_languages(ocr_language, expected):
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
@pytest.mark.parametrize(
("languages", "expected"),
[
("de", ["de"]),
("zh", ["zh"]),
("fr+en", ["fr", "en"]),
# Locales must be supported
("en-001+fr-CA", ["en-001", "fr-CA"]),
("en-001+fr", ["en-001", "fr"]),
# Special case for Chinese: variants seem to miss some dates,
# so we always add "zh" as a fallback.
("en+zh-Hans-HK", ["en", "zh-Hans-HK", "zh"]),
("en+zh-Hans", ["en", "zh-Hans", "zh"]),
("en+zh-Hans+zh-Hant", ["en", "zh-Hans", "zh-Hant", "zh"]),
],
)
def test_parser_date_parser_languages(languages, expected):
assert sorted(_parse_dateparser_languages(languages)) == sorted(expected)

110
src/paperless/utils.py Normal file
View File

@ -0,0 +1,110 @@
def ocr_to_dateparser_languages() -> dict[str, str]:
"""
Translation map from languages supported by Tesseract OCR
to languages supported by dateparser.
To add a language, make sure it is supported by both libraries.
The ISO 639-2 will help you link a 3-char to 2-char language code.
Links:
- Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
- Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html
- ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php
"""
# TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,
# so we didn't find the equivalent in Tesseract:
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
return {
"afr": "af",
"amh": "am",
"ara": "ar",
"asm": "as",
"ast": "ast",
"aze": "az",
"bel": "be",
"bul": "bg",
"ben": "bn",
"bod": "bo",
"bre": "br",
"bos": "bs",
"cat": "ca",
"cher": "chr",
"ces": "cs",
"cym": "cy",
"dan": "da",
"deu": "de",
"dzo": "dz",
"ell": "el",
"eng": "en",
"epo": "eo",
"spa": "es",
"est": "et",
"eus": "eu",
"fas": "fa",
"fin": "fi",
"fil": "fil",
"fao": "fo", # codespell:ignore
"fra": "fr",
"fry": "fy",
"gle": "ga",
"gla": "gd",
"glg": "gl",
"guj": "gu",
"heb": "he",
"hin": "hi",
"hrv": "hr",
"hun": "hu",
"hye": "hy",
"ind": "id",
"isl": "is",
"ita": "it",
"jpn": "ja",
"kat": "ka",
"kaz": "kk",
"khm": "km",
"knda": "kn",
"kor": "ko",
"kir": "ky",
"ltz": "lb",
"lao": "lo",
"lit": "lt",
"lav": "lv",
"mal": "ml",
"mon": "mn",
"mar": "mr",
"msa": "ms",
"mlt": "mt",
"mya": "my",
"nep": "ne",
"nld": "nl",
"ori": "or",
"pan": "pa",
"pol": "pl",
"pus": "ps",
"por": "pt",
"que": "qu",
"ron": "ro",
"rus": "ru",
"sin": "si",
"slk": "sk",
"slv": "sl",
"sqi": "sq",
"srp": "sr",
"swe": "sv",
"swa": "sw",
"tam": "ta",
"tel": "te", # codespell:ignore
"tha": "th", # codespell:ignore
"tir": "ti",
"tgl": "tl",
"ton": "to",
"tur": "tr",
"uig": "ug",
"ukr": "uk",
"urd": "ur",
"uzb": "uz",
"via": "vi",
"yid": "yi",
"yor": "yo",
"chi": "zh",
}