mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-02 16:14:39 -05:00
Performance: Add support for configuring date parser languages (#10181)
--------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
parent
c974dc9400
commit
6591d5da63
@ -1003,6 +1003,22 @@ still perform some basic text pre-processing before matching.
|
|||||||
|
|
||||||
Defaults to 1.
|
Defaults to 1.
|
||||||
|
|
||||||
|
#### [`PAPERLESS_DATE_PARSER_LANGUAGES=<lang>`](#PAPERLESS_DATE_PARSER_LANGUAGES) {#PAPERLESS_DATE_PARSER_LANGUAGES}
|
||||||
|
|
||||||
|
Specifies which language Paperless should use when parsing dates from documents.
|
||||||
|
|
||||||
|
This should be a language code supported by the dateparser library,
|
||||||
|
for example: "en", or a combination such as "en+de".
|
||||||
|
Locales are also supported (e.g., "en-AU").
|
||||||
|
Multiple languages can be combined using "+", for example: "en+de" or "en-AU+de".
|
||||||
|
For valid values, refer to the list of supported languages and locales in the [dateparser documentation](https://dateparser.readthedocs.io/en/latest/supported_locales.html).
|
||||||
|
|
||||||
|
Set this to match the languages in which most of your documents are written.
|
||||||
|
If not set, Paperless will attempt to infer the language(s) from the OCR configuration (`PAPERLESS_OCR_LANGUAGE`).
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
This format differs from the `PAPERLESS_OCR_LANGUAGE` setting, which uses ISO 639-2 codes (3 letters, e.g., "eng+deu" for Tesseract OCR).
|
||||||
|
|
||||||
#### [`PAPERLESS_EMAIL_TASK_CRON=<cron expression>`](#PAPERLESS_EMAIL_TASK_CRON) {#PAPERLESS_EMAIL_TASK_CRON}
|
#### [`PAPERLESS_EMAIL_TASK_CRON=<cron expression>`](#PAPERLESS_EMAIL_TASK_CRON) {#PAPERLESS_EMAIL_TASK_CRON}
|
||||||
|
|
||||||
: Configures the scheduled email fetching frequency. The value
|
: Configures the scheduled email fetching frequency. The value
|
||||||
|
@ -280,6 +280,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
|||||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||||
"TIMEZONE": settings.TIME_ZONE,
|
"TIMEZONE": settings.TIME_ZONE,
|
||||||
},
|
},
|
||||||
|
locales=settings.DATE_PARSER_LANGUAGES,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
||||||
|
@ -44,12 +44,22 @@ class TestDate:
|
|||||||
)
|
)
|
||||||
assert parse_date("", text) is None
|
assert parse_date("", text) is None
|
||||||
|
|
||||||
def test_date_format_7(self, settings_timezone: ZoneInfo):
|
def test_date_format_7(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
settings_timezone: ZoneInfo,
|
||||||
|
):
|
||||||
|
settings.DATE_PARSER_LANGUAGES = []
|
||||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
||||||
date = parse_date("", text)
|
date = parse_date("", text)
|
||||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
||||||
|
|
||||||
def test_date_format_8(self, settings_timezone: ZoneInfo):
|
def test_date_format_8(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
settings_timezone: ZoneInfo,
|
||||||
|
):
|
||||||
|
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||||
text = (
|
text = (
|
||||||
"lorem ipsum\n"
|
"lorem ipsum\n"
|
||||||
"Wohnort\n"
|
"Wohnort\n"
|
||||||
@ -71,7 +81,12 @@ class TestDate:
|
|||||||
tzinfo=settings_timezone,
|
tzinfo=settings_timezone,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_date_format_9(self, settings_timezone: ZoneInfo):
|
def test_date_format_9(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
settings_timezone: ZoneInfo,
|
||||||
|
):
|
||||||
|
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||||
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
||||||
assert parse_date("", text) == datetime.datetime(
|
assert parse_date("", text) == datetime.datetime(
|
||||||
2020,
|
2020,
|
||||||
@ -250,7 +265,12 @@ class TestDate:
|
|||||||
def test_crazy_date_with_spaces(self):
|
def test_crazy_date_with_spaces(self):
|
||||||
assert parse_date("", "20 408000l 2475") is None
|
assert parse_date("", "20 408000l 2475") is None
|
||||||
|
|
||||||
def test_utf_month_names(self, settings_timezone: ZoneInfo):
|
def test_utf_month_names(
|
||||||
|
self,
|
||||||
|
settings: SettingsWrapper,
|
||||||
|
settings_timezone: ZoneInfo,
|
||||||
|
):
|
||||||
|
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
|
||||||
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
||||||
2023,
|
2023,
|
||||||
12,
|
12,
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
import logging.config
|
||||||
import math
|
import math
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
@ -12,9 +14,14 @@ from urllib.parse import urlparse
|
|||||||
|
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from concurrent_log_handler.queue import setup_logging_queues
|
from concurrent_log_handler.queue import setup_logging_queues
|
||||||
|
from dateparser.languages.loader import LocaleDataLoader
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from paperless.utils import ocr_to_dateparser_languages
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.settings")
|
||||||
|
|
||||||
# Tap paperless.conf if it's available
|
# Tap paperless.conf if it's available
|
||||||
for path in [
|
for path in [
|
||||||
os.getenv("PAPERLESS_CONFIGURATION_PATH"),
|
os.getenv("PAPERLESS_CONFIGURATION_PATH"),
|
||||||
@ -864,6 +871,10 @@ LOGGING = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Configure logging before calling any logger in settings.py so it will respect the log format, even if Django has not parsed the settings yet.
|
||||||
|
logging.config.dictConfig(LOGGING)
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# Task queue #
|
# Task queue #
|
||||||
###############################################################################
|
###############################################################################
|
||||||
@ -1166,6 +1177,84 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
|
|||||||
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
||||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
|
||||||
|
into a list of locales compatible with the `dateparser` library.
|
||||||
|
|
||||||
|
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
|
||||||
|
Falls back to the base language (e.g., "az") if needed.
|
||||||
|
- If a language cannot be mapped or validated, it is skipped with a warning.
|
||||||
|
- Returns a list of valid locales, or an empty list if none could be converted.
|
||||||
|
"""
|
||||||
|
ocr_to_dateparser = ocr_to_dateparser_languages()
|
||||||
|
loader = LocaleDataLoader()
|
||||||
|
result = []
|
||||||
|
try:
|
||||||
|
for ocr_language in ocr_languages.split("+"):
|
||||||
|
# Split into language and optional script
|
||||||
|
ocr_lang_part, *script = ocr_language.split("_")
|
||||||
|
ocr_script_part = script[0] if script else None
|
||||||
|
|
||||||
|
language_part = ocr_to_dateparser.get(ocr_lang_part)
|
||||||
|
if language_part is None:
|
||||||
|
logger.warning(
|
||||||
|
f'Skipping unknown OCR language "{ocr_language}" — no dateparser equivalent.',
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure base language is supported by dateparser
|
||||||
|
loader.get_locale_map(locales=[language_part])
|
||||||
|
|
||||||
|
# Try to add the script part if it's supported by dateparser
|
||||||
|
if ocr_script_part:
|
||||||
|
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
|
||||||
|
try:
|
||||||
|
loader.get_locale_map(locales=[dateparser_language])
|
||||||
|
except Exception:
|
||||||
|
logger.warning(
|
||||||
|
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
|
||||||
|
)
|
||||||
|
dateparser_language = language_part
|
||||||
|
else:
|
||||||
|
dateparser_language = language_part
|
||||||
|
if dateparser_language not in result:
|
||||||
|
result.append(dateparser_language)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Could not configure dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
if not result:
|
||||||
|
logger.warning(
|
||||||
|
"Could not configure any dateparser languages from OCR_LANGUAGE — fallback to autodetection.",
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_dateparser_languages(languages: str | None):
|
||||||
|
language_list = languages.split("+") if languages else []
|
||||||
|
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
|
||||||
|
# See: https://github.com/scrapinghub/dateparser/issues/875
|
||||||
|
for index, language in enumerate(language_list):
|
||||||
|
if language.startswith("zh-") and "zh" not in language_list:
|
||||||
|
logger.warning(
|
||||||
|
f'Chinese locale detected: {language}. dateparser might fail to parse some dates with this locale, so Chinese ("zh") will be used as a fallback.',
|
||||||
|
)
|
||||||
|
language_list.append("zh")
|
||||||
|
|
||||||
|
return list(LocaleDataLoader().get_locale_map(locales=language_list))
|
||||||
|
|
||||||
|
|
||||||
|
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
|
||||||
|
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
|
||||||
|
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
|
||||||
|
|
||||||
|
|
||||||
# Maximum number of dates taken from document start to end to show as suggestions for
|
# Maximum number of dates taken from document start to end to show as suggestions for
|
||||||
# `created` date in the frontend. Duplicates are removed, which can result in
|
# `created` date in the frontend. Duplicates are removed, which can result in
|
||||||
# fewer dates shown.
|
# fewer dates shown.
|
||||||
|
@ -3,10 +3,13 @@ import os
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
|
|
||||||
|
from paperless.settings import _ocr_to_dateparser_languages
|
||||||
from paperless.settings import _parse_base_paths
|
from paperless.settings import _parse_base_paths
|
||||||
from paperless.settings import _parse_beat_schedule
|
from paperless.settings import _parse_beat_schedule
|
||||||
|
from paperless.settings import _parse_dateparser_languages
|
||||||
from paperless.settings import _parse_db_settings
|
from paperless.settings import _parse_db_settings
|
||||||
from paperless.settings import _parse_ignore_dates
|
from paperless.settings import _parse_ignore_dates
|
||||||
from paperless.settings import _parse_paperless_url
|
from paperless.settings import _parse_paperless_url
|
||||||
@ -471,3 +474,50 @@ class TestPathSettings(TestCase):
|
|||||||
base_paths = _parse_base_paths()
|
base_paths = _parse_base_paths()
|
||||||
self.assertEqual("/paperless/", base_paths[1]) # BASE_URL
|
self.assertEqual("/paperless/", base_paths[1]) # BASE_URL
|
||||||
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
|
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("ocr_language", "expected"),
|
||||||
|
[
|
||||||
|
# One language
|
||||||
|
("eng", ["en"]),
|
||||||
|
# Multiple languages
|
||||||
|
("fra+ita+lao", ["fr", "it", "lo"]),
|
||||||
|
# Languages that don't have a two-letter equivalent
|
||||||
|
("fil", ["fil"]),
|
||||||
|
# Languages with a script part supported by dateparser
|
||||||
|
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
|
||||||
|
# Languages with a script part not supported by dateparser
|
||||||
|
# In this case, default to the language without script
|
||||||
|
("deu_frak", ["de"]),
|
||||||
|
# Traditional and simplified chinese don't have the same name in dateparser,
|
||||||
|
# so they're converted to the general chinese language
|
||||||
|
("chi_tra+chi_sim", ["zh"]),
|
||||||
|
# If a language is not supported by dateparser, fallback to the supported ones
|
||||||
|
("eng+unsupported_language+por", ["en", "pt"]),
|
||||||
|
# If no language is supported, fallback to default
|
||||||
|
("unsupported1+unsupported2", []),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_ocr_to_dateparser_languages(ocr_language, expected):
|
||||||
|
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("languages", "expected"),
|
||||||
|
[
|
||||||
|
("de", ["de"]),
|
||||||
|
("zh", ["zh"]),
|
||||||
|
("fr+en", ["fr", "en"]),
|
||||||
|
# Locales must be supported
|
||||||
|
("en-001+fr-CA", ["en-001", "fr-CA"]),
|
||||||
|
("en-001+fr", ["en-001", "fr"]),
|
||||||
|
# Special case for Chinese: variants seem to miss some dates,
|
||||||
|
# so we always add "zh" as a fallback.
|
||||||
|
("en+zh-Hans-HK", ["en", "zh-Hans-HK", "zh"]),
|
||||||
|
("en+zh-Hans", ["en", "zh-Hans", "zh"]),
|
||||||
|
("en+zh-Hans+zh-Hant", ["en", "zh-Hans", "zh-Hant", "zh"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parser_date_parser_languages(languages, expected):
|
||||||
|
assert sorted(_parse_dateparser_languages(languages)) == sorted(expected)
|
||||||
|
110
src/paperless/utils.py
Normal file
110
src/paperless/utils.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
def ocr_to_dateparser_languages() -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Translation map from languages supported by Tesseract OCR
|
||||||
|
to languages supported by dateparser.
|
||||||
|
To add a language, make sure it is supported by both libraries.
|
||||||
|
The ISO 639-2 will help you link a 3-char to 2-char language code.
|
||||||
|
Links:
|
||||||
|
- Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
|
||||||
|
- Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html
|
||||||
|
- ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||||
|
"""
|
||||||
|
# TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,
|
||||||
|
# so we didn't find the equivalent in Tesseract:
|
||||||
|
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
|
||||||
|
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
|
||||||
|
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
|
||||||
|
return {
|
||||||
|
"afr": "af",
|
||||||
|
"amh": "am",
|
||||||
|
"ara": "ar",
|
||||||
|
"asm": "as",
|
||||||
|
"ast": "ast",
|
||||||
|
"aze": "az",
|
||||||
|
"bel": "be",
|
||||||
|
"bul": "bg",
|
||||||
|
"ben": "bn",
|
||||||
|
"bod": "bo",
|
||||||
|
"bre": "br",
|
||||||
|
"bos": "bs",
|
||||||
|
"cat": "ca",
|
||||||
|
"cher": "chr",
|
||||||
|
"ces": "cs",
|
||||||
|
"cym": "cy",
|
||||||
|
"dan": "da",
|
||||||
|
"deu": "de",
|
||||||
|
"dzo": "dz",
|
||||||
|
"ell": "el",
|
||||||
|
"eng": "en",
|
||||||
|
"epo": "eo",
|
||||||
|
"spa": "es",
|
||||||
|
"est": "et",
|
||||||
|
"eus": "eu",
|
||||||
|
"fas": "fa",
|
||||||
|
"fin": "fi",
|
||||||
|
"fil": "fil",
|
||||||
|
"fao": "fo", # codespell:ignore
|
||||||
|
"fra": "fr",
|
||||||
|
"fry": "fy",
|
||||||
|
"gle": "ga",
|
||||||
|
"gla": "gd",
|
||||||
|
"glg": "gl",
|
||||||
|
"guj": "gu",
|
||||||
|
"heb": "he",
|
||||||
|
"hin": "hi",
|
||||||
|
"hrv": "hr",
|
||||||
|
"hun": "hu",
|
||||||
|
"hye": "hy",
|
||||||
|
"ind": "id",
|
||||||
|
"isl": "is",
|
||||||
|
"ita": "it",
|
||||||
|
"jpn": "ja",
|
||||||
|
"kat": "ka",
|
||||||
|
"kaz": "kk",
|
||||||
|
"khm": "km",
|
||||||
|
"knda": "kn",
|
||||||
|
"kor": "ko",
|
||||||
|
"kir": "ky",
|
||||||
|
"ltz": "lb",
|
||||||
|
"lao": "lo",
|
||||||
|
"lit": "lt",
|
||||||
|
"lav": "lv",
|
||||||
|
"mal": "ml",
|
||||||
|
"mon": "mn",
|
||||||
|
"mar": "mr",
|
||||||
|
"msa": "ms",
|
||||||
|
"mlt": "mt",
|
||||||
|
"mya": "my",
|
||||||
|
"nep": "ne",
|
||||||
|
"nld": "nl",
|
||||||
|
"ori": "or",
|
||||||
|
"pan": "pa",
|
||||||
|
"pol": "pl",
|
||||||
|
"pus": "ps",
|
||||||
|
"por": "pt",
|
||||||
|
"que": "qu",
|
||||||
|
"ron": "ro",
|
||||||
|
"rus": "ru",
|
||||||
|
"sin": "si",
|
||||||
|
"slk": "sk",
|
||||||
|
"slv": "sl",
|
||||||
|
"sqi": "sq",
|
||||||
|
"srp": "sr",
|
||||||
|
"swe": "sv",
|
||||||
|
"swa": "sw",
|
||||||
|
"tam": "ta",
|
||||||
|
"tel": "te", # codespell:ignore
|
||||||
|
"tha": "th", # codespell:ignore
|
||||||
|
"tir": "ti",
|
||||||
|
"tgl": "tl",
|
||||||
|
"ton": "to",
|
||||||
|
"tur": "tr",
|
||||||
|
"uig": "ug",
|
||||||
|
"ukr": "uk",
|
||||||
|
"urd": "ur",
|
||||||
|
"uzb": "uz",
|
||||||
|
"via": "vi",
|
||||||
|
"yid": "yi",
|
||||||
|
"yor": "yo",
|
||||||
|
"chi": "zh",
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user