Compare commits

..

2 Commits

Author SHA1 Message Date
GitHub Actions
a6e41b4145 Auto translate strings 2025-08-31 22:25:05 +00:00
shamoon
cb927c5b22 Fix: include application config language settings for dateparser auto-detection (#10722) 2025-08-31 15:22:39 -07:00
13 changed files with 797 additions and 911 deletions

1008
.github/workflows/ci.yml vendored

File diff suppressed because it is too large Load Diff

View File

@@ -53,7 +53,6 @@ dependencies = [
"ocrmypdf~=16.10.0", "ocrmypdf~=16.10.0",
"pathvalidate~=3.3.1", "pathvalidate~=3.3.1",
"pdf2image~=1.17.0", "pdf2image~=1.17.0",
"psutil>=7",
"psycopg-pool", "psycopg-pool",
"python-dateutil~=2.9.0", "python-dateutil~=2.9.0",
"python-dotenv~=1.1.0", "python-dotenv~=1.1.0",

View File

@@ -4,7 +4,6 @@ import logging
import pickle import pickle
import re import re
import warnings import warnings
from functools import lru_cache
from hashlib import sha256 from hashlib import sha256
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@@ -51,7 +50,6 @@ class ClassifierModelCorruptError(Exception):
pass pass
@lru_cache(maxsize=1)
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None: def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
if not settings.MODEL_FILE.is_file(): if not settings.MODEL_FILE.is_file():
logger.debug( logger.debug(
@@ -63,11 +61,6 @@ def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | No
classifier = DocumentClassifier() classifier = DocumentClassifier()
try: try:
classifier.load() classifier.load()
logger.debug("classifier_id=%s", id(classifier))
logger.debug(
"classifier_data_vectorizer_hash=%s",
classifier.data_vectorizer_hash,
)
except IncompatibleClassifierVersionError as e: except IncompatibleClassifierVersionError as e:
logger.info(f"Classifier version incompatible: {e.message}, will re-train") logger.info(f"Classifier version incompatible: {e.message}, will re-train")
@@ -103,8 +96,7 @@ class DocumentClassifier:
# v7 - Updated scikit-learn package version # v7 - Updated scikit-learn package version
# v8 - Added storage path classifier # v8 - Added storage path classifier
# v9 - Changed from hashing to time/ids for re-train check # v9 - Changed from hashing to time/ids for re-train check
# v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes FORMAT_VERSION = 9
FORMAT_VERSION = 10
def __init__(self) -> None: def __init__(self) -> None:
# last time a document changed and therefore training might be required # last time a document changed and therefore training might be required
@@ -136,51 +128,32 @@ class DocumentClassifier:
).hexdigest() ).hexdigest()
def load(self) -> None: def load(self) -> None:
import joblib
from sklearn.exceptions import InconsistentVersionWarning from sklearn.exceptions import InconsistentVersionWarning
# Catch warnings for processing # Catch warnings for processing
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
try: with Path(settings.MODEL_FILE).open("rb") as f:
state = joblib.load(settings.MODEL_FILE, mmap_mode="r") schema_version = pickle.load(f)
except Exception as err:
# As a fallback, try to detect old pickle-based and mark incompatible
try:
with Path(settings.MODEL_FILE).open("rb") as f:
_ = pickle.load(f)
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
) from err
except IncompatibleClassifierVersionError:
raise
except Exception:
# Not even a readable pickle header
raise ClassifierModelCorruptError from err
try: if schema_version != self.FORMAT_VERSION:
if (
not isinstance(state, dict)
or state.get("format_version") != self.FORMAT_VERSION
):
raise IncompatibleClassifierVersionError( raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.", "Cannot load classifier, incompatible versions.",
) )
else:
try:
self.last_doc_change_time = pickle.load(f)
self.last_auto_type_hash = pickle.load(f)
self.last_doc_change_time = state.get("last_doc_change_time") self.data_vectorizer = pickle.load(f)
self.last_auto_type_hash = state.get("last_auto_type_hash") self._update_data_vectorizer_hash()
self.tags_binarizer = pickle.load(f)
self.data_vectorizer = state.get("data_vectorizer") self.tags_classifier = pickle.load(f)
self._update_data_vectorizer_hash() self.correspondent_classifier = pickle.load(f)
self.tags_binarizer = state.get("tags_binarizer") self.document_type_classifier = pickle.load(f)
self.storage_path_classifier = pickle.load(f)
self.tags_classifier = state.get("tags_classifier") except Exception as err:
self.correspondent_classifier = state.get("correspondent_classifier") raise ClassifierModelCorruptError from err
self.document_type_classifier = state.get("document_type_classifier")
self.storage_path_classifier = state.get("storage_path_classifier")
except IncompatibleClassifierVersionError:
raise
except Exception as err:
raise ClassifierModelCorruptError from err
# Check for the warning about unpickling from differing versions # Check for the warning about unpickling from differing versions
# and consider it incompatible # and consider it incompatible
@@ -198,24 +171,23 @@ class DocumentClassifier:
raise IncompatibleClassifierVersionError("sklearn version update") raise IncompatibleClassifierVersionError("sklearn version update")
def save(self) -> None: def save(self) -> None:
import joblib
target_file: Path = settings.MODEL_FILE target_file: Path = settings.MODEL_FILE
target_file_temp: Path = target_file.with_suffix(".joblib.part") target_file_temp: Path = target_file.with_suffix(".pickle.part")
state = { with target_file_temp.open("wb") as f:
"format_version": self.FORMAT_VERSION, pickle.dump(self.FORMAT_VERSION, f)
"last_doc_change_time": self.last_doc_change_time,
"last_auto_type_hash": self.last_auto_type_hash,
"data_vectorizer": self.data_vectorizer,
"tags_binarizer": self.tags_binarizer,
"tags_classifier": self.tags_classifier,
"correspondent_classifier": self.correspondent_classifier,
"document_type_classifier": self.document_type_classifier,
"storage_path_classifier": self.storage_path_classifier,
}
joblib.dump(state, target_file_temp, compress=3) pickle.dump(self.last_doc_change_time, f)
pickle.dump(self.last_auto_type_hash, f)
pickle.dump(self.data_vectorizer, f)
pickle.dump(self.tags_binarizer, f)
pickle.dump(self.tags_classifier, f)
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
target_file_temp.rename(target_file) target_file_temp.rename(target_file)

View File

@@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.utils import ocr_to_dateparser_languages
if TYPE_CHECKING: if TYPE_CHECKING:
import datetime import datetime
@@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
""" """
import dateparser import dateparser
ocr_config = OcrConfig()
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
ocr_config.language,
)
return dateparser.parse( return dateparser.parse(
ds, ds,
settings={ settings={
@@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"RETURN_AS_TIMEZONE_AWARE": True, "RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE, "TIMEZONE": settings.TIME_ZONE,
}, },
locales=settings.DATE_PARSER_LANGUAGES, locales=languages,
) )
def __filter(date: datetime.datetime) -> datetime.datetime | None: def __filter(date: datetime.datetime) -> datetime.datetime | None:

View File

@@ -1,12 +1,14 @@
import datetime import datetime
from zoneinfo import ZoneInfo from zoneinfo import ZoneInfo
import pytest
from pytest_django.fixtures import SettingsWrapper from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date from documents.parsers import parse_date
from documents.parsers import parse_date_generator from documents.parsers import parse_date_generator
@pytest.mark.django_db()
class TestDate: class TestDate:
def test_date_format_1(self): def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum" text = "lorem ipsum 130218 lorem ipsum"
@@ -49,7 +51,7 @@ class TestDate:
settings: SettingsWrapper, settings: SettingsWrapper,
settings_timezone: ZoneInfo, settings_timezone: ZoneInfo,
): ):
settings.DATE_PARSER_LANGUAGES = [] settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\nMärz 2019\nlorem ipsum" text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text) date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)

View File

@@ -3,9 +3,7 @@ import logging
import os import os
import platform import platform
import re import re
import resource
import tempfile import tempfile
import time
import zipfile import zipfile
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -192,33 +190,6 @@ if settings.AUDIT_LOG_ENABLED:
logger = logging.getLogger("paperless.api") logger = logging.getLogger("paperless.api")
try:
import psutil
_PS = psutil.Process(os.getpid())
except Exception:
_PS = None
_diag_log = logging.getLogger("paperless")
def _mem_mb():
rss = _PS.memory_info().rss if _PS else 0
peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
return rss / (1024 * 1024), peak_kb / 1024.0
def _mark(phase, doc_id, t0):
rss, peak = _mem_mb()
_diag_log.debug(
"sugg doc=%s phase=%s rss=%.1fMB peak=%.1fMB t=%.1fms",
doc_id,
phase,
rss,
peak,
(time.perf_counter() - t0) * 1000,
)
class IndexView(TemplateView): class IndexView(TemplateView):
template_name = "index.html" template_name = "index.html"
@@ -787,16 +758,7 @@ class DocumentViewSet(
), ),
) )
def suggestions(self, request, pk=None): def suggestions(self, request, pk=None):
t0 = time.perf_counter() doc = get_object_or_404(Document.objects.select_related("owner"), pk=pk)
# Don't fetch content here
doc = get_object_or_404(
Document.objects.select_related("owner").only(
"id",
"owner_id",
),
pk=pk,
)
_mark("start", doc.pk, t0)
if request.user is not None and not has_perms_owner_aware( if request.user is not None and not has_perms_owner_aware(
request.user, request.user,
"view_document", "view_document",
@@ -807,23 +769,18 @@ class DocumentViewSet(
document_suggestions = get_suggestion_cache(doc.pk) document_suggestions = get_suggestion_cache(doc.pk)
if document_suggestions is not None: if document_suggestions is not None:
_mark("cache_hit_return", doc.pk, t0)
refresh_suggestions_cache(doc.pk) refresh_suggestions_cache(doc.pk)
return Response(document_suggestions.suggestions) return Response(document_suggestions.suggestions)
classifier = load_classifier() classifier = load_classifier()
_mark("loaded_classifier", doc.pk, t0)
dates = [] dates = []
if settings.NUMBER_OF_SUGGESTED_DATES > 0: if settings.NUMBER_OF_SUGGESTED_DATES > 0:
gen = parse_date_generator(doc.filename, doc.content) gen = parse_date_generator(doc.filename, doc.content)
_mark("before_dates", doc.pk, t0)
dates = sorted( dates = sorted(
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
) )
_mark("after_dates", doc.pk, t0)
_mark("before_match", doc.pk, t0)
resp_data = { resp_data = {
"correspondents": [ "correspondents": [
c.id for c in match_correspondents(doc, classifier, request.user) c.id for c in match_correspondents(doc, classifier, request.user)
@@ -837,11 +794,9 @@ class DocumentViewSet(
], ],
"dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None], "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
} }
_mark("assembled_resp", doc.pk, t0)
# Cache the suggestions and the classifier hash for later # Cache the suggestions and the classifier hash for later
set_suggestions_cache(doc.pk, resp_data, classifier) set_suggestions_cache(doc.pk, resp_data, classifier)
_mark("cached", doc.pk, t0)
return Response(resp_data) return Response(resp_data)

View File

@@ -2,7 +2,7 @@ msgid ""
msgstr "" msgstr ""
"Project-Id-Version: paperless-ngx\n" "Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n" "Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2025-08-16 14:34+0000\n" "POT-Creation-Date: 2025-08-31 22:24+0000\n"
"PO-Revision-Date: 2022-02-17 04:17\n" "PO-Revision-Date: 2022-02-17 04:17\n"
"Last-Translator: \n" "Last-Translator: \n"
"Language-Team: English\n" "Language-Team: English\n"
@@ -1645,147 +1645,147 @@ msgstr ""
msgid "paperless application settings" msgid "paperless application settings"
msgstr "" msgstr ""
#: paperless/settings.py:774 #: paperless/settings.py:772
msgid "English (US)" msgid "English (US)"
msgstr "" msgstr ""
#: paperless/settings.py:775 #: paperless/settings.py:773
msgid "Arabic" msgid "Arabic"
msgstr "" msgstr ""
#: paperless/settings.py:776 #: paperless/settings.py:774
msgid "Afrikaans" msgid "Afrikaans"
msgstr "" msgstr ""
#: paperless/settings.py:777 #: paperless/settings.py:775
msgid "Belarusian" msgid "Belarusian"
msgstr "" msgstr ""
#: paperless/settings.py:778 #: paperless/settings.py:776
msgid "Bulgarian" msgid "Bulgarian"
msgstr "" msgstr ""
#: paperless/settings.py:779 #: paperless/settings.py:777
msgid "Catalan" msgid "Catalan"
msgstr "" msgstr ""
#: paperless/settings.py:780 #: paperless/settings.py:778
msgid "Czech" msgid "Czech"
msgstr "" msgstr ""
#: paperless/settings.py:781 #: paperless/settings.py:779
msgid "Danish" msgid "Danish"
msgstr "" msgstr ""
#: paperless/settings.py:782 #: paperless/settings.py:780
msgid "German" msgid "German"
msgstr "" msgstr ""
#: paperless/settings.py:783 #: paperless/settings.py:781
msgid "Greek" msgid "Greek"
msgstr "" msgstr ""
#: paperless/settings.py:784 #: paperless/settings.py:782
msgid "English (GB)" msgid "English (GB)"
msgstr "" msgstr ""
#: paperless/settings.py:785 #: paperless/settings.py:783
msgid "Spanish" msgid "Spanish"
msgstr "" msgstr ""
#: paperless/settings.py:786 #: paperless/settings.py:784
msgid "Persian" msgid "Persian"
msgstr "" msgstr ""
#: paperless/settings.py:787 #: paperless/settings.py:785
msgid "Finnish" msgid "Finnish"
msgstr "" msgstr ""
#: paperless/settings.py:788 #: paperless/settings.py:786
msgid "French" msgid "French"
msgstr "" msgstr ""
#: paperless/settings.py:789 #: paperless/settings.py:787
msgid "Hungarian" msgid "Hungarian"
msgstr "" msgstr ""
#: paperless/settings.py:790 #: paperless/settings.py:788
msgid "Italian" msgid "Italian"
msgstr "" msgstr ""
#: paperless/settings.py:791 #: paperless/settings.py:789
msgid "Japanese" msgid "Japanese"
msgstr "" msgstr ""
#: paperless/settings.py:792 #: paperless/settings.py:790
msgid "Korean" msgid "Korean"
msgstr "" msgstr ""
#: paperless/settings.py:793 #: paperless/settings.py:791
msgid "Luxembourgish" msgid "Luxembourgish"
msgstr "" msgstr ""
#: paperless/settings.py:794 #: paperless/settings.py:792
msgid "Norwegian" msgid "Norwegian"
msgstr "" msgstr ""
#: paperless/settings.py:795 #: paperless/settings.py:793
msgid "Dutch" msgid "Dutch"
msgstr "" msgstr ""
#: paperless/settings.py:796 #: paperless/settings.py:794
msgid "Polish" msgid "Polish"
msgstr "" msgstr ""
#: paperless/settings.py:797 #: paperless/settings.py:795
msgid "Portuguese (Brazil)" msgid "Portuguese (Brazil)"
msgstr "" msgstr ""
#: paperless/settings.py:798 #: paperless/settings.py:796
msgid "Portuguese" msgid "Portuguese"
msgstr "" msgstr ""
#: paperless/settings.py:799 #: paperless/settings.py:797
msgid "Romanian" msgid "Romanian"
msgstr "" msgstr ""
#: paperless/settings.py:800 #: paperless/settings.py:798
msgid "Russian" msgid "Russian"
msgstr "" msgstr ""
#: paperless/settings.py:801 #: paperless/settings.py:799
msgid "Slovak" msgid "Slovak"
msgstr "" msgstr ""
#: paperless/settings.py:802 #: paperless/settings.py:800
msgid "Slovenian" msgid "Slovenian"
msgstr "" msgstr ""
#: paperless/settings.py:803 #: paperless/settings.py:801
msgid "Serbian" msgid "Serbian"
msgstr "" msgstr ""
#: paperless/settings.py:804 #: paperless/settings.py:802
msgid "Swedish" msgid "Swedish"
msgstr "" msgstr ""
#: paperless/settings.py:805 #: paperless/settings.py:803
msgid "Turkish" msgid "Turkish"
msgstr "" msgstr ""
#: paperless/settings.py:806 #: paperless/settings.py:804
msgid "Ukrainian" msgid "Ukrainian"
msgstr "" msgstr ""
#: paperless/settings.py:807 #: paperless/settings.py:805
msgid "Vietnamese" msgid "Vietnamese"
msgstr "" msgstr ""
#: paperless/settings.py:808 #: paperless/settings.py:806
msgid "Chinese Simplified" msgid "Chinese Simplified"
msgstr "" msgstr ""
#: paperless/settings.py:809 #: paperless/settings.py:807
msgid "Chinese Traditional" msgid "Chinese Traditional"
msgstr "" msgstr ""

View File

@@ -1,14 +1,7 @@
import logging
import os
import resource
import time
from django.conf import settings from django.conf import settings
from paperless import version from paperless import version
logger = logging.getLogger("middleware")
class ApiVersionMiddleware: class ApiVersionMiddleware:
def __init__(self, get_response): def __init__(self, get_response):
@@ -22,56 +15,3 @@ class ApiVersionMiddleware:
response["X-Version"] = version.__full_version_str__ response["X-Version"] = version.__full_version_str__
return response return response
try:
import psutil
_PSUTIL = True
except Exception:
_PSUTIL = False
class MemLogMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
# capture baseline
ru_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if _PSUTIL:
p = psutil.Process()
rss_before = p.memory_info().rss
else:
rss_before = 0
t0 = time.perf_counter()
try:
return self.get_response(request)
finally:
dur_ms = (time.perf_counter() - t0) * 1000.0
ru_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
# ru_maxrss is KB on Linux; convert to MB
peak_mb = (ru_after) / 1024.0
peak_delta_mb = (ru_after - ru_before) / 1024.0
if _PSUTIL:
rss_after = p.memory_info().rss
delta_mb = (rss_after - rss_before) / (1024 * 1024)
rss_mb = rss_after / (1024 * 1024)
else:
delta_mb = 0.0
rss_mb = 0.0
logger.debug(
"pid=%s mem rss=%.1fMB Δend=%.1fMB peak=%.1fMB Δpeak=%.1fMB dur=%.1fms %s %s",
os.getpid(),
rss_mb,
delta_mb,
peak_mb,
peak_delta_mb,
dur_ms,
request.method,
request.path,
)

View File

@@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv from dotenv import load_dotenv
from paperless.utils import ocr_to_dateparser_languages
logger = logging.getLogger("paperless.settings") logger = logging.getLogger("paperless.settings")
# Tap paperless.conf if it's available # Tap paperless.conf if it's available
@@ -363,7 +361,6 @@ if DEBUG:
) )
MIDDLEWARE = [ MIDDLEWARE = [
"paperless.middleware.MemLogMiddleware",
"django.middleware.security.SecurityMiddleware", "django.middleware.security.SecurityMiddleware",
"whitenoise.middleware.WhiteNoiseMiddleware", "whitenoise.middleware.WhiteNoiseMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware", "django.contrib.sessions.middleware.SessionMiddleware",
@@ -834,7 +831,7 @@ LOGGING = {
"disable_existing_loggers": False, "disable_existing_loggers": False,
"formatters": { "formatters": {
"verbose": { "verbose": {
"format": "[{asctime}] [{levelname}] [{name}] pid={process} {message}", "format": "[{asctime}] [{levelname}] [{name}] {message}",
"style": "{", "style": "{",
}, },
"simple": { "simple": {
@@ -879,7 +876,6 @@ LOGGING = {
"kombu": {"handlers": ["file_celery"], "level": "DEBUG"}, "kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
"_granian": {"handlers": ["file_paperless"], "level": "DEBUG"}, "_granian": {"handlers": ["file_paperless"], "level": "DEBUG"},
"granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"}, "granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"},
"middleware": {"handlers": ["console"], "level": "DEBUG"},
}, },
} }
@@ -1186,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
"""
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
into a list of locales compatible with the `dateparser` library.
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
Falls back to the base language (e.g., "az") if needed.
- If a language cannot be mapped or validated, it is skipped with a warning.
- Returns a list of valid locales, or an empty list if none could be converted.
"""
ocr_to_dateparser = ocr_to_dateparser_languages()
loader = LocaleDataLoader()
result = []
try:
for ocr_language in ocr_languages.split("+"):
# Split into language and optional script
ocr_lang_part, *script = ocr_language.split("_")
ocr_script_part = script[0] if script else None
language_part = ocr_to_dateparser.get(ocr_lang_part)
if language_part is None:
logger.debug(
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
)
continue
# Ensure base language is supported by dateparser
loader.get_locale_map(locales=[language_part])
# Try to add the script part if it's supported by dateparser
if ocr_script_part:
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
try:
loader.get_locale_map(locales=[dateparser_language])
except Exception:
logger.info(
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
)
dateparser_language = language_part
else:
dateparser_language = language_part
if dateparser_language not in result:
result.append(dateparser_language)
except Exception as e:
logger.warning(
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
)
return []
if not result:
logger.info(
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
)
return result
def _parse_dateparser_languages(languages: str | None): def _parse_dateparser_languages(languages: str | None):
language_list = languages.split("+") if languages else [] language_list = languages.split("+") if languages else []
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib. # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
@@ -1255,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None):
return list(LocaleDataLoader().get_locale_map(locales=language_list)) return list(LocaleDataLoader().get_locale_map(locales=language_list))
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"): # If not set, we will infer it at runtime
DATE_PARSER_LANGUAGES = _parse_dateparser_languages( DATE_PARSER_LANGUAGES = (
_parse_dateparser_languages(
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"), os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
) )
else: if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE) else None
)
# Maximum number of dates taken from document start to end to show as suggestions for # Maximum number of dates taken from document start to end to show as suggestions for

View File

@@ -6,7 +6,6 @@ from unittest import mock
import pytest import pytest
from celery.schedules import crontab from celery.schedules import crontab
from paperless.settings import _ocr_to_dateparser_languages
from paperless.settings import _parse_base_paths from paperless.settings import _parse_base_paths
from paperless.settings import _parse_beat_schedule from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_dateparser_languages from paperless.settings import _parse_dateparser_languages
@@ -476,33 +475,6 @@ class TestPathSettings(TestCase):
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
@pytest.mark.parametrize(
("ocr_language", "expected"),
[
# One language
("eng", ["en"]),
# Multiple languages
("fra+ita+lao", ["fr", "it", "lo"]),
# Languages that don't have a two-letter equivalent
("fil", ["fil"]),
# Languages with a script part supported by dateparser
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
# Languages with a script part not supported by dateparser
# In this case, default to the language without script
("deu_frak", ["de"]),
# Traditional and simplified chinese don't have the same name in dateparser,
# so they're converted to the general chinese language
("chi_tra+chi_sim", ["zh"]),
# If a language is not supported by dateparser, fallback to the supported ones
("eng+unsupported_language+por", ["en", "pt"]),
# If no language is supported, fallback to default
("unsupported1+unsupported2", []),
],
)
def test_ocr_to_dateparser_languages(ocr_language, expected):
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("languages", "expected"), ("languages", "expected"),
[ [

View File

@@ -0,0 +1,52 @@
import logging
import pytest
from paperless import utils
from paperless.utils import ocr_to_dateparser_languages
@pytest.mark.parametrize(
("ocr_language", "expected"),
[
# One language
("eng", ["en"]),
# Multiple languages
("fra+ita+lao", ["fr", "it", "lo"]),
# Languages that don't have a two-letter equivalent
("fil", ["fil"]),
# Languages with a script part supported by dateparser
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
# Languages with a script part not supported by dateparser
# In this case, default to the language without script
("deu_frak", ["de"]),
# Traditional and simplified chinese don't have the same name in dateparser,
# so they're converted to the general chinese language
("chi_tra+chi_sim", ["zh"]),
# If a language is not supported by dateparser, fallback to the supported ones
("eng+unsupported_language+por", ["en", "pt"]),
# If no language is supported, fallback to default
("unsupported1+unsupported2", []),
# Duplicate languages, should not duplicate in result
("eng+eng", ["en"]),
# Language with script, but script is not mapped
("ita_unknownscript", ["it"]),
],
)
def test_ocr_to_dateparser_languages(ocr_language, expected):
assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog):
# Patch LocaleDataLoader.get_locale_map to raise an exception
class DummyLoader:
def get_locale_map(self, locales=None):
raise RuntimeError("Simulated error")
with caplog.at_level(logging.WARNING):
monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader())
result = utils.ocr_to_dateparser_languages("eng+fra")
assert result == []
assert (
"Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text
)

View File

@@ -1,4 +1,10 @@
def ocr_to_dateparser_languages() -> dict[str, str]: import logging
from dateparser.languages.loader import LocaleDataLoader
logger = logging.getLogger("paperless.utils")
OCR_TO_DATEPARSER_LANGUAGES = {
""" """
Translation map from languages supported by Tesseract OCR Translation map from languages supported by Tesseract OCR
to languages supported by dateparser. to languages supported by dateparser.
@@ -14,97 +20,150 @@ def ocr_to_dateparser_languages() -> dict[str, str]:
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln, # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus, # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
return { "afr": "af",
"afr": "af", "amh": "am",
"amh": "am", "ara": "ar",
"ara": "ar", "asm": "as",
"asm": "as", "ast": "ast",
"ast": "ast", "aze": "az",
"aze": "az", "bel": "be",
"bel": "be", "bul": "bg",
"bul": "bg", "ben": "bn",
"ben": "bn", "bod": "bo",
"bod": "bo", "bre": "br",
"bre": "br", "bos": "bs",
"bos": "bs", "cat": "ca",
"cat": "ca", "cher": "chr",
"cher": "chr", "ces": "cs",
"ces": "cs", "cym": "cy",
"cym": "cy", "dan": "da",
"dan": "da", "deu": "de",
"deu": "de", "dzo": "dz",
"dzo": "dz", "ell": "el",
"ell": "el", "eng": "en",
"eng": "en", "epo": "eo",
"epo": "eo", "spa": "es",
"spa": "es", "est": "et",
"est": "et", "eus": "eu",
"eus": "eu", "fas": "fa",
"fas": "fa", "fin": "fi",
"fin": "fi", "fil": "fil",
"fil": "fil", "fao": "fo", # codespell:ignore
"fao": "fo", # codespell:ignore "fra": "fr",
"fra": "fr", "fry": "fy",
"fry": "fy", "gle": "ga",
"gle": "ga", "gla": "gd",
"gla": "gd", "glg": "gl",
"glg": "gl", "guj": "gu",
"guj": "gu", "heb": "he",
"heb": "he", "hin": "hi",
"hin": "hi", "hrv": "hr",
"hrv": "hr", "hun": "hu",
"hun": "hu", "hye": "hy",
"hye": "hy", "ind": "id",
"ind": "id", "isl": "is",
"isl": "is", "ita": "it",
"ita": "it", "jpn": "ja",
"jpn": "ja", "kat": "ka",
"kat": "ka", "kaz": "kk",
"kaz": "kk", "khm": "km",
"khm": "km", "knda": "kn",
"knda": "kn", "kor": "ko",
"kor": "ko", "kir": "ky",
"kir": "ky", "ltz": "lb",
"ltz": "lb", "lao": "lo",
"lao": "lo", "lit": "lt",
"lit": "lt", "lav": "lv",
"lav": "lv", "mal": "ml",
"mal": "ml", "mon": "mn",
"mon": "mn", "mar": "mr",
"mar": "mr", "msa": "ms",
"msa": "ms", "mlt": "mt",
"mlt": "mt", "mya": "my",
"mya": "my", "nep": "ne",
"nep": "ne", "nld": "nl",
"nld": "nl", "ori": "or",
"ori": "or", "pan": "pa",
"pan": "pa", "pol": "pl",
"pol": "pl", "pus": "ps",
"pus": "ps", "por": "pt",
"por": "pt", "que": "qu",
"que": "qu", "ron": "ro",
"ron": "ro", "rus": "ru",
"rus": "ru", "sin": "si",
"sin": "si", "slk": "sk",
"slk": "sk", "slv": "sl",
"slv": "sl", "sqi": "sq",
"sqi": "sq", "srp": "sr",
"srp": "sr", "swe": "sv",
"swe": "sv", "swa": "sw",
"swa": "sw", "tam": "ta",
"tam": "ta", "tel": "te", # codespell:ignore
"tel": "te", # codespell:ignore "tha": "th", # codespell:ignore
"tha": "th", # codespell:ignore "tir": "ti",
"tir": "ti", "tgl": "tl",
"tgl": "tl", "ton": "to",
"ton": "to", "tur": "tr",
"tur": "tr", "uig": "ug",
"uig": "ug", "ukr": "uk",
"ukr": "uk", "urd": "ur",
"urd": "ur", "uzb": "uz",
"uzb": "uz", "via": "vi",
"via": "vi", "yid": "yi",
"yid": "yi", "yor": "yo",
"yor": "yo", "chi": "zh",
"chi": "zh", }
}
def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
"""
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
into a list of locales compatible with the `dateparser` library.
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
Falls back to the base language (e.g., "az") if needed.
- If a language cannot be mapped or validated, it is skipped with a warning.
- Returns a list of valid locales, or an empty list if none could be converted.
"""
loader = LocaleDataLoader()
result = []
try:
for ocr_language in ocr_languages.split("+"):
# Split into language and optional script
ocr_lang_part, *script = ocr_language.split("_")
ocr_script_part = script[0] if script else None
language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
if language_part is None:
logger.debug(
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
)
continue
# Ensure base language is supported by dateparser
loader.get_locale_map(locales=[language_part])
# Try to add the script part if it's supported by dateparser
if ocr_script_part:
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
try:
loader.get_locale_map(locales=[dateparser_language])
except Exception:
logger.info(
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
)
dateparser_language = language_part
else:
dateparser_language = language_part
if dateparser_language not in result:
result.append(dateparser_language)
except Exception as e:
logger.warning(
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
)
return []
if not result:
logger.info(
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
)
return result

15
uv.lock generated
View File

@@ -2046,7 +2046,6 @@ dependencies = [
{ name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2183,7 +2182,6 @@ requires-dist = [
{ name = "ocrmypdf", specifier = "~=16.10.0" }, { name = "ocrmypdf", specifier = "~=16.10.0" },
{ name = "pathvalidate", specifier = "~=3.3.1" }, { name = "pathvalidate", specifier = "~=3.3.1" },
{ name = "pdf2image", specifier = "~=1.17.0" }, { name = "pdf2image", specifier = "~=1.17.0" },
{ name = "psutil", specifier = ">=7.0.0" },
{ name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" }, { name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" },
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" }, { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" },
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" }, { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" },
@@ -2550,19 +2548,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" }, { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" },
] ]
[[package]]
name = "psutil"
version = "7.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
{ url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
{ url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
{ url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
{ url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
]
[[package]] [[package]]
name = "psycopg" name = "psycopg"
version = "3.2.9" version = "3.2.9"