mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-01 01:46:16 +00:00
Compare commits
2 Commits
fix-sugges
...
dev
Author | SHA1 | Date | |
---|---|---|---|
![]() |
a6e41b4145 | ||
![]() |
cb927c5b22 |
1008
.github/workflows/ci.yml
vendored
1008
.github/workflows/ci.yml
vendored
File diff suppressed because it is too large
Load Diff
@@ -53,7 +53,6 @@ dependencies = [
|
|||||||
"ocrmypdf~=16.10.0",
|
"ocrmypdf~=16.10.0",
|
||||||
"pathvalidate~=3.3.1",
|
"pathvalidate~=3.3.1",
|
||||||
"pdf2image~=1.17.0",
|
"pdf2image~=1.17.0",
|
||||||
"psutil>=7",
|
|
||||||
"psycopg-pool",
|
"psycopg-pool",
|
||||||
"python-dateutil~=2.9.0",
|
"python-dateutil~=2.9.0",
|
||||||
"python-dotenv~=1.1.0",
|
"python-dotenv~=1.1.0",
|
||||||
|
@@ -4,7 +4,6 @@ import logging
|
|||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from functools import lru_cache
|
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
@@ -51,7 +50,6 @@ class ClassifierModelCorruptError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
|
||||||
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
|
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
|
||||||
if not settings.MODEL_FILE.is_file():
|
if not settings.MODEL_FILE.is_file():
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@@ -63,11 +61,6 @@ def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | No
|
|||||||
classifier = DocumentClassifier()
|
classifier = DocumentClassifier()
|
||||||
try:
|
try:
|
||||||
classifier.load()
|
classifier.load()
|
||||||
logger.debug("classifier_id=%s", id(classifier))
|
|
||||||
logger.debug(
|
|
||||||
"classifier_data_vectorizer_hash=%s",
|
|
||||||
classifier.data_vectorizer_hash,
|
|
||||||
)
|
|
||||||
|
|
||||||
except IncompatibleClassifierVersionError as e:
|
except IncompatibleClassifierVersionError as e:
|
||||||
logger.info(f"Classifier version incompatible: {e.message}, will re-train")
|
logger.info(f"Classifier version incompatible: {e.message}, will re-train")
|
||||||
@@ -103,8 +96,7 @@ class DocumentClassifier:
|
|||||||
# v7 - Updated scikit-learn package version
|
# v7 - Updated scikit-learn package version
|
||||||
# v8 - Added storage path classifier
|
# v8 - Added storage path classifier
|
||||||
# v9 - Changed from hashing to time/ids for re-train check
|
# v9 - Changed from hashing to time/ids for re-train check
|
||||||
# v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes
|
FORMAT_VERSION = 9
|
||||||
FORMAT_VERSION = 10
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
# last time a document changed and therefore training might be required
|
# last time a document changed and therefore training might be required
|
||||||
@@ -136,49 +128,30 @@ class DocumentClassifier:
|
|||||||
).hexdigest()
|
).hexdigest()
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
import joblib
|
|
||||||
from sklearn.exceptions import InconsistentVersionWarning
|
from sklearn.exceptions import InconsistentVersionWarning
|
||||||
|
|
||||||
# Catch warnings for processing
|
# Catch warnings for processing
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
try:
|
|
||||||
state = joblib.load(settings.MODEL_FILE, mmap_mode="r")
|
|
||||||
except Exception as err:
|
|
||||||
# As a fallback, try to detect old pickle-based and mark incompatible
|
|
||||||
try:
|
|
||||||
with Path(settings.MODEL_FILE).open("rb") as f:
|
with Path(settings.MODEL_FILE).open("rb") as f:
|
||||||
_ = pickle.load(f)
|
schema_version = pickle.load(f)
|
||||||
raise IncompatibleClassifierVersionError(
|
|
||||||
"Cannot load classifier, incompatible versions.",
|
|
||||||
) from err
|
|
||||||
except IncompatibleClassifierVersionError:
|
|
||||||
raise
|
|
||||||
except Exception:
|
|
||||||
# Not even a readable pickle header
|
|
||||||
raise ClassifierModelCorruptError from err
|
|
||||||
|
|
||||||
try:
|
if schema_version != self.FORMAT_VERSION:
|
||||||
if (
|
|
||||||
not isinstance(state, dict)
|
|
||||||
or state.get("format_version") != self.FORMAT_VERSION
|
|
||||||
):
|
|
||||||
raise IncompatibleClassifierVersionError(
|
raise IncompatibleClassifierVersionError(
|
||||||
"Cannot load classifier, incompatible versions.",
|
"Cannot load classifier, incompatible versions.",
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.last_doc_change_time = pickle.load(f)
|
||||||
|
self.last_auto_type_hash = pickle.load(f)
|
||||||
|
|
||||||
self.last_doc_change_time = state.get("last_doc_change_time")
|
self.data_vectorizer = pickle.load(f)
|
||||||
self.last_auto_type_hash = state.get("last_auto_type_hash")
|
|
||||||
|
|
||||||
self.data_vectorizer = state.get("data_vectorizer")
|
|
||||||
self._update_data_vectorizer_hash()
|
self._update_data_vectorizer_hash()
|
||||||
self.tags_binarizer = state.get("tags_binarizer")
|
self.tags_binarizer = pickle.load(f)
|
||||||
|
|
||||||
self.tags_classifier = state.get("tags_classifier")
|
self.tags_classifier = pickle.load(f)
|
||||||
self.correspondent_classifier = state.get("correspondent_classifier")
|
self.correspondent_classifier = pickle.load(f)
|
||||||
self.document_type_classifier = state.get("document_type_classifier")
|
self.document_type_classifier = pickle.load(f)
|
||||||
self.storage_path_classifier = state.get("storage_path_classifier")
|
self.storage_path_classifier = pickle.load(f)
|
||||||
except IncompatibleClassifierVersionError:
|
|
||||||
raise
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise ClassifierModelCorruptError from err
|
raise ClassifierModelCorruptError from err
|
||||||
|
|
||||||
@@ -198,24 +171,23 @@ class DocumentClassifier:
|
|||||||
raise IncompatibleClassifierVersionError("sklearn version update")
|
raise IncompatibleClassifierVersionError("sklearn version update")
|
||||||
|
|
||||||
def save(self) -> None:
|
def save(self) -> None:
|
||||||
import joblib
|
|
||||||
|
|
||||||
target_file: Path = settings.MODEL_FILE
|
target_file: Path = settings.MODEL_FILE
|
||||||
target_file_temp: Path = target_file.with_suffix(".joblib.part")
|
target_file_temp: Path = target_file.with_suffix(".pickle.part")
|
||||||
|
|
||||||
state = {
|
with target_file_temp.open("wb") as f:
|
||||||
"format_version": self.FORMAT_VERSION,
|
pickle.dump(self.FORMAT_VERSION, f)
|
||||||
"last_doc_change_time": self.last_doc_change_time,
|
|
||||||
"last_auto_type_hash": self.last_auto_type_hash,
|
|
||||||
"data_vectorizer": self.data_vectorizer,
|
|
||||||
"tags_binarizer": self.tags_binarizer,
|
|
||||||
"tags_classifier": self.tags_classifier,
|
|
||||||
"correspondent_classifier": self.correspondent_classifier,
|
|
||||||
"document_type_classifier": self.document_type_classifier,
|
|
||||||
"storage_path_classifier": self.storage_path_classifier,
|
|
||||||
}
|
|
||||||
|
|
||||||
joblib.dump(state, target_file_temp, compress=3)
|
pickle.dump(self.last_doc_change_time, f)
|
||||||
|
pickle.dump(self.last_auto_type_hash, f)
|
||||||
|
|
||||||
|
pickle.dump(self.data_vectorizer, f)
|
||||||
|
|
||||||
|
pickle.dump(self.tags_binarizer, f)
|
||||||
|
pickle.dump(self.tags_classifier, f)
|
||||||
|
|
||||||
|
pickle.dump(self.correspondent_classifier, f)
|
||||||
|
pickle.dump(self.document_type_classifier, f)
|
||||||
|
pickle.dump(self.storage_path_classifier, f)
|
||||||
|
|
||||||
target_file_temp.rename(target_file)
|
target_file_temp.rename(target_file)
|
||||||
|
|
||||||
|
@@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin
|
|||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
from documents.utils import copy_file_with_basic_stats
|
from documents.utils import copy_file_with_basic_stats
|
||||||
from documents.utils import run_subprocess
|
from documents.utils import run_subprocess
|
||||||
|
from paperless.config import OcrConfig
|
||||||
|
from paperless.utils import ocr_to_dateparser_languages
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import datetime
|
import datetime
|
||||||
@@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
|||||||
"""
|
"""
|
||||||
import dateparser
|
import dateparser
|
||||||
|
|
||||||
|
ocr_config = OcrConfig()
|
||||||
|
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
||||||
|
ocr_config.language,
|
||||||
|
)
|
||||||
|
|
||||||
return dateparser.parse(
|
return dateparser.parse(
|
||||||
ds,
|
ds,
|
||||||
settings={
|
settings={
|
||||||
@@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
|||||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||||
"TIMEZONE": settings.TIME_ZONE,
|
"TIMEZONE": settings.TIME_ZONE,
|
||||||
},
|
},
|
||||||
locales=settings.DATE_PARSER_LANGUAGES,
|
locales=languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
||||||
|
@@ -1,12 +1,14 @@
|
|||||||
import datetime
|
import datetime
|
||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
|
import pytest
|
||||||
from pytest_django.fixtures import SettingsWrapper
|
from pytest_django.fixtures import SettingsWrapper
|
||||||
|
|
||||||
from documents.parsers import parse_date
|
from documents.parsers import parse_date
|
||||||
from documents.parsers import parse_date_generator
|
from documents.parsers import parse_date_generator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db()
|
||||||
class TestDate:
|
class TestDate:
|
||||||
def test_date_format_1(self):
|
def test_date_format_1(self):
|
||||||
text = "lorem ipsum 130218 lorem ipsum"
|
text = "lorem ipsum 130218 lorem ipsum"
|
||||||
@@ -49,7 +51,7 @@ class TestDate:
|
|||||||
settings: SettingsWrapper,
|
settings: SettingsWrapper,
|
||||||
settings_timezone: ZoneInfo,
|
settings_timezone: ZoneInfo,
|
||||||
):
|
):
|
||||||
settings.DATE_PARSER_LANGUAGES = []
|
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
||||||
date = parse_date("", text)
|
date = parse_date("", text)
|
||||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
||||||
|
@@ -3,9 +3,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
import resource
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
|
||||||
import zipfile
|
import zipfile
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -192,33 +190,6 @@ if settings.AUDIT_LOG_ENABLED:
|
|||||||
|
|
||||||
logger = logging.getLogger("paperless.api")
|
logger = logging.getLogger("paperless.api")
|
||||||
|
|
||||||
try:
|
|
||||||
import psutil
|
|
||||||
|
|
||||||
_PS = psutil.Process(os.getpid())
|
|
||||||
except Exception:
|
|
||||||
_PS = None
|
|
||||||
|
|
||||||
_diag_log = logging.getLogger("paperless")
|
|
||||||
|
|
||||||
|
|
||||||
def _mem_mb():
|
|
||||||
rss = _PS.memory_info().rss if _PS else 0
|
|
||||||
peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
||||||
return rss / (1024 * 1024), peak_kb / 1024.0
|
|
||||||
|
|
||||||
|
|
||||||
def _mark(phase, doc_id, t0):
|
|
||||||
rss, peak = _mem_mb()
|
|
||||||
_diag_log.debug(
|
|
||||||
"sugg doc=%s phase=%s rss=%.1fMB peak=%.1fMB t=%.1fms",
|
|
||||||
doc_id,
|
|
||||||
phase,
|
|
||||||
rss,
|
|
||||||
peak,
|
|
||||||
(time.perf_counter() - t0) * 1000,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class IndexView(TemplateView):
|
class IndexView(TemplateView):
|
||||||
template_name = "index.html"
|
template_name = "index.html"
|
||||||
@@ -787,16 +758,7 @@ class DocumentViewSet(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
def suggestions(self, request, pk=None):
|
def suggestions(self, request, pk=None):
|
||||||
t0 = time.perf_counter()
|
doc = get_object_or_404(Document.objects.select_related("owner"), pk=pk)
|
||||||
# Don't fetch content here
|
|
||||||
doc = get_object_or_404(
|
|
||||||
Document.objects.select_related("owner").only(
|
|
||||||
"id",
|
|
||||||
"owner_id",
|
|
||||||
),
|
|
||||||
pk=pk,
|
|
||||||
)
|
|
||||||
_mark("start", doc.pk, t0)
|
|
||||||
if request.user is not None and not has_perms_owner_aware(
|
if request.user is not None and not has_perms_owner_aware(
|
||||||
request.user,
|
request.user,
|
||||||
"view_document",
|
"view_document",
|
||||||
@@ -807,23 +769,18 @@ class DocumentViewSet(
|
|||||||
document_suggestions = get_suggestion_cache(doc.pk)
|
document_suggestions = get_suggestion_cache(doc.pk)
|
||||||
|
|
||||||
if document_suggestions is not None:
|
if document_suggestions is not None:
|
||||||
_mark("cache_hit_return", doc.pk, t0)
|
|
||||||
refresh_suggestions_cache(doc.pk)
|
refresh_suggestions_cache(doc.pk)
|
||||||
return Response(document_suggestions.suggestions)
|
return Response(document_suggestions.suggestions)
|
||||||
|
|
||||||
classifier = load_classifier()
|
classifier = load_classifier()
|
||||||
_mark("loaded_classifier", doc.pk, t0)
|
|
||||||
|
|
||||||
dates = []
|
dates = []
|
||||||
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
||||||
gen = parse_date_generator(doc.filename, doc.content)
|
gen = parse_date_generator(doc.filename, doc.content)
|
||||||
_mark("before_dates", doc.pk, t0)
|
|
||||||
dates = sorted(
|
dates = sorted(
|
||||||
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
|
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
|
||||||
)
|
)
|
||||||
_mark("after_dates", doc.pk, t0)
|
|
||||||
|
|
||||||
_mark("before_match", doc.pk, t0)
|
|
||||||
resp_data = {
|
resp_data = {
|
||||||
"correspondents": [
|
"correspondents": [
|
||||||
c.id for c in match_correspondents(doc, classifier, request.user)
|
c.id for c in match_correspondents(doc, classifier, request.user)
|
||||||
@@ -837,11 +794,9 @@ class DocumentViewSet(
|
|||||||
],
|
],
|
||||||
"dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
|
"dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
|
||||||
}
|
}
|
||||||
_mark("assembled_resp", doc.pk, t0)
|
|
||||||
|
|
||||||
# Cache the suggestions and the classifier hash for later
|
# Cache the suggestions and the classifier hash for later
|
||||||
set_suggestions_cache(doc.pk, resp_data, classifier)
|
set_suggestions_cache(doc.pk, resp_data, classifier)
|
||||||
_mark("cached", doc.pk, t0)
|
|
||||||
|
|
||||||
return Response(resp_data)
|
return Response(resp_data)
|
||||||
|
|
||||||
|
@@ -2,7 +2,7 @@ msgid ""
|
|||||||
msgstr ""
|
msgstr ""
|
||||||
"Project-Id-Version: paperless-ngx\n"
|
"Project-Id-Version: paperless-ngx\n"
|
||||||
"Report-Msgid-Bugs-To: \n"
|
"Report-Msgid-Bugs-To: \n"
|
||||||
"POT-Creation-Date: 2025-08-16 14:34+0000\n"
|
"POT-Creation-Date: 2025-08-31 22:24+0000\n"
|
||||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||||
"Last-Translator: \n"
|
"Last-Translator: \n"
|
||||||
"Language-Team: English\n"
|
"Language-Team: English\n"
|
||||||
@@ -1645,147 +1645,147 @@ msgstr ""
|
|||||||
msgid "paperless application settings"
|
msgid "paperless application settings"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:774
|
#: paperless/settings.py:772
|
||||||
msgid "English (US)"
|
msgid "English (US)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:775
|
#: paperless/settings.py:773
|
||||||
msgid "Arabic"
|
msgid "Arabic"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:776
|
#: paperless/settings.py:774
|
||||||
msgid "Afrikaans"
|
msgid "Afrikaans"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:777
|
#: paperless/settings.py:775
|
||||||
msgid "Belarusian"
|
msgid "Belarusian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:778
|
#: paperless/settings.py:776
|
||||||
msgid "Bulgarian"
|
msgid "Bulgarian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:779
|
#: paperless/settings.py:777
|
||||||
msgid "Catalan"
|
msgid "Catalan"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:780
|
#: paperless/settings.py:778
|
||||||
msgid "Czech"
|
msgid "Czech"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:781
|
#: paperless/settings.py:779
|
||||||
msgid "Danish"
|
msgid "Danish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:782
|
#: paperless/settings.py:780
|
||||||
msgid "German"
|
msgid "German"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:783
|
#: paperless/settings.py:781
|
||||||
msgid "Greek"
|
msgid "Greek"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:784
|
#: paperless/settings.py:782
|
||||||
msgid "English (GB)"
|
msgid "English (GB)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:785
|
#: paperless/settings.py:783
|
||||||
msgid "Spanish"
|
msgid "Spanish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:786
|
#: paperless/settings.py:784
|
||||||
msgid "Persian"
|
msgid "Persian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:787
|
#: paperless/settings.py:785
|
||||||
msgid "Finnish"
|
msgid "Finnish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:788
|
#: paperless/settings.py:786
|
||||||
msgid "French"
|
msgid "French"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:789
|
#: paperless/settings.py:787
|
||||||
msgid "Hungarian"
|
msgid "Hungarian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:790
|
#: paperless/settings.py:788
|
||||||
msgid "Italian"
|
msgid "Italian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:791
|
#: paperless/settings.py:789
|
||||||
msgid "Japanese"
|
msgid "Japanese"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:792
|
#: paperless/settings.py:790
|
||||||
msgid "Korean"
|
msgid "Korean"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:793
|
#: paperless/settings.py:791
|
||||||
msgid "Luxembourgish"
|
msgid "Luxembourgish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:794
|
#: paperless/settings.py:792
|
||||||
msgid "Norwegian"
|
msgid "Norwegian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:795
|
#: paperless/settings.py:793
|
||||||
msgid "Dutch"
|
msgid "Dutch"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:796
|
#: paperless/settings.py:794
|
||||||
msgid "Polish"
|
msgid "Polish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:797
|
#: paperless/settings.py:795
|
||||||
msgid "Portuguese (Brazil)"
|
msgid "Portuguese (Brazil)"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:798
|
#: paperless/settings.py:796
|
||||||
msgid "Portuguese"
|
msgid "Portuguese"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:799
|
#: paperless/settings.py:797
|
||||||
msgid "Romanian"
|
msgid "Romanian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:800
|
#: paperless/settings.py:798
|
||||||
msgid "Russian"
|
msgid "Russian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:801
|
#: paperless/settings.py:799
|
||||||
msgid "Slovak"
|
msgid "Slovak"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:802
|
#: paperless/settings.py:800
|
||||||
msgid "Slovenian"
|
msgid "Slovenian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:803
|
#: paperless/settings.py:801
|
||||||
msgid "Serbian"
|
msgid "Serbian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:804
|
#: paperless/settings.py:802
|
||||||
msgid "Swedish"
|
msgid "Swedish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:805
|
#: paperless/settings.py:803
|
||||||
msgid "Turkish"
|
msgid "Turkish"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:806
|
#: paperless/settings.py:804
|
||||||
msgid "Ukrainian"
|
msgid "Ukrainian"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:807
|
#: paperless/settings.py:805
|
||||||
msgid "Vietnamese"
|
msgid "Vietnamese"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:808
|
#: paperless/settings.py:806
|
||||||
msgid "Chinese Simplified"
|
msgid "Chinese Simplified"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
#: paperless/settings.py:809
|
#: paperless/settings.py:807
|
||||||
msgid "Chinese Traditional"
|
msgid "Chinese Traditional"
|
||||||
msgstr ""
|
msgstr ""
|
||||||
|
|
||||||
|
@@ -1,14 +1,7 @@
|
|||||||
import logging
|
|
||||||
import os
|
|
||||||
import resource
|
|
||||||
import time
|
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from paperless import version
|
from paperless import version
|
||||||
|
|
||||||
logger = logging.getLogger("middleware")
|
|
||||||
|
|
||||||
|
|
||||||
class ApiVersionMiddleware:
|
class ApiVersionMiddleware:
|
||||||
def __init__(self, get_response):
|
def __init__(self, get_response):
|
||||||
@@ -22,56 +15,3 @@ class ApiVersionMiddleware:
|
|||||||
response["X-Version"] = version.__full_version_str__
|
response["X-Version"] = version.__full_version_str__
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import psutil
|
|
||||||
|
|
||||||
_PSUTIL = True
|
|
||||||
except Exception:
|
|
||||||
_PSUTIL = False
|
|
||||||
|
|
||||||
|
|
||||||
class MemLogMiddleware:
|
|
||||||
def __init__(self, get_response):
|
|
||||||
self.get_response = get_response
|
|
||||||
|
|
||||||
def __call__(self, request):
|
|
||||||
# capture baseline
|
|
||||||
ru_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
||||||
if _PSUTIL:
|
|
||||||
p = psutil.Process()
|
|
||||||
rss_before = p.memory_info().rss
|
|
||||||
else:
|
|
||||||
rss_before = 0
|
|
||||||
|
|
||||||
t0 = time.perf_counter()
|
|
||||||
try:
|
|
||||||
return self.get_response(request)
|
|
||||||
finally:
|
|
||||||
dur_ms = (time.perf_counter() - t0) * 1000.0
|
|
||||||
|
|
||||||
ru_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
||||||
# ru_maxrss is KB on Linux; convert to MB
|
|
||||||
peak_mb = (ru_after) / 1024.0
|
|
||||||
peak_delta_mb = (ru_after - ru_before) / 1024.0
|
|
||||||
|
|
||||||
if _PSUTIL:
|
|
||||||
rss_after = p.memory_info().rss
|
|
||||||
delta_mb = (rss_after - rss_before) / (1024 * 1024)
|
|
||||||
rss_mb = rss_after / (1024 * 1024)
|
|
||||||
else:
|
|
||||||
delta_mb = 0.0
|
|
||||||
rss_mb = 0.0
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"pid=%s mem rss=%.1fMB Δend=%.1fMB peak=%.1fMB Δpeak=%.1fMB dur=%.1fms %s %s",
|
|
||||||
os.getpid(),
|
|
||||||
rss_mb,
|
|
||||||
delta_mb,
|
|
||||||
peak_mb,
|
|
||||||
peak_delta_mb,
|
|
||||||
dur_ms,
|
|
||||||
request.method,
|
|
||||||
request.path,
|
|
||||||
)
|
|
||||||
|
@@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader
|
|||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
from paperless.utils import ocr_to_dateparser_languages
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.settings")
|
logger = logging.getLogger("paperless.settings")
|
||||||
|
|
||||||
# Tap paperless.conf if it's available
|
# Tap paperless.conf if it's available
|
||||||
@@ -363,7 +361,6 @@ if DEBUG:
|
|||||||
)
|
)
|
||||||
|
|
||||||
MIDDLEWARE = [
|
MIDDLEWARE = [
|
||||||
"paperless.middleware.MemLogMiddleware",
|
|
||||||
"django.middleware.security.SecurityMiddleware",
|
"django.middleware.security.SecurityMiddleware",
|
||||||
"whitenoise.middleware.WhiteNoiseMiddleware",
|
"whitenoise.middleware.WhiteNoiseMiddleware",
|
||||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||||
@@ -834,7 +831,7 @@ LOGGING = {
|
|||||||
"disable_existing_loggers": False,
|
"disable_existing_loggers": False,
|
||||||
"formatters": {
|
"formatters": {
|
||||||
"verbose": {
|
"verbose": {
|
||||||
"format": "[{asctime}] [{levelname}] [{name}] pid={process} {message}",
|
"format": "[{asctime}] [{levelname}] [{name}] {message}",
|
||||||
"style": "{",
|
"style": "{",
|
||||||
},
|
},
|
||||||
"simple": {
|
"simple": {
|
||||||
@@ -879,7 +876,6 @@ LOGGING = {
|
|||||||
"kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
|
"kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
|
||||||
"_granian": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
"_granian": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
||||||
"granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
"granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
||||||
"middleware": {"handlers": ["console"], "level": "DEBUG"},
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1186,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
|
|||||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
|
||||||
|
|
||||||
|
|
||||||
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
|
|
||||||
into a list of locales compatible with the `dateparser` library.
|
|
||||||
|
|
||||||
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
|
|
||||||
Falls back to the base language (e.g., "az") if needed.
|
|
||||||
- If a language cannot be mapped or validated, it is skipped with a warning.
|
|
||||||
- Returns a list of valid locales, or an empty list if none could be converted.
|
|
||||||
"""
|
|
||||||
ocr_to_dateparser = ocr_to_dateparser_languages()
|
|
||||||
loader = LocaleDataLoader()
|
|
||||||
result = []
|
|
||||||
try:
|
|
||||||
for ocr_language in ocr_languages.split("+"):
|
|
||||||
# Split into language and optional script
|
|
||||||
ocr_lang_part, *script = ocr_language.split("_")
|
|
||||||
ocr_script_part = script[0] if script else None
|
|
||||||
|
|
||||||
language_part = ocr_to_dateparser.get(ocr_lang_part)
|
|
||||||
if language_part is None:
|
|
||||||
logger.debug(
|
|
||||||
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Ensure base language is supported by dateparser
|
|
||||||
loader.get_locale_map(locales=[language_part])
|
|
||||||
|
|
||||||
# Try to add the script part if it's supported by dateparser
|
|
||||||
if ocr_script_part:
|
|
||||||
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
|
|
||||||
try:
|
|
||||||
loader.get_locale_map(locales=[dateparser_language])
|
|
||||||
except Exception:
|
|
||||||
logger.info(
|
|
||||||
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
|
|
||||||
)
|
|
||||||
dateparser_language = language_part
|
|
||||||
else:
|
|
||||||
dateparser_language = language_part
|
|
||||||
if dateparser_language not in result:
|
|
||||||
result.append(dateparser_language)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(
|
|
||||||
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
|
|
||||||
)
|
|
||||||
return []
|
|
||||||
if not result:
|
|
||||||
logger.info(
|
|
||||||
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_dateparser_languages(languages: str | None):
|
def _parse_dateparser_languages(languages: str | None):
|
||||||
language_list = languages.split("+") if languages else []
|
language_list = languages.split("+") if languages else []
|
||||||
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
|
# There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
|
||||||
@@ -1255,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None):
|
|||||||
return list(LocaleDataLoader().get_locale_map(locales=language_list))
|
return list(LocaleDataLoader().get_locale_map(locales=language_list))
|
||||||
|
|
||||||
|
|
||||||
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
|
# If not set, we will infer it at runtime
|
||||||
DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
|
DATE_PARSER_LANGUAGES = (
|
||||||
|
_parse_dateparser_languages(
|
||||||
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
|
os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
|
||||||
)
|
)
|
||||||
else:
|
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
|
||||||
DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Maximum number of dates taken from document start to end to show as suggestions for
|
# Maximum number of dates taken from document start to end to show as suggestions for
|
||||||
|
@@ -6,7 +6,6 @@ from unittest import mock
|
|||||||
import pytest
|
import pytest
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
|
|
||||||
from paperless.settings import _ocr_to_dateparser_languages
|
|
||||||
from paperless.settings import _parse_base_paths
|
from paperless.settings import _parse_base_paths
|
||||||
from paperless.settings import _parse_beat_schedule
|
from paperless.settings import _parse_beat_schedule
|
||||||
from paperless.settings import _parse_dateparser_languages
|
from paperless.settings import _parse_dateparser_languages
|
||||||
@@ -476,33 +475,6 @@ class TestPathSettings(TestCase):
|
|||||||
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
|
self.assertEqual("/foobar/", base_paths[4]) # LOGOUT_REDIRECT_URL
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("ocr_language", "expected"),
|
|
||||||
[
|
|
||||||
# One language
|
|
||||||
("eng", ["en"]),
|
|
||||||
# Multiple languages
|
|
||||||
("fra+ita+lao", ["fr", "it", "lo"]),
|
|
||||||
# Languages that don't have a two-letter equivalent
|
|
||||||
("fil", ["fil"]),
|
|
||||||
# Languages with a script part supported by dateparser
|
|
||||||
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
|
|
||||||
# Languages with a script part not supported by dateparser
|
|
||||||
# In this case, default to the language without script
|
|
||||||
("deu_frak", ["de"]),
|
|
||||||
# Traditional and simplified chinese don't have the same name in dateparser,
|
|
||||||
# so they're converted to the general chinese language
|
|
||||||
("chi_tra+chi_sim", ["zh"]),
|
|
||||||
# If a language is not supported by dateparser, fallback to the supported ones
|
|
||||||
("eng+unsupported_language+por", ["en", "pt"]),
|
|
||||||
# If no language is supported, fallback to default
|
|
||||||
("unsupported1+unsupported2", []),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_ocr_to_dateparser_languages(ocr_language, expected):
|
|
||||||
assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("languages", "expected"),
|
("languages", "expected"),
|
||||||
[
|
[
|
||||||
|
52
src/paperless/tests/test_utils.py
Normal file
52
src/paperless/tests/test_utils.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperless import utils
|
||||||
|
from paperless.utils import ocr_to_dateparser_languages
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("ocr_language", "expected"),
|
||||||
|
[
|
||||||
|
# One language
|
||||||
|
("eng", ["en"]),
|
||||||
|
# Multiple languages
|
||||||
|
("fra+ita+lao", ["fr", "it", "lo"]),
|
||||||
|
# Languages that don't have a two-letter equivalent
|
||||||
|
("fil", ["fil"]),
|
||||||
|
# Languages with a script part supported by dateparser
|
||||||
|
("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
|
||||||
|
# Languages with a script part not supported by dateparser
|
||||||
|
# In this case, default to the language without script
|
||||||
|
("deu_frak", ["de"]),
|
||||||
|
# Traditional and simplified chinese don't have the same name in dateparser,
|
||||||
|
# so they're converted to the general chinese language
|
||||||
|
("chi_tra+chi_sim", ["zh"]),
|
||||||
|
# If a language is not supported by dateparser, fallback to the supported ones
|
||||||
|
("eng+unsupported_language+por", ["en", "pt"]),
|
||||||
|
# If no language is supported, fallback to default
|
||||||
|
("unsupported1+unsupported2", []),
|
||||||
|
# Duplicate languages, should not duplicate in result
|
||||||
|
("eng+eng", ["en"]),
|
||||||
|
# Language with script, but script is not mapped
|
||||||
|
("ita_unknownscript", ["it"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_ocr_to_dateparser_languages(ocr_language, expected):
|
||||||
|
assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog):
|
||||||
|
# Patch LocaleDataLoader.get_locale_map to raise an exception
|
||||||
|
class DummyLoader:
|
||||||
|
def get_locale_map(self, locales=None):
|
||||||
|
raise RuntimeError("Simulated error")
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING):
|
||||||
|
monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader())
|
||||||
|
result = utils.ocr_to_dateparser_languages("eng+fra")
|
||||||
|
assert result == []
|
||||||
|
assert (
|
||||||
|
"Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text
|
||||||
|
)
|
@@ -1,4 +1,10 @@
|
|||||||
def ocr_to_dateparser_languages() -> dict[str, str]:
|
import logging
|
||||||
|
|
||||||
|
from dateparser.languages.loader import LocaleDataLoader
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.utils")
|
||||||
|
|
||||||
|
OCR_TO_DATEPARSER_LANGUAGES = {
|
||||||
"""
|
"""
|
||||||
Translation map from languages supported by Tesseract OCR
|
Translation map from languages supported by Tesseract OCR
|
||||||
to languages supported by dateparser.
|
to languages supported by dateparser.
|
||||||
@@ -14,7 +20,6 @@ def ocr_to_dateparser_languages() -> dict[str, str]:
|
|||||||
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
|
# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
|
||||||
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
|
# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
|
||||||
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
|
# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
|
||||||
return {
|
|
||||||
"afr": "af",
|
"afr": "af",
|
||||||
"amh": "am",
|
"amh": "am",
|
||||||
"ara": "ar",
|
"ara": "ar",
|
||||||
@@ -108,3 +113,57 @@ def ocr_to_dateparser_languages() -> dict[str, str]:
|
|||||||
"yor": "yo",
|
"yor": "yo",
|
||||||
"chi": "zh",
|
"chi": "zh",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
|
||||||
|
into a list of locales compatible with the `dateparser` library.
|
||||||
|
|
||||||
|
- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
|
||||||
|
Falls back to the base language (e.g., "az") if needed.
|
||||||
|
- If a language cannot be mapped or validated, it is skipped with a warning.
|
||||||
|
- Returns a list of valid locales, or an empty list if none could be converted.
|
||||||
|
"""
|
||||||
|
loader = LocaleDataLoader()
|
||||||
|
result = []
|
||||||
|
try:
|
||||||
|
for ocr_language in ocr_languages.split("+"):
|
||||||
|
# Split into language and optional script
|
||||||
|
ocr_lang_part, *script = ocr_language.split("_")
|
||||||
|
ocr_script_part = script[0] if script else None
|
||||||
|
|
||||||
|
language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
|
||||||
|
if language_part is None:
|
||||||
|
logger.debug(
|
||||||
|
f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure base language is supported by dateparser
|
||||||
|
loader.get_locale_map(locales=[language_part])
|
||||||
|
|
||||||
|
# Try to add the script part if it's supported by dateparser
|
||||||
|
if ocr_script_part:
|
||||||
|
dateparser_language = f"{language_part}-{ocr_script_part.title()}"
|
||||||
|
try:
|
||||||
|
loader.get_locale_map(locales=[dateparser_language])
|
||||||
|
except Exception:
|
||||||
|
logger.info(
|
||||||
|
f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
|
||||||
|
)
|
||||||
|
dateparser_language = language_part
|
||||||
|
else:
|
||||||
|
dateparser_language = language_part
|
||||||
|
if dateparser_language not in result:
|
||||||
|
result.append(dateparser_language)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
if not result:
|
||||||
|
logger.info(
|
||||||
|
"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
15
uv.lock
generated
15
uv.lock
generated
@@ -2046,7 +2046,6 @@ dependencies = [
|
|||||||
{ name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "psutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
@@ -2183,7 +2182,6 @@ requires-dist = [
|
|||||||
{ name = "ocrmypdf", specifier = "~=16.10.0" },
|
{ name = "ocrmypdf", specifier = "~=16.10.0" },
|
||||||
{ name = "pathvalidate", specifier = "~=3.3.1" },
|
{ name = "pathvalidate", specifier = "~=3.3.1" },
|
||||||
{ name = "pdf2image", specifier = "~=1.17.0" },
|
{ name = "pdf2image", specifier = "~=1.17.0" },
|
||||||
{ name = "psutil", specifier = ">=7.0.0" },
|
|
||||||
{ name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" },
|
{ name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" },
|
||||||
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" },
|
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" },
|
||||||
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" },
|
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" },
|
||||||
@@ -2550,19 +2548,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" },
|
{ url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "psutil"
|
|
||||||
version = "7.0.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "psycopg"
|
name = "psycopg"
|
||||||
version = "3.2.9"
|
version = "3.2.9"
|
||||||
|
Reference in New Issue
Block a user