mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			a6e41b4145
			...
			fix-sugges
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					7ea4893e42 | ||
| 
						 | 
					78255d0a99 | ||
| 
						 | 
					fc4cb08bda | ||
| 
						 | 
					875dc6602b | ||
| 
						 | 
					8084ece274 | ||
| 
						 | 
					70b24c056b | ||
| 
						 | 
					26d2d63c26 | 
							
								
								
									
										1008
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1008
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -53,6 +53,7 @@ dependencies = [
 | 
			
		||||
  "ocrmypdf~=16.10.0",
 | 
			
		||||
  "pathvalidate~=3.3.1",
 | 
			
		||||
  "pdf2image~=1.17.0",
 | 
			
		||||
  "psutil>=7",
 | 
			
		||||
  "psycopg-pool",
 | 
			
		||||
  "python-dateutil~=2.9.0",
 | 
			
		||||
  "python-dotenv~=1.1.0",
 | 
			
		||||
 
 | 
			
		||||
@@ -4,6 +4,7 @@ import logging
 | 
			
		||||
import pickle
 | 
			
		||||
import re
 | 
			
		||||
import warnings
 | 
			
		||||
from functools import lru_cache
 | 
			
		||||
from hashlib import sha256
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import TYPE_CHECKING
 | 
			
		||||
@@ -50,6 +51,7 @@ class ClassifierModelCorruptError(Exception):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@lru_cache(maxsize=1)
 | 
			
		||||
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
 | 
			
		||||
    if not settings.MODEL_FILE.is_file():
 | 
			
		||||
        logger.debug(
 | 
			
		||||
@@ -61,6 +63,11 @@ def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | No
 | 
			
		||||
    classifier = DocumentClassifier()
 | 
			
		||||
    try:
 | 
			
		||||
        classifier.load()
 | 
			
		||||
        logger.debug("classifier_id=%s", id(classifier))
 | 
			
		||||
        logger.debug(
 | 
			
		||||
            "classifier_data_vectorizer_hash=%s",
 | 
			
		||||
            classifier.data_vectorizer_hash,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    except IncompatibleClassifierVersionError as e:
 | 
			
		||||
        logger.info(f"Classifier version incompatible: {e.message}, will re-train")
 | 
			
		||||
@@ -96,7 +103,8 @@ class DocumentClassifier:
 | 
			
		||||
    # v7 - Updated scikit-learn package version
 | 
			
		||||
    # v8 - Added storage path classifier
 | 
			
		||||
    # v9 - Changed from hashing to time/ids for re-train check
 | 
			
		||||
    FORMAT_VERSION = 9
 | 
			
		||||
    # v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes
 | 
			
		||||
    FORMAT_VERSION = 10
 | 
			
		||||
 | 
			
		||||
    def __init__(self) -> None:
 | 
			
		||||
        # last time a document changed and therefore training might be required
 | 
			
		||||
@@ -128,32 +136,51 @@ class DocumentClassifier:
 | 
			
		||||
        ).hexdigest()
 | 
			
		||||
 | 
			
		||||
    def load(self) -> None:
 | 
			
		||||
        import joblib
 | 
			
		||||
        from sklearn.exceptions import InconsistentVersionWarning
 | 
			
		||||
 | 
			
		||||
        # Catch warnings for processing
 | 
			
		||||
        with warnings.catch_warnings(record=True) as w:
 | 
			
		||||
            with Path(settings.MODEL_FILE).open("rb") as f:
 | 
			
		||||
                schema_version = pickle.load(f)
 | 
			
		||||
            try:
 | 
			
		||||
                state = joblib.load(settings.MODEL_FILE, mmap_mode="r")
 | 
			
		||||
            except Exception as err:
 | 
			
		||||
                # As a fallback, try to detect old pickle-based and mark incompatible
 | 
			
		||||
                try:
 | 
			
		||||
                    with Path(settings.MODEL_FILE).open("rb") as f:
 | 
			
		||||
                        _ = pickle.load(f)
 | 
			
		||||
                    raise IncompatibleClassifierVersionError(
 | 
			
		||||
                        "Cannot load classifier, incompatible versions.",
 | 
			
		||||
                    ) from err
 | 
			
		||||
                except IncompatibleClassifierVersionError:
 | 
			
		||||
                    raise
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    # Not even a readable pickle header
 | 
			
		||||
                    raise ClassifierModelCorruptError from err
 | 
			
		||||
 | 
			
		||||
                if schema_version != self.FORMAT_VERSION:
 | 
			
		||||
            try:
 | 
			
		||||
                if (
 | 
			
		||||
                    not isinstance(state, dict)
 | 
			
		||||
                    or state.get("format_version") != self.FORMAT_VERSION
 | 
			
		||||
                ):
 | 
			
		||||
                    raise IncompatibleClassifierVersionError(
 | 
			
		||||
                        "Cannot load classifier, incompatible versions.",
 | 
			
		||||
                    )
 | 
			
		||||
                else:
 | 
			
		||||
                    try:
 | 
			
		||||
                        self.last_doc_change_time = pickle.load(f)
 | 
			
		||||
                        self.last_auto_type_hash = pickle.load(f)
 | 
			
		||||
 | 
			
		||||
                        self.data_vectorizer = pickle.load(f)
 | 
			
		||||
                        self._update_data_vectorizer_hash()
 | 
			
		||||
                        self.tags_binarizer = pickle.load(f)
 | 
			
		||||
                self.last_doc_change_time = state.get("last_doc_change_time")
 | 
			
		||||
                self.last_auto_type_hash = state.get("last_auto_type_hash")
 | 
			
		||||
 | 
			
		||||
                        self.tags_classifier = pickle.load(f)
 | 
			
		||||
                        self.correspondent_classifier = pickle.load(f)
 | 
			
		||||
                        self.document_type_classifier = pickle.load(f)
 | 
			
		||||
                        self.storage_path_classifier = pickle.load(f)
 | 
			
		||||
                    except Exception as err:
 | 
			
		||||
                        raise ClassifierModelCorruptError from err
 | 
			
		||||
                self.data_vectorizer = state.get("data_vectorizer")
 | 
			
		||||
                self._update_data_vectorizer_hash()
 | 
			
		||||
                self.tags_binarizer = state.get("tags_binarizer")
 | 
			
		||||
 | 
			
		||||
                self.tags_classifier = state.get("tags_classifier")
 | 
			
		||||
                self.correspondent_classifier = state.get("correspondent_classifier")
 | 
			
		||||
                self.document_type_classifier = state.get("document_type_classifier")
 | 
			
		||||
                self.storage_path_classifier = state.get("storage_path_classifier")
 | 
			
		||||
            except IncompatibleClassifierVersionError:
 | 
			
		||||
                raise
 | 
			
		||||
            except Exception as err:
 | 
			
		||||
                raise ClassifierModelCorruptError from err
 | 
			
		||||
 | 
			
		||||
            # Check for the warning about unpickling from differing versions
 | 
			
		||||
            # and consider it incompatible
 | 
			
		||||
@@ -171,23 +198,24 @@ class DocumentClassifier:
 | 
			
		||||
                    raise IncompatibleClassifierVersionError("sklearn version update")
 | 
			
		||||
 | 
			
		||||
    def save(self) -> None:
 | 
			
		||||
        import joblib
 | 
			
		||||
 | 
			
		||||
        target_file: Path = settings.MODEL_FILE
 | 
			
		||||
        target_file_temp: Path = target_file.with_suffix(".pickle.part")
 | 
			
		||||
        target_file_temp: Path = target_file.with_suffix(".joblib.part")
 | 
			
		||||
 | 
			
		||||
        with target_file_temp.open("wb") as f:
 | 
			
		||||
            pickle.dump(self.FORMAT_VERSION, f)
 | 
			
		||||
        state = {
 | 
			
		||||
            "format_version": self.FORMAT_VERSION,
 | 
			
		||||
            "last_doc_change_time": self.last_doc_change_time,
 | 
			
		||||
            "last_auto_type_hash": self.last_auto_type_hash,
 | 
			
		||||
            "data_vectorizer": self.data_vectorizer,
 | 
			
		||||
            "tags_binarizer": self.tags_binarizer,
 | 
			
		||||
            "tags_classifier": self.tags_classifier,
 | 
			
		||||
            "correspondent_classifier": self.correspondent_classifier,
 | 
			
		||||
            "document_type_classifier": self.document_type_classifier,
 | 
			
		||||
            "storage_path_classifier": self.storage_path_classifier,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
            pickle.dump(self.last_doc_change_time, f)
 | 
			
		||||
            pickle.dump(self.last_auto_type_hash, f)
 | 
			
		||||
 | 
			
		||||
            pickle.dump(self.data_vectorizer, f)
 | 
			
		||||
 | 
			
		||||
            pickle.dump(self.tags_binarizer, f)
 | 
			
		||||
            pickle.dump(self.tags_classifier, f)
 | 
			
		||||
 | 
			
		||||
            pickle.dump(self.correspondent_classifier, f)
 | 
			
		||||
            pickle.dump(self.document_type_classifier, f)
 | 
			
		||||
            pickle.dump(self.storage_path_classifier, f)
 | 
			
		||||
        joblib.dump(state, target_file_temp, compress=3)
 | 
			
		||||
 | 
			
		||||
        target_file_temp.rename(target_file)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -19,8 +19,6 @@ from documents.loggers import LoggingMixin
 | 
			
		||||
from documents.signals import document_consumer_declaration
 | 
			
		||||
from documents.utils import copy_file_with_basic_stats
 | 
			
		||||
from documents.utils import run_subprocess
 | 
			
		||||
from paperless.config import OcrConfig
 | 
			
		||||
from paperless.utils import ocr_to_dateparser_languages
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    import datetime
 | 
			
		||||
@@ -274,11 +272,6 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
 | 
			
		||||
        """
 | 
			
		||||
        import dateparser
 | 
			
		||||
 | 
			
		||||
        ocr_config = OcrConfig()
 | 
			
		||||
        languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
 | 
			
		||||
            ocr_config.language,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        return dateparser.parse(
 | 
			
		||||
            ds,
 | 
			
		||||
            settings={
 | 
			
		||||
@@ -287,7 +280,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
 | 
			
		||||
                "RETURN_AS_TIMEZONE_AWARE": True,
 | 
			
		||||
                "TIMEZONE": settings.TIME_ZONE,
 | 
			
		||||
            },
 | 
			
		||||
            locales=languages,
 | 
			
		||||
            locales=settings.DATE_PARSER_LANGUAGES,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __filter(date: datetime.datetime) -> datetime.datetime | None:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,14 +1,12 @@
 | 
			
		||||
import datetime
 | 
			
		||||
from zoneinfo import ZoneInfo
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
from pytest_django.fixtures import SettingsWrapper
 | 
			
		||||
 | 
			
		||||
from documents.parsers import parse_date
 | 
			
		||||
from documents.parsers import parse_date_generator
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.django_db()
 | 
			
		||||
class TestDate:
 | 
			
		||||
    def test_date_format_1(self):
 | 
			
		||||
        text = "lorem ipsum 130218 lorem ipsum"
 | 
			
		||||
@@ -51,7 +49,7 @@ class TestDate:
 | 
			
		||||
        settings: SettingsWrapper,
 | 
			
		||||
        settings_timezone: ZoneInfo,
 | 
			
		||||
    ):
 | 
			
		||||
        settings.DATE_PARSER_LANGUAGES = ["de"]
 | 
			
		||||
        settings.DATE_PARSER_LANGUAGES = []
 | 
			
		||||
        text = "lorem ipsum\nMärz 2019\nlorem ipsum"
 | 
			
		||||
        date = parse_date("", text)
 | 
			
		||||
        assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,9 @@ import logging
 | 
			
		||||
import os
 | 
			
		||||
import platform
 | 
			
		||||
import re
 | 
			
		||||
import resource
 | 
			
		||||
import tempfile
 | 
			
		||||
import time
 | 
			
		||||
import zipfile
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
@@ -190,6 +192,33 @@ if settings.AUDIT_LOG_ENABLED:
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger("paperless.api")
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    import psutil
 | 
			
		||||
 | 
			
		||||
    _PS = psutil.Process(os.getpid())
 | 
			
		||||
except Exception:
 | 
			
		||||
    _PS = None
 | 
			
		||||
 | 
			
		||||
_diag_log = logging.getLogger("paperless")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _mem_mb():
 | 
			
		||||
    rss = _PS.memory_info().rss if _PS else 0
 | 
			
		||||
    peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
 | 
			
		||||
    return rss / (1024 * 1024), peak_kb / 1024.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _mark(phase, doc_id, t0):
 | 
			
		||||
    rss, peak = _mem_mb()
 | 
			
		||||
    _diag_log.debug(
 | 
			
		||||
        "sugg doc=%s phase=%s rss=%.1fMB peak=%.1fMB t=%.1fms",
 | 
			
		||||
        doc_id,
 | 
			
		||||
        phase,
 | 
			
		||||
        rss,
 | 
			
		||||
        peak,
 | 
			
		||||
        (time.perf_counter() - t0) * 1000,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IndexView(TemplateView):
 | 
			
		||||
    template_name = "index.html"
 | 
			
		||||
@@ -758,7 +787,16 @@ class DocumentViewSet(
 | 
			
		||||
        ),
 | 
			
		||||
    )
 | 
			
		||||
    def suggestions(self, request, pk=None):
 | 
			
		||||
        doc = get_object_or_404(Document.objects.select_related("owner"), pk=pk)
 | 
			
		||||
        t0 = time.perf_counter()
 | 
			
		||||
        # Don't fetch content here
 | 
			
		||||
        doc = get_object_or_404(
 | 
			
		||||
            Document.objects.select_related("owner").only(
 | 
			
		||||
                "id",
 | 
			
		||||
                "owner_id",
 | 
			
		||||
            ),
 | 
			
		||||
            pk=pk,
 | 
			
		||||
        )
 | 
			
		||||
        _mark("start", doc.pk, t0)
 | 
			
		||||
        if request.user is not None and not has_perms_owner_aware(
 | 
			
		||||
            request.user,
 | 
			
		||||
            "view_document",
 | 
			
		||||
@@ -769,18 +807,23 @@ class DocumentViewSet(
 | 
			
		||||
        document_suggestions = get_suggestion_cache(doc.pk)
 | 
			
		||||
 | 
			
		||||
        if document_suggestions is not None:
 | 
			
		||||
            _mark("cache_hit_return", doc.pk, t0)
 | 
			
		||||
            refresh_suggestions_cache(doc.pk)
 | 
			
		||||
            return Response(document_suggestions.suggestions)
 | 
			
		||||
 | 
			
		||||
        classifier = load_classifier()
 | 
			
		||||
        _mark("loaded_classifier", doc.pk, t0)
 | 
			
		||||
 | 
			
		||||
        dates = []
 | 
			
		||||
        if settings.NUMBER_OF_SUGGESTED_DATES > 0:
 | 
			
		||||
            gen = parse_date_generator(doc.filename, doc.content)
 | 
			
		||||
            _mark("before_dates", doc.pk, t0)
 | 
			
		||||
            dates = sorted(
 | 
			
		||||
                {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
 | 
			
		||||
            )
 | 
			
		||||
            _mark("after_dates", doc.pk, t0)
 | 
			
		||||
 | 
			
		||||
        _mark("before_match", doc.pk, t0)
 | 
			
		||||
        resp_data = {
 | 
			
		||||
            "correspondents": [
 | 
			
		||||
                c.id for c in match_correspondents(doc, classifier, request.user)
 | 
			
		||||
@@ -794,9 +837,11 @@ class DocumentViewSet(
 | 
			
		||||
            ],
 | 
			
		||||
            "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
 | 
			
		||||
        }
 | 
			
		||||
        _mark("assembled_resp", doc.pk, t0)
 | 
			
		||||
 | 
			
		||||
        # Cache the suggestions and the classifier hash for later
 | 
			
		||||
        set_suggestions_cache(doc.pk, resp_data, classifier)
 | 
			
		||||
        _mark("cached", doc.pk, t0)
 | 
			
		||||
 | 
			
		||||
        return Response(resp_data)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@ msgid ""
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Project-Id-Version: paperless-ngx\n"
 | 
			
		||||
"Report-Msgid-Bugs-To: \n"
 | 
			
		||||
"POT-Creation-Date: 2025-08-31 22:24+0000\n"
 | 
			
		||||
"POT-Creation-Date: 2025-08-16 14:34+0000\n"
 | 
			
		||||
"PO-Revision-Date: 2022-02-17 04:17\n"
 | 
			
		||||
"Last-Translator: \n"
 | 
			
		||||
"Language-Team: English\n"
 | 
			
		||||
@@ -1645,147 +1645,147 @@ msgstr ""
 | 
			
		||||
msgid "paperless application settings"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:772
 | 
			
		||||
#: paperless/settings.py:774
 | 
			
		||||
msgid "English (US)"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:773
 | 
			
		||||
#: paperless/settings.py:775
 | 
			
		||||
msgid "Arabic"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:774
 | 
			
		||||
#: paperless/settings.py:776
 | 
			
		||||
msgid "Afrikaans"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:775
 | 
			
		||||
#: paperless/settings.py:777
 | 
			
		||||
msgid "Belarusian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:776
 | 
			
		||||
#: paperless/settings.py:778
 | 
			
		||||
msgid "Bulgarian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:777
 | 
			
		||||
#: paperless/settings.py:779
 | 
			
		||||
msgid "Catalan"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:778
 | 
			
		||||
#: paperless/settings.py:780
 | 
			
		||||
msgid "Czech"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:779
 | 
			
		||||
#: paperless/settings.py:781
 | 
			
		||||
msgid "Danish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:780
 | 
			
		||||
#: paperless/settings.py:782
 | 
			
		||||
msgid "German"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:781
 | 
			
		||||
#: paperless/settings.py:783
 | 
			
		||||
msgid "Greek"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:782
 | 
			
		||||
#: paperless/settings.py:784
 | 
			
		||||
msgid "English (GB)"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:783
 | 
			
		||||
#: paperless/settings.py:785
 | 
			
		||||
msgid "Spanish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:784
 | 
			
		||||
#: paperless/settings.py:786
 | 
			
		||||
msgid "Persian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:785
 | 
			
		||||
#: paperless/settings.py:787
 | 
			
		||||
msgid "Finnish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:786
 | 
			
		||||
#: paperless/settings.py:788
 | 
			
		||||
msgid "French"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:787
 | 
			
		||||
#: paperless/settings.py:789
 | 
			
		||||
msgid "Hungarian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:788
 | 
			
		||||
#: paperless/settings.py:790
 | 
			
		||||
msgid "Italian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:789
 | 
			
		||||
#: paperless/settings.py:791
 | 
			
		||||
msgid "Japanese"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:790
 | 
			
		||||
#: paperless/settings.py:792
 | 
			
		||||
msgid "Korean"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:791
 | 
			
		||||
#: paperless/settings.py:793
 | 
			
		||||
msgid "Luxembourgish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:792
 | 
			
		||||
#: paperless/settings.py:794
 | 
			
		||||
msgid "Norwegian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:793
 | 
			
		||||
#: paperless/settings.py:795
 | 
			
		||||
msgid "Dutch"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:794
 | 
			
		||||
#: paperless/settings.py:796
 | 
			
		||||
msgid "Polish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:795
 | 
			
		||||
#: paperless/settings.py:797
 | 
			
		||||
msgid "Portuguese (Brazil)"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:796
 | 
			
		||||
#: paperless/settings.py:798
 | 
			
		||||
msgid "Portuguese"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:797
 | 
			
		||||
#: paperless/settings.py:799
 | 
			
		||||
msgid "Romanian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:798
 | 
			
		||||
#: paperless/settings.py:800
 | 
			
		||||
msgid "Russian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:799
 | 
			
		||||
#: paperless/settings.py:801
 | 
			
		||||
msgid "Slovak"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:800
 | 
			
		||||
#: paperless/settings.py:802
 | 
			
		||||
msgid "Slovenian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:801
 | 
			
		||||
#: paperless/settings.py:803
 | 
			
		||||
msgid "Serbian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:802
 | 
			
		||||
#: paperless/settings.py:804
 | 
			
		||||
msgid "Swedish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:803
 | 
			
		||||
#: paperless/settings.py:805
 | 
			
		||||
msgid "Turkish"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:804
 | 
			
		||||
#: paperless/settings.py:806
 | 
			
		||||
msgid "Ukrainian"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:805
 | 
			
		||||
#: paperless/settings.py:807
 | 
			
		||||
msgid "Vietnamese"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:806
 | 
			
		||||
#: paperless/settings.py:808
 | 
			
		||||
msgid "Chinese Simplified"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: paperless/settings.py:807
 | 
			
		||||
#: paperless/settings.py:809
 | 
			
		||||
msgid "Chinese Traditional"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,14 @@
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import resource
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from paperless import version
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger("middleware")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ApiVersionMiddleware:
 | 
			
		||||
    def __init__(self, get_response):
 | 
			
		||||
@@ -15,3 +22,56 @@ class ApiVersionMiddleware:
 | 
			
		||||
            response["X-Version"] = version.__full_version_str__
 | 
			
		||||
 | 
			
		||||
        return response
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    import psutil
 | 
			
		||||
 | 
			
		||||
    _PSUTIL = True
 | 
			
		||||
except Exception:
 | 
			
		||||
    _PSUTIL = False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MemLogMiddleware:
 | 
			
		||||
    def __init__(self, get_response):
 | 
			
		||||
        self.get_response = get_response
 | 
			
		||||
 | 
			
		||||
    def __call__(self, request):
 | 
			
		||||
        # capture baseline
 | 
			
		||||
        ru_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
 | 
			
		||||
        if _PSUTIL:
 | 
			
		||||
            p = psutil.Process()
 | 
			
		||||
            rss_before = p.memory_info().rss
 | 
			
		||||
        else:
 | 
			
		||||
            rss_before = 0
 | 
			
		||||
 | 
			
		||||
        t0 = time.perf_counter()
 | 
			
		||||
        try:
 | 
			
		||||
            return self.get_response(request)
 | 
			
		||||
        finally:
 | 
			
		||||
            dur_ms = (time.perf_counter() - t0) * 1000.0
 | 
			
		||||
 | 
			
		||||
            ru_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
 | 
			
		||||
            # ru_maxrss is KB on Linux; convert to MB
 | 
			
		||||
            peak_mb = (ru_after) / 1024.0
 | 
			
		||||
            peak_delta_mb = (ru_after - ru_before) / 1024.0
 | 
			
		||||
 | 
			
		||||
            if _PSUTIL:
 | 
			
		||||
                rss_after = p.memory_info().rss
 | 
			
		||||
                delta_mb = (rss_after - rss_before) / (1024 * 1024)
 | 
			
		||||
                rss_mb = rss_after / (1024 * 1024)
 | 
			
		||||
            else:
 | 
			
		||||
                delta_mb = 0.0
 | 
			
		||||
                rss_mb = 0.0
 | 
			
		||||
 | 
			
		||||
            logger.debug(
 | 
			
		||||
                "pid=%s mem rss=%.1fMB Δend=%.1fMB peak=%.1fMB Δpeak=%.1fMB dur=%.1fms %s %s",
 | 
			
		||||
                os.getpid(),
 | 
			
		||||
                rss_mb,
 | 
			
		||||
                delta_mb,
 | 
			
		||||
                peak_mb,
 | 
			
		||||
                peak_delta_mb,
 | 
			
		||||
                dur_ms,
 | 
			
		||||
                request.method,
 | 
			
		||||
                request.path,
 | 
			
		||||
            )
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,8 @@ from dateparser.languages.loader import LocaleDataLoader
 | 
			
		||||
from django.utils.translation import gettext_lazy as _
 | 
			
		||||
from dotenv import load_dotenv
 | 
			
		||||
 | 
			
		||||
from paperless.utils import ocr_to_dateparser_languages
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger("paperless.settings")
 | 
			
		||||
 | 
			
		||||
# Tap paperless.conf if it's available
 | 
			
		||||
@@ -361,6 +363,7 @@ if DEBUG:
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
MIDDLEWARE = [
 | 
			
		||||
    "paperless.middleware.MemLogMiddleware",
 | 
			
		||||
    "django.middleware.security.SecurityMiddleware",
 | 
			
		||||
    "whitenoise.middleware.WhiteNoiseMiddleware",
 | 
			
		||||
    "django.contrib.sessions.middleware.SessionMiddleware",
 | 
			
		||||
@@ -831,7 +834,7 @@ LOGGING = {
 | 
			
		||||
    "disable_existing_loggers": False,
 | 
			
		||||
    "formatters": {
 | 
			
		||||
        "verbose": {
 | 
			
		||||
            "format": "[{asctime}] [{levelname}] [{name}] {message}",
 | 
			
		||||
            "format": "[{asctime}] [{levelname}] [{name}] pid={process} {message}",
 | 
			
		||||
            "style": "{",
 | 
			
		||||
        },
 | 
			
		||||
        "simple": {
 | 
			
		||||
@@ -876,6 +879,7 @@ LOGGING = {
 | 
			
		||||
        "kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
 | 
			
		||||
        "_granian": {"handlers": ["file_paperless"], "level": "DEBUG"},
 | 
			
		||||
        "granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"},
 | 
			
		||||
        "middleware": {"handlers": ["console"], "level": "DEBUG"},
 | 
			
		||||
    },
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -1182,6 +1186,61 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 | 
			
		||||
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
 | 
			
		||||
    """
 | 
			
		||||
    Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
 | 
			
		||||
    into a list of locales compatible with the `dateparser` library.
 | 
			
		||||
 | 
			
		||||
    - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
 | 
			
		||||
    Falls back to the base language (e.g., "az") if needed.
 | 
			
		||||
    - If a language cannot be mapped or validated, it is skipped with a warning.
 | 
			
		||||
    - Returns a list of valid locales, or an empty list if none could be converted.
 | 
			
		||||
    """
 | 
			
		||||
    ocr_to_dateparser = ocr_to_dateparser_languages()
 | 
			
		||||
    loader = LocaleDataLoader()
 | 
			
		||||
    result = []
 | 
			
		||||
    try:
 | 
			
		||||
        for ocr_language in ocr_languages.split("+"):
 | 
			
		||||
            # Split into language and optional script
 | 
			
		||||
            ocr_lang_part, *script = ocr_language.split("_")
 | 
			
		||||
            ocr_script_part = script[0] if script else None
 | 
			
		||||
 | 
			
		||||
            language_part = ocr_to_dateparser.get(ocr_lang_part)
 | 
			
		||||
            if language_part is None:
 | 
			
		||||
                logger.debug(
 | 
			
		||||
                    f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
 | 
			
		||||
                )
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            # Ensure base language is supported by dateparser
 | 
			
		||||
            loader.get_locale_map(locales=[language_part])
 | 
			
		||||
 | 
			
		||||
            # Try to add the script part if it's supported by dateparser
 | 
			
		||||
            if ocr_script_part:
 | 
			
		||||
                dateparser_language = f"{language_part}-{ocr_script_part.title()}"
 | 
			
		||||
                try:
 | 
			
		||||
                    loader.get_locale_map(locales=[dateparser_language])
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    logger.info(
 | 
			
		||||
                        f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
 | 
			
		||||
                    )
 | 
			
		||||
                    dateparser_language = language_part
 | 
			
		||||
            else:
 | 
			
		||||
                dateparser_language = language_part
 | 
			
		||||
            if dateparser_language not in result:
 | 
			
		||||
                result.append(dateparser_language)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.warning(
 | 
			
		||||
            f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
 | 
			
		||||
        )
 | 
			
		||||
        return []
 | 
			
		||||
    if not result:
 | 
			
		||||
        logger.info(
 | 
			
		||||
            "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
 | 
			
		||||
        )
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _parse_dateparser_languages(languages: str | None):
 | 
			
		||||
    language_list = languages.split("+") if languages else []
 | 
			
		||||
    # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib.
 | 
			
		||||
@@ -1196,14 +1255,12 @@ def _parse_dateparser_languages(languages: str | None):
 | 
			
		||||
    return list(LocaleDataLoader().get_locale_map(locales=language_list))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# If not set, we will infer it at runtime
 | 
			
		||||
DATE_PARSER_LANGUAGES = (
 | 
			
		||||
    _parse_dateparser_languages(
 | 
			
		||||
if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"):
 | 
			
		||||
    DATE_PARSER_LANGUAGES = _parse_dateparser_languages(
 | 
			
		||||
        os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"),
 | 
			
		||||
    )
 | 
			
		||||
    if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES")
 | 
			
		||||
    else None
 | 
			
		||||
)
 | 
			
		||||
else:
 | 
			
		||||
    DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Maximum number of dates taken from document start to end to show as suggestions for
 | 
			
		||||
 
 | 
			
		||||
@@ -6,6 +6,7 @@ from unittest import mock
 | 
			
		||||
import pytest
 | 
			
		||||
from celery.schedules import crontab
 | 
			
		||||
 | 
			
		||||
from paperless.settings import _ocr_to_dateparser_languages
 | 
			
		||||
from paperless.settings import _parse_base_paths
 | 
			
		||||
from paperless.settings import _parse_beat_schedule
 | 
			
		||||
from paperless.settings import _parse_dateparser_languages
 | 
			
		||||
@@ -475,6 +476,33 @@ class TestPathSettings(TestCase):
 | 
			
		||||
        self.assertEqual("/foobar/", base_paths[4])  # LOGOUT_REDIRECT_URL
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    ("ocr_language", "expected"),
 | 
			
		||||
    [
 | 
			
		||||
        # One language
 | 
			
		||||
        ("eng", ["en"]),
 | 
			
		||||
        # Multiple languages
 | 
			
		||||
        ("fra+ita+lao", ["fr", "it", "lo"]),
 | 
			
		||||
        # Languages that don't have a two-letter equivalent
 | 
			
		||||
        ("fil", ["fil"]),
 | 
			
		||||
        # Languages with a script part supported by dateparser
 | 
			
		||||
        ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
 | 
			
		||||
        # Languages with a script part not supported by dateparser
 | 
			
		||||
        # In this case, default to the language without script
 | 
			
		||||
        ("deu_frak", ["de"]),
 | 
			
		||||
        # Traditional and simplified chinese don't have the same name in dateparser,
 | 
			
		||||
        # so they're converted to the general chinese language
 | 
			
		||||
        ("chi_tra+chi_sim", ["zh"]),
 | 
			
		||||
        # If a language is not supported by dateparser, fallback to the supported ones
 | 
			
		||||
        ("eng+unsupported_language+por", ["en", "pt"]),
 | 
			
		||||
        # If no language is supported, fallback to default
 | 
			
		||||
        ("unsupported1+unsupported2", []),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_ocr_to_dateparser_languages(ocr_language, expected):
 | 
			
		||||
    assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    ("languages", "expected"),
 | 
			
		||||
    [
 | 
			
		||||
 
 | 
			
		||||
@@ -1,52 +0,0 @@
 | 
			
		||||
import logging
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
from paperless import utils
 | 
			
		||||
from paperless.utils import ocr_to_dateparser_languages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    ("ocr_language", "expected"),
 | 
			
		||||
    [
 | 
			
		||||
        # One language
 | 
			
		||||
        ("eng", ["en"]),
 | 
			
		||||
        # Multiple languages
 | 
			
		||||
        ("fra+ita+lao", ["fr", "it", "lo"]),
 | 
			
		||||
        # Languages that don't have a two-letter equivalent
 | 
			
		||||
        ("fil", ["fil"]),
 | 
			
		||||
        # Languages with a script part supported by dateparser
 | 
			
		||||
        ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]),
 | 
			
		||||
        # Languages with a script part not supported by dateparser
 | 
			
		||||
        # In this case, default to the language without script
 | 
			
		||||
        ("deu_frak", ["de"]),
 | 
			
		||||
        # Traditional and simplified chinese don't have the same name in dateparser,
 | 
			
		||||
        # so they're converted to the general chinese language
 | 
			
		||||
        ("chi_tra+chi_sim", ["zh"]),
 | 
			
		||||
        # If a language is not supported by dateparser, fallback to the supported ones
 | 
			
		||||
        ("eng+unsupported_language+por", ["en", "pt"]),
 | 
			
		||||
        # If no language is supported, fallback to default
 | 
			
		||||
        ("unsupported1+unsupported2", []),
 | 
			
		||||
        # Duplicate languages, should not duplicate in result
 | 
			
		||||
        ("eng+eng", ["en"]),
 | 
			
		||||
        # Language with script, but script is not mapped
 | 
			
		||||
        ("ita_unknownscript", ["it"]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_ocr_to_dateparser_languages(ocr_language, expected):
 | 
			
		||||
    assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog):
 | 
			
		||||
    # Patch LocaleDataLoader.get_locale_map to raise an exception
 | 
			
		||||
    class DummyLoader:
 | 
			
		||||
        def get_locale_map(self, locales=None):
 | 
			
		||||
            raise RuntimeError("Simulated error")
 | 
			
		||||
 | 
			
		||||
    with caplog.at_level(logging.WARNING):
 | 
			
		||||
        monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader())
 | 
			
		||||
        result = utils.ocr_to_dateparser_languages("eng+fra")
 | 
			
		||||
        assert result == []
 | 
			
		||||
        assert (
 | 
			
		||||
            "Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text
 | 
			
		||||
        )
 | 
			
		||||
@@ -1,10 +1,4 @@
 | 
			
		||||
import logging
 | 
			
		||||
 | 
			
		||||
from dateparser.languages.loader import LocaleDataLoader
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger("paperless.utils")
 | 
			
		||||
 | 
			
		||||
OCR_TO_DATEPARSER_LANGUAGES = {
 | 
			
		||||
def ocr_to_dateparser_languages() -> dict[str, str]:
 | 
			
		||||
    """
 | 
			
		||||
    Translation map from languages supported by Tesseract OCR
 | 
			
		||||
    to languages supported by dateparser.
 | 
			
		||||
@@ -20,150 +14,97 @@ OCR_TO_DATEPARSER_LANGUAGES = {
 | 
			
		||||
    # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
 | 
			
		||||
    # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
 | 
			
		||||
    # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
 | 
			
		||||
    "afr": "af",
 | 
			
		||||
    "amh": "am",
 | 
			
		||||
    "ara": "ar",
 | 
			
		||||
    "asm": "as",
 | 
			
		||||
    "ast": "ast",
 | 
			
		||||
    "aze": "az",
 | 
			
		||||
    "bel": "be",
 | 
			
		||||
    "bul": "bg",
 | 
			
		||||
    "ben": "bn",
 | 
			
		||||
    "bod": "bo",
 | 
			
		||||
    "bre": "br",
 | 
			
		||||
    "bos": "bs",
 | 
			
		||||
    "cat": "ca",
 | 
			
		||||
    "cher": "chr",
 | 
			
		||||
    "ces": "cs",
 | 
			
		||||
    "cym": "cy",
 | 
			
		||||
    "dan": "da",
 | 
			
		||||
    "deu": "de",
 | 
			
		||||
    "dzo": "dz",
 | 
			
		||||
    "ell": "el",
 | 
			
		||||
    "eng": "en",
 | 
			
		||||
    "epo": "eo",
 | 
			
		||||
    "spa": "es",
 | 
			
		||||
    "est": "et",
 | 
			
		||||
    "eus": "eu",
 | 
			
		||||
    "fas": "fa",
 | 
			
		||||
    "fin": "fi",
 | 
			
		||||
    "fil": "fil",
 | 
			
		||||
    "fao": "fo",  # codespell:ignore
 | 
			
		||||
    "fra": "fr",
 | 
			
		||||
    "fry": "fy",
 | 
			
		||||
    "gle": "ga",
 | 
			
		||||
    "gla": "gd",
 | 
			
		||||
    "glg": "gl",
 | 
			
		||||
    "guj": "gu",
 | 
			
		||||
    "heb": "he",
 | 
			
		||||
    "hin": "hi",
 | 
			
		||||
    "hrv": "hr",
 | 
			
		||||
    "hun": "hu",
 | 
			
		||||
    "hye": "hy",
 | 
			
		||||
    "ind": "id",
 | 
			
		||||
    "isl": "is",
 | 
			
		||||
    "ita": "it",
 | 
			
		||||
    "jpn": "ja",
 | 
			
		||||
    "kat": "ka",
 | 
			
		||||
    "kaz": "kk",
 | 
			
		||||
    "khm": "km",
 | 
			
		||||
    "knda": "kn",
 | 
			
		||||
    "kor": "ko",
 | 
			
		||||
    "kir": "ky",
 | 
			
		||||
    "ltz": "lb",
 | 
			
		||||
    "lao": "lo",
 | 
			
		||||
    "lit": "lt",
 | 
			
		||||
    "lav": "lv",
 | 
			
		||||
    "mal": "ml",
 | 
			
		||||
    "mon": "mn",
 | 
			
		||||
    "mar": "mr",
 | 
			
		||||
    "msa": "ms",
 | 
			
		||||
    "mlt": "mt",
 | 
			
		||||
    "mya": "my",
 | 
			
		||||
    "nep": "ne",
 | 
			
		||||
    "nld": "nl",
 | 
			
		||||
    "ori": "or",
 | 
			
		||||
    "pan": "pa",
 | 
			
		||||
    "pol": "pl",
 | 
			
		||||
    "pus": "ps",
 | 
			
		||||
    "por": "pt",
 | 
			
		||||
    "que": "qu",
 | 
			
		||||
    "ron": "ro",
 | 
			
		||||
    "rus": "ru",
 | 
			
		||||
    "sin": "si",
 | 
			
		||||
    "slk": "sk",
 | 
			
		||||
    "slv": "sl",
 | 
			
		||||
    "sqi": "sq",
 | 
			
		||||
    "srp": "sr",
 | 
			
		||||
    "swe": "sv",
 | 
			
		||||
    "swa": "sw",
 | 
			
		||||
    "tam": "ta",
 | 
			
		||||
    "tel": "te",  # codespell:ignore
 | 
			
		||||
    "tha": "th",  # codespell:ignore
 | 
			
		||||
    "tir": "ti",
 | 
			
		||||
    "tgl": "tl",
 | 
			
		||||
    "ton": "to",
 | 
			
		||||
    "tur": "tr",
 | 
			
		||||
    "uig": "ug",
 | 
			
		||||
    "ukr": "uk",
 | 
			
		||||
    "urd": "ur",
 | 
			
		||||
    "uzb": "uz",
 | 
			
		||||
    "via": "vi",
 | 
			
		||||
    "yid": "yi",
 | 
			
		||||
    "yor": "yo",
 | 
			
		||||
    "chi": "zh",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
 | 
			
		||||
    """
 | 
			
		||||
    Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
 | 
			
		||||
    into a list of locales compatible with the `dateparser` library.
 | 
			
		||||
 | 
			
		||||
    - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
 | 
			
		||||
    Falls back to the base language (e.g., "az") if needed.
 | 
			
		||||
    - If a language cannot be mapped or validated, it is skipped with a warning.
 | 
			
		||||
    - Returns a list of valid locales, or an empty list if none could be converted.
 | 
			
		||||
    """
 | 
			
		||||
    loader = LocaleDataLoader()
 | 
			
		||||
    result = []
 | 
			
		||||
    try:
 | 
			
		||||
        for ocr_language in ocr_languages.split("+"):
 | 
			
		||||
            # Split into language and optional script
 | 
			
		||||
            ocr_lang_part, *script = ocr_language.split("_")
 | 
			
		||||
            ocr_script_part = script[0] if script else None
 | 
			
		||||
 | 
			
		||||
            language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
 | 
			
		||||
            if language_part is None:
 | 
			
		||||
                logger.debug(
 | 
			
		||||
                    f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
 | 
			
		||||
                )
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            # Ensure base language is supported by dateparser
 | 
			
		||||
            loader.get_locale_map(locales=[language_part])
 | 
			
		||||
 | 
			
		||||
            # Try to add the script part if it's supported by dateparser
 | 
			
		||||
            if ocr_script_part:
 | 
			
		||||
                dateparser_language = f"{language_part}-{ocr_script_part.title()}"
 | 
			
		||||
                try:
 | 
			
		||||
                    loader.get_locale_map(locales=[dateparser_language])
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    logger.info(
 | 
			
		||||
                        f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
 | 
			
		||||
                    )
 | 
			
		||||
                    dateparser_language = language_part
 | 
			
		||||
            else:
 | 
			
		||||
                dateparser_language = language_part
 | 
			
		||||
            if dateparser_language not in result:
 | 
			
		||||
                result.append(dateparser_language)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.warning(
 | 
			
		||||
            f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
 | 
			
		||||
        )
 | 
			
		||||
        return []
 | 
			
		||||
    if not result:
 | 
			
		||||
        logger.info(
 | 
			
		||||
            "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
 | 
			
		||||
        )
 | 
			
		||||
    return result
 | 
			
		||||
    return {
 | 
			
		||||
        "afr": "af",
 | 
			
		||||
        "amh": "am",
 | 
			
		||||
        "ara": "ar",
 | 
			
		||||
        "asm": "as",
 | 
			
		||||
        "ast": "ast",
 | 
			
		||||
        "aze": "az",
 | 
			
		||||
        "bel": "be",
 | 
			
		||||
        "bul": "bg",
 | 
			
		||||
        "ben": "bn",
 | 
			
		||||
        "bod": "bo",
 | 
			
		||||
        "bre": "br",
 | 
			
		||||
        "bos": "bs",
 | 
			
		||||
        "cat": "ca",
 | 
			
		||||
        "cher": "chr",
 | 
			
		||||
        "ces": "cs",
 | 
			
		||||
        "cym": "cy",
 | 
			
		||||
        "dan": "da",
 | 
			
		||||
        "deu": "de",
 | 
			
		||||
        "dzo": "dz",
 | 
			
		||||
        "ell": "el",
 | 
			
		||||
        "eng": "en",
 | 
			
		||||
        "epo": "eo",
 | 
			
		||||
        "spa": "es",
 | 
			
		||||
        "est": "et",
 | 
			
		||||
        "eus": "eu",
 | 
			
		||||
        "fas": "fa",
 | 
			
		||||
        "fin": "fi",
 | 
			
		||||
        "fil": "fil",
 | 
			
		||||
        "fao": "fo",  # codespell:ignore
 | 
			
		||||
        "fra": "fr",
 | 
			
		||||
        "fry": "fy",
 | 
			
		||||
        "gle": "ga",
 | 
			
		||||
        "gla": "gd",
 | 
			
		||||
        "glg": "gl",
 | 
			
		||||
        "guj": "gu",
 | 
			
		||||
        "heb": "he",
 | 
			
		||||
        "hin": "hi",
 | 
			
		||||
        "hrv": "hr",
 | 
			
		||||
        "hun": "hu",
 | 
			
		||||
        "hye": "hy",
 | 
			
		||||
        "ind": "id",
 | 
			
		||||
        "isl": "is",
 | 
			
		||||
        "ita": "it",
 | 
			
		||||
        "jpn": "ja",
 | 
			
		||||
        "kat": "ka",
 | 
			
		||||
        "kaz": "kk",
 | 
			
		||||
        "khm": "km",
 | 
			
		||||
        "knda": "kn",
 | 
			
		||||
        "kor": "ko",
 | 
			
		||||
        "kir": "ky",
 | 
			
		||||
        "ltz": "lb",
 | 
			
		||||
        "lao": "lo",
 | 
			
		||||
        "lit": "lt",
 | 
			
		||||
        "lav": "lv",
 | 
			
		||||
        "mal": "ml",
 | 
			
		||||
        "mon": "mn",
 | 
			
		||||
        "mar": "mr",
 | 
			
		||||
        "msa": "ms",
 | 
			
		||||
        "mlt": "mt",
 | 
			
		||||
        "mya": "my",
 | 
			
		||||
        "nep": "ne",
 | 
			
		||||
        "nld": "nl",
 | 
			
		||||
        "ori": "or",
 | 
			
		||||
        "pan": "pa",
 | 
			
		||||
        "pol": "pl",
 | 
			
		||||
        "pus": "ps",
 | 
			
		||||
        "por": "pt",
 | 
			
		||||
        "que": "qu",
 | 
			
		||||
        "ron": "ro",
 | 
			
		||||
        "rus": "ru",
 | 
			
		||||
        "sin": "si",
 | 
			
		||||
        "slk": "sk",
 | 
			
		||||
        "slv": "sl",
 | 
			
		||||
        "sqi": "sq",
 | 
			
		||||
        "srp": "sr",
 | 
			
		||||
        "swe": "sv",
 | 
			
		||||
        "swa": "sw",
 | 
			
		||||
        "tam": "ta",
 | 
			
		||||
        "tel": "te",  # codespell:ignore
 | 
			
		||||
        "tha": "th",  # codespell:ignore
 | 
			
		||||
        "tir": "ti",
 | 
			
		||||
        "tgl": "tl",
 | 
			
		||||
        "ton": "to",
 | 
			
		||||
        "tur": "tr",
 | 
			
		||||
        "uig": "ug",
 | 
			
		||||
        "ukr": "uk",
 | 
			
		||||
        "urd": "ur",
 | 
			
		||||
        "uzb": "uz",
 | 
			
		||||
        "via": "vi",
 | 
			
		||||
        "yid": "yi",
 | 
			
		||||
        "yor": "yo",
 | 
			
		||||
        "chi": "zh",
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										15
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										15
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							@@ -2046,6 +2046,7 @@ dependencies = [
 | 
			
		||||
    { name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "psutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
@@ -2182,6 +2183,7 @@ requires-dist = [
 | 
			
		||||
    { name = "ocrmypdf", specifier = "~=16.10.0" },
 | 
			
		||||
    { name = "pathvalidate", specifier = "~=3.3.1" },
 | 
			
		||||
    { name = "pdf2image", specifier = "~=1.17.0" },
 | 
			
		||||
    { name = "psutil", specifier = ">=7.0.0" },
 | 
			
		||||
    { name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" },
 | 
			
		||||
    { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" },
 | 
			
		||||
    { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" },
 | 
			
		||||
@@ -2548,6 +2550,19 @@ wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "psutil"
 | 
			
		||||
version = "7.0.0"
 | 
			
		||||
source = { registry = "https://pypi.org/simple" }
 | 
			
		||||
sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
 | 
			
		||||
wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "psycopg"
 | 
			
		||||
version = "3.2.9"
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user