mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			fix-sugges
			...
			a6e41b4145
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | a6e41b4145 | ||
|   | cb927c5b22 | 
							
								
								
									
										1008
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1008
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -53,7 +53,6 @@ dependencies = [ | |||||||
|   "ocrmypdf~=16.10.0", |   "ocrmypdf~=16.10.0", | ||||||
|   "pathvalidate~=3.3.1", |   "pathvalidate~=3.3.1", | ||||||
|   "pdf2image~=1.17.0", |   "pdf2image~=1.17.0", | ||||||
|   "psutil>=7", |  | ||||||
|   "psycopg-pool", |   "psycopg-pool", | ||||||
|   "python-dateutil~=2.9.0", |   "python-dateutil~=2.9.0", | ||||||
|   "python-dotenv~=1.1.0", |   "python-dotenv~=1.1.0", | ||||||
|   | |||||||
| @@ -4,7 +4,6 @@ import logging | |||||||
| import pickle | import pickle | ||||||
| import re | import re | ||||||
| import warnings | import warnings | ||||||
| from functools import lru_cache |  | ||||||
| from hashlib import sha256 | from hashlib import sha256 | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import TYPE_CHECKING | from typing import TYPE_CHECKING | ||||||
| @@ -51,7 +50,6 @@ class ClassifierModelCorruptError(Exception): | |||||||
|     pass |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
| @lru_cache(maxsize=1) |  | ||||||
| def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None: | def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None: | ||||||
|     if not settings.MODEL_FILE.is_file(): |     if not settings.MODEL_FILE.is_file(): | ||||||
|         logger.debug( |         logger.debug( | ||||||
| @@ -63,11 +61,6 @@ def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | No | |||||||
|     classifier = DocumentClassifier() |     classifier = DocumentClassifier() | ||||||
|     try: |     try: | ||||||
|         classifier.load() |         classifier.load() | ||||||
|         logger.debug("classifier_id=%s", id(classifier)) |  | ||||||
|         logger.debug( |  | ||||||
|             "classifier_data_vectorizer_hash=%s", |  | ||||||
|             classifier.data_vectorizer_hash, |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     except IncompatibleClassifierVersionError as e: |     except IncompatibleClassifierVersionError as e: | ||||||
|         logger.info(f"Classifier version incompatible: {e.message}, will re-train") |         logger.info(f"Classifier version incompatible: {e.message}, will re-train") | ||||||
| @@ -103,8 +96,7 @@ class DocumentClassifier: | |||||||
|     # v7 - Updated scikit-learn package version |     # v7 - Updated scikit-learn package version | ||||||
|     # v8 - Added storage path classifier |     # v8 - Added storage path classifier | ||||||
|     # v9 - Changed from hashing to time/ids for re-train check |     # v9 - Changed from hashing to time/ids for re-train check | ||||||
|     # v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes |     FORMAT_VERSION = 9 | ||||||
|     FORMAT_VERSION = 10 |  | ||||||
|  |  | ||||||
|     def __init__(self) -> None: |     def __init__(self) -> None: | ||||||
|         # last time a document changed and therefore training might be required |         # last time a document changed and therefore training might be required | ||||||
| @@ -136,51 +128,32 @@ class DocumentClassifier: | |||||||
|         ).hexdigest() |         ).hexdigest() | ||||||
|  |  | ||||||
|     def load(self) -> None: |     def load(self) -> None: | ||||||
|         import joblib |  | ||||||
|         from sklearn.exceptions import InconsistentVersionWarning |         from sklearn.exceptions import InconsistentVersionWarning | ||||||
|  |  | ||||||
|         # Catch warnings for processing |         # Catch warnings for processing | ||||||
|         with warnings.catch_warnings(record=True) as w: |         with warnings.catch_warnings(record=True) as w: | ||||||
|             try: |             with Path(settings.MODEL_FILE).open("rb") as f: | ||||||
|                 state = joblib.load(settings.MODEL_FILE, mmap_mode="r") |                 schema_version = pickle.load(f) | ||||||
|             except Exception as err: |  | ||||||
|                 # As a fallback, try to detect old pickle-based and mark incompatible |  | ||||||
|                 try: |  | ||||||
|                     with Path(settings.MODEL_FILE).open("rb") as f: |  | ||||||
|                         _ = pickle.load(f) |  | ||||||
|                     raise IncompatibleClassifierVersionError( |  | ||||||
|                         "Cannot load classifier, incompatible versions.", |  | ||||||
|                     ) from err |  | ||||||
|                 except IncompatibleClassifierVersionError: |  | ||||||
|                     raise |  | ||||||
|                 except Exception: |  | ||||||
|                     # Not even a readable pickle header |  | ||||||
|                     raise ClassifierModelCorruptError from err |  | ||||||
|  |  | ||||||
|             try: |                 if schema_version != self.FORMAT_VERSION: | ||||||
|                 if ( |  | ||||||
|                     not isinstance(state, dict) |  | ||||||
|                     or state.get("format_version") != self.FORMAT_VERSION |  | ||||||
|                 ): |  | ||||||
|                     raise IncompatibleClassifierVersionError( |                     raise IncompatibleClassifierVersionError( | ||||||
|                         "Cannot load classifier, incompatible versions.", |                         "Cannot load classifier, incompatible versions.", | ||||||
|                     ) |                     ) | ||||||
|  |                 else: | ||||||
|  |                     try: | ||||||
|  |                         self.last_doc_change_time = pickle.load(f) | ||||||
|  |                         self.last_auto_type_hash = pickle.load(f) | ||||||
|  |  | ||||||
|                 self.last_doc_change_time = state.get("last_doc_change_time") |                         self.data_vectorizer = pickle.load(f) | ||||||
|                 self.last_auto_type_hash = state.get("last_auto_type_hash") |                         self._update_data_vectorizer_hash() | ||||||
|  |                         self.tags_binarizer = pickle.load(f) | ||||||
|  |  | ||||||
|                 self.data_vectorizer = state.get("data_vectorizer") |                         self.tags_classifier = pickle.load(f) | ||||||
|                 self._update_data_vectorizer_hash() |                         self.correspondent_classifier = pickle.load(f) | ||||||
|                 self.tags_binarizer = state.get("tags_binarizer") |                         self.document_type_classifier = pickle.load(f) | ||||||
|  |                         self.storage_path_classifier = pickle.load(f) | ||||||
|                 self.tags_classifier = state.get("tags_classifier") |                     except Exception as err: | ||||||
|                 self.correspondent_classifier = state.get("correspondent_classifier") |                         raise ClassifierModelCorruptError from err | ||||||
|                 self.document_type_classifier = state.get("document_type_classifier") |  | ||||||
|                 self.storage_path_classifier = state.get("storage_path_classifier") |  | ||||||
|             except IncompatibleClassifierVersionError: |  | ||||||
|                 raise |  | ||||||
|             except Exception as err: |  | ||||||
|                 raise ClassifierModelCorruptError from err |  | ||||||
|  |  | ||||||
|             # Check for the warning about unpickling from differing versions |             # Check for the warning about unpickling from differing versions | ||||||
|             # and consider it incompatible |             # and consider it incompatible | ||||||
| @@ -198,24 +171,23 @@ class DocumentClassifier: | |||||||
|                     raise IncompatibleClassifierVersionError("sklearn version update") |                     raise IncompatibleClassifierVersionError("sklearn version update") | ||||||
|  |  | ||||||
|     def save(self) -> None: |     def save(self) -> None: | ||||||
|         import joblib |  | ||||||
|  |  | ||||||
|         target_file: Path = settings.MODEL_FILE |         target_file: Path = settings.MODEL_FILE | ||||||
|         target_file_temp: Path = target_file.with_suffix(".joblib.part") |         target_file_temp: Path = target_file.with_suffix(".pickle.part") | ||||||
|  |  | ||||||
|         state = { |         with target_file_temp.open("wb") as f: | ||||||
|             "format_version": self.FORMAT_VERSION, |             pickle.dump(self.FORMAT_VERSION, f) | ||||||
|             "last_doc_change_time": self.last_doc_change_time, |  | ||||||
|             "last_auto_type_hash": self.last_auto_type_hash, |  | ||||||
|             "data_vectorizer": self.data_vectorizer, |  | ||||||
|             "tags_binarizer": self.tags_binarizer, |  | ||||||
|             "tags_classifier": self.tags_classifier, |  | ||||||
|             "correspondent_classifier": self.correspondent_classifier, |  | ||||||
|             "document_type_classifier": self.document_type_classifier, |  | ||||||
|             "storage_path_classifier": self.storage_path_classifier, |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         joblib.dump(state, target_file_temp, compress=3) |             pickle.dump(self.last_doc_change_time, f) | ||||||
|  |             pickle.dump(self.last_auto_type_hash, f) | ||||||
|  |  | ||||||
|  |             pickle.dump(self.data_vectorizer, f) | ||||||
|  |  | ||||||
|  |             pickle.dump(self.tags_binarizer, f) | ||||||
|  |             pickle.dump(self.tags_classifier, f) | ||||||
|  |  | ||||||
|  |             pickle.dump(self.correspondent_classifier, f) | ||||||
|  |             pickle.dump(self.document_type_classifier, f) | ||||||
|  |             pickle.dump(self.storage_path_classifier, f) | ||||||
|  |  | ||||||
|         target_file_temp.rename(target_file) |         target_file_temp.rename(target_file) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -19,6 +19,8 @@ from documents.loggers import LoggingMixin | |||||||
| from documents.signals import document_consumer_declaration | from documents.signals import document_consumer_declaration | ||||||
| from documents.utils import copy_file_with_basic_stats | from documents.utils import copy_file_with_basic_stats | ||||||
| from documents.utils import run_subprocess | from documents.utils import run_subprocess | ||||||
|  | from paperless.config import OcrConfig | ||||||
|  | from paperless.utils import ocr_to_dateparser_languages | ||||||
|  |  | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     import datetime |     import datetime | ||||||
| @@ -272,6 +274,11 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: | |||||||
|         """ |         """ | ||||||
|         import dateparser |         import dateparser | ||||||
|  |  | ||||||
|  |         ocr_config = OcrConfig() | ||||||
|  |         languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages( | ||||||
|  |             ocr_config.language, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|         return dateparser.parse( |         return dateparser.parse( | ||||||
|             ds, |             ds, | ||||||
|             settings={ |             settings={ | ||||||
| @@ -280,7 +287,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: | |||||||
|                 "RETURN_AS_TIMEZONE_AWARE": True, |                 "RETURN_AS_TIMEZONE_AWARE": True, | ||||||
|                 "TIMEZONE": settings.TIME_ZONE, |                 "TIMEZONE": settings.TIME_ZONE, | ||||||
|             }, |             }, | ||||||
|             locales=settings.DATE_PARSER_LANGUAGES, |             locales=languages, | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     def __filter(date: datetime.datetime) -> datetime.datetime | None: |     def __filter(date: datetime.datetime) -> datetime.datetime | None: | ||||||
|   | |||||||
| @@ -1,12 +1,14 @@ | |||||||
| import datetime | import datetime | ||||||
| from zoneinfo import ZoneInfo | from zoneinfo import ZoneInfo | ||||||
|  |  | ||||||
|  | import pytest | ||||||
| from pytest_django.fixtures import SettingsWrapper | from pytest_django.fixtures import SettingsWrapper | ||||||
|  |  | ||||||
| from documents.parsers import parse_date | from documents.parsers import parse_date | ||||||
| from documents.parsers import parse_date_generator | from documents.parsers import parse_date_generator | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @pytest.mark.django_db() | ||||||
| class TestDate: | class TestDate: | ||||||
|     def test_date_format_1(self): |     def test_date_format_1(self): | ||||||
|         text = "lorem ipsum 130218 lorem ipsum" |         text = "lorem ipsum 130218 lorem ipsum" | ||||||
| @@ -49,7 +51,7 @@ class TestDate: | |||||||
|         settings: SettingsWrapper, |         settings: SettingsWrapper, | ||||||
|         settings_timezone: ZoneInfo, |         settings_timezone: ZoneInfo, | ||||||
|     ): |     ): | ||||||
|         settings.DATE_PARSER_LANGUAGES = [] |         settings.DATE_PARSER_LANGUAGES = ["de"] | ||||||
|         text = "lorem ipsum\nMärz 2019\nlorem ipsum" |         text = "lorem ipsum\nMärz 2019\nlorem ipsum" | ||||||
|         date = parse_date("", text) |         date = parse_date("", text) | ||||||
|         assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) |         assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) | ||||||
|   | |||||||
| @@ -3,9 +3,7 @@ import logging | |||||||
| import os | import os | ||||||
| import platform | import platform | ||||||
| import re | import re | ||||||
| import resource |  | ||||||
| import tempfile | import tempfile | ||||||
| import time |  | ||||||
| import zipfile | import zipfile | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| @@ -192,33 +190,6 @@ if settings.AUDIT_LOG_ENABLED: | |||||||
|  |  | ||||||
| logger = logging.getLogger("paperless.api") | logger = logging.getLogger("paperless.api") | ||||||
|  |  | ||||||
| try: |  | ||||||
|     import psutil |  | ||||||
|  |  | ||||||
|     _PS = psutil.Process(os.getpid()) |  | ||||||
| except Exception: |  | ||||||
|     _PS = None |  | ||||||
|  |  | ||||||
| _diag_log = logging.getLogger("paperless") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _mem_mb(): |  | ||||||
|     rss = _PS.memory_info().rss if _PS else 0 |  | ||||||
|     peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |  | ||||||
|     return rss / (1024 * 1024), peak_kb / 1024.0 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _mark(phase, doc_id, t0): |  | ||||||
|     rss, peak = _mem_mb() |  | ||||||
|     _diag_log.debug( |  | ||||||
|         "sugg doc=%s phase=%s rss=%.1fMB peak=%.1fMB t=%.1fms", |  | ||||||
|         doc_id, |  | ||||||
|         phase, |  | ||||||
|         rss, |  | ||||||
|         peak, |  | ||||||
|         (time.perf_counter() - t0) * 1000, |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class IndexView(TemplateView): | class IndexView(TemplateView): | ||||||
|     template_name = "index.html" |     template_name = "index.html" | ||||||
| @@ -787,16 +758,7 @@ class DocumentViewSet( | |||||||
|         ), |         ), | ||||||
|     ) |     ) | ||||||
|     def suggestions(self, request, pk=None): |     def suggestions(self, request, pk=None): | ||||||
|         t0 = time.perf_counter() |         doc = get_object_or_404(Document.objects.select_related("owner"), pk=pk) | ||||||
|         # Don't fetch content here |  | ||||||
|         doc = get_object_or_404( |  | ||||||
|             Document.objects.select_related("owner").only( |  | ||||||
|                 "id", |  | ||||||
|                 "owner_id", |  | ||||||
|             ), |  | ||||||
|             pk=pk, |  | ||||||
|         ) |  | ||||||
|         _mark("start", doc.pk, t0) |  | ||||||
|         if request.user is not None and not has_perms_owner_aware( |         if request.user is not None and not has_perms_owner_aware( | ||||||
|             request.user, |             request.user, | ||||||
|             "view_document", |             "view_document", | ||||||
| @@ -807,23 +769,18 @@ class DocumentViewSet( | |||||||
|         document_suggestions = get_suggestion_cache(doc.pk) |         document_suggestions = get_suggestion_cache(doc.pk) | ||||||
|  |  | ||||||
|         if document_suggestions is not None: |         if document_suggestions is not None: | ||||||
|             _mark("cache_hit_return", doc.pk, t0) |  | ||||||
|             refresh_suggestions_cache(doc.pk) |             refresh_suggestions_cache(doc.pk) | ||||||
|             return Response(document_suggestions.suggestions) |             return Response(document_suggestions.suggestions) | ||||||
|  |  | ||||||
|         classifier = load_classifier() |         classifier = load_classifier() | ||||||
|         _mark("loaded_classifier", doc.pk, t0) |  | ||||||
|  |  | ||||||
|         dates = [] |         dates = [] | ||||||
|         if settings.NUMBER_OF_SUGGESTED_DATES > 0: |         if settings.NUMBER_OF_SUGGESTED_DATES > 0: | ||||||
|             gen = parse_date_generator(doc.filename, doc.content) |             gen = parse_date_generator(doc.filename, doc.content) | ||||||
|             _mark("before_dates", doc.pk, t0) |  | ||||||
|             dates = sorted( |             dates = sorted( | ||||||
|                 {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, |                 {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, | ||||||
|             ) |             ) | ||||||
|             _mark("after_dates", doc.pk, t0) |  | ||||||
|  |  | ||||||
|         _mark("before_match", doc.pk, t0) |  | ||||||
|         resp_data = { |         resp_data = { | ||||||
|             "correspondents": [ |             "correspondents": [ | ||||||
|                 c.id for c in match_correspondents(doc, classifier, request.user) |                 c.id for c in match_correspondents(doc, classifier, request.user) | ||||||
| @@ -837,11 +794,9 @@ class DocumentViewSet( | |||||||
|             ], |             ], | ||||||
|             "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None], |             "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None], | ||||||
|         } |         } | ||||||
|         _mark("assembled_resp", doc.pk, t0) |  | ||||||
|  |  | ||||||
|         # Cache the suggestions and the classifier hash for later |         # Cache the suggestions and the classifier hash for later | ||||||
|         set_suggestions_cache(doc.pk, resp_data, classifier) |         set_suggestions_cache(doc.pk, resp_data, classifier) | ||||||
|         _mark("cached", doc.pk, t0) |  | ||||||
|  |  | ||||||
|         return Response(resp_data) |         return Response(resp_data) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ msgid "" | |||||||
| msgstr "" | msgstr "" | ||||||
| "Project-Id-Version: paperless-ngx\n" | "Project-Id-Version: paperless-ngx\n" | ||||||
| "Report-Msgid-Bugs-To: \n" | "Report-Msgid-Bugs-To: \n" | ||||||
| "POT-Creation-Date: 2025-08-16 14:34+0000\n" | "POT-Creation-Date: 2025-08-31 22:24+0000\n" | ||||||
| "PO-Revision-Date: 2022-02-17 04:17\n" | "PO-Revision-Date: 2022-02-17 04:17\n" | ||||||
| "Last-Translator: \n" | "Last-Translator: \n" | ||||||
| "Language-Team: English\n" | "Language-Team: English\n" | ||||||
| @@ -1645,147 +1645,147 @@ msgstr "" | |||||||
| msgid "paperless application settings" | msgid "paperless application settings" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:774 | #: paperless/settings.py:772 | ||||||
| msgid "English (US)" | msgid "English (US)" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:775 | #: paperless/settings.py:773 | ||||||
| msgid "Arabic" | msgid "Arabic" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:776 | #: paperless/settings.py:774 | ||||||
| msgid "Afrikaans" | msgid "Afrikaans" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:777 | #: paperless/settings.py:775 | ||||||
| msgid "Belarusian" | msgid "Belarusian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:778 | #: paperless/settings.py:776 | ||||||
| msgid "Bulgarian" | msgid "Bulgarian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:779 | #: paperless/settings.py:777 | ||||||
| msgid "Catalan" | msgid "Catalan" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:780 | #: paperless/settings.py:778 | ||||||
| msgid "Czech" | msgid "Czech" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:781 | #: paperless/settings.py:779 | ||||||
| msgid "Danish" | msgid "Danish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:782 | #: paperless/settings.py:780 | ||||||
| msgid "German" | msgid "German" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:783 | #: paperless/settings.py:781 | ||||||
| msgid "Greek" | msgid "Greek" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:784 | #: paperless/settings.py:782 | ||||||
| msgid "English (GB)" | msgid "English (GB)" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:785 | #: paperless/settings.py:783 | ||||||
| msgid "Spanish" | msgid "Spanish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:786 | #: paperless/settings.py:784 | ||||||
| msgid "Persian" | msgid "Persian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:787 | #: paperless/settings.py:785 | ||||||
| msgid "Finnish" | msgid "Finnish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:788 | #: paperless/settings.py:786 | ||||||
| msgid "French" | msgid "French" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:789 | #: paperless/settings.py:787 | ||||||
| msgid "Hungarian" | msgid "Hungarian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:790 | #: paperless/settings.py:788 | ||||||
| msgid "Italian" | msgid "Italian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:791 | #: paperless/settings.py:789 | ||||||
| msgid "Japanese" | msgid "Japanese" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:792 | #: paperless/settings.py:790 | ||||||
| msgid "Korean" | msgid "Korean" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:793 | #: paperless/settings.py:791 | ||||||
| msgid "Luxembourgish" | msgid "Luxembourgish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:794 | #: paperless/settings.py:792 | ||||||
| msgid "Norwegian" | msgid "Norwegian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:795 | #: paperless/settings.py:793 | ||||||
| msgid "Dutch" | msgid "Dutch" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:796 | #: paperless/settings.py:794 | ||||||
| msgid "Polish" | msgid "Polish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:797 | #: paperless/settings.py:795 | ||||||
| msgid "Portuguese (Brazil)" | msgid "Portuguese (Brazil)" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:798 | #: paperless/settings.py:796 | ||||||
| msgid "Portuguese" | msgid "Portuguese" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:799 | #: paperless/settings.py:797 | ||||||
| msgid "Romanian" | msgid "Romanian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:800 | #: paperless/settings.py:798 | ||||||
| msgid "Russian" | msgid "Russian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:801 | #: paperless/settings.py:799 | ||||||
| msgid "Slovak" | msgid "Slovak" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:802 | #: paperless/settings.py:800 | ||||||
| msgid "Slovenian" | msgid "Slovenian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:803 | #: paperless/settings.py:801 | ||||||
| msgid "Serbian" | msgid "Serbian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:804 | #: paperless/settings.py:802 | ||||||
| msgid "Swedish" | msgid "Swedish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:805 | #: paperless/settings.py:803 | ||||||
| msgid "Turkish" | msgid "Turkish" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:806 | #: paperless/settings.py:804 | ||||||
| msgid "Ukrainian" | msgid "Ukrainian" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:807 | #: paperless/settings.py:805 | ||||||
| msgid "Vietnamese" | msgid "Vietnamese" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:808 | #: paperless/settings.py:806 | ||||||
| msgid "Chinese Simplified" | msgid "Chinese Simplified" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
| #: paperless/settings.py:809 | #: paperless/settings.py:807 | ||||||
| msgid "Chinese Traditional" | msgid "Chinese Traditional" | ||||||
| msgstr "" | msgstr "" | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,14 +1,7 @@ | |||||||
| import logging |  | ||||||
| import os |  | ||||||
| import resource |  | ||||||
| import time |  | ||||||
|  |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
|  |  | ||||||
| from paperless import version | from paperless import version | ||||||
|  |  | ||||||
| logger = logging.getLogger("middleware") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ApiVersionMiddleware: | class ApiVersionMiddleware: | ||||||
|     def __init__(self, get_response): |     def __init__(self, get_response): | ||||||
| @@ -22,56 +15,3 @@ class ApiVersionMiddleware: | |||||||
|             response["X-Version"] = version.__full_version_str__ |             response["X-Version"] = version.__full_version_str__ | ||||||
|  |  | ||||||
|         return response |         return response | ||||||
|  |  | ||||||
|  |  | ||||||
| try: |  | ||||||
|     import psutil |  | ||||||
|  |  | ||||||
|     _PSUTIL = True |  | ||||||
| except Exception: |  | ||||||
|     _PSUTIL = False |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class MemLogMiddleware: |  | ||||||
|     def __init__(self, get_response): |  | ||||||
|         self.get_response = get_response |  | ||||||
|  |  | ||||||
|     def __call__(self, request): |  | ||||||
|         # capture baseline |  | ||||||
|         ru_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |  | ||||||
|         if _PSUTIL: |  | ||||||
|             p = psutil.Process() |  | ||||||
|             rss_before = p.memory_info().rss |  | ||||||
|         else: |  | ||||||
|             rss_before = 0 |  | ||||||
|  |  | ||||||
|         t0 = time.perf_counter() |  | ||||||
|         try: |  | ||||||
|             return self.get_response(request) |  | ||||||
|         finally: |  | ||||||
|             dur_ms = (time.perf_counter() - t0) * 1000.0 |  | ||||||
|  |  | ||||||
|             ru_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |  | ||||||
|             # ru_maxrss is KB on Linux; convert to MB |  | ||||||
|             peak_mb = (ru_after) / 1024.0 |  | ||||||
|             peak_delta_mb = (ru_after - ru_before) / 1024.0 |  | ||||||
|  |  | ||||||
|             if _PSUTIL: |  | ||||||
|                 rss_after = p.memory_info().rss |  | ||||||
|                 delta_mb = (rss_after - rss_before) / (1024 * 1024) |  | ||||||
|                 rss_mb = rss_after / (1024 * 1024) |  | ||||||
|             else: |  | ||||||
|                 delta_mb = 0.0 |  | ||||||
|                 rss_mb = 0.0 |  | ||||||
|  |  | ||||||
|             logger.debug( |  | ||||||
|                 "pid=%s mem rss=%.1fMB Δend=%.1fMB peak=%.1fMB Δpeak=%.1fMB dur=%.1fms %s %s", |  | ||||||
|                 os.getpid(), |  | ||||||
|                 rss_mb, |  | ||||||
|                 delta_mb, |  | ||||||
|                 peak_mb, |  | ||||||
|                 peak_delta_mb, |  | ||||||
|                 dur_ms, |  | ||||||
|                 request.method, |  | ||||||
|                 request.path, |  | ||||||
|             ) |  | ||||||
|   | |||||||
| @@ -17,8 +17,6 @@ from dateparser.languages.loader import LocaleDataLoader | |||||||
| from django.utils.translation import gettext_lazy as _ | from django.utils.translation import gettext_lazy as _ | ||||||
| from dotenv import load_dotenv | from dotenv import load_dotenv | ||||||
|  |  | ||||||
| from paperless.utils import ocr_to_dateparser_languages |  | ||||||
|  |  | ||||||
| logger = logging.getLogger("paperless.settings") | logger = logging.getLogger("paperless.settings") | ||||||
|  |  | ||||||
| # Tap paperless.conf if it's available | # Tap paperless.conf if it's available | ||||||
| @@ -363,7 +361,6 @@ if DEBUG: | |||||||
|     ) |     ) | ||||||
|  |  | ||||||
| MIDDLEWARE = [ | MIDDLEWARE = [ | ||||||
|     "paperless.middleware.MemLogMiddleware", |  | ||||||
|     "django.middleware.security.SecurityMiddleware", |     "django.middleware.security.SecurityMiddleware", | ||||||
|     "whitenoise.middleware.WhiteNoiseMiddleware", |     "whitenoise.middleware.WhiteNoiseMiddleware", | ||||||
|     "django.contrib.sessions.middleware.SessionMiddleware", |     "django.contrib.sessions.middleware.SessionMiddleware", | ||||||
| @@ -834,7 +831,7 @@ LOGGING = { | |||||||
|     "disable_existing_loggers": False, |     "disable_existing_loggers": False, | ||||||
|     "formatters": { |     "formatters": { | ||||||
|         "verbose": { |         "verbose": { | ||||||
|             "format": "[{asctime}] [{levelname}] [{name}] pid={process} {message}", |             "format": "[{asctime}] [{levelname}] [{name}] {message}", | ||||||
|             "style": "{", |             "style": "{", | ||||||
|         }, |         }, | ||||||
|         "simple": { |         "simple": { | ||||||
| @@ -879,7 +876,6 @@ LOGGING = { | |||||||
|         "kombu": {"handlers": ["file_celery"], "level": "DEBUG"}, |         "kombu": {"handlers": ["file_celery"], "level": "DEBUG"}, | ||||||
|         "_granian": {"handlers": ["file_paperless"], "level": "DEBUG"}, |         "_granian": {"handlers": ["file_paperless"], "level": "DEBUG"}, | ||||||
|         "granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"}, |         "granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"}, | ||||||
|         "middleware": {"handlers": ["console"], "level": "DEBUG"}, |  | ||||||
|     }, |     }, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -1186,61 +1182,6 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") | |||||||
| FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") | FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") | ||||||
|  |  | ||||||
|  |  | ||||||
| def _ocr_to_dateparser_languages(ocr_languages: str) -> list[str]: |  | ||||||
|     """ |  | ||||||
|     Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl") |  | ||||||
|     into a list of locales compatible with the `dateparser` library. |  | ||||||
|  |  | ||||||
|     - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl"). |  | ||||||
|     Falls back to the base language (e.g., "az") if needed. |  | ||||||
|     - If a language cannot be mapped or validated, it is skipped with a warning. |  | ||||||
|     - Returns a list of valid locales, or an empty list if none could be converted. |  | ||||||
|     """ |  | ||||||
|     ocr_to_dateparser = ocr_to_dateparser_languages() |  | ||||||
|     loader = LocaleDataLoader() |  | ||||||
|     result = [] |  | ||||||
|     try: |  | ||||||
|         for ocr_language in ocr_languages.split("+"): |  | ||||||
|             # Split into language and optional script |  | ||||||
|             ocr_lang_part, *script = ocr_language.split("_") |  | ||||||
|             ocr_script_part = script[0] if script else None |  | ||||||
|  |  | ||||||
|             language_part = ocr_to_dateparser.get(ocr_lang_part) |  | ||||||
|             if language_part is None: |  | ||||||
|                 logger.debug( |  | ||||||
|                     f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ', |  | ||||||
|                 ) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             # Ensure base language is supported by dateparser |  | ||||||
|             loader.get_locale_map(locales=[language_part]) |  | ||||||
|  |  | ||||||
|             # Try to add the script part if it's supported by dateparser |  | ||||||
|             if ocr_script_part: |  | ||||||
|                 dateparser_language = f"{language_part}-{ocr_script_part.title()}" |  | ||||||
|                 try: |  | ||||||
|                     loader.get_locale_map(locales=[dateparser_language]) |  | ||||||
|                 except Exception: |  | ||||||
|                     logger.info( |  | ||||||
|                         f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.", |  | ||||||
|                     ) |  | ||||||
|                     dateparser_language = language_part |  | ||||||
|             else: |  | ||||||
|                 dateparser_language = language_part |  | ||||||
|             if dateparser_language not in result: |  | ||||||
|                 result.append(dateparser_language) |  | ||||||
|     except Exception as e: |  | ||||||
|         logger.warning( |  | ||||||
|             f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}", |  | ||||||
|         ) |  | ||||||
|         return [] |  | ||||||
|     if not result: |  | ||||||
|         logger.info( |  | ||||||
|             "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.", |  | ||||||
|         ) |  | ||||||
|     return result |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _parse_dateparser_languages(languages: str | None): | def _parse_dateparser_languages(languages: str | None): | ||||||
|     language_list = languages.split("+") if languages else [] |     language_list = languages.split("+") if languages else [] | ||||||
|     # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib. |     # There is an unfixed issue in zh-Hant and zh-Hans locales in the dateparser lib. | ||||||
| @@ -1255,12 +1196,14 @@ def _parse_dateparser_languages(languages: str | None): | |||||||
|     return list(LocaleDataLoader().get_locale_map(locales=language_list)) |     return list(LocaleDataLoader().get_locale_map(locales=language_list)) | ||||||
|  |  | ||||||
|  |  | ||||||
| if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"): | # If not set, we will infer it at runtime | ||||||
|     DATE_PARSER_LANGUAGES = _parse_dateparser_languages( | DATE_PARSER_LANGUAGES = ( | ||||||
|  |     _parse_dateparser_languages( | ||||||
|         os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"), |         os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES"), | ||||||
|     ) |     ) | ||||||
| else: |     if os.getenv("PAPERLESS_DATE_PARSER_LANGUAGES") | ||||||
|     DATE_PARSER_LANGUAGES = _ocr_to_dateparser_languages(OCR_LANGUAGE) |     else None | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Maximum number of dates taken from document start to end to show as suggestions for | # Maximum number of dates taken from document start to end to show as suggestions for | ||||||
|   | |||||||
| @@ -6,7 +6,6 @@ from unittest import mock | |||||||
| import pytest | import pytest | ||||||
| from celery.schedules import crontab | from celery.schedules import crontab | ||||||
|  |  | ||||||
| from paperless.settings import _ocr_to_dateparser_languages |  | ||||||
| from paperless.settings import _parse_base_paths | from paperless.settings import _parse_base_paths | ||||||
| from paperless.settings import _parse_beat_schedule | from paperless.settings import _parse_beat_schedule | ||||||
| from paperless.settings import _parse_dateparser_languages | from paperless.settings import _parse_dateparser_languages | ||||||
| @@ -476,33 +475,6 @@ class TestPathSettings(TestCase): | |||||||
|         self.assertEqual("/foobar/", base_paths[4])  # LOGOUT_REDIRECT_URL |         self.assertEqual("/foobar/", base_paths[4])  # LOGOUT_REDIRECT_URL | ||||||
|  |  | ||||||
|  |  | ||||||
| @pytest.mark.parametrize( |  | ||||||
|     ("ocr_language", "expected"), |  | ||||||
|     [ |  | ||||||
|         # One language |  | ||||||
|         ("eng", ["en"]), |  | ||||||
|         # Multiple languages |  | ||||||
|         ("fra+ita+lao", ["fr", "it", "lo"]), |  | ||||||
|         # Languages that don't have a two-letter equivalent |  | ||||||
|         ("fil", ["fil"]), |  | ||||||
|         # Languages with a script part supported by dateparser |  | ||||||
|         ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]), |  | ||||||
|         # Languages with a script part not supported by dateparser |  | ||||||
|         # In this case, default to the language without script |  | ||||||
|         ("deu_frak", ["de"]), |  | ||||||
|         # Traditional and simplified chinese don't have the same name in dateparser, |  | ||||||
|         # so they're converted to the general chinese language |  | ||||||
|         ("chi_tra+chi_sim", ["zh"]), |  | ||||||
|         # If a language is not supported by dateparser, fallback to the supported ones |  | ||||||
|         ("eng+unsupported_language+por", ["en", "pt"]), |  | ||||||
|         # If no language is supported, fallback to default |  | ||||||
|         ("unsupported1+unsupported2", []), |  | ||||||
|     ], |  | ||||||
| ) |  | ||||||
| def test_ocr_to_dateparser_languages(ocr_language, expected): |  | ||||||
|     assert sorted(_ocr_to_dateparser_languages(ocr_language)) == sorted(expected) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     ("languages", "expected"), |     ("languages", "expected"), | ||||||
|     [ |     [ | ||||||
|   | |||||||
							
								
								
									
										52
									
								
								src/paperless/tests/test_utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								src/paperless/tests/test_utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | |||||||
|  | import logging | ||||||
|  |  | ||||||
|  | import pytest | ||||||
|  |  | ||||||
|  | from paperless import utils | ||||||
|  | from paperless.utils import ocr_to_dateparser_languages | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @pytest.mark.parametrize( | ||||||
|  |     ("ocr_language", "expected"), | ||||||
|  |     [ | ||||||
|  |         # One language | ||||||
|  |         ("eng", ["en"]), | ||||||
|  |         # Multiple languages | ||||||
|  |         ("fra+ita+lao", ["fr", "it", "lo"]), | ||||||
|  |         # Languages that don't have a two-letter equivalent | ||||||
|  |         ("fil", ["fil"]), | ||||||
|  |         # Languages with a script part supported by dateparser | ||||||
|  |         ("aze_cyrl+srp_latn", ["az-Cyrl", "sr-Latn"]), | ||||||
|  |         # Languages with a script part not supported by dateparser | ||||||
|  |         # In this case, default to the language without script | ||||||
|  |         ("deu_frak", ["de"]), | ||||||
|  |         # Traditional and simplified chinese don't have the same name in dateparser, | ||||||
|  |         # so they're converted to the general chinese language | ||||||
|  |         ("chi_tra+chi_sim", ["zh"]), | ||||||
|  |         # If a language is not supported by dateparser, fallback to the supported ones | ||||||
|  |         ("eng+unsupported_language+por", ["en", "pt"]), | ||||||
|  |         # If no language is supported, fallback to default | ||||||
|  |         ("unsupported1+unsupported2", []), | ||||||
|  |         # Duplicate languages, should not duplicate in result | ||||||
|  |         ("eng+eng", ["en"]), | ||||||
|  |         # Language with script, but script is not mapped | ||||||
|  |         ("ita_unknownscript", ["it"]), | ||||||
|  |     ], | ||||||
|  | ) | ||||||
|  | def test_ocr_to_dateparser_languages(ocr_language, expected): | ||||||
|  |     assert sorted(ocr_to_dateparser_languages(ocr_language)) == sorted(expected) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_ocr_to_dateparser_languages_exception(monkeypatch, caplog): | ||||||
|  |     # Patch LocaleDataLoader.get_locale_map to raise an exception | ||||||
|  |     class DummyLoader: | ||||||
|  |         def get_locale_map(self, locales=None): | ||||||
|  |             raise RuntimeError("Simulated error") | ||||||
|  |  | ||||||
|  |     with caplog.at_level(logging.WARNING): | ||||||
|  |         monkeypatch.setattr(utils, "LocaleDataLoader", lambda: DummyLoader()) | ||||||
|  |         result = utils.ocr_to_dateparser_languages("eng+fra") | ||||||
|  |         assert result == [] | ||||||
|  |         assert ( | ||||||
|  |             "Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this" in caplog.text | ||||||
|  |         ) | ||||||
| @@ -1,4 +1,10 @@ | |||||||
| def ocr_to_dateparser_languages() -> dict[str, str]: | import logging | ||||||
|  |  | ||||||
|  | from dateparser.languages.loader import LocaleDataLoader | ||||||
|  |  | ||||||
|  | logger = logging.getLogger("paperless.utils") | ||||||
|  |  | ||||||
|  | OCR_TO_DATEPARSER_LANGUAGES = { | ||||||
|     """ |     """ | ||||||
|     Translation map from languages supported by Tesseract OCR |     Translation map from languages supported by Tesseract OCR | ||||||
|     to languages supported by dateparser. |     to languages supported by dateparser. | ||||||
| @@ -14,97 +20,150 @@ def ocr_to_dateparser_languages() -> dict[str, str]: | |||||||
|     # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln, |     # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln, | ||||||
|     # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus, |     # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus, | ||||||
|     # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue |     # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue | ||||||
|     return { |     "afr": "af", | ||||||
|         "afr": "af", |     "amh": "am", | ||||||
|         "amh": "am", |     "ara": "ar", | ||||||
|         "ara": "ar", |     "asm": "as", | ||||||
|         "asm": "as", |     "ast": "ast", | ||||||
|         "ast": "ast", |     "aze": "az", | ||||||
|         "aze": "az", |     "bel": "be", | ||||||
|         "bel": "be", |     "bul": "bg", | ||||||
|         "bul": "bg", |     "ben": "bn", | ||||||
|         "ben": "bn", |     "bod": "bo", | ||||||
|         "bod": "bo", |     "bre": "br", | ||||||
|         "bre": "br", |     "bos": "bs", | ||||||
|         "bos": "bs", |     "cat": "ca", | ||||||
|         "cat": "ca", |     "cher": "chr", | ||||||
|         "cher": "chr", |     "ces": "cs", | ||||||
|         "ces": "cs", |     "cym": "cy", | ||||||
|         "cym": "cy", |     "dan": "da", | ||||||
|         "dan": "da", |     "deu": "de", | ||||||
|         "deu": "de", |     "dzo": "dz", | ||||||
|         "dzo": "dz", |     "ell": "el", | ||||||
|         "ell": "el", |     "eng": "en", | ||||||
|         "eng": "en", |     "epo": "eo", | ||||||
|         "epo": "eo", |     "spa": "es", | ||||||
|         "spa": "es", |     "est": "et", | ||||||
|         "est": "et", |     "eus": "eu", | ||||||
|         "eus": "eu", |     "fas": "fa", | ||||||
|         "fas": "fa", |     "fin": "fi", | ||||||
|         "fin": "fi", |     "fil": "fil", | ||||||
|         "fil": "fil", |     "fao": "fo",  # codespell:ignore | ||||||
|         "fao": "fo",  # codespell:ignore |     "fra": "fr", | ||||||
|         "fra": "fr", |     "fry": "fy", | ||||||
|         "fry": "fy", |     "gle": "ga", | ||||||
|         "gle": "ga", |     "gla": "gd", | ||||||
|         "gla": "gd", |     "glg": "gl", | ||||||
|         "glg": "gl", |     "guj": "gu", | ||||||
|         "guj": "gu", |     "heb": "he", | ||||||
|         "heb": "he", |     "hin": "hi", | ||||||
|         "hin": "hi", |     "hrv": "hr", | ||||||
|         "hrv": "hr", |     "hun": "hu", | ||||||
|         "hun": "hu", |     "hye": "hy", | ||||||
|         "hye": "hy", |     "ind": "id", | ||||||
|         "ind": "id", |     "isl": "is", | ||||||
|         "isl": "is", |     "ita": "it", | ||||||
|         "ita": "it", |     "jpn": "ja", | ||||||
|         "jpn": "ja", |     "kat": "ka", | ||||||
|         "kat": "ka", |     "kaz": "kk", | ||||||
|         "kaz": "kk", |     "khm": "km", | ||||||
|         "khm": "km", |     "knda": "kn", | ||||||
|         "knda": "kn", |     "kor": "ko", | ||||||
|         "kor": "ko", |     "kir": "ky", | ||||||
|         "kir": "ky", |     "ltz": "lb", | ||||||
|         "ltz": "lb", |     "lao": "lo", | ||||||
|         "lao": "lo", |     "lit": "lt", | ||||||
|         "lit": "lt", |     "lav": "lv", | ||||||
|         "lav": "lv", |     "mal": "ml", | ||||||
|         "mal": "ml", |     "mon": "mn", | ||||||
|         "mon": "mn", |     "mar": "mr", | ||||||
|         "mar": "mr", |     "msa": "ms", | ||||||
|         "msa": "ms", |     "mlt": "mt", | ||||||
|         "mlt": "mt", |     "mya": "my", | ||||||
|         "mya": "my", |     "nep": "ne", | ||||||
|         "nep": "ne", |     "nld": "nl", | ||||||
|         "nld": "nl", |     "ori": "or", | ||||||
|         "ori": "or", |     "pan": "pa", | ||||||
|         "pan": "pa", |     "pol": "pl", | ||||||
|         "pol": "pl", |     "pus": "ps", | ||||||
|         "pus": "ps", |     "por": "pt", | ||||||
|         "por": "pt", |     "que": "qu", | ||||||
|         "que": "qu", |     "ron": "ro", | ||||||
|         "ron": "ro", |     "rus": "ru", | ||||||
|         "rus": "ru", |     "sin": "si", | ||||||
|         "sin": "si", |     "slk": "sk", | ||||||
|         "slk": "sk", |     "slv": "sl", | ||||||
|         "slv": "sl", |     "sqi": "sq", | ||||||
|         "sqi": "sq", |     "srp": "sr", | ||||||
|         "srp": "sr", |     "swe": "sv", | ||||||
|         "swe": "sv", |     "swa": "sw", | ||||||
|         "swa": "sw", |     "tam": "ta", | ||||||
|         "tam": "ta", |     "tel": "te",  # codespell:ignore | ||||||
|         "tel": "te",  # codespell:ignore |     "tha": "th",  # codespell:ignore | ||||||
|         "tha": "th",  # codespell:ignore |     "tir": "ti", | ||||||
|         "tir": "ti", |     "tgl": "tl", | ||||||
|         "tgl": "tl", |     "ton": "to", | ||||||
|         "ton": "to", |     "tur": "tr", | ||||||
|         "tur": "tr", |     "uig": "ug", | ||||||
|         "uig": "ug", |     "ukr": "uk", | ||||||
|         "ukr": "uk", |     "urd": "ur", | ||||||
|         "urd": "ur", |     "uzb": "uz", | ||||||
|         "uzb": "uz", |     "via": "vi", | ||||||
|         "via": "vi", |     "yid": "yi", | ||||||
|         "yid": "yi", |     "yor": "yo", | ||||||
|         "yor": "yo", |     "chi": "zh", | ||||||
|         "chi": "zh", | } | ||||||
|     } |  | ||||||
|  |  | ||||||
|  | def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]: | ||||||
|  |     """ | ||||||
|  |     Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl") | ||||||
|  |     into a list of locales compatible with the `dateparser` library. | ||||||
|  |  | ||||||
|  |     - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl"). | ||||||
|  |     Falls back to the base language (e.g., "az") if needed. | ||||||
|  |     - If a language cannot be mapped or validated, it is skipped with a warning. | ||||||
|  |     - Returns a list of valid locales, or an empty list if none could be converted. | ||||||
|  |     """ | ||||||
|  |     loader = LocaleDataLoader() | ||||||
|  |     result = [] | ||||||
|  |     try: | ||||||
|  |         for ocr_language in ocr_languages.split("+"): | ||||||
|  |             # Split into language and optional script | ||||||
|  |             ocr_lang_part, *script = ocr_language.split("_") | ||||||
|  |             ocr_script_part = script[0] if script else None | ||||||
|  |  | ||||||
|  |             language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part) | ||||||
|  |             if language_part is None: | ||||||
|  |                 logger.debug( | ||||||
|  |                     f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ', | ||||||
|  |                 ) | ||||||
|  |                 continue | ||||||
|  |  | ||||||
|  |             # Ensure base language is supported by dateparser | ||||||
|  |             loader.get_locale_map(locales=[language_part]) | ||||||
|  |  | ||||||
|  |             # Try to add the script part if it's supported by dateparser | ||||||
|  |             if ocr_script_part: | ||||||
|  |                 dateparser_language = f"{language_part}-{ocr_script_part.title()}" | ||||||
|  |                 try: | ||||||
|  |                     loader.get_locale_map(locales=[dateparser_language]) | ||||||
|  |                 except Exception: | ||||||
|  |                     logger.info( | ||||||
|  |                         f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.", | ||||||
|  |                     ) | ||||||
|  |                     dateparser_language = language_part | ||||||
|  |             else: | ||||||
|  |                 dateparser_language = language_part | ||||||
|  |             if dateparser_language not in result: | ||||||
|  |                 result.append(dateparser_language) | ||||||
|  |     except Exception as e: | ||||||
|  |         logger.warning( | ||||||
|  |             f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}", | ||||||
|  |         ) | ||||||
|  |         return [] | ||||||
|  |     if not result: | ||||||
|  |         logger.info( | ||||||
|  |             "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.", | ||||||
|  |         ) | ||||||
|  |     return result | ||||||
|   | |||||||
							
								
								
									
										15
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										15
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							| @@ -2046,7 +2046,6 @@ dependencies = [ | |||||||
|     { name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |     { name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, | ||||||
|     { name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |     { name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, | ||||||
|     { name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |     { name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, | ||||||
|     { name = "psutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |  | ||||||
|     { name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |     { name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, | ||||||
|     { name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |     { name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, | ||||||
|     { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, |     { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, | ||||||
| @@ -2183,7 +2182,6 @@ requires-dist = [ | |||||||
|     { name = "ocrmypdf", specifier = "~=16.10.0" }, |     { name = "ocrmypdf", specifier = "~=16.10.0" }, | ||||||
|     { name = "pathvalidate", specifier = "~=3.3.1" }, |     { name = "pathvalidate", specifier = "~=3.3.1" }, | ||||||
|     { name = "pdf2image", specifier = "~=1.17.0" }, |     { name = "pdf2image", specifier = "~=1.17.0" }, | ||||||
|     { name = "psutil", specifier = ">=7.0.0" }, |  | ||||||
|     { name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" }, |     { name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" }, | ||||||
|     { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" }, |     { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" }, | ||||||
|     { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" }, |     { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" }, | ||||||
| @@ -2550,19 +2548,6 @@ wheels = [ | |||||||
|     { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" }, |     { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" }, | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "psutil" |  | ||||||
| version = "7.0.0" |  | ||||||
| source = { registry = "https://pypi.org/simple" } |  | ||||||
| sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" } |  | ||||||
| wheels = [ |  | ||||||
|     { url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" }, |  | ||||||
|     { url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" }, |  | ||||||
|     { url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" }, |  | ||||||
|     { url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" }, |  | ||||||
|     { url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" }, |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "psycopg" | name = "psycopg" | ||||||
| version = "3.2.9" | version = "3.2.9" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user