Changes the NLTK language to be based on the Tesseract OCR language, with fallback to the default processing

2025-12-24 02:05:48 -06:00 · 2022-09-18 08:48:26 -07:00
parent 0bc13c2a72
commit a78d44ec5f
2 changed files with 61 additions and 23 deletions
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -709,4 +709,30 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
 if ENABLE_UPDATE_CHECK != "default":
    ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")

-NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
+
+def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
+    """
+    Maps an ISO-639-1 language code supported by Tesseract into
+    an optional NLTK language name.  This is the set of common supported
+    languages for all the NLTK data used.
+    """
+    iso_code_to_nltk = {
+        "dan": "danish",
+        "nld": "dutch",
+        "eng": "english",
+        "fin": "finnish",
+        "fra": "french",
+        "deu": "german",
+        "ita": "italian",
+        "nor": "norwegian",
+        "por": "portuguese",
+        "rus": "russian",
+        "spa": "spanish",
+        "swe": "swedish",
+        "tur": "turkish",
+    }
+
+    return iso_code_to_nltk.get(ocr_lang, None)
+
+
+NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)