From 6965165c76e501c09366850fb67f03d56724282f Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Sun, 14 Jul 2024 16:35:16 -0700
Subject: [PATCH] Removes Turkish from the NLTK languages (#7246)

---
 src/paperless/settings.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index fe7caebb8..000904aef 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1126,6 +1126,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
 
     NLTK Languages:
       - https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer
+      - https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
+      - https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
+
+    The common intersection between all languages in those 3 is handled here
 
     """
     ocr_lang = ocr_lang.split("+")[0]
@@ -1142,7 +1146,6 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
         "rus": "russian",
         "spa": "spanish",
         "swe": "swedish",
-        "tur": "turkish",
     }
 
     return iso_code_to_nltk.get(ocr_lang)