From 6965165c76e501c09366850fb67f03d56724282f Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sun, 14 Jul 2024 16:35:16 -0700 Subject: [PATCH] Removes Turkish from the NLTK languages (#7246) --- src/paperless/settings.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index fe7caebb8..000904aef 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1126,6 +1126,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: NLTK Languages: - https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer + - https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip + - https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip + + The common intersection between all languages in those 3 is handled here """ ocr_lang = ocr_lang.split("+")[0] @@ -1142,7 +1146,6 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: "rus": "russian", "spa": "spanish", "swe": "swedish", - "tur": "turkish", } return iso_code_to_nltk.get(ocr_lang)