mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Removes Turkish from the NLTK languages (#7246)
This commit is contained in:
parent
73d33ff25a
commit
6965165c76
@ -1126,6 +1126,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
|
|||||||
|
|
||||||
NLTK Languages:
|
NLTK Languages:
|
||||||
- https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer
|
- https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer
|
||||||
|
- https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
|
||||||
|
- https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
|
||||||
|
|
||||||
|
The common intersection between all languages in those 3 is handled here
|
||||||
|
|
||||||
"""
|
"""
|
||||||
ocr_lang = ocr_lang.split("+")[0]
|
ocr_lang = ocr_lang.split("+")[0]
|
||||||
@ -1142,7 +1146,6 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
|
|||||||
"rus": "russian",
|
"rus": "russian",
|
||||||
"spa": "spanish",
|
"spa": "spanish",
|
||||||
"swe": "swedish",
|
"swe": "swedish",
|
||||||
"tur": "turkish",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return iso_code_to_nltk.get(ocr_lang)
|
return iso_code_to_nltk.get(ocr_lang)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user