Changes the NLTK language to be based on the Tesseract OCR language, with fallback to the default processing

This commit is contained in:
Trenton Holmes
2022-09-18 08:48:26 -07:00
committed by Trenton H
parent 0bc13c2a72
commit a78d44ec5f
2 changed files with 61 additions and 23 deletions

View File

@@ -709,4 +709,30 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
if ENABLE_UPDATE_CHECK != "default":
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
"""
Maps an ISO-639-1 language code supported by Tesseract into
an optional NLTK language name. This is the set of common supported
languages for all the NLTK data used.
"""
iso_code_to_nltk = {
"dan": "danish",
"nld": "dutch",
"eng": "english",
"fin": "finnish",
"fra": "french",
"deu": "german",
"ita": "italian",
"nor": "norwegian",
"por": "portuguese",
"rus": "russian",
"spa": "spanish",
"swe": "swedish",
"tur": "turkish",
}
return iso_code_to_nltk.get(ocr_lang, None)
NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)