Allows configuration of the NLTK processing language

This commit is contained in:
Trenton H 2022-09-16 13:57:37 -07:00
parent 6523cf0c4b
commit d10d2f5a54
2 changed files with 4 additions and 3 deletions

View File

@ -320,10 +320,9 @@ class DocumentClassifier:
# Get only the letters (remove punctuation too)
content = re.sub(r"[^\w\s]", " ", content)
# Tokenize
# TODO configurable language
words: List[str] = word_tokenize(content, language="english")
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Remove stop words
stops = set(stopwords.words("english"))
stops = set(stopwords.words(settings.NLTK_LANGUAGE))
meaningful_words = [w for w in words if w not in stops]
# Stem words
meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]

View File

@ -708,3 +708,5 @@ if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
if ENABLE_UPDATE_CHECK != "default":
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()