Allows configuration of the NLTK processing language

2026-02-03 23:22:42 -06:00 · 2022-09-16 13:57:37 -07:00
parent 70b1988a55
commit 0bc13c2a72
2 changed files with 4 additions and 3 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -320,10 +320,9 @@ class DocumentClassifier:
        # Get only the letters (remove punctuation too)
        content = re.sub(r"[^\w\s]", " ", content)
        # Tokenize
-        # TODO configurable language
-        words: List[str] = word_tokenize(content, language="english")
+        words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
        # Remove stop words
-        stops = set(stopwords.words("english"))
+        stops = set(stopwords.words(settings.NLTK_LANGUAGE))
        meaningful_words = [w for w in words if w not in stops]
        # Stem words
        meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]