diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 72a01dbee..5711c34af 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -320,10 +320,9 @@ class DocumentClassifier: # Get only the letters (remove punctuation too) content = re.sub(r"[^\w\s]", " ", content) # Tokenize - # TODO configurable language - words: List[str] = word_tokenize(content, language="english") + words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE) # Remove stop words - stops = set(stopwords.words("english")) + stops = set(stopwords.words(settings.NLTK_LANGUAGE)) meaningful_words = [w for w in words if w not in stops] # Stem words meaningful_words = [self.stemmer.stem(w) for w in meaningful_words] diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 7c2e21d00..cdbfbbfba 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -708,3 +708,5 @@ if os.getenv("PAPERLESS_IGNORE_DATES") is not None: ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") if ENABLE_UPDATE_CHECK != "default": ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK") + +NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()