mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Allows configuration of the NLTK processing language
This commit is contained in:
parent
6523cf0c4b
commit
d10d2f5a54
@ -320,10 +320,9 @@ class DocumentClassifier:
|
||||
# Get only the letters (remove punctuation too)
|
||||
content = re.sub(r"[^\w\s]", " ", content)
|
||||
# Tokenize
|
||||
# TODO configurable language
|
||||
words: List[str] = word_tokenize(content, language="english")
|
||||
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
|
||||
# Remove stop words
|
||||
stops = set(stopwords.words("english"))
|
||||
stops = set(stopwords.words(settings.NLTK_LANGUAGE))
|
||||
meaningful_words = [w for w in words if w not in stops]
|
||||
# Stem words
|
||||
meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]
|
||||
|
@ -708,3 +708,5 @@ if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
|
||||
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
|
||||
if ENABLE_UPDATE_CHECK != "default":
|
||||
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
|
||||
|
||||
NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
|
||||
|
Loading…
x
Reference in New Issue
Block a user