From c1641f6fb8a1cbe24d96de432a64217bec4a4936 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 24 May 2023 11:54:12 -0700 Subject: [PATCH] Just in case, catch a sometimes nltk error and return the basic processed content instead --- src/documents/classifier.py | 46 +++++++++++++++++++++++++++---------- src/paperless/settings.py | 4 ++++ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 0848e0105..e4f92b9ea 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -341,20 +341,42 @@ class DocumentClassifier: # set the search path for NLTK data to the single location it should be in nltk.data.path = [settings.NLTK_DIR] - # Do some one time setup - if self._stemmer is None: - self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE) - if self._stop_words is None: - self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE)) + try: + # Preload the corpus early, to force the lazy loader to transform + stopwords.ensure_loaded() - # Tokenize - words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE) - # Remove stop words - meaningful_words = [w for w in words if w not in self._stop_words] - # Stem words - meaningful_words = [self._stemmer.stem(w) for w in meaningful_words] + # Do some one time setup + # Sometimes, somehow, there's multiple threads loading the corpus + # and it's not thread safe, raising an AttributeError + if self._stemmer is None: + self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE) + if self._stop_words is None: + self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE)) - return " ".join(meaningful_words) + # Tokenize + # This splits the content into tokens, roughly words + words: List[str] = word_tokenize( + content, + language=settings.NLTK_LANGUAGE, + ) + + meaningful_words = [] + for word in words: + # Skip stop words + # These are words like "a", "and", "the" which add little meaning + if word in self._stop_words: + continue + # Stem the words + # This reduces the words to their stems. + # "amazement" returns "amaz" + # "amaze" returns "amaz + # "amazed" returns "amaz" + meaningful_words.append(self._stemmer.stem(word)) + + return " ".join(meaningful_words) + + except AttributeError: + return content return content diff --git a/src/paperless/settings.py b/src/paperless/settings.py index c3e75e402..d3c239b43 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -921,6 +921,10 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: languages for all the NLTK data used. Assumption: The primary language is first + + NLTK Languages: + - https://www.nltk.org/api/nltk.stem.snowball.html#nltk.stem.snowball.SnowballStemmer + """ ocr_lang = ocr_lang.split("+")[0] iso_code_to_nltk = {