Changes the NLTK language to be based on the Tesseract OCR language, with fallback to the default processing

2025-09-14 21:45:37 -05:00 · 2022-09-18 08:48:26 -07:00
parent d10d2f5a54
commit c44c914d3d
2 changed files with 61 additions and 23 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -75,7 +75,8 @@ class DocumentClassifier:
        self.document_type_classifier = None
        self.storage_path_classifier = None
-        self.stemmer = None
+        self._stemmer = None
        self._stop_words = None
    def load(self):
        # Catch warnings for processing
@@ -302,33 +303,44 @@ class DocumentClassifier:
        Process to contents of a document, distilling it down into
        words which are meaningful to the content
        """
        # Lower case the document
        content = content.lower().strip()
        # Reduce spaces
        content = re.sub(r"\s+", " ", content)
        # Get only the letters
        content = re.sub(r"[^\w\s]", " ", content)
        # If the NLTK language is supported, do further processing
        if settings.NLTK_LANGUAGE is not None:
            import nltk
            from nltk.tokenize import word_tokenize
            from nltk.corpus import stopwords
            from nltk.stem import SnowballStemmer
        import nltk
            # Not really hacky, since it isn't private and is documented, but
            # set the search path for NLTK data to the single location it should be in
            nltk.data.path = [settings.NLTK_DIR]
-        if self.stemmer is None:
+            # Do some one time setup
-            self.stemmer = SnowballStemmer("english")
+            if self._stemmer is None:
                self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
            if self._stop_words is None:
                self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
        # Lower case the document
        content = content.lower().strip()
        # Get only the letters (remove punctuation too)
        content = re.sub(r"[^\w\s]", " ", content)
            # Tokenize
            words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
            # Remove stop words
-        stops = set(stopwords.words(settings.NLTK_LANGUAGE))
+            meaningful_words = [w for w in words if w not in self._stop_words]
        meaningful_words = [w for w in words if w not in stops]
            # Stem words
-        meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]
+            meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
            return " ".join(meaningful_words)
        return content
    def predict_correspondent(self, content):
        if self.correspondent_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -709,4 +709,30 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
 if ENABLE_UPDATE_CHECK != "default":
    ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
-NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
+
 def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
    """
    Maps an ISO-639-1 language code supported by Tesseract into
    an optional NLTK language name.  This is the set of common supported
    languages for all the NLTK data used.
    """
    iso_code_to_nltk = {
        "dan": "danish",
        "nld": "dutch",
        "eng": "english",
        "fin": "finnish",
        "fra": "french",
        "deu": "german",
        "ita": "italian",
        "nor": "norwegian",
        "por": "portuguese",
        "rus": "russian",
        "spa": "spanish",
        "swe": "swedish",
        "tur": "turkish",
    }
    return iso_code_to_nltk.get(ocr_lang, None)
 NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)