From c44c914d3d2fef7ecbe2de21ab373fcc3452b709 Mon Sep 17 00:00:00 2001
From: Trenton Holmes <holmes.trenton@gmail.com>
Date: Sun, 18 Sep 2022 08:48:26 -0700
Subject: [PATCH] Changes the NLTK language to be based on the Tesseract OCR
 language, with fallback to the default processing

---
 src/documents/classifier.py | 56 ++++++++++++++++++++++---------------
 src/paperless/settings.py   | 28 ++++++++++++++++++-
 2 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/src/documents/classifier.py b/src/documents/classifier.py
index 5711c34af..666beffa7 100644
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -75,7 +75,8 @@ class DocumentClassifier:
         self.document_type_classifier = None
         self.storage_path_classifier = None
 
-        self.stemmer = None
+        self._stemmer = None
+        self._stop_words = None
 
     def load(self):
         # Catch warnings for processing
@@ -302,32 +303,43 @@ class DocumentClassifier:
         Process to contents of a document, distilling it down into
         words which are meaningful to the content
         """
-        from nltk.tokenize import word_tokenize
-        from nltk.corpus import stopwords
-        from nltk.stem import SnowballStemmer
-
-        import nltk
-
-        # Not really hacky, since it isn't private and is documented, but
-        # set the search path for NLTK data to the single location it should be in
-        nltk.data.path = [settings.NLTK_DIR]
-
-        if self.stemmer is None:
-            self.stemmer = SnowballStemmer("english")
 
         # Lower case the document
         content = content.lower().strip()
-        # Get only the letters (remove punctuation too)
+        # Reduce spaces
+        content = re.sub(r"\s+", " ", content)
+        # Get only the letters
         content = re.sub(r"[^\w\s]", " ", content)
-        # Tokenize
-        words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
-        # Remove stop words
-        stops = set(stopwords.words(settings.NLTK_LANGUAGE))
-        meaningful_words = [w for w in words if w not in stops]
-        # Stem words
-        meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]
 
-        return " ".join(meaningful_words)
+        # If the NLTK language is supported, do further processing
+        if settings.NLTK_LANGUAGE is not None:
+
+            import nltk
+
+            from nltk.tokenize import word_tokenize
+            from nltk.corpus import stopwords
+            from nltk.stem import SnowballStemmer
+
+            # Not really hacky, since it isn't private and is documented, but
+            # set the search path for NLTK data to the single location it should be in
+            nltk.data.path = [settings.NLTK_DIR]
+
+            # Do some one time setup
+            if self._stemmer is None:
+                self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
+            if self._stop_words is None:
+                self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
+
+            # Tokenize
+            words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
+            # Remove stop words
+            meaningful_words = [w for w in words if w not in self._stop_words]
+            # Stem words
+            meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
+
+            return " ".join(meaningful_words)
+
+        return content
 
     def predict_correspondent(self, content):
         if self.correspondent_classifier:
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index cdbfbbfba..45544b5c2 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -709,4 +709,30 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
 if ENABLE_UPDATE_CHECK != "default":
     ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
 
-NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
+
+def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
+    """
+    Maps an ISO-639-1 language code supported by Tesseract into
+    an optional NLTK language name.  This is the set of common supported
+    languages for all the NLTK data used.
+    """
+    iso_code_to_nltk = {
+        "dan": "danish",
+        "nld": "dutch",
+        "eng": "english",
+        "fin": "finnish",
+        "fra": "french",
+        "deu": "german",
+        "ita": "italian",
+        "nor": "norwegian",
+        "por": "portuguese",
+        "rus": "russian",
+        "spa": "spanish",
+        "swe": "swedish",
+        "tur": "turkish",
+    }
+
+    return iso_code_to_nltk.get(ocr_lang, None)
+
+
+NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)