Fixes the download and usage of the downloaded data

2026-02-03 23:22:42 -06:00 · 2022-09-16 06:55:42 -07:00
parent 1262c121f0
commit 6523cf0c4b
4 changed files with 25 additions and 20 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -306,6 +306,12 @@ class DocumentClassifier:
        from nltk.corpus import stopwords
        from nltk.stem import SnowballStemmer

+        import nltk
+
+        # Not really hacky, since it isn't private and is documented, but
+        # set the search path for NLTK data to the single location it should be in
+        nltk.data.path = [settings.NLTK_DIR]
+
        if self.stemmer is None:
            self.stemmer = SnowballStemmer("english")

--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -84,6 +84,8 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")

 DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))

+NLTK_DIR = os.path.join(DATA_DIR, "nltk")
+
 TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")

 # Lock file for synchronizing changes to the MEDIA directory across multiple