Fixes the download and usage of the downloaded data

2025-12-22 01:55:49 -06:00 · 2022-09-16 06:55:42 -07:00
parent 1262c121f0
commit 6523cf0c4b
4 changed files with 25 additions and 20 deletions
--- a/docker/docker-entrypoint.sh
+++ b/docker/docker-entrypoint.sh
@@ -53,6 +53,21 @@ map_folders() {
 	export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
 }
 nltk_data () {
 	# Store the NLTK data outside the Docker container
 	local nltk_data_dir="${DATA_DIR}/nltk"
 	# Download or update the snowball stemmer data
 	python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
 	# Download or update the stopwords corpus
 	python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
 	# Download or update the punkt tokenizer data
 	python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
 }
 initialize() {
 	# Setup environment from secrets before anything else
@@ -93,6 +108,8 @@ initialize() {
 	echo "Creating directory ${tmp_dir}"
 	mkdir -p "${tmp_dir}"
 	nltk_data
 	set +e
 	echo "Adjusting permissions of paperless files. This may take a while."
 	chown -R paperless:paperless ${tmp_dir}
--- a/docker/docker-prepare.sh
+++ b/docker/docker-prepare.sh
@@ -89,24 +89,6 @@ superuser() {
 	fi
 }
 nltk_data () {
 	# Store the NLTK data outside the Docker container
 	local nltk_data_dir="${DATA_DIR}/nltk"
 	# Download or update the snowball stemmer data
 	python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data
 	# Download or update the stopwords corpus
 	python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords
 	# Download or update the punkt tokenizer data
 	python3 -m nltk.downloader -d "${nltk_data_dir}" punkt
 	# Set env so nltk can find the downloaded data
 	export NLTK_DATA="${nltk_data_dir}"
 }
 do_work() {
 	if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then
 		wait_for_mariadb
@@ -118,8 +100,6 @@ do_work() {
 	migrations
 	nltk_data
 	search_index
 	superuser
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -306,6 +306,12 @@ class DocumentClassifier:
        from nltk.corpus import stopwords
        from nltk.stem import SnowballStemmer
        import nltk
        # Not really hacky, since it isn't private and is documented, but
        # set the search path for NLTK data to the single location it should be in
        nltk.data.path = [settings.NLTK_DIR]
        if self.stemmer is None:
            self.stemmer = SnowballStemmer("english")
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -84,6 +84,8 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
 DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
 NLTK_DIR = os.path.join(DATA_DIR, "nltk")
 TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
 # Lock file for synchronizing changes to the MEDIA directory across multiple