Updates the pre-processing of document content to be much more robust, with tokenization, stemming and stop word removal

2025-11-01 04:06:16 -05:00 · 2022-09-15 08:39:47 -07:00
parent 14d82bd8ff
commit d856e48045
4 changed files with 76 additions and 19 deletions
--- a/docker/docker-entrypoint.sh
+++ b/docker/docker-entrypoint.sh
@@ -53,6 +53,24 @@ map_folders() {
 	export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
 }

+nltk_data () {
+	# Store the NLTK data outside the Docker container
+	local nltk_data_dir="${DATA_DIR}/nltk"
+
+	# Download or update the snowball stemmer data
+	python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data
+
+	# Download or update the stopwords corpus
+	python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords
+
+	# Download or update the punkt tokenizer data
+	python3 -m nltk.downloader -d "${nltk_data_dir}" punkt
+
+	# Set env so nltk can find the downloaded data
+	export NLTK_DATA="${nltk_data_dir}"
+
+}
+
 initialize() {

 	# Setup environment from secrets before anything else
@@ -105,6 +123,8 @@ initialize() {
 	done
 	set -e

+	nltk_data
+
 	"${gosu_cmd[@]}" /sbin/docker-prepare.sh
 }