Fixes the download and usage of the downloaded data

This commit is contained in:
Trenton Holmes
2022-09-16 06:55:42 -07:00
committed by Trenton H
parent 1262c121f0
commit 6523cf0c4b
4 changed files with 25 additions and 20 deletions

View File

@@ -53,6 +53,21 @@ map_folders() {
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
}
nltk_data () {
# Store the NLTK data outside the Docker container
local nltk_data_dir="${DATA_DIR}/nltk"
# Download or update the snowball stemmer data
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
# Download or update the stopwords corpus
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
# Download or update the punkt tokenizer data
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
}
initialize() {
# Setup environment from secrets before anything else
@@ -93,6 +108,8 @@ initialize() {
echo "Creating directory ${tmp_dir}"
mkdir -p "${tmp_dir}"
nltk_data
set +e
echo "Adjusting permissions of paperless files. This may take a while."
chown -R paperless:paperless ${tmp_dir}