Fixes the download and usage of the downloaded data

This commit is contained in:
Trenton Holmes 2022-09-16 06:55:42 -07:00 committed by Trenton H
parent 1262c121f0
commit 6523cf0c4b
4 changed files with 25 additions and 20 deletions

@ -53,6 +53,21 @@ map_folders() {
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}" export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
} }
nltk_data () {
# Store the NLTK data outside the Docker container
local nltk_data_dir="${DATA_DIR}/nltk"
# Download or update the snowball stemmer data
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
# Download or update the stopwords corpus
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
# Download or update the punkt tokenizer data
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
}
initialize() { initialize() {
# Setup environment from secrets before anything else # Setup environment from secrets before anything else
@ -93,6 +108,8 @@ initialize() {
echo "Creating directory ${tmp_dir}" echo "Creating directory ${tmp_dir}"
mkdir -p "${tmp_dir}" mkdir -p "${tmp_dir}"
nltk_data
set +e set +e
echo "Adjusting permissions of paperless files. This may take a while." echo "Adjusting permissions of paperless files. This may take a while."
chown -R paperless:paperless ${tmp_dir} chown -R paperless:paperless ${tmp_dir}

@ -89,24 +89,6 @@ superuser() {
fi fi
} }
nltk_data () {
# Store the NLTK data outside the Docker container
local nltk_data_dir="${DATA_DIR}/nltk"
# Download or update the snowball stemmer data
python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data
# Download or update the stopwords corpus
python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords
# Download or update the punkt tokenizer data
python3 -m nltk.downloader -d "${nltk_data_dir}" punkt
# Set env so nltk can find the downloaded data
export NLTK_DATA="${nltk_data_dir}"
}
do_work() { do_work() {
if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then
wait_for_mariadb wait_for_mariadb
@ -118,8 +100,6 @@ do_work() {
migrations migrations
nltk_data
search_index search_index
superuser superuser

@ -306,6 +306,12 @@ class DocumentClassifier:
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer from nltk.stem import SnowballStemmer
import nltk
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
if self.stemmer is None: if self.stemmer is None:
self.stemmer = SnowballStemmer("english") self.stemmer = SnowballStemmer("english")

@ -84,6 +84,8 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data")) DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
NLTK_DIR = os.path.join(DATA_DIR, "nltk")
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR") TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
# Lock file for synchronizing changes to the MEDIA directory across multiple # Lock file for synchronizing changes to the MEDIA directory across multiple