Updates the pre-processing of document content to be much more robust, with tokenization, stemming and stop word removal

This commit is contained in:
Trenton Holmes
2022-09-15 08:39:47 -07:00
committed by Trenton H
parent 14d82bd8ff
commit d856e48045
4 changed files with 76 additions and 19 deletions

View File

@@ -53,6 +53,24 @@ map_folders() {
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
}
nltk_data () {
# Store the NLTK data outside the Docker container
local nltk_data_dir="${DATA_DIR}/nltk"
# Download or update the snowball stemmer data
python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data
# Download or update the stopwords corpus
python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords
# Download or update the punkt tokenizer data
python3 -m nltk.downloader -d "${nltk_data_dir}" punkt
# Set env so nltk can find the downloaded data
export NLTK_DATA="${nltk_data_dir}"
}
initialize() {
# Setup environment from secrets before anything else
@@ -105,6 +123,8 @@ initialize() {
done
set -e
nltk_data
"${gosu_cmd[@]}" /sbin/docker-prepare.sh
}