diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ae2e30d6e..49efc7ef4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,6 +125,9 @@ jobs: name: Install Python dependencies run: | pipenv sync --dev + pipenv run python3 -m nltk.downloader snowball_data + pipenv run python3 -m nltk.downloader stopwords + pipenv run python3 -m nltk.downloader punkt - name: List installed Python dependencies run: | diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index c97225007..14a0650f0 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -53,24 +53,6 @@ map_folders() { export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}" } -nltk_data () { - # Store the NLTK data outside the Docker container - local nltk_data_dir="${DATA_DIR}/nltk" - - # Download or update the snowball stemmer data - python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data - - # Download or update the stopwords corpus - python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords - - # Download or update the punkt tokenizer data - python3 -m nltk.downloader -d "${nltk_data_dir}" punkt - - # Set env so nltk can find the downloaded data - export NLTK_DATA="${nltk_data_dir}" - -} - initialize() { # Setup environment from secrets before anything else @@ -123,8 +105,6 @@ initialize() { done set -e - nltk_data - "${gosu_cmd[@]}" /sbin/docker-prepare.sh } diff --git a/docker/docker-prepare.sh b/docker/docker-prepare.sh index c4e45c032..75504ee8c 100755 --- a/docker/docker-prepare.sh +++ b/docker/docker-prepare.sh @@ -89,6 +89,24 @@ superuser() { fi } +nltk_data () { + # Store the NLTK data outside the Docker container + local nltk_data_dir="${DATA_DIR}/nltk" + + # Download or update the snowball stemmer data + python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data + + # Download or update the stopwords corpus + python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords + + # Download or update the punkt tokenizer data + python3 -m nltk.downloader -d "${nltk_data_dir}" punkt + + # Set env so nltk can find the downloaded data + export NLTK_DATA="${nltk_data_dir}" + +} + do_work() { if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then wait_for_mariadb @@ -100,6 +118,8 @@ do_work() { migrations + nltk_data + search_index superuser