diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index 14a0650f0..81536f5ce 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -53,6 +53,21 @@ map_folders() { export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}" } +nltk_data () { + # Store the NLTK data outside the Docker container + local nltk_data_dir="${DATA_DIR}/nltk" + + # Download or update the snowball stemmer data + python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data + + # Download or update the stopwords corpus + python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords + + # Download or update the punkt tokenizer data + python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt + +} + initialize() { # Setup environment from secrets before anything else @@ -93,6 +108,8 @@ initialize() { echo "Creating directory ${tmp_dir}" mkdir -p "${tmp_dir}" + nltk_data + set +e echo "Adjusting permissions of paperless files. This may take a while." chown -R paperless:paperless ${tmp_dir} diff --git a/docker/docker-prepare.sh b/docker/docker-prepare.sh index 75504ee8c..c4e45c032 100755 --- a/docker/docker-prepare.sh +++ b/docker/docker-prepare.sh @@ -89,24 +89,6 @@ superuser() { fi } -nltk_data () { - # Store the NLTK data outside the Docker container - local nltk_data_dir="${DATA_DIR}/nltk" - - # Download or update the snowball stemmer data - python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data - - # Download or update the stopwords corpus - python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords - - # Download or update the punkt tokenizer data - python3 -m nltk.downloader -d "${nltk_data_dir}" punkt - - # Set env so nltk can find the downloaded data - export NLTK_DATA="${nltk_data_dir}" - -} - do_work() { if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then wait_for_mariadb @@ -118,8 +100,6 @@ do_work() { migrations - nltk_data - search_index superuser diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 27964d0f8..72a01dbee 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -306,6 +306,12 @@ class DocumentClassifier: from nltk.corpus import stopwords from nltk.stem import SnowballStemmer + import nltk + + # Not really hacky, since it isn't private and is documented, but + # set the search path for NLTK data to the single location it should be in + nltk.data.path = [settings.NLTK_DIR] + if self.stemmer is None: self.stemmer = SnowballStemmer("english") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 6836f4ea0..7c2e21d00 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -84,6 +84,8 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data")) +NLTK_DIR = os.path.join(DATA_DIR, "nltk") + TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR") # Lock file for synchronizing changes to the MEDIA directory across multiple