diff --git a/Dockerfile b/Dockerfile index 2cb25c0c3..a1f577243 100644 --- a/Dockerfile +++ b/Dockerfile @@ -228,6 +228,10 @@ RUN set -eux \ && python3 -m pip install --no-cache-dir --upgrade wheel \ && echo "Installing Python requirements" \ && python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \ + && echo "Installing NLTK data" \ + && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" snowball_data \ + && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" stopwords \ + && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" punkt \ && echo "Cleaning up image" \ && apt-get -y purge ${BUILD_PACKAGES} \ && apt-get -y autoremove --purge \ diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index 74e080671..00be59add 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -53,30 +53,6 @@ map_folders() { export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}" } -nltk_data () { - # Store the NLTK data outside the Docker container - local -r nltk_data_dir="${DATA_DIR}/nltk" - local -r truthy_things=("yes y 1 t true") - - # If not set, or it looks truthy - if [[ -z "${PAPERLESS_ENABLE_NLTK}" ]] || [[ "${truthy_things[*]}" =~ ${PAPERLESS_ENABLE_NLTK,} ]]; then - - # Download or update the snowball stemmer data - python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data - - # Download or update the stopwords corpus - python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords - - # Download or update the punkt tokenizer data - python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt - - else - echo "Skipping NLTK data download" - - fi - -} - custom_container_init() { # Mostly borrowed from the LinuxServer.io base image # https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d @@ -157,8 +133,6 @@ initialize() { echo "Creating directory ${tmp_dir}" mkdir -p "${tmp_dir}" - nltk_data - set +e echo "Adjusting permissions of paperless files. This may take a while." chown -R paperless:paperless ${tmp_dir} @@ -191,10 +165,6 @@ install_languages() { for lang in "${langs[@]}"; do pkg="tesseract-ocr-$lang" - # English is installed by default - #if [[ "$lang" == "eng" ]]; then - # continue - #fi if dpkg -s "$pkg" &>/dev/null; then echo "Package $pkg already installed!" diff --git a/src/paperless/settings.py b/src/paperless/settings.py index c11e43489..40c7a5c3b 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -123,7 +123,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data")) -NLTK_DIR = os.path.join(DATA_DIR, "nltk") +NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data") TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")