mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Bakes the NLTK data into the image (~60mb)
This commit is contained in:
parent
3f6e3a2750
commit
8da3ae2c53
@ -228,6 +228,10 @@ RUN set -eux \
|
|||||||
&& python3 -m pip install --no-cache-dir --upgrade wheel \
|
&& python3 -m pip install --no-cache-dir --upgrade wheel \
|
||||||
&& echo "Installing Python requirements" \
|
&& echo "Installing Python requirements" \
|
||||||
&& python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \
|
&& python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \
|
||||||
|
&& echo "Installing NLTK data" \
|
||||||
|
&& python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" snowball_data \
|
||||||
|
&& python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" stopwords \
|
||||||
|
&& python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" punkt \
|
||||||
&& echo "Cleaning up image" \
|
&& echo "Cleaning up image" \
|
||||||
&& apt-get -y purge ${BUILD_PACKAGES} \
|
&& apt-get -y purge ${BUILD_PACKAGES} \
|
||||||
&& apt-get -y autoremove --purge \
|
&& apt-get -y autoremove --purge \
|
||||||
|
@ -53,30 +53,6 @@ map_folders() {
|
|||||||
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
|
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
|
||||||
}
|
}
|
||||||
|
|
||||||
nltk_data () {
|
|
||||||
# Store the NLTK data outside the Docker container
|
|
||||||
local -r nltk_data_dir="${DATA_DIR}/nltk"
|
|
||||||
local -r truthy_things=("yes y 1 t true")
|
|
||||||
|
|
||||||
# If not set, or it looks truthy
|
|
||||||
if [[ -z "${PAPERLESS_ENABLE_NLTK}" ]] || [[ "${truthy_things[*]}" =~ ${PAPERLESS_ENABLE_NLTK,} ]]; then
|
|
||||||
|
|
||||||
# Download or update the snowball stemmer data
|
|
||||||
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
|
|
||||||
|
|
||||||
# Download or update the stopwords corpus
|
|
||||||
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
|
|
||||||
|
|
||||||
# Download or update the punkt tokenizer data
|
|
||||||
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
|
|
||||||
|
|
||||||
else
|
|
||||||
echo "Skipping NLTK data download"
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
custom_container_init() {
|
custom_container_init() {
|
||||||
# Mostly borrowed from the LinuxServer.io base image
|
# Mostly borrowed from the LinuxServer.io base image
|
||||||
# https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d
|
# https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d
|
||||||
@ -157,8 +133,6 @@ initialize() {
|
|||||||
echo "Creating directory ${tmp_dir}"
|
echo "Creating directory ${tmp_dir}"
|
||||||
mkdir -p "${tmp_dir}"
|
mkdir -p "${tmp_dir}"
|
||||||
|
|
||||||
nltk_data
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
echo "Adjusting permissions of paperless files. This may take a while."
|
echo "Adjusting permissions of paperless files. This may take a while."
|
||||||
chown -R paperless:paperless ${tmp_dir}
|
chown -R paperless:paperless ${tmp_dir}
|
||||||
@ -191,10 +165,6 @@ install_languages() {
|
|||||||
|
|
||||||
for lang in "${langs[@]}"; do
|
for lang in "${langs[@]}"; do
|
||||||
pkg="tesseract-ocr-$lang"
|
pkg="tesseract-ocr-$lang"
|
||||||
# English is installed by default
|
|
||||||
#if [[ "$lang" == "eng" ]]; then
|
|
||||||
# continue
|
|
||||||
#fi
|
|
||||||
|
|
||||||
if dpkg -s "$pkg" &>/dev/null; then
|
if dpkg -s "$pkg" &>/dev/null; then
|
||||||
echo "Package $pkg already installed!"
|
echo "Package $pkg already installed!"
|
||||||
|
@ -123,7 +123,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
|||||||
|
|
||||||
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
|
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
|
||||||
|
|
||||||
NLTK_DIR = os.path.join(DATA_DIR, "nltk")
|
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")
|
||||||
|
|
||||||
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
|
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user