mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Bakes the NLTK data into the image (~60mb)
This commit is contained in:
		| @@ -228,6 +228,10 @@ RUN set -eux \ | |||||||
|     && python3 -m pip install --no-cache-dir --upgrade wheel \ |     && python3 -m pip install --no-cache-dir --upgrade wheel \ | ||||||
|   && echo "Installing Python requirements" \ |   && echo "Installing Python requirements" \ | ||||||
|     && python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \ |     && python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \ | ||||||
|  |   && echo "Installing NLTK data" \ | ||||||
|  |     && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" snowball_data \ | ||||||
|  |     && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" stopwords \ | ||||||
|  |     && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" punkt \ | ||||||
|   && echo "Cleaning up image" \ |   && echo "Cleaning up image" \ | ||||||
|     && apt-get -y purge ${BUILD_PACKAGES} \ |     && apt-get -y purge ${BUILD_PACKAGES} \ | ||||||
|     && apt-get -y autoremove --purge \ |     && apt-get -y autoremove --purge \ | ||||||
|   | |||||||
| @@ -53,30 +53,6 @@ map_folders() { | |||||||
| 	export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}" | 	export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}" | ||||||
| } | } | ||||||
|  |  | ||||||
| nltk_data () { |  | ||||||
| 	# Store the NLTK data outside the Docker container |  | ||||||
| 	local -r nltk_data_dir="${DATA_DIR}/nltk" |  | ||||||
| 	local -r truthy_things=("yes y 1 t true") |  | ||||||
|  |  | ||||||
| 	# If not set, or it looks truthy |  | ||||||
| 	if [[ -z "${PAPERLESS_ENABLE_NLTK}" ]] || [[ "${truthy_things[*]}" =~ ${PAPERLESS_ENABLE_NLTK,} ]]; then |  | ||||||
|  |  | ||||||
| 		# Download or update the snowball stemmer data |  | ||||||
| 		python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data |  | ||||||
|  |  | ||||||
| 		# Download or update the stopwords corpus |  | ||||||
| 		python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords |  | ||||||
|  |  | ||||||
| 		# Download or update the punkt tokenizer data |  | ||||||
| 		python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt |  | ||||||
|  |  | ||||||
| 	else |  | ||||||
| 		echo "Skipping NLTK data download" |  | ||||||
|  |  | ||||||
| 	fi |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| custom_container_init() { | custom_container_init() { | ||||||
| 	# Mostly borrowed from the LinuxServer.io base image | 	# Mostly borrowed from the LinuxServer.io base image | ||||||
| 	# https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d | 	# https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d | ||||||
| @@ -157,8 +133,6 @@ initialize() { | |||||||
| 	echo "Creating directory ${tmp_dir}" | 	echo "Creating directory ${tmp_dir}" | ||||||
| 	mkdir -p "${tmp_dir}" | 	mkdir -p "${tmp_dir}" | ||||||
|  |  | ||||||
| 	nltk_data |  | ||||||
|  |  | ||||||
| 	set +e | 	set +e | ||||||
| 	echo "Adjusting permissions of paperless files. This may take a while." | 	echo "Adjusting permissions of paperless files. This may take a while." | ||||||
| 	chown -R paperless:paperless ${tmp_dir} | 	chown -R paperless:paperless ${tmp_dir} | ||||||
| @@ -191,10 +165,6 @@ install_languages() { | |||||||
|  |  | ||||||
| 	for lang in "${langs[@]}"; do | 	for lang in "${langs[@]}"; do | ||||||
| 		pkg="tesseract-ocr-$lang" | 		pkg="tesseract-ocr-$lang" | ||||||
| 		# English is installed by default |  | ||||||
| 		#if [[ "$lang" ==  "eng" ]]; then |  | ||||||
| 		#    continue |  | ||||||
| 		#fi |  | ||||||
|  |  | ||||||
| 		if dpkg -s "$pkg" &>/dev/null; then | 		if dpkg -s "$pkg" &>/dev/null; then | ||||||
| 			echo "Package $pkg already installed!" | 			echo "Package $pkg already installed!" | ||||||
|   | |||||||
| @@ -123,7 +123,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") | |||||||
|  |  | ||||||
| DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data")) | DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data")) | ||||||
|  |  | ||||||
| NLTK_DIR = os.path.join(DATA_DIR, "nltk") | NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data") | ||||||
|  |  | ||||||
| TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR") | TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR") | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H