mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Fixes the download and usage of the downloaded data
This commit is contained in:
parent
1262c121f0
commit
6523cf0c4b
@ -53,6 +53,21 @@ map_folders() {
|
|||||||
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
|
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nltk_data () {
|
||||||
|
# Store the NLTK data outside the Docker container
|
||||||
|
local nltk_data_dir="${DATA_DIR}/nltk"
|
||||||
|
|
||||||
|
# Download or update the snowball stemmer data
|
||||||
|
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
|
||||||
|
|
||||||
|
# Download or update the stopwords corpus
|
||||||
|
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
|
||||||
|
|
||||||
|
# Download or update the punkt tokenizer data
|
||||||
|
python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
initialize() {
|
initialize() {
|
||||||
|
|
||||||
# Setup environment from secrets before anything else
|
# Setup environment from secrets before anything else
|
||||||
@ -93,6 +108,8 @@ initialize() {
|
|||||||
echo "Creating directory ${tmp_dir}"
|
echo "Creating directory ${tmp_dir}"
|
||||||
mkdir -p "${tmp_dir}"
|
mkdir -p "${tmp_dir}"
|
||||||
|
|
||||||
|
nltk_data
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
echo "Adjusting permissions of paperless files. This may take a while."
|
echo "Adjusting permissions of paperless files. This may take a while."
|
||||||
chown -R paperless:paperless ${tmp_dir}
|
chown -R paperless:paperless ${tmp_dir}
|
||||||
|
@ -89,24 +89,6 @@ superuser() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
nltk_data () {
|
|
||||||
# Store the NLTK data outside the Docker container
|
|
||||||
local nltk_data_dir="${DATA_DIR}/nltk"
|
|
||||||
|
|
||||||
# Download or update the snowball stemmer data
|
|
||||||
python3 -m nltk.downloader -d "${nltk_data_dir}" snowball_data
|
|
||||||
|
|
||||||
# Download or update the stopwords corpus
|
|
||||||
python3 -m nltk.downloader -d "${nltk_data_dir}" stopwords
|
|
||||||
|
|
||||||
# Download or update the punkt tokenizer data
|
|
||||||
python3 -m nltk.downloader -d "${nltk_data_dir}" punkt
|
|
||||||
|
|
||||||
# Set env so nltk can find the downloaded data
|
|
||||||
export NLTK_DATA="${nltk_data_dir}"
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
do_work() {
|
do_work() {
|
||||||
if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then
|
if [[ "${PAPERLESS_DBENGINE}" == "mariadb" ]]; then
|
||||||
wait_for_mariadb
|
wait_for_mariadb
|
||||||
@ -118,8 +100,6 @@ do_work() {
|
|||||||
|
|
||||||
migrations
|
migrations
|
||||||
|
|
||||||
nltk_data
|
|
||||||
|
|
||||||
search_index
|
search_index
|
||||||
|
|
||||||
superuser
|
superuser
|
||||||
|
@ -306,6 +306,12 @@ class DocumentClassifier:
|
|||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from nltk.stem import SnowballStemmer
|
from nltk.stem import SnowballStemmer
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
# Not really hacky, since it isn't private and is documented, but
|
||||||
|
# set the search path for NLTK data to the single location it should be in
|
||||||
|
nltk.data.path = [settings.NLTK_DIR]
|
||||||
|
|
||||||
if self.stemmer is None:
|
if self.stemmer is None:
|
||||||
self.stemmer = SnowballStemmer("english")
|
self.stemmer = SnowballStemmer("english")
|
||||||
|
|
||||||
|
@ -84,6 +84,8 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
|||||||
|
|
||||||
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
|
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
|
||||||
|
|
||||||
|
NLTK_DIR = os.path.join(DATA_DIR, "nltk")
|
||||||
|
|
||||||
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
|
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
|
||||||
|
|
||||||
# Lock file for synchronizing changes to the MEDIA directory across multiple
|
# Lock file for synchronizing changes to the MEDIA directory across multiple
|
||||||
|
Loading…
x
Reference in New Issue
Block a user