diff --git a/docker-compose.env.example b/docker-compose.env.example index d5339db1f..175f65afe 100644 --- a/docker-compose.env.example +++ b/docker-compose.env.example @@ -1,35 +1,52 @@ +# Database settings for paperless +# If you want to use sqlite instead, remove these settings. PAPERLESS_DBENGINE="django.db.backends.postgresql_psycopg2" PAPERLESS_DBHOST="db" PAPERLESS_DBNAME="paperless" PAPERLESS_DBUSER="paperless" PAPERLESS_DBPASS="paperless" +# DONT EDIT. Consumption directory. This is the location of the consumption +# directory inside the container. If you want to modify the location of the +# consumption folder on the host, edit the docker-compose.yml file instead. PAPERLESS_CONSUMPTION_DIR="../consume" -# Environment variables to set for Paperless -# Commented out variables will be replaced with a default within Paperless. -# -# In addition to what you see here, you can also define any values you find in -# paperless.conf.example here. Values like: -# -# * PAPERLESS_PASSPHRASE -# * PAPERLESS_CONSUME_MAIL_HOST -# -# ...are all explained in that file but can be defined here, since the Docker -# installation doesn't make use of paperless.conf. - -# Use this variable to set a timezone for the Paperless Docker containers. If not specified, defaults to UTC. -#TZ=America/Los_Angeles - -# Additional languages to install for text recognition. Note that this is -# different from PAPERLESS_OCR_LANGUAGE (default=eng), which defines the -# default language used when guessing the language from the OCR output. -# The container installs English, German, Italian, Spanish and French by -# default. -#PAPERLESS_OCR_LANGUAGES=deu ita spa fra - # The UID and GID of the user used to run paperless in the container. Set this # to your UID and GID on the host so that you have write access to the # consumption directory. #USERMAP_UID=1000 #USERMAP_GID=1000 + +# Additional languages to install for text recognition, separated by a +# whitespace. Note that this is +# different from PAPERLESS_OCR_LANGUAGE (default=eng), which defines the +# default language used when guessing the language from the OCR output. +# The container installs English, German, Italian, Spanish and French by +# default. +# See https://packages.debian.org/search?keywords=tesseract-ocr-&searchon=names&suite=buster +# for available languages. +#PAPERLESS_OCR_LANGUAGES=tur ces + +############################################################################### +# Paperless-specific settings # +############################################################################### + +# All settings defined in the paperless.conf.example can be used here. The +# Docker setup does not use the configuration file. +# A few commonly adjusted settings are provided below. + +# Adjust this key if you plan to make paperless available publicly. It should +# be a very long sequence of random characters. You don't need to remember it. +#PAPERLESS_SECRET_KEY="change-me" + +# Use this variable to set a timezone for the Paperless Docker containers. If not specified, defaults to UTC. +#PAPERLESS_TIME_ZONE=America/Los_Angeles + +# The default language to use for OCR. Set this to the language most of your +# documents are written in. +#PAPERLESS_OCR_LANGUAGE="eng" + +# By default Paperless does not OCR a document if the text can be retrieved from +# the document directly. Set to true to always OCR documents. (i.e., if you +# know that some of your documents have faulty/bad OCR data) +#PAPERLESS_OCR_ALWAYS="true" diff --git a/paperless.conf.example b/paperless.conf.example index 5c50acf8e..c1bf62cd9 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -23,7 +23,7 @@ # This where your documents should go to be consumed. Make sure that it exists # and that the user running the paperless service can read/write its contents # before you start Paperless. -#PAPERLESS_CONSUMPTION_DIR="" +PAPERLESS_CONSUMPTION_DIR="../consume" # This is where paperless stores all its data (search index, sqlite database, # classification model, etc). @@ -165,7 +165,10 @@ # Customize the default language that tesseract will attempt to use when -# parsing documents. It should be a 3-letter language code consistent with ISO +# parsing documents. The default language is used whenever +# - No language could be detected on a document +# - No tesseract data files are available for the detected language +# It should be a 3-letter language code consistent with ISO # 639: https://www.loc.gov/standards/iso639-2/php/code_list.php #PAPERLESS_OCR_LANGUAGE=eng @@ -203,16 +206,6 @@ # with little impact to OCR accuracy. #PAPERLESS_CONVERT_DENSITY=300 - -# (This setting is ignored on Linux where inotify is used instead of a -# polling loop.) -# The number of seconds that Paperless will wait between checking -# PAPERLESS_CONSUMPTION_DIR. If you tend to write documents to this directory -# rarely, you may want to use a higher value than the default (10). -#PAPERLESS_CONSUMER_LOOP_TIME=10 - - - # By default Paperless does not OCR a document if the text can be retrieved from # the document directly. Set to true to always OCR documents. #PAPERLESS_OCR_ALWAYS="false" diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 8f918eb4f..737a0a3d5 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1,22 +1,10 @@ -""" -Django settings for paperless project. - -Generated by 'django-admin startproject' using Django 1.9. - -For more information on this file, see -https://docs.djangoproject.com/en/1.10/topics/settings/ - -For the full list of settings and their values, see -https://docs.djangoproject.com/en/1.10/ref/settings/ -""" - import json +import multiprocessing import os import re from dotenv import load_dotenv - # Tap paperless.conf if it's available if os.path.exists("../paperless.conf"): load_dotenv("../paperless.conf") @@ -33,45 +21,24 @@ def __get_boolean(key, default="NO"): """ return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true")) +############################################################################### +# Directories # +############################################################################### -# Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) +STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "static")) MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) - -INDEX_DIR = os.path.join(DATA_DIR, "index") ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") + +DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) +INDEX_DIR = os.path.join(DATA_DIR, "index") MODEL_FILE = os.path.join(DATA_DIR, "classification_model.pickle") - -# Quick-start development settings - unsuitable for production -# See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ - -# The secret key has a default that should be fine so long as you're hosting -# Paperless on a closed network. However, if you're putting this anywhere -# public, you should change the key to something unique and verbose. -SECRET_KEY = os.getenv( - "PAPERLESS_SECRET_KEY", - "e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee" -) - - -# SECURITY WARNING: don't run with debug turned on in production! -DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") - -LOGIN_URL = "admin:login" - -_allowed_hosts = os.getenv("PAPERLESS_ALLOWED_HOSTS") -if _allowed_hosts: - ALLOWED_HOSTS = _allowed_hosts.split(",") -else: - ALLOWED_HOSTS = ["*"] - -FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME") - -# Application definition +############################################################################### +# Application Definition # +############################################################################### INSTALLED_APPS = [ "whitenoise.runserver_nostatic", @@ -118,18 +85,17 @@ MIDDLEWARE = [ 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] -X_FRAME_OPTIONS = 'SAMEORIGIN' - -# We allow CORS from localhost:8080 -CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080,http://localhost:4200").split(",")) - -# If auth is disabled, we just use our "bypass" authentication middleware -if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")): - _index = MIDDLEWARE.index("django.contrib.auth.middleware.AuthenticationMiddleware") - MIDDLEWARE[_index] = "paperless.middleware.Middleware" - ROOT_URLCONF = 'paperless.urls' +LOGIN_URL = "admin:login" + +FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME") + +WSGI_APPLICATION = 'paperless.wsgi.application' + +STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/") + +# what is this used for? TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', @@ -146,38 +112,40 @@ TEMPLATES = [ }, ] -WSGI_APPLICATION = 'paperless.wsgi.application' +############################################################################### +# Security # +############################################################################### +# NEVER RUN WITH DEBUG IN PRODUCTION. +DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") -# Database -# https://docs.djangoproject.com/en/1.10/ref/settings/#databases +X_FRAME_OPTIONS = 'SAMEORIGIN' -DATABASES = { - "default": { - "ENGINE": "django.db.backends.sqlite3", - "NAME": os.path.join( - DATA_DIR, - "db.sqlite3" - ) - } -} +# We allow CORS from localhost:8080 +CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(",")) -if os.getenv("PAPERLESS_DBENGINE"): - DATABASES["default"] = { - "ENGINE": os.getenv("PAPERLESS_DBENGINE"), - "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"), - "USER": os.getenv("PAPERLESS_DBUSER"), - } - if os.getenv("PAPERLESS_DBPASS"): - DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS") - if os.getenv("PAPERLESS_DBHOST"): - DATABASES["default"]["HOST"] = os.getenv("PAPERLESS_DBHOST") - if os.getenv("PAPERLESS_DBPORT"): - DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT") +if DEBUG: + # Allow access from the angular development server during debugging + CORS_ORIGIN_WHITELIST += ('http://localhost:4200',) +# If auth is disabled, we just use our "bypass" authentication middleware +if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")): + _index = MIDDLEWARE.index("django.contrib.auth.middleware.AuthenticationMiddleware") + MIDDLEWARE[_index] = "paperless.middleware.Middleware" -# Password validation -# https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators +# The secret key has a default that should be fine so long as you're hosting +# Paperless on a closed network. However, if you're putting this anywhere +# public, you should change the key to something unique and verbose. +SECRET_KEY = os.getenv( + "PAPERLESS_SECRET_KEY", + "e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee" +) + +_allowed_hosts = os.getenv("PAPERLESS_ALLOWED_HOSTS") +if _allowed_hosts: + ALLOWED_HOSTS = _allowed_hosts.split(",") +else: + ALLOWED_HOSTS = ["*"] AUTH_PASSWORD_VALIDATORS = [ { @@ -194,9 +162,47 @@ AUTH_PASSWORD_VALIDATORS = [ }, ] +# Disable Django's artificial limit on the number of form fields to submit at +# once. This is a protection against overloading the server, but since this is +# a self-hosted sort of gig, the benefits of being able to mass-delete a tonne +# of log entries outweight the benefits of such a safeguard. -# Internationalization -# https://docs.djangoproject.com/en/1.10/topics/i18n/ +DATA_UPLOAD_MAX_NUMBER_FIELDS = None + +############################################################################### +# Database # +############################################################################### + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.sqlite3", + "NAME": os.path.join( + DATA_DIR, + "db.sqlite3" + ) + } +} + +# Always have sqlite available as a second option for management commands +# This is important when migrating to/from sqlite +DATABASES['sqlite'] = DATABASES['default'].copy() + +if os.getenv("PAPERLESS_DBENGINE"): + DATABASES["default"] = { + "ENGINE": os.getenv("PAPERLESS_DBENGINE"), + "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"), + "USER": os.getenv("PAPERLESS_DBUSER"), + } + if os.getenv("PAPERLESS_DBPASS"): + DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS") + if os.getenv("PAPERLESS_DBHOST"): + DATABASES["default"]["HOST"] = os.getenv("PAPERLESS_DBHOST") + if os.getenv("PAPERLESS_DBPORT"): + DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT") + +############################################################################### +# Internationalization # +############################################################################### LANGUAGE_CODE = 'en-us' @@ -208,32 +214,9 @@ USE_L10N = True USE_TZ = True - -# Static files (CSS, JavaScript, Images) -# https://docs.djangoproject.com/en/1.10/howto/static-files/ - -STATIC_ROOT = os.getenv( - "PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "static")) - -STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/") - - -# Other - -# Disable Django's artificial limit on the number of form fields to submit at -# once. This is a protection against overloading the server, but since this is -# a self-hosted sort of gig, the benefits of being able to mass-delete a tonne -# of log entries outweight the benefits of such a safeguard. - -DATA_UPLOAD_MAX_NUMBER_FIELDS = None - - -# Paperless-specific stuff -# You shouldn't have to edit any of these values. Rather, you can set these -# values in /etc/paperless.conf instead. -# ---------------------------------------------------------------------------- - -# Logging +############################################################################### +# Logging # +############################################################################### LOGGING = { "version": 1, @@ -254,18 +237,20 @@ LOGGING = { }, } +############################################################################### +# Paperless Specific Settings # +############################################################################### # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") # The amount of threads to use for OCR -OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", 4)) +OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count())) # OCR all documents? OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") - # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") @@ -275,13 +260,8 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300)) -# Ghostscript GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") - -# OptiPNG OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") - -# Unpaper UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") # This will be created if it doesn't exist @@ -290,14 +270,6 @@ SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") - -# (This setting is ignored on Linux where inotify is used instead of a -# polling loop.) -# The number of seconds that Paperless will wait between checking -# CONSUMPTION_DIR. If you tend to write documents to this directory very -# slowly, you may want to use a higher value than the default. -CONSUMER_LOOP_TIME = int(os.getenv("PAPERLESS_CONSUMER_LOOP_TIME", 10)) - # Pre-2.x versions of Paperless stored your documents locally with GPG # encryption, but that is no longer the default. This behaviour is still # available, but it must be explicitly enabled by setting