mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-01 18:37:42 -05:00
Merge branch 'dev' into celery-tasks
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import json
|
||||
import math
|
||||
import multiprocessing
|
||||
import os
|
||||
import re
|
||||
@@ -13,6 +14,18 @@ elif os.path.exists("/etc/paperless.conf"):
|
||||
elif os.path.exists("/usr/local/etc/paperless.conf"):
|
||||
load_dotenv("/usr/local/etc/paperless.conf")
|
||||
|
||||
# There are multiple levels of concurrency in paperless:
|
||||
# - Multiple consumers may be run in parallel.
|
||||
# - Each consumer may process multiple pages in parallel.
|
||||
# - Each Tesseract OCR run may spawn multiple threads to process a single page
|
||||
# slightly faster.
|
||||
# The performance gains from having tesseract use multiple threads are minimal.
|
||||
# However, when multiple pages are processed in parallel, the total number of
|
||||
# OCR threads may exceed the number of available cpu cores, which will
|
||||
# dramatically slow down the consumption process. This settings limits each
|
||||
# Tesseract process to one thread.
|
||||
os.environ['OMP_THREAD_LIMIT'] = "1"
|
||||
|
||||
|
||||
def __get_boolean(key, default="NO"):
|
||||
"""
|
||||
@@ -21,9 +34,11 @@ def __get_boolean(key, default="NO"):
|
||||
"""
|
||||
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
# NEVER RUN WITH DEBUG IN PRODUCTION.
|
||||
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Directories #
|
||||
###############################################################################
|
||||
@@ -65,6 +80,7 @@ INSTALLED_APPS = [
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
|
||||
"django.contrib.admin",
|
||||
|
||||
@@ -139,11 +155,11 @@ else:
|
||||
X_FRAME_OPTIONS = 'SAMEORIGIN'
|
||||
|
||||
# We allow CORS from localhost:8080
|
||||
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(","))
|
||||
CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))
|
||||
|
||||
if DEBUG:
|
||||
# Allow access from the angular development server during debugging
|
||||
CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
|
||||
CORS_ALLOWED_ORIGINS += ('http://localhost:4200',)
|
||||
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
@@ -195,11 +211,11 @@ DATABASES = {
|
||||
}
|
||||
}
|
||||
|
||||
# Always have sqlite available as a second option for management commands
|
||||
# This is important when migrating to/from sqlite
|
||||
DATABASES['sqlite'] = DATABASES['default'].copy()
|
||||
|
||||
if os.getenv("PAPERLESS_DBHOST"):
|
||||
# Have sqlite available as a second option for management commands
|
||||
# This is important when migrating to/from sqlite
|
||||
DATABASES['sqlite'] = DATABASES['default'].copy()
|
||||
|
||||
DATABASES["default"] = {
|
||||
"ENGINE": "django.db.backends.postgresql_psycopg2",
|
||||
"HOST": os.getenv("PAPERLESS_DBHOST"),
|
||||
@@ -244,6 +260,14 @@ LOGGING = {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
},
|
||||
"paperless_mail": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
},
|
||||
"paperless_tesseract": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -251,22 +275,60 @@ LOGGING = {
|
||||
# Task queue #
|
||||
###############################################################################
|
||||
|
||||
|
||||
# Sensible defaults for multitasking:
|
||||
# use a fair balance between worker processes and threads epr worker so that
|
||||
# both consuming many documents in parallel and consuming large documents is
|
||||
# reasonably fast.
|
||||
# Favors threads per worker on smaller systems and never exceeds cpu_count()
|
||||
# in total.
|
||||
|
||||
def default_task_workers():
|
||||
try:
|
||||
return max(
|
||||
math.floor(math.sqrt(multiprocessing.cpu_count())),
|
||||
1
|
||||
)
|
||||
except NotImplementedError:
|
||||
return 1
|
||||
|
||||
|
||||
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
|
||||
|
||||
Q_CLUSTER = {
|
||||
'name': 'paperless',
|
||||
'catch_up': False,
|
||||
'workers': TASK_WORKERS,
|
||||
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
|
||||
}
|
||||
|
||||
|
||||
def default_threads_per_worker():
|
||||
try:
|
||||
return max(
|
||||
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
|
||||
1
|
||||
)
|
||||
except NotImplementedError:
|
||||
return 1
|
||||
|
||||
|
||||
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
|
||||
|
||||
###############################################################################
|
||||
# Paperless Specific Settings #
|
||||
###############################################################################
|
||||
|
||||
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
|
||||
|
||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||
|
||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||
@@ -311,6 +373,7 @@ FILENAME_PARSE_TRANSFORMS = []
|
||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||
|
||||
# TODO: this should not have a prefix.
|
||||
# Specify the filename format for out files
|
||||
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
||||
|
||||
|
Reference in New Issue
Block a user