Merge branch 'dev' into celery-tasks

2026-01-10 21:34:20 -06:00 · 2020-11-19 22:10:57 +01:00
parent 9e81c82452 1655d85a53
commit 17430210a1
145 changed files with 5228 additions and 11538 deletions
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1,4 +1,5 @@
 import json
+import math
 import multiprocessing
 import os
 import re
@@ -13,6 +14,18 @@ elif os.path.exists("/etc/paperless.conf"):
 elif os.path.exists("/usr/local/etc/paperless.conf"):
    load_dotenv("/usr/local/etc/paperless.conf")

+# There are multiple levels of concurrency in paperless:
+#  - Multiple consumers may be run in parallel.
+#  - Each consumer may process multiple pages in parallel.
+#  - Each Tesseract OCR run may spawn multiple threads to process a single page
+#    slightly faster.
+# The performance gains from having tesseract use multiple threads are minimal.
+# However, when multiple pages are processed in parallel, the total number of
+# OCR threads may exceed the number of available cpu cores, which will
+# dramatically slow down the consumption process. This settings limits each
+# Tesseract process to one thread.
+os.environ['OMP_THREAD_LIMIT'] = "1"
+

 def __get_boolean(key, default="NO"):
    """
@@ -21,9 +34,11 @@ def __get_boolean(key, default="NO"):
    """
    return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))

+
 # NEVER RUN WITH DEBUG IN PRODUCTION.
 DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")

+
 ###############################################################################
 # Directories                                                                 #
 ###############################################################################
@@ -65,6 +80,7 @@ INSTALLED_APPS = [
    "documents.apps.DocumentsConfig",
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
+    "paperless_mail.apps.PaperlessMailConfig",

    "django.contrib.admin",

@@ -139,11 +155,11 @@ else:
    X_FRAME_OPTIONS = 'SAMEORIGIN'

 # We allow CORS from localhost:8080
-CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(","))
+CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))

 if DEBUG:
    # Allow access from the angular development server during debugging
-    CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
+    CORS_ALLOWED_ORIGINS += ('http://localhost:4200',)

 # The secret key has a default that should be fine so long as you're hosting
 # Paperless on a closed network.  However, if you're putting this anywhere
@@ -195,11 +211,11 @@ DATABASES = {
    }
 }

-# Always have sqlite available as a second option for management commands
-# This is important when migrating to/from sqlite
-DATABASES['sqlite'] = DATABASES['default'].copy()
-
 if os.getenv("PAPERLESS_DBHOST"):
+    # Have sqlite available as a second option for management commands
+    # This is important when migrating to/from sqlite
+    DATABASES['sqlite'] = DATABASES['default'].copy()
+
    DATABASES["default"] = {
        "ENGINE": "django.db.backends.postgresql_psycopg2",
        "HOST": os.getenv("PAPERLESS_DBHOST"),
@@ -244,6 +260,14 @@ LOGGING = {
            "handlers": ["dbhandler", "streamhandler"],
            "level": "DEBUG"
        },
+        "paperless_mail": {
+            "handlers": ["dbhandler", "streamhandler"],
+            "level": "DEBUG"
+        },
+        "paperless_tesseract": {
+            "handlers": ["dbhandler", "streamhandler"],
+            "level": "DEBUG"
+        },
    },
 }

@@ -251,22 +275,60 @@ LOGGING = {
 # Task queue                                                                  #
 ###############################################################################

+
+# Sensible defaults for multitasking:
+# use a fair balance between worker processes and threads epr worker so that
+# both consuming many documents in parallel and consuming large documents is
+# reasonably fast.
+# Favors threads per worker on smaller systems and never exceeds cpu_count()
+# in total.
+
+def default_task_workers():
+    try:
+        return max(
+            math.floor(math.sqrt(multiprocessing.cpu_count())),
+            1
+        )
+    except NotImplementedError:
+        return 1
+
+
+TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
+
 Q_CLUSTER = {
    'name': 'paperless',
    'catch_up': False,
+    'workers': TASK_WORKERS,
    'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
 }

+
+def default_threads_per_worker():
+    try:
+        return max(
+            math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
+            1
+        )
+    except NotImplementedError:
+        return 1
+
+
+THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
+
 ###############################################################################
 # Paperless Specific Settings                                                 #
 ###############################################################################

+CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
+
+CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
+
+OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
+
 # The default language that tesseract will attempt to use when parsing
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")

-# The amount of threads to use for OCR
-OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))

 # OCR all documents?
 OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
@@ -311,6 +373,7 @@ FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))

+# TODO: this should not have a prefix.
 # Specify the filename format for out files
 PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")