Merge branch 'dev' into celery-tasks

2025-12-24 02:05:48 -06:00 · 2020-11-19 22:10:57 +01:00
parent 4253f4aca7 cbee56ae8c
commit 196faa8fdc
145 changed files with 5228 additions and 11538 deletions
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -11,6 +11,8 @@ writeable_hint = (
    "Set the permissions of {} to be writeable by the user running the "
    "Paperless services"
 )
+
+
 def path_check(env_var):
    messages = []
    directory = os.getenv(env_var)
@@ -27,6 +29,7 @@ def path_check(env_var):
            ))
    return messages

+
@register()
 def paths_check(app_configs, **kwargs):
    """
@@ -34,9 +37,9 @@ def paths_check(app_configs, **kwargs):
    """

    check_messages = path_check("PAPERLESS_DATA_DIR") + \
-                     path_check("PAPERLESS_MEDIA_ROOT") + \
-                     path_check("PAPERLESS_CONSUMPTION_DIR") + \
-                     path_check("PAPERLESS_STATICDIR")
+        path_check("PAPERLESS_MEDIA_ROOT") + \
+        path_check("PAPERLESS_CONSUMPTION_DIR") + \
+        path_check("PAPERLESS_STATICDIR")

    return check_messages

@@ -64,3 +67,16 @@ def binaries_check(app_configs, **kwargs):
            check_messages.append(Warning(error.format(binary), hint))

    return check_messages
+
+
+@register()
+def debug_mode_check(app_configs, **kwargs):
+    if settings.DEBUG:
+        return [Warning(
+            "DEBUG mode is enabled. Disable Debug mode. This is a serious "
+            "security issue, since it puts security overides in place which "
+            "are meant to be only used during development. This "
+            "also means that paperless will tell anyone various "
+            "debugging information when something goes wrong.")]
+    else:
+        return []
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1,4 +1,5 @@
 import json
+import math
 import multiprocessing
 import os
 import re
@@ -13,6 +14,18 @@ elif os.path.exists("/etc/paperless.conf"):
 elif os.path.exists("/usr/local/etc/paperless.conf"):
    load_dotenv("/usr/local/etc/paperless.conf")

+# There are multiple levels of concurrency in paperless:
+#  - Multiple consumers may be run in parallel.
+#  - Each consumer may process multiple pages in parallel.
+#  - Each Tesseract OCR run may spawn multiple threads to process a single page
+#    slightly faster.
+# The performance gains from having tesseract use multiple threads are minimal.
+# However, when multiple pages are processed in parallel, the total number of
+# OCR threads may exceed the number of available cpu cores, which will
+# dramatically slow down the consumption process. This settings limits each
+# Tesseract process to one thread.
+os.environ['OMP_THREAD_LIMIT'] = "1"
+

 def __get_boolean(key, default="NO"):
    """
@@ -21,9 +34,11 @@ def __get_boolean(key, default="NO"):
    """
    return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))

+
 # NEVER RUN WITH DEBUG IN PRODUCTION.
 DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")

+
 ###############################################################################
 # Directories                                                                 #
 ###############################################################################
@@ -65,6 +80,7 @@ INSTALLED_APPS = [
    "documents.apps.DocumentsConfig",
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
+    "paperless_mail.apps.PaperlessMailConfig",

    "django.contrib.admin",

@@ -139,11 +155,11 @@ else:
    X_FRAME_OPTIONS = 'SAMEORIGIN'

 # We allow CORS from localhost:8080
-CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(","))
+CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))

 if DEBUG:
    # Allow access from the angular development server during debugging
-    CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
+    CORS_ALLOWED_ORIGINS += ('http://localhost:4200',)

 # The secret key has a default that should be fine so long as you're hosting
 # Paperless on a closed network.  However, if you're putting this anywhere
@@ -195,11 +211,11 @@ DATABASES = {
    }
 }

-# Always have sqlite available as a second option for management commands
-# This is important when migrating to/from sqlite
-DATABASES['sqlite'] = DATABASES['default'].copy()
-
 if os.getenv("PAPERLESS_DBHOST"):
+    # Have sqlite available as a second option for management commands
+    # This is important when migrating to/from sqlite
+    DATABASES['sqlite'] = DATABASES['default'].copy()
+
    DATABASES["default"] = {
        "ENGINE": "django.db.backends.postgresql_psycopg2",
        "HOST": os.getenv("PAPERLESS_DBHOST"),
@@ -244,6 +260,14 @@ LOGGING = {
            "handlers": ["dbhandler", "streamhandler"],
            "level": "DEBUG"
        },
+        "paperless_mail": {
+            "handlers": ["dbhandler", "streamhandler"],
+            "level": "DEBUG"
+        },
+        "paperless_tesseract": {
+            "handlers": ["dbhandler", "streamhandler"],
+            "level": "DEBUG"
+        },
    },
 }

@@ -251,22 +275,60 @@ LOGGING = {
 # Task queue                                                                  #
 ###############################################################################

+
+# Sensible defaults for multitasking:
+# use a fair balance between worker processes and threads epr worker so that
+# both consuming many documents in parallel and consuming large documents is
+# reasonably fast.
+# Favors threads per worker on smaller systems and never exceeds cpu_count()
+# in total.
+
+def default_task_workers():
+    try:
+        return max(
+            math.floor(math.sqrt(multiprocessing.cpu_count())),
+            1
+        )
+    except NotImplementedError:
+        return 1
+
+
+TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
+
 Q_CLUSTER = {
    'name': 'paperless',
    'catch_up': False,
+    'workers': TASK_WORKERS,
    'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
 }

+
+def default_threads_per_worker():
+    try:
+        return max(
+            math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
+            1
+        )
+    except NotImplementedError:
+        return 1
+
+
+THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
+
 ###############################################################################
 # Paperless Specific Settings                                                 #
 ###############################################################################

+CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
+
+CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
+
+OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
+
 # The default language that tesseract will attempt to use when parsing
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")

-# The amount of threads to use for OCR
-OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))

 # OCR all documents?
 OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
@@ -311,6 +373,7 @@ FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))

+# TODO: this should not have a prefix.
 # Specify the filename format for out files
 PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -1,4 +1,4 @@
-from django.conf.urls import include, url
+from django.conf.urls import include
 from django.contrib import admin
 from django.contrib.auth.decorators import login_required
 from django.urls import path, re_path
@@ -7,7 +7,6 @@ from django.views.generic import RedirectView
 from rest_framework.routers import DefaultRouter

 from paperless.consumers import StatusConsumer
-from paperless.views import FaviconView
 from documents.views import (
    CorrespondentViewSet,
    DocumentViewSet,
@@ -19,6 +18,7 @@ from documents.views import (
    SearchAutoCompleteView,
    StatisticsView
 )
+from paperless.views import FaviconView

 api_router = DefaultRouter()
 api_router.register(r"correspondents", CorrespondentViewSet)
@@ -31,32 +31,32 @@ api_router.register(r"tags", TagViewSet)
 urlpatterns = [

    # API
-    url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
-    url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
-    url(r"^api/search/", SearchView.as_view(), name="search"),
-    url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
-    url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
+    re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
+    re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
+    re_path(r"^api/search/", SearchView.as_view(), name="search"),
+    re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
+    re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),

    # Favicon
-    url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
+    re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),

    # The Django admin
-    url(r"admin/", admin.site.urls),
+    re_path(r"admin/", admin.site.urls),

    # These redirects are here to support clients that use the old FetchView.
-    url(
+    re_path(
        r"^fetch/doc/(?P<pk>\d+)$",
        RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
    ),
-    url(
+    re_path(
        r"^fetch/thumb/(?P<pk>\d+)$",
        RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
    ),
-    url(
+    re_path(
        r"^fetch/preview/(?P<pk>\d+)$",
        RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
    ),
-    url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
+    re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),

    # Frontend assets TODO: this is pretty bad.
    path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
@@ -64,7 +64,7 @@ urlpatterns = [
    path('accounts/', include('django.contrib.auth.urls')),

    # Root of the Frontent
-    url(r".*", login_required(IndexView.as_view())),
+    re_path(r".*", login_required(IndexView.as_view())),

 ]

@@ -74,8 +74,8 @@ websocket_urlpatterns = [
 ]

 # Text in each page's <h1> (and above login form).
-admin.site.site_header = 'Paperless'
+admin.site.site_header = 'Paperless-ng'
 # Text at the end of each page's <title>.
-admin.site.site_title = 'Paperless'
+admin.site.site_title = 'Paperless-ng'
 # Text at the top of the admin index page.
-admin.site.index_title = 'Paperless administration'
+admin.site.index_title = 'Paperless-ng administration'
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (1, 0, 0)
+__version__ = (0, 9, 1)