Merge branch 'dev' into celery-tasks

This commit is contained in:
Jonas Winkler
2020-11-19 22:10:57 +01:00
145 changed files with 5228 additions and 11538 deletions

View File

@@ -11,6 +11,8 @@ writeable_hint = (
"Set the permissions of {} to be writeable by the user running the "
"Paperless services"
)
def path_check(env_var):
messages = []
directory = os.getenv(env_var)
@@ -27,6 +29,7 @@ def path_check(env_var):
))
return messages
@register()
def paths_check(app_configs, **kwargs):
"""
@@ -34,9 +37,9 @@ def paths_check(app_configs, **kwargs):
"""
check_messages = path_check("PAPERLESS_DATA_DIR") + \
path_check("PAPERLESS_MEDIA_ROOT") + \
path_check("PAPERLESS_CONSUMPTION_DIR") + \
path_check("PAPERLESS_STATICDIR")
path_check("PAPERLESS_MEDIA_ROOT") + \
path_check("PAPERLESS_CONSUMPTION_DIR") + \
path_check("PAPERLESS_STATICDIR")
return check_messages
@@ -64,3 +67,16 @@ def binaries_check(app_configs, **kwargs):
check_messages.append(Warning(error.format(binary), hint))
return check_messages
@register()
def debug_mode_check(app_configs, **kwargs):
if settings.DEBUG:
return [Warning(
"DEBUG mode is enabled. Disable Debug mode. This is a serious "
"security issue, since it puts security overides in place which "
"are meant to be only used during development. This "
"also means that paperless will tell anyone various "
"debugging information when something goes wrong.")]
else:
return []

View File

@@ -1,4 +1,5 @@
import json
import math
import multiprocessing
import os
import re
@@ -13,6 +14,18 @@ elif os.path.exists("/etc/paperless.conf"):
elif os.path.exists("/usr/local/etc/paperless.conf"):
load_dotenv("/usr/local/etc/paperless.conf")
# There are multiple levels of concurrency in paperless:
# - Multiple consumers may be run in parallel.
# - Each consumer may process multiple pages in parallel.
# - Each Tesseract OCR run may spawn multiple threads to process a single page
# slightly faster.
# The performance gains from having tesseract use multiple threads are minimal.
# However, when multiple pages are processed in parallel, the total number of
# OCR threads may exceed the number of available cpu cores, which will
# dramatically slow down the consumption process. This settings limits each
# Tesseract process to one thread.
os.environ['OMP_THREAD_LIMIT'] = "1"
def __get_boolean(key, default="NO"):
"""
@@ -21,9 +34,11 @@ def __get_boolean(key, default="NO"):
"""
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
###############################################################################
# Directories #
###############################################################################
@@ -65,6 +80,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"django.contrib.admin",
@@ -139,11 +155,11 @@ else:
X_FRAME_OPTIONS = 'SAMEORIGIN'
# We allow CORS from localhost:8080
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(","))
CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))
if DEBUG:
# Allow access from the angular development server during debugging
CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
CORS_ALLOWED_ORIGINS += ('http://localhost:4200',)
# The secret key has a default that should be fine so long as you're hosting
# Paperless on a closed network. However, if you're putting this anywhere
@@ -195,11 +211,11 @@ DATABASES = {
}
}
# Always have sqlite available as a second option for management commands
# This is important when migrating to/from sqlite
DATABASES['sqlite'] = DATABASES['default'].copy()
if os.getenv("PAPERLESS_DBHOST"):
# Have sqlite available as a second option for management commands
# This is important when migrating to/from sqlite
DATABASES['sqlite'] = DATABASES['default'].copy()
DATABASES["default"] = {
"ENGINE": "django.db.backends.postgresql_psycopg2",
"HOST": os.getenv("PAPERLESS_DBHOST"),
@@ -244,6 +260,14 @@ LOGGING = {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_mail": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_tesseract": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
},
}
@@ -251,22 +275,60 @@ LOGGING = {
# Task queue #
###############################################################################
# Sensible defaults for multitasking:
# use a fair balance between worker processes and threads epr worker so that
# both consuming many documents in parallel and consuming large documents is
# reasonably fast.
# Favors threads per worker on smaller systems and never exceeds cpu_count()
# in total.
def default_task_workers():
try:
return max(
math.floor(math.sqrt(multiprocessing.cpu_count())),
1
)
except NotImplementedError:
return 1
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
Q_CLUSTER = {
'name': 'paperless',
'catch_up': False,
'workers': TASK_WORKERS,
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
}
def default_threads_per_worker():
try:
return max(
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
1
)
except NotImplementedError:
return 1
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
###############################################################################
# Paperless Specific Settings #
###############################################################################
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
# OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
@@ -311,6 +373,7 @@ FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
# TODO: this should not have a prefix.
# Specify the filename format for out files
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

View File

@@ -1,4 +1,4 @@
from django.conf.urls import include, url
from django.conf.urls import include
from django.contrib import admin
from django.contrib.auth.decorators import login_required
from django.urls import path, re_path
@@ -7,7 +7,6 @@ from django.views.generic import RedirectView
from rest_framework.routers import DefaultRouter
from paperless.consumers import StatusConsumer
from paperless.views import FaviconView
from documents.views import (
CorrespondentViewSet,
DocumentViewSet,
@@ -19,6 +18,7 @@ from documents.views import (
SearchAutoCompleteView,
StatisticsView
)
from paperless.views import FaviconView
api_router = DefaultRouter()
api_router.register(r"correspondents", CorrespondentViewSet)
@@ -31,32 +31,32 @@ api_router.register(r"tags", TagViewSet)
urlpatterns = [
# API
url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
url(r"^api/search/", SearchView.as_view(), name="search"),
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
re_path(r"^api/search/", SearchView.as_view(), name="search"),
re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
# Favicon
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
# The Django admin
url(r"admin/", admin.site.urls),
re_path(r"admin/", admin.site.urls),
# These redirects are here to support clients that use the old FetchView.
url(
re_path(
r"^fetch/doc/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
),
url(
re_path(
r"^fetch/thumb/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
),
url(
re_path(
r"^fetch/preview/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
),
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
# Frontend assets TODO: this is pretty bad.
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
@@ -64,7 +64,7 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
# Root of the Frontent
url(r".*", login_required(IndexView.as_view())),
re_path(r".*", login_required(IndexView.as_view())),
]
@@ -74,8 +74,8 @@ websocket_urlpatterns = [
]
# Text in each page's <h1> (and above login form).
admin.site.site_header = 'Paperless'
admin.site.site_header = 'Paperless-ng'
# Text at the end of each page's <title>.
admin.site.site_title = 'Paperless'
admin.site.site_title = 'Paperless-ng'
# Text at the top of the admin index page.
admin.site.index_title = 'Paperless administration'
admin.site.index_title = 'Paperless-ng administration'

View File

@@ -1 +1 @@
__version__ = (1, 0, 0)
__version__ = (0, 9, 1)