mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-18 22:14:22 -06:00
Feature: Paperless AI (#10319)
This commit is contained in:
@@ -169,3 +169,37 @@ class GeneralConfig(BaseConfig):
|
||||
|
||||
self.app_title = app_config.app_title or None
|
||||
self.app_logo = app_config.app_logo.url if app_config.app_logo else None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class AIConfig(BaseConfig):
|
||||
"""
|
||||
AI related settings that require global scope
|
||||
"""
|
||||
|
||||
ai_enabled: bool = dataclasses.field(init=False)
|
||||
llm_embedding_backend: str = dataclasses.field(init=False)
|
||||
llm_embedding_model: str = dataclasses.field(init=False)
|
||||
llm_backend: str = dataclasses.field(init=False)
|
||||
llm_model: str = dataclasses.field(init=False)
|
||||
llm_api_key: str = dataclasses.field(init=False)
|
||||
llm_endpoint: str = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
app_config = self._get_config_instance()
|
||||
|
||||
self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
|
||||
self.llm_embedding_backend = (
|
||||
app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
|
||||
)
|
||||
self.llm_embedding_model = (
|
||||
app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
|
||||
)
|
||||
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
|
||||
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
||||
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
||||
self.llm_endpoint = app_config.llm_endpoint or settings.LLM_ENDPOINT
|
||||
|
||||
@property
|
||||
def llm_index_enabled(self) -> bool:
|
||||
return bool(self.ai_enabled and self.llm_embedding_backend)
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
# Generated by Django 5.2.6 on 2025-09-30 17:43
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless", "0004_applicationconfiguration_barcode_asn_prefix_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="ai_enabled",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
null=True,
|
||||
verbose_name="Enables AI features",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_api_key",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=1024,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM API key",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_backend",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("openai", "OpenAI"), ("ollama", "Ollama")],
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM backend",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_embedding_backend",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("openai", "OpenAI"), ("huggingface", "Huggingface")],
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM embedding backend",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_embedding_model",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM embedding model",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_endpoint",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=256,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM endpoint, optional",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_model",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM model",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -74,6 +74,20 @@ class ColorConvertChoices(models.TextChoices):
|
||||
CMYK = ("CMYK", _("CMYK"))
|
||||
|
||||
|
||||
class LLMEmbeddingBackend(models.TextChoices):
|
||||
OPENAI = ("openai", _("OpenAI"))
|
||||
HUGGINGFACE = ("huggingface", _("Huggingface"))
|
||||
|
||||
|
||||
class LLMBackend(models.TextChoices):
|
||||
"""
|
||||
Matches to --llm-backend
|
||||
"""
|
||||
|
||||
OPENAI = ("openai", _("OpenAI"))
|
||||
OLLAMA = ("ollama", _("Ollama"))
|
||||
|
||||
|
||||
class ApplicationConfiguration(AbstractSingletonModel):
|
||||
"""
|
||||
Settings which are common across more than 1 parser
|
||||
@@ -265,6 +279,60 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
null=True,
|
||||
)
|
||||
|
||||
"""
|
||||
AI related settings
|
||||
"""
|
||||
|
||||
ai_enabled = models.BooleanField(
|
||||
verbose_name=_("Enables AI features"),
|
||||
null=True,
|
||||
default=False,
|
||||
)
|
||||
|
||||
llm_embedding_backend = models.CharField(
|
||||
verbose_name=_("Sets the LLM embedding backend"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
choices=LLMEmbeddingBackend.choices,
|
||||
)
|
||||
|
||||
llm_embedding_model = models.CharField(
|
||||
verbose_name=_("Sets the LLM embedding model"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
)
|
||||
|
||||
llm_backend = models.CharField(
|
||||
verbose_name=_("Sets the LLM backend"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
choices=LLMBackend.choices,
|
||||
)
|
||||
|
||||
llm_model = models.CharField(
|
||||
verbose_name=_("Sets the LLM model"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
)
|
||||
|
||||
llm_api_key = models.CharField(
|
||||
verbose_name=_("Sets the LLM API key"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=1024,
|
||||
)
|
||||
|
||||
llm_endpoint = models.CharField(
|
||||
verbose_name=_("Sets the LLM endpoint, optional"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=256,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("paperless application settings")
|
||||
|
||||
|
||||
@@ -206,6 +206,10 @@ class ProfileSerializer(PasswordValidationMixin, serializers.ModelSerializer):
|
||||
class ApplicationConfigurationSerializer(serializers.ModelSerializer):
|
||||
user_args = serializers.JSONField(binary=True, allow_null=True)
|
||||
barcode_tag_mapping = serializers.JSONField(binary=True, allow_null=True)
|
||||
llm_api_key = ObfuscatedPasswordField(
|
||||
required=False,
|
||||
allow_null=True,
|
||||
)
|
||||
|
||||
def run_validation(self, data):
|
||||
# Empty strings treated as None to avoid unexpected behavior
|
||||
@@ -215,6 +219,11 @@ class ApplicationConfigurationSerializer(serializers.ModelSerializer):
|
||||
data["barcode_tag_mapping"] = None
|
||||
if "language" in data and data["language"] == "":
|
||||
data["language"] = None
|
||||
if "llm_api_key" in data and data["llm_api_key"] is not None:
|
||||
if data["llm_api_key"] == "":
|
||||
data["llm_api_key"] = None
|
||||
elif len(data["llm_api_key"].replace("*", "")) == 0:
|
||||
del data["llm_api_key"]
|
||||
return super().run_validation(data)
|
||||
|
||||
def update(self, instance, validated_data):
|
||||
|
||||
@@ -12,6 +12,7 @@ from typing import Final
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from celery.schedules import crontab
|
||||
from compression_middleware.middleware import CompressionMiddleware
|
||||
from dateparser.languages.loader import LocaleDataLoader
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from dotenv import load_dotenv
|
||||
@@ -229,6 +230,17 @@ def _parse_beat_schedule() -> dict:
|
||||
"expires": 59.0 * 60.0,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "Rebuild LLM index",
|
||||
"env_key": "PAPERLESS_LLM_INDEX_TASK_CRON",
|
||||
# Default daily at 02:10
|
||||
"env_default": "10 2 * * *",
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"options": {
|
||||
# 1 hour before default schedule sends again
|
||||
"expires": 23.0 * 60.0 * 60.0,
|
||||
},
|
||||
},
|
||||
]
|
||||
for task in tasks:
|
||||
# Either get the environment setting or use the default
|
||||
@@ -287,6 +299,7 @@ MODEL_FILE = __get_path(
|
||||
"PAPERLESS_MODEL_FILE",
|
||||
DATA_DIR / "classification_model.pickle",
|
||||
)
|
||||
LLM_INDEX_DIR = DATA_DIR / "llm_index"
|
||||
|
||||
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
|
||||
|
||||
@@ -380,6 +393,19 @@ MIDDLEWARE = [
|
||||
if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"): # pragma: no cover
|
||||
MIDDLEWARE.insert(0, "compression_middleware.middleware.CompressionMiddleware")
|
||||
|
||||
# Workaround to not compress streaming responses (e.g. chat).
|
||||
# See https://github.com/friedelwolff/django-compression-middleware/pull/7
|
||||
original_process_response = CompressionMiddleware.process_response
|
||||
|
||||
|
||||
def patched_process_response(self, request, response):
|
||||
if getattr(request, "compress_exempt", False):
|
||||
return response
|
||||
return original_process_response(self, request, response)
|
||||
|
||||
|
||||
CompressionMiddleware.process_response = patched_process_response
|
||||
|
||||
ROOT_URLCONF = "paperless.urls"
|
||||
|
||||
|
||||
@@ -585,6 +611,10 @@ X_FRAME_OPTIONS = "SAMEORIGIN"
|
||||
# The next 3 settings can also be set using just PAPERLESS_URL
|
||||
CSRF_TRUSTED_ORIGINS = __get_list("PAPERLESS_CSRF_TRUSTED_ORIGINS")
|
||||
|
||||
if DEBUG:
|
||||
# Allow access from the angular development server during debugging
|
||||
CSRF_TRUSTED_ORIGINS.append("http://localhost:4200")
|
||||
|
||||
# We allow CORS from localhost:8000
|
||||
CORS_ALLOWED_ORIGINS = __get_list(
|
||||
"PAPERLESS_CORS_ALLOWED_HOSTS",
|
||||
@@ -595,6 +625,8 @@ if DEBUG:
|
||||
# Allow access from the angular development server during debugging
|
||||
CORS_ALLOWED_ORIGINS.append("http://localhost:4200")
|
||||
|
||||
CORS_ALLOW_CREDENTIALS = True
|
||||
|
||||
CORS_EXPOSE_HEADERS = [
|
||||
"Content-Disposition",
|
||||
]
|
||||
@@ -868,6 +900,7 @@ LOGGING = {
|
||||
"loggers": {
|
||||
"paperless": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
||||
"paperless_mail": {"handlers": ["file_mail"], "level": "DEBUG"},
|
||||
"paperless_ai": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
||||
"ocrmypdf": {"handlers": ["file_paperless"], "level": "INFO"},
|
||||
"celery": {"handlers": ["file_celery"], "level": "DEBUG"},
|
||||
"kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
|
||||
@@ -1404,3 +1437,16 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
|
||||
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
|
||||
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
|
||||
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
|
||||
|
||||
################################################################################
|
||||
# AI Settings #
|
||||
################################################################################
|
||||
AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
|
||||
LLM_EMBEDDING_BACKEND = os.getenv(
|
||||
"PAPERLESS_AI_LLM_EMBEDDING_BACKEND",
|
||||
) # "huggingface" or "openai"
|
||||
LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_AI_LLM_EMBEDDING_MODEL")
|
||||
LLM_BACKEND = os.getenv("PAPERLESS_AI_LLM_BACKEND") # "ollama" or "openai"
|
||||
LLM_MODEL = os.getenv("PAPERLESS_AI_LLM_MODEL")
|
||||
LLM_API_KEY = os.getenv("PAPERLESS_AI_LLM_API_KEY")
|
||||
LLM_ENDPOINT = os.getenv("PAPERLESS_AI_LLM_ENDPOINT")
|
||||
|
||||
@@ -160,6 +160,7 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
|
||||
EMPTY_TRASH_EXPIRE_TIME = 23.0 * 60.0 * 60.0
|
||||
RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME = 59.0 * 60.0
|
||||
LLM_INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
|
||||
|
||||
def test_schedule_configuration_default(self):
|
||||
"""
|
||||
@@ -204,6 +205,13 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
|
||||
},
|
||||
"Rebuild LLM index": {
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"schedule": crontab(minute=10, hour=2),
|
||||
"options": {
|
||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||
},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
@@ -256,6 +264,13 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
|
||||
},
|
||||
"Rebuild LLM index": {
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"schedule": crontab(minute=10, hour=2),
|
||||
"options": {
|
||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||
},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
@@ -300,6 +315,13 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
|
||||
},
|
||||
"Rebuild LLM index": {
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"schedule": crontab(minute=10, hour=2),
|
||||
"options": {
|
||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||
},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
@@ -322,6 +344,7 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"PAPERLESS_INDEX_TASK_CRON": "disable",
|
||||
"PAPERLESS_EMPTY_TRASH_TASK_CRON": "disable",
|
||||
"PAPERLESS_WORKFLOW_SCHEDULED_TASK_CRON": "disable",
|
||||
"PAPERLESS_LLM_INDEX_TASK_CRON": "disable",
|
||||
},
|
||||
):
|
||||
schedule = _parse_beat_schedule()
|
||||
|
||||
@@ -18,6 +18,7 @@ from rest_framework.routers import DefaultRouter
|
||||
from documents.views import BulkDownloadView
|
||||
from documents.views import BulkEditObjectsView
|
||||
from documents.views import BulkEditView
|
||||
from documents.views import ChatStreamingView
|
||||
from documents.views import CorrespondentViewSet
|
||||
from documents.views import CustomFieldViewSet
|
||||
from documents.views import DocumentTypeViewSet
|
||||
@@ -139,6 +140,11 @@ urlpatterns = [
|
||||
SelectionDataView.as_view(),
|
||||
name="selection_data",
|
||||
),
|
||||
re_path(
|
||||
"^chat/",
|
||||
ChatStreamingView.as_view(),
|
||||
name="chat_streaming_view",
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
|
||||
@@ -35,6 +35,7 @@ from rest_framework.viewsets import ModelViewSet
|
||||
|
||||
from documents.index import DelayedQuery
|
||||
from documents.permissions import PaperlessObjectPermissions
|
||||
from documents.tasks import llmindex_index
|
||||
from paperless.filters import GroupFilterSet
|
||||
from paperless.filters import UserFilterSet
|
||||
from paperless.models import ApplicationConfiguration
|
||||
@@ -43,6 +44,7 @@ from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import PaperlessAuthTokenSerializer
|
||||
from paperless.serialisers import ProfileSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless_ai.indexing import vector_store_file_exists
|
||||
|
||||
|
||||
class PaperlessObtainAuthTokenView(ObtainAuthToken):
|
||||
@@ -358,6 +360,30 @@ class ApplicationConfigurationViewSet(ModelViewSet):
|
||||
def create(self, request, *args, **kwargs):
|
||||
return Response(status=405) # Not Allowed
|
||||
|
||||
def perform_update(self, serializer):
|
||||
old_instance = ApplicationConfiguration.objects.all().first()
|
||||
old_ai_index_enabled = (
|
||||
old_instance.ai_enabled and old_instance.llm_embedding_backend
|
||||
)
|
||||
|
||||
new_instance: ApplicationConfiguration = serializer.save()
|
||||
new_ai_index_enabled = (
|
||||
new_instance.ai_enabled and new_instance.llm_embedding_backend
|
||||
)
|
||||
|
||||
if (
|
||||
not old_ai_index_enabled
|
||||
and new_ai_index_enabled
|
||||
and not vector_store_file_exists()
|
||||
):
|
||||
# AI index was just enabled and vector store file does not exist
|
||||
llmindex_index.delay(
|
||||
progress_bar_disable=True,
|
||||
rebuild=True,
|
||||
scheduled=False,
|
||||
auto=True,
|
||||
)
|
||||
|
||||
|
||||
@extend_schema_view(
|
||||
post=extend_schema(
|
||||
|
||||
Reference in New Issue
Block a user