Feature: Paperless AI (#10319)

2026-01-18 22:14:22 -06:00 · 2026-01-13 08:24:42 -08:00
parent 4347ba1f9c
commit e940764fe0
78 changed files with 5429 additions and 106 deletions
--- a/src/paperless/config.py
+++ b/src/paperless/config.py
@@ -169,3 +169,37 @@ class GeneralConfig(BaseConfig):

        self.app_title = app_config.app_title or None
        self.app_logo = app_config.app_logo.url if app_config.app_logo else None
+
+
+@dataclasses.dataclass
+class AIConfig(BaseConfig):
+    """
+    AI related settings that require global scope
+    """
+
+    ai_enabled: bool = dataclasses.field(init=False)
+    llm_embedding_backend: str = dataclasses.field(init=False)
+    llm_embedding_model: str = dataclasses.field(init=False)
+    llm_backend: str = dataclasses.field(init=False)
+    llm_model: str = dataclasses.field(init=False)
+    llm_api_key: str = dataclasses.field(init=False)
+    llm_endpoint: str = dataclasses.field(init=False)
+
+    def __post_init__(self) -> None:
+        app_config = self._get_config_instance()
+
+        self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
+        self.llm_embedding_backend = (
+            app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
+        )
+        self.llm_embedding_model = (
+            app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
+        )
+        self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
+        self.llm_model = app_config.llm_model or settings.LLM_MODEL
+        self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
+        self.llm_endpoint = app_config.llm_endpoint or settings.LLM_ENDPOINT
+
+    @property
+    def llm_index_enabled(self) -> bool:
+        return bool(self.ai_enabled and self.llm_embedding_backend)
--- a/src/paperless/migrations/0005_applicationconfiguration_ai_enabled_and_more.py
+++ b/src/paperless/migrations/0005_applicationconfiguration_ai_enabled_and_more.py
@@ -0,0 +1,84 @@
+# Generated by Django 5.2.6 on 2025-09-30 17:43
+
+from django.db import migrations
+from django.db import models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("paperless", "0004_applicationconfiguration_barcode_asn_prefix_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="ai_enabled",
+            field=models.BooleanField(
+                default=False,
+                null=True,
+                verbose_name="Enables AI features",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_api_key",
+            field=models.CharField(
+                blank=True,
+                max_length=1024,
+                null=True,
+                verbose_name="Sets the LLM API key",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_backend",
+            field=models.CharField(
+                blank=True,
+                choices=[("openai", "OpenAI"), ("ollama", "Ollama")],
+                max_length=128,
+                null=True,
+                verbose_name="Sets the LLM backend",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_embedding_backend",
+            field=models.CharField(
+                blank=True,
+                choices=[("openai", "OpenAI"), ("huggingface", "Huggingface")],
+                max_length=128,
+                null=True,
+                verbose_name="Sets the LLM embedding backend",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_embedding_model",
+            field=models.CharField(
+                blank=True,
+                max_length=128,
+                null=True,
+                verbose_name="Sets the LLM embedding model",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_endpoint",
+            field=models.CharField(
+                blank=True,
+                max_length=256,
+                null=True,
+                verbose_name="Sets the LLM endpoint, optional",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_model",
+            field=models.CharField(
+                blank=True,
+                max_length=128,
+                null=True,
+                verbose_name="Sets the LLM model",
+            ),
+        ),
+    ]
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@@ -74,6 +74,20 @@ class ColorConvertChoices(models.TextChoices):
    CMYK = ("CMYK", _("CMYK"))


+class LLMEmbeddingBackend(models.TextChoices):
+    OPENAI = ("openai", _("OpenAI"))
+    HUGGINGFACE = ("huggingface", _("Huggingface"))
+
+
+class LLMBackend(models.TextChoices):
+    """
+    Matches to --llm-backend
+    """
+
+    OPENAI = ("openai", _("OpenAI"))
+    OLLAMA = ("ollama", _("Ollama"))
+
+
 class ApplicationConfiguration(AbstractSingletonModel):
    """
    Settings which are common across more than 1 parser
@@ -265,6 +279,60 @@ class ApplicationConfiguration(AbstractSingletonModel):
        null=True,
    )

+    """
+    AI related settings
+    """
+
+    ai_enabled = models.BooleanField(
+        verbose_name=_("Enables AI features"),
+        null=True,
+        default=False,
+    )
+
+    llm_embedding_backend = models.CharField(
+        verbose_name=_("Sets the LLM embedding backend"),
+        blank=True,
+        null=True,
+        max_length=128,
+        choices=LLMEmbeddingBackend.choices,
+    )
+
+    llm_embedding_model = models.CharField(
+        verbose_name=_("Sets the LLM embedding model"),
+        blank=True,
+        null=True,
+        max_length=128,
+    )
+
+    llm_backend = models.CharField(
+        verbose_name=_("Sets the LLM backend"),
+        blank=True,
+        null=True,
+        max_length=128,
+        choices=LLMBackend.choices,
+    )
+
+    llm_model = models.CharField(
+        verbose_name=_("Sets the LLM model"),
+        blank=True,
+        null=True,
+        max_length=128,
+    )
+
+    llm_api_key = models.CharField(
+        verbose_name=_("Sets the LLM API key"),
+        blank=True,
+        null=True,
+        max_length=1024,
+    )
+
+    llm_endpoint = models.CharField(
+        verbose_name=_("Sets the LLM endpoint, optional"),
+        blank=True,
+        null=True,
+        max_length=256,
+    )
+
    class Meta:
        verbose_name = _("paperless application settings")

--- a/src/paperless/serialisers.py
+++ b/src/paperless/serialisers.py
@@ -206,6 +206,10 @@ class ProfileSerializer(PasswordValidationMixin, serializers.ModelSerializer):
 class ApplicationConfigurationSerializer(serializers.ModelSerializer):
    user_args = serializers.JSONField(binary=True, allow_null=True)
    barcode_tag_mapping = serializers.JSONField(binary=True, allow_null=True)
+    llm_api_key = ObfuscatedPasswordField(
+        required=False,
+        allow_null=True,
+    )

    def run_validation(self, data):
        # Empty strings treated as None to avoid unexpected behavior
@@ -215,6 +219,11 @@ class ApplicationConfigurationSerializer(serializers.ModelSerializer):
            data["barcode_tag_mapping"] = None
        if "language" in data and data["language"] == "":
            data["language"] = None
+        if "llm_api_key" in data and data["llm_api_key"] is not None:
+            if data["llm_api_key"] == "":
+                data["llm_api_key"] = None
+            elif len(data["llm_api_key"].replace("*", "")) == 0:
+                del data["llm_api_key"]
        return super().run_validation(data)

    def update(self, instance, validated_data):
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -12,6 +12,7 @@ from typing import Final
 from urllib.parse import urlparse

 from celery.schedules import crontab
+from compression_middleware.middleware import CompressionMiddleware
 from dateparser.languages.loader import LocaleDataLoader
 from django.utils.translation import gettext_lazy as _
 from dotenv import load_dotenv
@@ -229,6 +230,17 @@ def _parse_beat_schedule() -> dict:
                "expires": 59.0 * 60.0,
            },
        },
+        {
+            "name": "Rebuild LLM index",
+            "env_key": "PAPERLESS_LLM_INDEX_TASK_CRON",
+            # Default daily at 02:10
+            "env_default": "10 2 * * *",
+            "task": "documents.tasks.llmindex_index",
+            "options": {
+                # 1 hour before default schedule sends again
+                "expires": 23.0 * 60.0 * 60.0,
+            },
+        },
    ]
    for task in tasks:
        # Either get the environment setting or use the default
@@ -287,6 +299,7 @@ MODEL_FILE = __get_path(
    "PAPERLESS_MODEL_FILE",
    DATA_DIR / "classification_model.pickle",
 )
+LLM_INDEX_DIR = DATA_DIR / "llm_index"

 LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")

@@ -380,6 +393,19 @@ MIDDLEWARE = [
 if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"):  # pragma: no cover
    MIDDLEWARE.insert(0, "compression_middleware.middleware.CompressionMiddleware")

+# Workaround to not compress streaming responses (e.g. chat).
+# See https://github.com/friedelwolff/django-compression-middleware/pull/7
+original_process_response = CompressionMiddleware.process_response
+
+
+def patched_process_response(self, request, response):
+    if getattr(request, "compress_exempt", False):
+        return response
+    return original_process_response(self, request, response)
+
+
+CompressionMiddleware.process_response = patched_process_response
+
 ROOT_URLCONF = "paperless.urls"


@@ -585,6 +611,10 @@ X_FRAME_OPTIONS = "SAMEORIGIN"
 # The next 3 settings can also be set using just PAPERLESS_URL
 CSRF_TRUSTED_ORIGINS = __get_list("PAPERLESS_CSRF_TRUSTED_ORIGINS")

+if DEBUG:
+    # Allow access from the angular development server during debugging
+    CSRF_TRUSTED_ORIGINS.append("http://localhost:4200")
+
 # We allow CORS from localhost:8000
 CORS_ALLOWED_ORIGINS = __get_list(
    "PAPERLESS_CORS_ALLOWED_HOSTS",
@@ -595,6 +625,8 @@ if DEBUG:
    # Allow access from the angular development server during debugging
    CORS_ALLOWED_ORIGINS.append("http://localhost:4200")

+CORS_ALLOW_CREDENTIALS = True
+
 CORS_EXPOSE_HEADERS = [
    "Content-Disposition",
 ]
@@ -868,6 +900,7 @@ LOGGING = {
    "loggers": {
        "paperless": {"handlers": ["file_paperless"], "level": "DEBUG"},
        "paperless_mail": {"handlers": ["file_mail"], "level": "DEBUG"},
+        "paperless_ai": {"handlers": ["file_paperless"], "level": "DEBUG"},
        "ocrmypdf": {"handlers": ["file_paperless"], "level": "INFO"},
        "celery": {"handlers": ["file_celery"], "level": "DEBUG"},
        "kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
@@ -1404,3 +1437,16 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
 REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
 REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
 REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
+
+################################################################################
+# AI Settings                                                                  #
+################################################################################
+AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
+LLM_EMBEDDING_BACKEND = os.getenv(
+    "PAPERLESS_AI_LLM_EMBEDDING_BACKEND",
+)  # "huggingface" or "openai"
+LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_AI_LLM_EMBEDDING_MODEL")
+LLM_BACKEND = os.getenv("PAPERLESS_AI_LLM_BACKEND")  # "ollama" or "openai"
+LLM_MODEL = os.getenv("PAPERLESS_AI_LLM_MODEL")
+LLM_API_KEY = os.getenv("PAPERLESS_AI_LLM_API_KEY")
+LLM_ENDPOINT = os.getenv("PAPERLESS_AI_LLM_ENDPOINT")
--- a/src/paperless/tests/test_settings.py
+++ b/src/paperless/tests/test_settings.py
@@ -160,6 +160,7 @@ class TestCeleryScheduleParsing(TestCase):
    SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
    EMPTY_TRASH_EXPIRE_TIME = 23.0 * 60.0 * 60.0
    RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME = 59.0 * 60.0
+    LLM_INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0

    def test_schedule_configuration_default(self):
        """
@@ -204,6 +205,13 @@ class TestCeleryScheduleParsing(TestCase):
                    "schedule": crontab(minute="5", hour="*/1"),
                    "options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
                },
+                "Rebuild LLM index": {
+                    "task": "documents.tasks.llmindex_index",
+                    "schedule": crontab(minute=10, hour=2),
+                    "options": {
+                        "expires": self.LLM_INDEX_EXPIRE_TIME,
+                    },
+                },
            },
            schedule,
        )
@@ -256,6 +264,13 @@ class TestCeleryScheduleParsing(TestCase):
                    "schedule": crontab(minute="5", hour="*/1"),
                    "options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
                },
+                "Rebuild LLM index": {
+                    "task": "documents.tasks.llmindex_index",
+                    "schedule": crontab(minute=10, hour=2),
+                    "options": {
+                        "expires": self.LLM_INDEX_EXPIRE_TIME,
+                    },
+                },
            },
            schedule,
        )
@@ -300,6 +315,13 @@ class TestCeleryScheduleParsing(TestCase):
                    "schedule": crontab(minute="5", hour="*/1"),
                    "options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
                },
+                "Rebuild LLM index": {
+                    "task": "documents.tasks.llmindex_index",
+                    "schedule": crontab(minute=10, hour=2),
+                    "options": {
+                        "expires": self.LLM_INDEX_EXPIRE_TIME,
+                    },
+                },
            },
            schedule,
        )
@@ -322,6 +344,7 @@ class TestCeleryScheduleParsing(TestCase):
                "PAPERLESS_INDEX_TASK_CRON": "disable",
                "PAPERLESS_EMPTY_TRASH_TASK_CRON": "disable",
                "PAPERLESS_WORKFLOW_SCHEDULED_TASK_CRON": "disable",
+                "PAPERLESS_LLM_INDEX_TASK_CRON": "disable",
            },
        ):
            schedule = _parse_beat_schedule()
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -18,6 +18,7 @@ from rest_framework.routers import DefaultRouter
 from documents.views import BulkDownloadView
 from documents.views import BulkEditObjectsView
 from documents.views import BulkEditView
+from documents.views import ChatStreamingView
 from documents.views import CorrespondentViewSet
 from documents.views import CustomFieldViewSet
 from documents.views import DocumentTypeViewSet
@@ -139,6 +140,11 @@ urlpatterns = [
                                SelectionDataView.as_view(),
                                name="selection_data",
                            ),
+                            re_path(
+                                "^chat/",
+                                ChatStreamingView.as_view(),
+                                name="chat_streaming_view",
+                            ),
                        ],
                    ),
                ),
--- a/src/paperless/views.py
+++ b/src/paperless/views.py
@@ -35,6 +35,7 @@ from rest_framework.viewsets import ModelViewSet

 from documents.index import DelayedQuery
 from documents.permissions import PaperlessObjectPermissions
+from documents.tasks import llmindex_index
 from paperless.filters import GroupFilterSet
 from paperless.filters import UserFilterSet
 from paperless.models import ApplicationConfiguration
@@ -43,6 +44,7 @@ from paperless.serialisers import GroupSerializer
 from paperless.serialisers import PaperlessAuthTokenSerializer
 from paperless.serialisers import ProfileSerializer
 from paperless.serialisers import UserSerializer
+from paperless_ai.indexing import vector_store_file_exists


 class PaperlessObtainAuthTokenView(ObtainAuthToken):
@@ -358,6 +360,30 @@ class ApplicationConfigurationViewSet(ModelViewSet):
    def create(self, request, *args, **kwargs):
        return Response(status=405)  # Not Allowed

+    def perform_update(self, serializer):
+        old_instance = ApplicationConfiguration.objects.all().first()
+        old_ai_index_enabled = (
+            old_instance.ai_enabled and old_instance.llm_embedding_backend
+        )
+
+        new_instance: ApplicationConfiguration = serializer.save()
+        new_ai_index_enabled = (
+            new_instance.ai_enabled and new_instance.llm_embedding_backend
+        )
+
+        if (
+            not old_ai_index_enabled
+            and new_ai_index_enabled
+            and not vector_store_file_exists()
+        ):
+            # AI index was just enabled and vector store file does not exist
+            llmindex_index.delay(
+                progress_bar_disable=True,
+                rebuild=True,
+                scheduled=False,
+                auto=True,
+            )
+

@extend_schema_view(
    post=extend_schema(