llamaindex vector index, llmindex mangement command

2025-08-10 00:18:57 +00:00 · 2025-04-24 20:51:06 -07:00
parent edeb9a7534
commit 0a19a5500c
12 changed files with 1868 additions and 10 deletions
--- a/src/paperless/ai/embedding.py
+++ b/src/paperless/ai/embedding.py
@@ -0,0 +1,67 @@
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.embeddings.openai import OpenAIEmbedding
+
+from documents.models import Document
+from documents.models import Note
+from paperless.config import AIConfig
+
+EMBEDDING_DIMENSIONS = {
+    "text-embedding-3-small": 1536,
+    "sentence-transformers/all-MiniLM-L6-v2": 384,
+}
+
+
+def get_embedding_model():
+    config = AIConfig()
+
+    match config.llm_embedding_backend:
+        case "openai":
+            return OpenAIEmbedding(
+                model=config.llm_embedding_model or "text-embedding-3-small",
+                api_key=config.llm_api_key,
+            )
+        case "local":
+            return HuggingFaceEmbedding(
+                model_name=config.llm_embedding_model
+                or "sentence-transformers/all-MiniLM-L6-v2",
+            )
+        case _:
+            raise ValueError(
+                f"Unsupported embedding backend: {config.llm_embedding_backend}",
+            )
+
+
+def get_embedding_dim() -> int:
+    config = AIConfig()
+    model = config.llm_embedding_model or (
+        "text-embedding-3-small"
+        if config.llm_embedding_backend == "openai"
+        else "sentence-transformers/all-MiniLM-L6-v2"
+    )
+    if model not in EMBEDDING_DIMENSIONS:
+        raise ValueError(f"Unknown embedding model: {model}")
+    return EMBEDDING_DIMENSIONS[model]
+
+
+def build_llm_index_text(doc: Document) -> str:
+    lines = [
+        f"Title: {doc.title}",
+        f"Filename: {doc.filename}",
+        f"Created: {doc.created}",
+        f"Added: {doc.added}",
+        f"Modified: {doc.modified}",
+        f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
+        f"Document Type: {doc.document_type.name if doc.document_type else ''}",
+        f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
+        f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
+        f"Archive Serial Number: {doc.archive_serial_number or ''}",
+        f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
+    ]
+
+    for instance in doc.custom_fields.all():
+        lines.append(f"Custom Field - {instance.field.name}: {instance}")
+
+    lines.append("\nContent:\n")
+    lines.append(doc.content or "")
+
+    return "\n".join(lines)
--- a/src/paperless/ai/indexing.py
+++ b/src/paperless/ai/indexing.py
@@ -0,0 +1,52 @@
+import logging
+
+import llama_index.core.settings as llama_settings
+from django.conf import settings
+from llama_index.core import StorageContext
+from llama_index.core import VectorStoreIndex
+from llama_index.core import load_index_from_storage
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.vector_stores.faiss import FaissVectorStore
+
+from documents.models import Document
+from paperless.ai.embedding import get_embedding_model
+
+logger = logging.getLogger("paperless.ai.indexing")
+
+
+def load_index() -> VectorStoreIndex:
+    """Loads the persisted LlamaIndex from disk."""
+    vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
+    embed_model = get_embedding_model()
+
+    llama_settings.Settings.embed_model = embed_model
+    llama_settings.Settings.chunk_size = 512
+
+    storage_context = StorageContext.from_defaults(
+        vector_store=vector_store,
+        persist_dir=settings.LLM_INDEX_DIR,
+    )
+    return load_index_from_storage(storage_context)
+
+
+def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]:
+    """Runs a similarity query and returns top-k similar Document objects."""
+
+    # Load index
+    index = load_index()
+    retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
+
+    # Build query from the document text
+    query_text = (document.title or "") + "\n" + (document.content or "")
+
+    # Query
+    results = retriever.retrieve(query_text)
+
+    # Each result.node.metadata["document_id"] should match our stored doc
+    document_ids = [
+        int(node.metadata["document_id"])
+        for node in results
+        if "document_id" in node.metadata
+    ]
+
+    return list(Document.objects.filter(pk__in=document_ids))
--- a/src/paperless/config.py
+++ b/src/paperless/config.py
@@ -178,6 +178,8 @@ class AIConfig(BaseConfig):
    """

    ai_enabled: bool = dataclasses.field(init=False)
+    llm_embedding_backend: str = dataclasses.field(init=False)
+    llm_embedding_model: str = dataclasses.field(init=False)
    llm_backend: str = dataclasses.field(init=False)
    llm_model: str = dataclasses.field(init=False)
    llm_api_key: str = dataclasses.field(init=False)
@@ -187,6 +189,12 @@ class AIConfig(BaseConfig):
        app_config = self._get_config_instance()

        self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
+        self.llm_embedding_backend = (
+            app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
+        )
+        self.llm_embedding_model = (
+            app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
+        )
        self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
        self.llm_model = app_config.llm_model or settings.LLM_MODEL
        self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
--- a/src/paperless/migrations/0004_applicationconfiguration_ai_enabled_and_more.py
+++ b/src/paperless/migrations/0004_applicationconfiguration_ai_enabled_and_more.py
@@ -19,6 +19,27 @@ class Migration(migrations.Migration):
                verbose_name="Enables AI features",
            ),
        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_embedding_backend",
+            field=models.CharField(
+                blank=True,
+                choices=[("openai", "OpenAI"), ("local", "Local")],
+                max_length=32,
+                null=True,
+                verbose_name="Sets the LLM Embedding backend",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_embedding_model",
+            field=models.CharField(
+                blank=True,
+                max_length=32,
+                null=True,
+                verbose_name="Sets the LLM Embedding model",
+            ),
+        ),
        migrations.AddField(
            model_name="applicationconfiguration",
            name="llm_api_key",
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@@ -74,6 +74,11 @@ class ColorConvertChoices(models.TextChoices):
    CMYK = ("CMYK", _("CMYK"))


+class LLMEmbeddingBackend(models.TextChoices):
+    OPENAI = ("openai", _("OpenAI"))
+    LOCAL = ("local", _("Local"))
+
+
 class LLMBackend(models.TextChoices):
    """
    Matches to --llm-backend
@@ -284,6 +289,21 @@ class ApplicationConfiguration(AbstractSingletonModel):
        default=False,
    )

+    llm_embedding_backend = models.CharField(
+        verbose_name=_("Sets the LLM embedding backend"),
+        null=True,
+        blank=True,
+        max_length=32,
+        choices=LLMEmbeddingBackend.choices,
+    )
+
+    llm_embedding_model = models.CharField(
+        verbose_name=_("Sets the LLM embedding model"),
+        null=True,
+        blank=True,
+        max_length=32,
+    )
+
    llm_backend = models.CharField(
        verbose_name=_("Sets the LLM backend"),
        null=True,
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -284,6 +284,7 @@ MODEL_FILE = __get_path(
    "PAPERLESS_MODEL_FILE",
    DATA_DIR / "classification_model.pickle",
 )
+LLM_INDEX_DIR = DATA_DIR / "llm_index"

 LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")

@@ -1283,7 +1284,12 @@ OUTLOOK_OAUTH_ENABLED = bool(
 # AI Settings                                                                  #
 ################################################################################
 AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
-LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "openai")  # or "ollama"
+LLM_EMBEDDING_BACKEND = os.getenv(
+    "PAPERLESS_LLM_EMBEDDING_BACKEND",
+    "local",
+)  # or "openai"
+LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_LLM_EMBEDDING_MODEL")
+LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "ollama")  # or "openai"
 LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL")
 LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY")
 LLM_URL = os.getenv("PAPERLESS_LLM_URL")