llamaindex vector index, llmindex mangement command

2025-05-23 12:58:18 -05:00 · 2025-04-24 20:51:06 -07:00 · 2025-04-24 20:51:06 -07:00 · 58f3b7be0a
commit 58f3b7be0a
parent 02c8221a50
12 changed files with 1868 additions and 10 deletions
--- a/docker/install_management_commands.sh
+++ b/docker/install_management_commands.sh
@ -11,6 +11,7 @@ for command in decrypt_documents \
 	mail_fetcher \
 	document_create_classifier \
 	document_index \
 	document_llmindex \
 	document_renamer \
 	document_retagger \
 	document_thumbnails \
--- a/docker/rootfs/usr/local/bin/document_llmindex
+++ b/docker/rootfs/usr/local/bin/document_llmindex
@ -0,0 +1,14 @@
 #!/command/with-contenv /usr/bin/bash
 # shellcheck shell=bash
 set -e
 cd "${PAPERLESS_SRC_DIR}"
 if [[ $(id -u) == 0 ]]; then
 	s6-setuidgid paperless python3 manage.py document_llmindex "$@"
 elif [[ $(id -un) == "paperless" ]]; then
 	python3 manage.py document_llmindex "$@"
 else
 	echo "Unknown user."
 fi
--- a/pyproject.toml
+++ b/pyproject.toml
@ -39,6 +39,7 @@ dependencies = [
  "drf-spectacular~=0.28",
  "drf-spectacular-sidecar~=2025.4.1",
  "drf-writable-nested~=0.7.1",
  "faiss-cpu>=1.10",
  "filelock~=3.18.0",
  "flower~=2.0.1",
  "gotenberg-client~=0.10.0",
@ -47,8 +48,12 @@ dependencies = [
  "inotifyrecursive~=0.3",
  "jinja2~=3.1.5",
  "langdetect~=1.0.9",
  "llama-index>=0.12.33",
  "llama-index-embeddings-huggingface>=0.5.3",
  "llama-index-vector-stores-faiss>=0.3",
  "nltk~=3.9.1",
  "ocrmypdf~=16.10.0",
  "openai>=1.76",
  "pathvalidate~=3.2.3",
  "pdf2image~=1.17.0",
  "python-dateutil~=2.9.0",
@ -60,6 +65,7 @@ dependencies = [
  "rapidfuzz~=3.13.0",
  "redis[hiredis]~=5.2.1",
  "scikit-learn~=1.6.1",
  "sentence-transformers>=4.1",
  "setproctitle~=1.3.4",
  "tika-client~=0.9.0",
  "tqdm~=4.67.1",
--- a/src/documents/management/commands/document_llmindex.py
+++ b/src/documents/management/commands/document_llmindex.py
@ -0,0 +1,19 @@
 from django.core.management import BaseCommand
 from django.db import transaction
 from documents.management.commands.mixins import ProgressBarMixin
 from documents.tasks import llm_index_rebuild
 class Command(ProgressBarMixin, BaseCommand):
    help = "Manages the LLM-based vector index for Paperless."
    def add_arguments(self, parser):
        parser.add_argument("command", choices=["rebuild"])
        self.add_argument_progress_bar_mixin(parser)
    def handle(self, *args, **options):
        self.handle_progress_bar_mixin(**options)
        with transaction.atomic():
            if options["command"] == "rebuild":
                llm_index_rebuild(progress_bar_disable=self.no_progress_bar)
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -6,6 +6,7 @@ import uuid
 from pathlib import Path
 from tempfile import TemporaryDirectory
 import faiss
 import tqdm
 from celery import Task
 from celery import shared_task
@ -17,6 +18,11 @@ from django.db import transaction
 from django.db.models.signals import post_save
 from django.utils import timezone
 from filelock import FileLock
 from llama_index.core import Document as LlamaDocument
 from llama_index.core import StorageContext
 from llama_index.core import VectorStoreIndex
 from llama_index.core.settings import Settings
 from llama_index.vector_stores.faiss import FaissVectorStore
 from whoosh.writing import AsyncWriter
 from documents import index
@ -52,6 +58,9 @@ from documents.sanity_checker import SanityCheckFailedException
 from documents.signals import document_updated
 from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
 from paperless.ai.embedding import build_llm_index_text
 from paperless.ai.embedding import get_embedding_dim
 from paperless.ai.embedding import get_embedding_model
 if settings.AUDIT_LOG_ENABLED:
    from auditlog.models import LogEntry
@ -496,3 +505,52 @@ def check_scheduled_workflows():
                            workflow_to_run=workflow,
                            document=document,
                        )
 def llm_index_rebuild(*, progress_bar_disable=False, rebuild=False):
    if rebuild:
        shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
        settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
    documents = Document.objects.all()
    embed_model = get_embedding_model()
    if rebuild or not settings.LLM_INDEX_DIR.exists():
        embedding_dim = get_embedding_dim()
        faiss_index = faiss.IndexFlatL2(embedding_dim)
        vector_store = FaissVectorStore(faiss_index)
    else:
        vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    Settings.embed_model = embed_model
    llm_docs = []
    for document in tqdm.tqdm(documents, disable=progress_bar_disable):
        if not document.content:
            continue
        llm_docs.append(
            LlamaDocument(
                text=build_llm_index_text(document),
                metadata={
                    "id": document.id,
                    "title": document.title,
                    "tags": [t.name for t in document.tags.all()],
                    "correspondent": document.correspondent.name
                    if document.correspondent
                    else None,
                    "document_type": document.document_type.name
                    if document.document_type
                    else None,
                    "created": document.created.isoformat(),
                    "added": document.added.isoformat(),
                },
            ),
        )
    index = VectorStoreIndex.from_documents(
        llm_docs,
        storage_context=storage_context,
    )
    settings.LLM_INDEX_DIR.mkdir(exist_ok=True)
    index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
--- a/src/paperless/ai/embedding.py
+++ b/src/paperless/ai/embedding.py
@ -0,0 +1,67 @@
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from llama_index.embeddings.openai import OpenAIEmbedding
 from documents.models import Document
 from documents.models import Note
 from paperless.config import AIConfig
 EMBEDDING_DIMENSIONS = {
    "text-embedding-3-small": 1536,
    "sentence-transformers/all-MiniLM-L6-v2": 384,
 }
 def get_embedding_model():
    config = AIConfig()
    match config.llm_embedding_backend:
        case "openai":
            return OpenAIEmbedding(
                model=config.llm_embedding_model or "text-embedding-3-small",
                api_key=config.llm_api_key,
            )
        case "local":
            return HuggingFaceEmbedding(
                model_name=config.llm_embedding_model
                or "sentence-transformers/all-MiniLM-L6-v2",
            )
        case _:
            raise ValueError(
                f"Unsupported embedding backend: {config.llm_embedding_backend}",
            )
 def get_embedding_dim() -> int:
    config = AIConfig()
    model = config.llm_embedding_model or (
        "text-embedding-3-small"
        if config.llm_embedding_backend == "openai"
        else "sentence-transformers/all-MiniLM-L6-v2"
    )
    if model not in EMBEDDING_DIMENSIONS:
        raise ValueError(f"Unknown embedding model: {model}")
    return EMBEDDING_DIMENSIONS[model]
 def build_llm_index_text(doc: Document) -> str:
    lines = [
        f"Title: {doc.title}",
        f"Filename: {doc.filename}",
        f"Created: {doc.created}",
        f"Added: {doc.added}",
        f"Modified: {doc.modified}",
        f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
        f"Document Type: {doc.document_type.name if doc.document_type else ''}",
        f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
        f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
        f"Archive Serial Number: {doc.archive_serial_number or ''}",
        f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
    ]
    for instance in doc.custom_fields.all():
        lines.append(f"Custom Field - {instance.field.name}: {instance}")
    lines.append("\nContent:\n")
    lines.append(doc.content or "")
    return "\n".join(lines)
--- a/src/paperless/ai/indexing.py
+++ b/src/paperless/ai/indexing.py
@ -0,0 +1,52 @@
 import logging
 import llama_index.core.settings as llama_settings
 from django.conf import settings
 from llama_index.core import StorageContext
 from llama_index.core import VectorStoreIndex
 from llama_index.core import load_index_from_storage
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.vector_stores.faiss import FaissVectorStore
 from documents.models import Document
 from paperless.ai.embedding import get_embedding_model
 logger = logging.getLogger("paperless.ai.indexing")
 def load_index() -> VectorStoreIndex:
    """Loads the persisted LlamaIndex from disk."""
    vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
    embed_model = get_embedding_model()
    llama_settings.Settings.embed_model = embed_model
    llama_settings.Settings.chunk_size = 512
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
        persist_dir=settings.LLM_INDEX_DIR,
    )
    return load_index_from_storage(storage_context)
 def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]:
    """Runs a similarity query and returns top-k similar Document objects."""
    # Load index
    index = load_index()
    retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
    # Build query from the document text
    query_text = (document.title or "") + "\n" + (document.content or "")
    # Query
    results = retriever.retrieve(query_text)
    # Each result.node.metadata["document_id"] should match our stored doc
    document_ids = [
        int(node.metadata["document_id"])
        for node in results
        if "document_id" in node.metadata
    ]
    return list(Document.objects.filter(pk__in=document_ids))
--- a/src/paperless/config.py
+++ b/src/paperless/config.py
@ -178,6 +178,8 @@ class AIConfig(BaseConfig):
    """
    ai_enabled: bool = dataclasses.field(init=False)
    llm_embedding_backend: str = dataclasses.field(init=False)
    llm_embedding_model: str = dataclasses.field(init=False)
    llm_backend: str = dataclasses.field(init=False)
    llm_model: str = dataclasses.field(init=False)
    llm_api_key: str = dataclasses.field(init=False)
@ -187,6 +189,12 @@ class AIConfig(BaseConfig):
        app_config = self._get_config_instance()
        self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
        self.llm_embedding_backend = (
            app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
        )
        self.llm_embedding_model = (
            app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
        )
        self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
        self.llm_model = app_config.llm_model or settings.LLM_MODEL
        self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
--- a/src/paperless/migrations/0004_applicationconfiguration_ai_enabled_and_more.py
+++ b/src/paperless/migrations/0004_applicationconfiguration_ai_enabled_and_more.py
@ -19,6 +19,27 @@ class Migration(migrations.Migration):
                verbose_name="Enables AI features",
            ),
        ),
        migrations.AddField(
            model_name="applicationconfiguration",
            name="llm_embedding_backend",
            field=models.CharField(
                blank=True,
                choices=[("openai", "OpenAI"), ("local", "Local")],
                max_length=32,
                null=True,
                verbose_name="Sets the LLM Embedding backend",
            ),
        ),
        migrations.AddField(
            model_name="applicationconfiguration",
            name="llm_embedding_model",
            field=models.CharField(
                blank=True,
                max_length=32,
                null=True,
                verbose_name="Sets the LLM Embedding model",
            ),
        ),
        migrations.AddField(
            model_name="applicationconfiguration",
            name="llm_api_key",
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@ -74,6 +74,11 @@ class ColorConvertChoices(models.TextChoices):
    CMYK = ("CMYK", _("CMYK"))
 class LLMEmbeddingBackend(models.TextChoices):
    OPENAI = ("openai", _("OpenAI"))
    LOCAL = ("local", _("Local"))
 class LLMBackend(models.TextChoices):
    """
    Matches to --llm-backend
@ -284,6 +289,21 @@ class ApplicationConfiguration(AbstractSingletonModel):
        default=False,
    )
    llm_embedding_backend = models.CharField(
        verbose_name=_("Sets the LLM embedding backend"),
        null=True,
        blank=True,
        max_length=32,
        choices=LLMEmbeddingBackend.choices,
    )
    llm_embedding_model = models.CharField(
        verbose_name=_("Sets the LLM embedding model"),
        null=True,
        blank=True,
        max_length=32,
    )
    llm_backend = models.CharField(
        verbose_name=_("Sets the LLM backend"),
        null=True,
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -283,6 +283,7 @@ MODEL_FILE = __get_path(
    "PAPERLESS_MODEL_FILE",
    DATA_DIR / "classification_model.pickle",
 )
 LLM_INDEX_DIR = DATA_DIR / "llm_index"
 LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
@ -1281,7 +1282,12 @@ OUTLOOK_OAUTH_ENABLED = bool(
 # AI Settings                                                                  #
 ################################################################################
 AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
-LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "openai")  # or "ollama"
+LLM_EMBEDDING_BACKEND = os.getenv(
    "PAPERLESS_LLM_EMBEDDING_BACKEND",
    "local",
 )  # or "openai"
 LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_LLM_EMBEDDING_MODEL")
 LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "ollama")  # or "openai"
 LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL")
 LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY")
 LLM_URL = os.getenv("PAPERLESS_LLM_URL")
--- a/uv.lock
+++ b/uv.lock