llamaindex vector index, llmindex mangement command

This commit is contained in:
shamoon 2025-04-24 20:51:06 -07:00
parent 02c8221a50
commit 58f3b7be0a
No known key found for this signature in database
12 changed files with 1868 additions and 10 deletions

View File

@ -11,6 +11,7 @@ for command in decrypt_documents \
mail_fetcher \ mail_fetcher \
document_create_classifier \ document_create_classifier \
document_index \ document_index \
document_llmindex \
document_renamer \ document_renamer \
document_retagger \ document_retagger \
document_thumbnails \ document_thumbnails \

View File

@ -0,0 +1,14 @@
#!/command/with-contenv /usr/bin/bash
# shellcheck shell=bash
set -e
cd "${PAPERLESS_SRC_DIR}"
if [[ $(id -u) == 0 ]]; then
s6-setuidgid paperless python3 manage.py document_llmindex "$@"
elif [[ $(id -un) == "paperless" ]]; then
python3 manage.py document_llmindex "$@"
else
echo "Unknown user."
fi

View File

@ -39,6 +39,7 @@ dependencies = [
"drf-spectacular~=0.28", "drf-spectacular~=0.28",
"drf-spectacular-sidecar~=2025.4.1", "drf-spectacular-sidecar~=2025.4.1",
"drf-writable-nested~=0.7.1", "drf-writable-nested~=0.7.1",
"faiss-cpu>=1.10",
"filelock~=3.18.0", "filelock~=3.18.0",
"flower~=2.0.1", "flower~=2.0.1",
"gotenberg-client~=0.10.0", "gotenberg-client~=0.10.0",
@ -47,8 +48,12 @@ dependencies = [
"inotifyrecursive~=0.3", "inotifyrecursive~=0.3",
"jinja2~=3.1.5", "jinja2~=3.1.5",
"langdetect~=1.0.9", "langdetect~=1.0.9",
"llama-index>=0.12.33",
"llama-index-embeddings-huggingface>=0.5.3",
"llama-index-vector-stores-faiss>=0.3",
"nltk~=3.9.1", "nltk~=3.9.1",
"ocrmypdf~=16.10.0", "ocrmypdf~=16.10.0",
"openai>=1.76",
"pathvalidate~=3.2.3", "pathvalidate~=3.2.3",
"pdf2image~=1.17.0", "pdf2image~=1.17.0",
"python-dateutil~=2.9.0", "python-dateutil~=2.9.0",
@ -60,6 +65,7 @@ dependencies = [
"rapidfuzz~=3.13.0", "rapidfuzz~=3.13.0",
"redis[hiredis]~=5.2.1", "redis[hiredis]~=5.2.1",
"scikit-learn~=1.6.1", "scikit-learn~=1.6.1",
"sentence-transformers>=4.1",
"setproctitle~=1.3.4", "setproctitle~=1.3.4",
"tika-client~=0.9.0", "tika-client~=0.9.0",
"tqdm~=4.67.1", "tqdm~=4.67.1",

View File

@ -0,0 +1,19 @@
from django.core.management import BaseCommand
from django.db import transaction
from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import llm_index_rebuild
class Command(ProgressBarMixin, BaseCommand):
help = "Manages the LLM-based vector index for Paperless."
def add_arguments(self, parser):
parser.add_argument("command", choices=["rebuild"])
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
if options["command"] == "rebuild":
llm_index_rebuild(progress_bar_disable=self.no_progress_bar)

View File

@ -6,6 +6,7 @@ import uuid
from pathlib import Path from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
import faiss
import tqdm import tqdm
from celery import Task from celery import Task
from celery import shared_task from celery import shared_task
@ -17,6 +18,11 @@ from django.db import transaction
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.utils import timezone from django.utils import timezone
from filelock import FileLock from filelock import FileLock
from llama_index.core import Document as LlamaDocument
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.settings import Settings
from llama_index.vector_stores.faiss import FaissVectorStore
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
from documents import index from documents import index
@ -52,6 +58,9 @@ from documents.sanity_checker import SanityCheckFailedException
from documents.signals import document_updated from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion from documents.signals.handlers import cleanup_document_deletion
from documents.signals.handlers import run_workflows from documents.signals.handlers import run_workflows
from paperless.ai.embedding import build_llm_index_text
from paperless.ai.embedding import get_embedding_dim
from paperless.ai.embedding import get_embedding_model
if settings.AUDIT_LOG_ENABLED: if settings.AUDIT_LOG_ENABLED:
from auditlog.models import LogEntry from auditlog.models import LogEntry
@ -496,3 +505,52 @@ def check_scheduled_workflows():
workflow_to_run=workflow, workflow_to_run=workflow,
document=document, document=document,
) )
def llm_index_rebuild(*, progress_bar_disable=False, rebuild=False):
if rebuild:
shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
documents = Document.objects.all()
embed_model = get_embedding_model()
if rebuild or not settings.LLM_INDEX_DIR.exists():
embedding_dim = get_embedding_dim()
faiss_index = faiss.IndexFlatL2(embedding_dim)
vector_store = FaissVectorStore(faiss_index)
else:
vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.embed_model = embed_model
llm_docs = []
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
if not document.content:
continue
llm_docs.append(
LlamaDocument(
text=build_llm_index_text(document),
metadata={
"id": document.id,
"title": document.title,
"tags": [t.name for t in document.tags.all()],
"correspondent": document.correspondent.name
if document.correspondent
else None,
"document_type": document.document_type.name
if document.document_type
else None,
"created": document.created.isoformat(),
"added": document.added.isoformat(),
},
),
)
index = VectorStoreIndex.from_documents(
llm_docs,
storage_context=storage_context,
)
settings.LLM_INDEX_DIR.mkdir(exist_ok=True)
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)

View File

@ -0,0 +1,67 @@
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from documents.models import Document
from documents.models import Note
from paperless.config import AIConfig
EMBEDDING_DIMENSIONS = {
"text-embedding-3-small": 1536,
"sentence-transformers/all-MiniLM-L6-v2": 384,
}
def get_embedding_model():
config = AIConfig()
match config.llm_embedding_backend:
case "openai":
return OpenAIEmbedding(
model=config.llm_embedding_model or "text-embedding-3-small",
api_key=config.llm_api_key,
)
case "local":
return HuggingFaceEmbedding(
model_name=config.llm_embedding_model
or "sentence-transformers/all-MiniLM-L6-v2",
)
case _:
raise ValueError(
f"Unsupported embedding backend: {config.llm_embedding_backend}",
)
def get_embedding_dim() -> int:
config = AIConfig()
model = config.llm_embedding_model or (
"text-embedding-3-small"
if config.llm_embedding_backend == "openai"
else "sentence-transformers/all-MiniLM-L6-v2"
)
if model not in EMBEDDING_DIMENSIONS:
raise ValueError(f"Unknown embedding model: {model}")
return EMBEDDING_DIMENSIONS[model]
def build_llm_index_text(doc: Document) -> str:
lines = [
f"Title: {doc.title}",
f"Filename: {doc.filename}",
f"Created: {doc.created}",
f"Added: {doc.added}",
f"Modified: {doc.modified}",
f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
f"Document Type: {doc.document_type.name if doc.document_type else ''}",
f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
f"Archive Serial Number: {doc.archive_serial_number or ''}",
f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
]
for instance in doc.custom_fields.all():
lines.append(f"Custom Field - {instance.field.name}: {instance}")
lines.append("\nContent:\n")
lines.append(doc.content or "")
return "\n".join(lines)

View File

@ -0,0 +1,52 @@
import logging
import llama_index.core.settings as llama_settings
from django.conf import settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.vector_stores.faiss import FaissVectorStore
from documents.models import Document
from paperless.ai.embedding import get_embedding_model
logger = logging.getLogger("paperless.ai.indexing")
def load_index() -> VectorStoreIndex:
"""Loads the persisted LlamaIndex from disk."""
vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
embed_model = get_embedding_model()
llama_settings.Settings.embed_model = embed_model
llama_settings.Settings.chunk_size = 512
storage_context = StorageContext.from_defaults(
vector_store=vector_store,
persist_dir=settings.LLM_INDEX_DIR,
)
return load_index_from_storage(storage_context)
def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]:
"""Runs a similarity query and returns top-k similar Document objects."""
# Load index
index = load_index()
retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
# Build query from the document text
query_text = (document.title or "") + "\n" + (document.content or "")
# Query
results = retriever.retrieve(query_text)
# Each result.node.metadata["document_id"] should match our stored doc
document_ids = [
int(node.metadata["document_id"])
for node in results
if "document_id" in node.metadata
]
return list(Document.objects.filter(pk__in=document_ids))

View File

@ -178,6 +178,8 @@ class AIConfig(BaseConfig):
""" """
ai_enabled: bool = dataclasses.field(init=False) ai_enabled: bool = dataclasses.field(init=False)
llm_embedding_backend: str = dataclasses.field(init=False)
llm_embedding_model: str = dataclasses.field(init=False)
llm_backend: str = dataclasses.field(init=False) llm_backend: str = dataclasses.field(init=False)
llm_model: str = dataclasses.field(init=False) llm_model: str = dataclasses.field(init=False)
llm_api_key: str = dataclasses.field(init=False) llm_api_key: str = dataclasses.field(init=False)
@ -187,6 +189,12 @@ class AIConfig(BaseConfig):
app_config = self._get_config_instance() app_config = self._get_config_instance()
self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
self.llm_embedding_backend = (
app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
)
self.llm_embedding_model = (
app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
)
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
self.llm_model = app_config.llm_model or settings.LLM_MODEL self.llm_model = app_config.llm_model or settings.LLM_MODEL
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY

View File

@ -19,6 +19,27 @@ class Migration(migrations.Migration):
verbose_name="Enables AI features", verbose_name="Enables AI features",
), ),
), ),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_backend",
field=models.CharField(
blank=True,
choices=[("openai", "OpenAI"), ("local", "Local")],
max_length=32,
null=True,
verbose_name="Sets the LLM Embedding backend",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_model",
field=models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Sets the LLM Embedding model",
),
),
migrations.AddField( migrations.AddField(
model_name="applicationconfiguration", model_name="applicationconfiguration",
name="llm_api_key", name="llm_api_key",

View File

@ -74,6 +74,11 @@ class ColorConvertChoices(models.TextChoices):
CMYK = ("CMYK", _("CMYK")) CMYK = ("CMYK", _("CMYK"))
class LLMEmbeddingBackend(models.TextChoices):
OPENAI = ("openai", _("OpenAI"))
LOCAL = ("local", _("Local"))
class LLMBackend(models.TextChoices): class LLMBackend(models.TextChoices):
""" """
Matches to --llm-backend Matches to --llm-backend
@ -284,6 +289,21 @@ class ApplicationConfiguration(AbstractSingletonModel):
default=False, default=False,
) )
llm_embedding_backend = models.CharField(
verbose_name=_("Sets the LLM embedding backend"),
null=True,
blank=True,
max_length=32,
choices=LLMEmbeddingBackend.choices,
)
llm_embedding_model = models.CharField(
verbose_name=_("Sets the LLM embedding model"),
null=True,
blank=True,
max_length=32,
)
llm_backend = models.CharField( llm_backend = models.CharField(
verbose_name=_("Sets the LLM backend"), verbose_name=_("Sets the LLM backend"),
null=True, null=True,

View File

@ -283,6 +283,7 @@ MODEL_FILE = __get_path(
"PAPERLESS_MODEL_FILE", "PAPERLESS_MODEL_FILE",
DATA_DIR / "classification_model.pickle", DATA_DIR / "classification_model.pickle",
) )
LLM_INDEX_DIR = DATA_DIR / "llm_index"
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log") LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
@ -1281,7 +1282,12 @@ OUTLOOK_OAUTH_ENABLED = bool(
# AI Settings # # AI Settings #
################################################################################ ################################################################################
AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO") AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "openai") # or "ollama" LLM_EMBEDDING_BACKEND = os.getenv(
"PAPERLESS_LLM_EMBEDDING_BACKEND",
"local",
) # or "openai"
LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_LLM_EMBEDDING_MODEL")
LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "ollama") # or "openai"
LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL") LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL")
LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY") LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY")
LLM_URL = os.getenv("PAPERLESS_LLM_URL") LLM_URL = os.getenv("PAPERLESS_LLM_URL")

1604
uv.lock generated

File diff suppressed because it is too large Load Diff