mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-10 00:18:57 +00:00
llamaindex vector index, llmindex mangement command
This commit is contained in:
67
src/paperless/ai/embedding.py
Normal file
67
src/paperless/ai/embedding.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from paperless.config import AIConfig
|
||||
|
||||
EMBEDDING_DIMENSIONS = {
|
||||
"text-embedding-3-small": 1536,
|
||||
"sentence-transformers/all-MiniLM-L6-v2": 384,
|
||||
}
|
||||
|
||||
|
||||
def get_embedding_model():
|
||||
config = AIConfig()
|
||||
|
||||
match config.llm_embedding_backend:
|
||||
case "openai":
|
||||
return OpenAIEmbedding(
|
||||
model=config.llm_embedding_model or "text-embedding-3-small",
|
||||
api_key=config.llm_api_key,
|
||||
)
|
||||
case "local":
|
||||
return HuggingFaceEmbedding(
|
||||
model_name=config.llm_embedding_model
|
||||
or "sentence-transformers/all-MiniLM-L6-v2",
|
||||
)
|
||||
case _:
|
||||
raise ValueError(
|
||||
f"Unsupported embedding backend: {config.llm_embedding_backend}",
|
||||
)
|
||||
|
||||
|
||||
def get_embedding_dim() -> int:
|
||||
config = AIConfig()
|
||||
model = config.llm_embedding_model or (
|
||||
"text-embedding-3-small"
|
||||
if config.llm_embedding_backend == "openai"
|
||||
else "sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
if model not in EMBEDDING_DIMENSIONS:
|
||||
raise ValueError(f"Unknown embedding model: {model}")
|
||||
return EMBEDDING_DIMENSIONS[model]
|
||||
|
||||
|
||||
def build_llm_index_text(doc: Document) -> str:
|
||||
lines = [
|
||||
f"Title: {doc.title}",
|
||||
f"Filename: {doc.filename}",
|
||||
f"Created: {doc.created}",
|
||||
f"Added: {doc.added}",
|
||||
f"Modified: {doc.modified}",
|
||||
f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
|
||||
f"Document Type: {doc.document_type.name if doc.document_type else ''}",
|
||||
f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
|
||||
f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
|
||||
f"Archive Serial Number: {doc.archive_serial_number or ''}",
|
||||
f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
|
||||
]
|
||||
|
||||
for instance in doc.custom_fields.all():
|
||||
lines.append(f"Custom Field - {instance.field.name}: {instance}")
|
||||
|
||||
lines.append("\nContent:\n")
|
||||
lines.append(doc.content or "")
|
||||
|
||||
return "\n".join(lines)
|
52
src/paperless/ai/indexing.py
Normal file
52
src/paperless/ai/indexing.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import logging
|
||||
|
||||
import llama_index.core.settings as llama_settings
|
||||
from django.conf import settings
|
||||
from llama_index.core import StorageContext
|
||||
from llama_index.core import VectorStoreIndex
|
||||
from llama_index.core import load_index_from_storage
|
||||
from llama_index.core.retrievers import VectorIndexRetriever
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.ai.embedding import get_embedding_model
|
||||
|
||||
logger = logging.getLogger("paperless.ai.indexing")
|
||||
|
||||
|
||||
def load_index() -> VectorStoreIndex:
|
||||
"""Loads the persisted LlamaIndex from disk."""
|
||||
vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
|
||||
embed_model = get_embedding_model()
|
||||
|
||||
llama_settings.Settings.embed_model = embed_model
|
||||
llama_settings.Settings.chunk_size = 512
|
||||
|
||||
storage_context = StorageContext.from_defaults(
|
||||
vector_store=vector_store,
|
||||
persist_dir=settings.LLM_INDEX_DIR,
|
||||
)
|
||||
return load_index_from_storage(storage_context)
|
||||
|
||||
|
||||
def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]:
|
||||
"""Runs a similarity query and returns top-k similar Document objects."""
|
||||
|
||||
# Load index
|
||||
index = load_index()
|
||||
retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
|
||||
|
||||
# Build query from the document text
|
||||
query_text = (document.title or "") + "\n" + (document.content or "")
|
||||
|
||||
# Query
|
||||
results = retriever.retrieve(query_text)
|
||||
|
||||
# Each result.node.metadata["document_id"] should match our stored doc
|
||||
document_ids = [
|
||||
int(node.metadata["document_id"])
|
||||
for node in results
|
||||
if "document_id" in node.metadata
|
||||
]
|
||||
|
||||
return list(Document.objects.filter(pk__in=document_ids))
|
@@ -178,6 +178,8 @@ class AIConfig(BaseConfig):
|
||||
"""
|
||||
|
||||
ai_enabled: bool = dataclasses.field(init=False)
|
||||
llm_embedding_backend: str = dataclasses.field(init=False)
|
||||
llm_embedding_model: str = dataclasses.field(init=False)
|
||||
llm_backend: str = dataclasses.field(init=False)
|
||||
llm_model: str = dataclasses.field(init=False)
|
||||
llm_api_key: str = dataclasses.field(init=False)
|
||||
@@ -187,6 +189,12 @@ class AIConfig(BaseConfig):
|
||||
app_config = self._get_config_instance()
|
||||
|
||||
self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
|
||||
self.llm_embedding_backend = (
|
||||
app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
|
||||
)
|
||||
self.llm_embedding_model = (
|
||||
app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
|
||||
)
|
||||
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
|
||||
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
||||
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
||||
|
@@ -19,6 +19,27 @@ class Migration(migrations.Migration):
|
||||
verbose_name="Enables AI features",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_embedding_backend",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("openai", "OpenAI"), ("local", "Local")],
|
||||
max_length=32,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM Embedding backend",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_embedding_model",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=32,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM Embedding model",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_api_key",
|
||||
|
@@ -74,6 +74,11 @@ class ColorConvertChoices(models.TextChoices):
|
||||
CMYK = ("CMYK", _("CMYK"))
|
||||
|
||||
|
||||
class LLMEmbeddingBackend(models.TextChoices):
|
||||
OPENAI = ("openai", _("OpenAI"))
|
||||
LOCAL = ("local", _("Local"))
|
||||
|
||||
|
||||
class LLMBackend(models.TextChoices):
|
||||
"""
|
||||
Matches to --llm-backend
|
||||
@@ -284,6 +289,21 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
default=False,
|
||||
)
|
||||
|
||||
llm_embedding_backend = models.CharField(
|
||||
verbose_name=_("Sets the LLM embedding backend"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=32,
|
||||
choices=LLMEmbeddingBackend.choices,
|
||||
)
|
||||
|
||||
llm_embedding_model = models.CharField(
|
||||
verbose_name=_("Sets the LLM embedding model"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=32,
|
||||
)
|
||||
|
||||
llm_backend = models.CharField(
|
||||
verbose_name=_("Sets the LLM backend"),
|
||||
null=True,
|
||||
|
@@ -284,6 +284,7 @@ MODEL_FILE = __get_path(
|
||||
"PAPERLESS_MODEL_FILE",
|
||||
DATA_DIR / "classification_model.pickle",
|
||||
)
|
||||
LLM_INDEX_DIR = DATA_DIR / "llm_index"
|
||||
|
||||
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
|
||||
|
||||
@@ -1283,7 +1284,12 @@ OUTLOOK_OAUTH_ENABLED = bool(
|
||||
# AI Settings #
|
||||
################################################################################
|
||||
AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
|
||||
LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "openai") # or "ollama"
|
||||
LLM_EMBEDDING_BACKEND = os.getenv(
|
||||
"PAPERLESS_LLM_EMBEDDING_BACKEND",
|
||||
"local",
|
||||
) # or "openai"
|
||||
LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_LLM_EMBEDDING_MODEL")
|
||||
LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "ollama") # or "openai"
|
||||
LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL")
|
||||
LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY")
|
||||
LLM_URL = os.getenv("PAPERLESS_LLM_URL")
|
||||
|
Reference in New Issue
Block a user