mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-23 12:58:18 -05:00
llamaindex vector index, llmindex mangement command
This commit is contained in:
parent
02c8221a50
commit
58f3b7be0a
@ -11,6 +11,7 @@ for command in decrypt_documents \
|
|||||||
mail_fetcher \
|
mail_fetcher \
|
||||||
document_create_classifier \
|
document_create_classifier \
|
||||||
document_index \
|
document_index \
|
||||||
|
document_llmindex \
|
||||||
document_renamer \
|
document_renamer \
|
||||||
document_retagger \
|
document_retagger \
|
||||||
document_thumbnails \
|
document_thumbnails \
|
||||||
|
14
docker/rootfs/usr/local/bin/document_llmindex
Executable file
14
docker/rootfs/usr/local/bin/document_llmindex
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
#!/command/with-contenv /usr/bin/bash
|
||||||
|
# shellcheck shell=bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
cd "${PAPERLESS_SRC_DIR}"
|
||||||
|
|
||||||
|
if [[ $(id -u) == 0 ]]; then
|
||||||
|
s6-setuidgid paperless python3 manage.py document_llmindex "$@"
|
||||||
|
elif [[ $(id -un) == "paperless" ]]; then
|
||||||
|
python3 manage.py document_llmindex "$@"
|
||||||
|
else
|
||||||
|
echo "Unknown user."
|
||||||
|
fi
|
@ -39,6 +39,7 @@ dependencies = [
|
|||||||
"drf-spectacular~=0.28",
|
"drf-spectacular~=0.28",
|
||||||
"drf-spectacular-sidecar~=2025.4.1",
|
"drf-spectacular-sidecar~=2025.4.1",
|
||||||
"drf-writable-nested~=0.7.1",
|
"drf-writable-nested~=0.7.1",
|
||||||
|
"faiss-cpu>=1.10",
|
||||||
"filelock~=3.18.0",
|
"filelock~=3.18.0",
|
||||||
"flower~=2.0.1",
|
"flower~=2.0.1",
|
||||||
"gotenberg-client~=0.10.0",
|
"gotenberg-client~=0.10.0",
|
||||||
@ -47,8 +48,12 @@ dependencies = [
|
|||||||
"inotifyrecursive~=0.3",
|
"inotifyrecursive~=0.3",
|
||||||
"jinja2~=3.1.5",
|
"jinja2~=3.1.5",
|
||||||
"langdetect~=1.0.9",
|
"langdetect~=1.0.9",
|
||||||
|
"llama-index>=0.12.33",
|
||||||
|
"llama-index-embeddings-huggingface>=0.5.3",
|
||||||
|
"llama-index-vector-stores-faiss>=0.3",
|
||||||
"nltk~=3.9.1",
|
"nltk~=3.9.1",
|
||||||
"ocrmypdf~=16.10.0",
|
"ocrmypdf~=16.10.0",
|
||||||
|
"openai>=1.76",
|
||||||
"pathvalidate~=3.2.3",
|
"pathvalidate~=3.2.3",
|
||||||
"pdf2image~=1.17.0",
|
"pdf2image~=1.17.0",
|
||||||
"python-dateutil~=2.9.0",
|
"python-dateutil~=2.9.0",
|
||||||
@ -60,6 +65,7 @@ dependencies = [
|
|||||||
"rapidfuzz~=3.13.0",
|
"rapidfuzz~=3.13.0",
|
||||||
"redis[hiredis]~=5.2.1",
|
"redis[hiredis]~=5.2.1",
|
||||||
"scikit-learn~=1.6.1",
|
"scikit-learn~=1.6.1",
|
||||||
|
"sentence-transformers>=4.1",
|
||||||
"setproctitle~=1.3.4",
|
"setproctitle~=1.3.4",
|
||||||
"tika-client~=0.9.0",
|
"tika-client~=0.9.0",
|
||||||
"tqdm~=4.67.1",
|
"tqdm~=4.67.1",
|
||||||
|
19
src/documents/management/commands/document_llmindex.py
Normal file
19
src/documents/management/commands/document_llmindex.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from django.core.management import BaseCommand
|
||||||
|
from django.db import transaction
|
||||||
|
|
||||||
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
|
from documents.tasks import llm_index_rebuild
|
||||||
|
|
||||||
|
|
||||||
|
class Command(ProgressBarMixin, BaseCommand):
|
||||||
|
help = "Manages the LLM-based vector index for Paperless."
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("command", choices=["rebuild"])
|
||||||
|
self.add_argument_progress_bar_mixin(parser)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
self.handle_progress_bar_mixin(**options)
|
||||||
|
with transaction.atomic():
|
||||||
|
if options["command"] == "rebuild":
|
||||||
|
llm_index_rebuild(progress_bar_disable=self.no_progress_bar)
|
@ -6,6 +6,7 @@ import uuid
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
import faiss
|
||||||
import tqdm
|
import tqdm
|
||||||
from celery import Task
|
from celery import Task
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
@ -17,6 +18,11 @@ from django.db import transaction
|
|||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
|
from llama_index.core import Document as LlamaDocument
|
||||||
|
from llama_index.core import StorageContext
|
||||||
|
from llama_index.core import VectorStoreIndex
|
||||||
|
from llama_index.core.settings import Settings
|
||||||
|
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||||
from whoosh.writing import AsyncWriter
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
from documents import index
|
from documents import index
|
||||||
@ -52,6 +58,9 @@ from documents.sanity_checker import SanityCheckFailedException
|
|||||||
from documents.signals import document_updated
|
from documents.signals import document_updated
|
||||||
from documents.signals.handlers import cleanup_document_deletion
|
from documents.signals.handlers import cleanup_document_deletion
|
||||||
from documents.signals.handlers import run_workflows
|
from documents.signals.handlers import run_workflows
|
||||||
|
from paperless.ai.embedding import build_llm_index_text
|
||||||
|
from paperless.ai.embedding import get_embedding_dim
|
||||||
|
from paperless.ai.embedding import get_embedding_model
|
||||||
|
|
||||||
if settings.AUDIT_LOG_ENABLED:
|
if settings.AUDIT_LOG_ENABLED:
|
||||||
from auditlog.models import LogEntry
|
from auditlog.models import LogEntry
|
||||||
@ -496,3 +505,52 @@ def check_scheduled_workflows():
|
|||||||
workflow_to_run=workflow,
|
workflow_to_run=workflow,
|
||||||
document=document,
|
document=document,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def llm_index_rebuild(*, progress_bar_disable=False, rebuild=False):
|
||||||
|
if rebuild:
|
||||||
|
shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
|
||||||
|
settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
documents = Document.objects.all()
|
||||||
|
|
||||||
|
embed_model = get_embedding_model()
|
||||||
|
|
||||||
|
if rebuild or not settings.LLM_INDEX_DIR.exists():
|
||||||
|
embedding_dim = get_embedding_dim()
|
||||||
|
faiss_index = faiss.IndexFlatL2(embedding_dim)
|
||||||
|
vector_store = FaissVectorStore(faiss_index)
|
||||||
|
else:
|
||||||
|
vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
|
||||||
|
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||||
|
Settings.embed_model = embed_model
|
||||||
|
|
||||||
|
llm_docs = []
|
||||||
|
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||||
|
if not document.content:
|
||||||
|
continue
|
||||||
|
llm_docs.append(
|
||||||
|
LlamaDocument(
|
||||||
|
text=build_llm_index_text(document),
|
||||||
|
metadata={
|
||||||
|
"id": document.id,
|
||||||
|
"title": document.title,
|
||||||
|
"tags": [t.name for t in document.tags.all()],
|
||||||
|
"correspondent": document.correspondent.name
|
||||||
|
if document.correspondent
|
||||||
|
else None,
|
||||||
|
"document_type": document.document_type.name
|
||||||
|
if document.document_type
|
||||||
|
else None,
|
||||||
|
"created": document.created.isoformat(),
|
||||||
|
"added": document.added.isoformat(),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
index = VectorStoreIndex.from_documents(
|
||||||
|
llm_docs,
|
||||||
|
storage_context=storage_context,
|
||||||
|
)
|
||||||
|
settings.LLM_INDEX_DIR.mkdir(exist_ok=True)
|
||||||
|
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
||||||
|
67
src/paperless/ai/embedding.py
Normal file
67
src/paperless/ai/embedding.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||||
|
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
from documents.models import Note
|
||||||
|
from paperless.config import AIConfig
|
||||||
|
|
||||||
|
EMBEDDING_DIMENSIONS = {
|
||||||
|
"text-embedding-3-small": 1536,
|
||||||
|
"sentence-transformers/all-MiniLM-L6-v2": 384,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding_model():
|
||||||
|
config = AIConfig()
|
||||||
|
|
||||||
|
match config.llm_embedding_backend:
|
||||||
|
case "openai":
|
||||||
|
return OpenAIEmbedding(
|
||||||
|
model=config.llm_embedding_model or "text-embedding-3-small",
|
||||||
|
api_key=config.llm_api_key,
|
||||||
|
)
|
||||||
|
case "local":
|
||||||
|
return HuggingFaceEmbedding(
|
||||||
|
model_name=config.llm_embedding_model
|
||||||
|
or "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
|
)
|
||||||
|
case _:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported embedding backend: {config.llm_embedding_backend}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding_dim() -> int:
|
||||||
|
config = AIConfig()
|
||||||
|
model = config.llm_embedding_model or (
|
||||||
|
"text-embedding-3-small"
|
||||||
|
if config.llm_embedding_backend == "openai"
|
||||||
|
else "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
|
)
|
||||||
|
if model not in EMBEDDING_DIMENSIONS:
|
||||||
|
raise ValueError(f"Unknown embedding model: {model}")
|
||||||
|
return EMBEDDING_DIMENSIONS[model]
|
||||||
|
|
||||||
|
|
||||||
|
def build_llm_index_text(doc: Document) -> str:
|
||||||
|
lines = [
|
||||||
|
f"Title: {doc.title}",
|
||||||
|
f"Filename: {doc.filename}",
|
||||||
|
f"Created: {doc.created}",
|
||||||
|
f"Added: {doc.added}",
|
||||||
|
f"Modified: {doc.modified}",
|
||||||
|
f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
|
||||||
|
f"Document Type: {doc.document_type.name if doc.document_type else ''}",
|
||||||
|
f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
|
||||||
|
f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
|
||||||
|
f"Archive Serial Number: {doc.archive_serial_number or ''}",
|
||||||
|
f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
|
||||||
|
]
|
||||||
|
|
||||||
|
for instance in doc.custom_fields.all():
|
||||||
|
lines.append(f"Custom Field - {instance.field.name}: {instance}")
|
||||||
|
|
||||||
|
lines.append("\nContent:\n")
|
||||||
|
lines.append(doc.content or "")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
52
src/paperless/ai/indexing.py
Normal file
52
src/paperless/ai/indexing.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import llama_index.core.settings as llama_settings
|
||||||
|
from django.conf import settings
|
||||||
|
from llama_index.core import StorageContext
|
||||||
|
from llama_index.core import VectorStoreIndex
|
||||||
|
from llama_index.core import load_index_from_storage
|
||||||
|
from llama_index.core.retrievers import VectorIndexRetriever
|
||||||
|
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
from paperless.ai.embedding import get_embedding_model
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.ai.indexing")
|
||||||
|
|
||||||
|
|
||||||
|
def load_index() -> VectorStoreIndex:
|
||||||
|
"""Loads the persisted LlamaIndex from disk."""
|
||||||
|
vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
|
||||||
|
embed_model = get_embedding_model()
|
||||||
|
|
||||||
|
llama_settings.Settings.embed_model = embed_model
|
||||||
|
llama_settings.Settings.chunk_size = 512
|
||||||
|
|
||||||
|
storage_context = StorageContext.from_defaults(
|
||||||
|
vector_store=vector_store,
|
||||||
|
persist_dir=settings.LLM_INDEX_DIR,
|
||||||
|
)
|
||||||
|
return load_index_from_storage(storage_context)
|
||||||
|
|
||||||
|
|
||||||
|
def query_similar_documents(document: Document, top_k: int = 5) -> list[Document]:
|
||||||
|
"""Runs a similarity query and returns top-k similar Document objects."""
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
index = load_index()
|
||||||
|
retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k)
|
||||||
|
|
||||||
|
# Build query from the document text
|
||||||
|
query_text = (document.title or "") + "\n" + (document.content or "")
|
||||||
|
|
||||||
|
# Query
|
||||||
|
results = retriever.retrieve(query_text)
|
||||||
|
|
||||||
|
# Each result.node.metadata["document_id"] should match our stored doc
|
||||||
|
document_ids = [
|
||||||
|
int(node.metadata["document_id"])
|
||||||
|
for node in results
|
||||||
|
if "document_id" in node.metadata
|
||||||
|
]
|
||||||
|
|
||||||
|
return list(Document.objects.filter(pk__in=document_ids))
|
@ -178,6 +178,8 @@ class AIConfig(BaseConfig):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
ai_enabled: bool = dataclasses.field(init=False)
|
ai_enabled: bool = dataclasses.field(init=False)
|
||||||
|
llm_embedding_backend: str = dataclasses.field(init=False)
|
||||||
|
llm_embedding_model: str = dataclasses.field(init=False)
|
||||||
llm_backend: str = dataclasses.field(init=False)
|
llm_backend: str = dataclasses.field(init=False)
|
||||||
llm_model: str = dataclasses.field(init=False)
|
llm_model: str = dataclasses.field(init=False)
|
||||||
llm_api_key: str = dataclasses.field(init=False)
|
llm_api_key: str = dataclasses.field(init=False)
|
||||||
@ -187,6 +189,12 @@ class AIConfig(BaseConfig):
|
|||||||
app_config = self._get_config_instance()
|
app_config = self._get_config_instance()
|
||||||
|
|
||||||
self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
|
self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
|
||||||
|
self.llm_embedding_backend = (
|
||||||
|
app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
|
||||||
|
)
|
||||||
|
self.llm_embedding_model = (
|
||||||
|
app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
|
||||||
|
)
|
||||||
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
|
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
|
||||||
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
||||||
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
||||||
|
@ -19,6 +19,27 @@ class Migration(migrations.Migration):
|
|||||||
verbose_name="Enables AI features",
|
verbose_name="Enables AI features",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="applicationconfiguration",
|
||||||
|
name="llm_embedding_backend",
|
||||||
|
field=models.CharField(
|
||||||
|
blank=True,
|
||||||
|
choices=[("openai", "OpenAI"), ("local", "Local")],
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Sets the LLM Embedding backend",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="applicationconfiguration",
|
||||||
|
name="llm_embedding_model",
|
||||||
|
field=models.CharField(
|
||||||
|
blank=True,
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Sets the LLM Embedding model",
|
||||||
|
),
|
||||||
|
),
|
||||||
migrations.AddField(
|
migrations.AddField(
|
||||||
model_name="applicationconfiguration",
|
model_name="applicationconfiguration",
|
||||||
name="llm_api_key",
|
name="llm_api_key",
|
||||||
|
@ -74,6 +74,11 @@ class ColorConvertChoices(models.TextChoices):
|
|||||||
CMYK = ("CMYK", _("CMYK"))
|
CMYK = ("CMYK", _("CMYK"))
|
||||||
|
|
||||||
|
|
||||||
|
class LLMEmbeddingBackend(models.TextChoices):
|
||||||
|
OPENAI = ("openai", _("OpenAI"))
|
||||||
|
LOCAL = ("local", _("Local"))
|
||||||
|
|
||||||
|
|
||||||
class LLMBackend(models.TextChoices):
|
class LLMBackend(models.TextChoices):
|
||||||
"""
|
"""
|
||||||
Matches to --llm-backend
|
Matches to --llm-backend
|
||||||
@ -284,6 +289,21 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
|||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
llm_embedding_backend = models.CharField(
|
||||||
|
verbose_name=_("Sets the LLM embedding backend"),
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
max_length=32,
|
||||||
|
choices=LLMEmbeddingBackend.choices,
|
||||||
|
)
|
||||||
|
|
||||||
|
llm_embedding_model = models.CharField(
|
||||||
|
verbose_name=_("Sets the LLM embedding model"),
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
max_length=32,
|
||||||
|
)
|
||||||
|
|
||||||
llm_backend = models.CharField(
|
llm_backend = models.CharField(
|
||||||
verbose_name=_("Sets the LLM backend"),
|
verbose_name=_("Sets the LLM backend"),
|
||||||
null=True,
|
null=True,
|
||||||
|
@ -283,6 +283,7 @@ MODEL_FILE = __get_path(
|
|||||||
"PAPERLESS_MODEL_FILE",
|
"PAPERLESS_MODEL_FILE",
|
||||||
DATA_DIR / "classification_model.pickle",
|
DATA_DIR / "classification_model.pickle",
|
||||||
)
|
)
|
||||||
|
LLM_INDEX_DIR = DATA_DIR / "llm_index"
|
||||||
|
|
||||||
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
|
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
|
||||||
|
|
||||||
@ -1281,7 +1282,12 @@ OUTLOOK_OAUTH_ENABLED = bool(
|
|||||||
# AI Settings #
|
# AI Settings #
|
||||||
################################################################################
|
################################################################################
|
||||||
AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
|
AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
|
||||||
LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "openai") # or "ollama"
|
LLM_EMBEDDING_BACKEND = os.getenv(
|
||||||
|
"PAPERLESS_LLM_EMBEDDING_BACKEND",
|
||||||
|
"local",
|
||||||
|
) # or "openai"
|
||||||
|
LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_LLM_EMBEDDING_MODEL")
|
||||||
|
LLM_BACKEND = os.getenv("PAPERLESS_LLM_BACKEND", "ollama") # or "openai"
|
||||||
LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL")
|
LLM_MODEL = os.getenv("PAPERLESS_LLM_MODEL")
|
||||||
LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY")
|
LLM_API_KEY = os.getenv("PAPERLESS_LLM_API_KEY")
|
||||||
LLM_URL = os.getenv("PAPERLESS_LLM_URL")
|
LLM_URL = os.getenv("PAPERLESS_LLM_URL")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user