mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-22 22:34:20 -06:00
Add LLM index update queuing and improve error handling
This commit is contained in:
@@ -1,11 +1,14 @@
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import faiss
|
||||
import llama_index.core.settings as llama_settings
|
||||
import tqdm
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from llama_index.core import Document as LlamaDocument
|
||||
from llama_index.core import StorageContext
|
||||
from llama_index.core import VectorStoreIndex
|
||||
@@ -21,6 +24,8 @@ from llama_index.core.text_splitter import TokenTextSplitter
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.tasks import llmindex_index
|
||||
from paperless_ai.embedding import build_llm_index_text
|
||||
from paperless_ai.embedding import get_embedding_dim
|
||||
from paperless_ai.embedding import get_embedding_model
|
||||
@@ -28,6 +33,27 @@ from paperless_ai.embedding import get_embedding_model
|
||||
logger = logging.getLogger("paperless_ai.indexing")
|
||||
|
||||
|
||||
def queue_llm_index_update_if_needed(*, rebuild: bool, reason: str) -> bool:
|
||||
has_running = PaperlessTask.objects.filter(
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
status__in=[states.PENDING, states.STARTED],
|
||||
).exists()
|
||||
has_recent = PaperlessTask.objects.filter(
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
date_created__gte=(timezone.now() - timedelta(minutes=5)),
|
||||
).exists()
|
||||
if has_running or has_recent:
|
||||
return False
|
||||
|
||||
llmindex_index.delay(rebuild=rebuild, scheduled=False, auto=True)
|
||||
logger.warning(
|
||||
"Queued LLM index update%s: %s",
|
||||
" (rebuild)" if rebuild else "",
|
||||
reason,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def get_or_create_storage_context(*, rebuild=False):
|
||||
"""
|
||||
Loads or creates the StorageContext (vector store, docstore, index store).
|
||||
@@ -93,6 +119,10 @@ def load_or_build_index(nodes=None):
|
||||
except ValueError as e:
|
||||
logger.warning("Failed to load index from storage: %s", e)
|
||||
if not nodes:
|
||||
queue_llm_index_update_if_needed(
|
||||
rebuild=vector_store_file_exists(),
|
||||
reason="LLM index missing or invalid while loading.",
|
||||
)
|
||||
logger.info("No nodes provided for index creation.")
|
||||
raise
|
||||
return VectorStoreIndex(
|
||||
@@ -250,7 +280,21 @@ def query_similar_documents(
|
||||
"""
|
||||
Runs a similarity query and returns top-k similar Document objects.
|
||||
"""
|
||||
index = load_or_build_index()
|
||||
if not vector_store_file_exists():
|
||||
queue_llm_index_update_if_needed(
|
||||
rebuild=False,
|
||||
reason="LLM index not found for similarity query.",
|
||||
)
|
||||
return []
|
||||
|
||||
try:
|
||||
index = load_or_build_index()
|
||||
except ValueError:
|
||||
queue_llm_index_update_if_needed(
|
||||
rebuild=True,
|
||||
reason="LLM index failed to load for similarity query.",
|
||||
)
|
||||
return []
|
||||
|
||||
# constrain only the node(s) that match the document IDs, if given
|
||||
doc_node_ids = (
|
||||
|
||||
@@ -299,11 +299,15 @@ def test_query_similar_documents(
|
||||
with (
|
||||
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
|
||||
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
|
||||
patch(
|
||||
"paperless_ai.indexing.vector_store_file_exists",
|
||||
) as mock_vector_store_exists,
|
||||
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
|
||||
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
|
||||
):
|
||||
mock_storage.return_value = MagicMock()
|
||||
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
||||
mock_vector_store_exists.return_value = True
|
||||
|
||||
mock_index = MagicMock()
|
||||
mock_load_or_build_index.return_value = mock_index
|
||||
|
||||
Reference in New Issue
Block a user