diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 03c8aa9be..40493c0f0 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -1,11 +1,14 @@ import logging import shutil +from datetime import timedelta from pathlib import Path import faiss import llama_index.core.settings as llama_settings import tqdm +from celery import states from django.conf import settings +from django.utils import timezone from llama_index.core import Document as LlamaDocument from llama_index.core import StorageContext from llama_index.core import VectorStoreIndex @@ -21,6 +24,8 @@ from llama_index.core.text_splitter import TokenTextSplitter from llama_index.vector_stores.faiss import FaissVectorStore from documents.models import Document +from documents.models import PaperlessTask +from documents.tasks import llmindex_index from paperless_ai.embedding import build_llm_index_text from paperless_ai.embedding import get_embedding_dim from paperless_ai.embedding import get_embedding_model @@ -28,6 +33,27 @@ from paperless_ai.embedding import get_embedding_model logger = logging.getLogger("paperless_ai.indexing") +def queue_llm_index_update_if_needed(*, rebuild: bool, reason: str) -> bool: + has_running = PaperlessTask.objects.filter( + task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE, + status__in=[states.PENDING, states.STARTED], + ).exists() + has_recent = PaperlessTask.objects.filter( + task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE, + date_created__gte=(timezone.now() - timedelta(minutes=5)), + ).exists() + if has_running or has_recent: + return False + + llmindex_index.delay(rebuild=rebuild, scheduled=False, auto=True) + logger.warning( + "Queued LLM index update%s: %s", + " (rebuild)" if rebuild else "", + reason, + ) + return True + + def get_or_create_storage_context(*, rebuild=False): """ Loads or creates the StorageContext (vector store, docstore, index store). @@ -93,6 +119,10 @@ def load_or_build_index(nodes=None): except ValueError as e: logger.warning("Failed to load index from storage: %s", e) if not nodes: + queue_llm_index_update_if_needed( + rebuild=vector_store_file_exists(), + reason="LLM index missing or invalid while loading.", + ) logger.info("No nodes provided for index creation.") raise return VectorStoreIndex( @@ -250,7 +280,21 @@ def query_similar_documents( """ Runs a similarity query and returns top-k similar Document objects. """ - index = load_or_build_index() + if not vector_store_file_exists(): + queue_llm_index_update_if_needed( + rebuild=False, + reason="LLM index not found for similarity query.", + ) + return [] + + try: + index = load_or_build_index() + except ValueError: + queue_llm_index_update_if_needed( + rebuild=True, + reason="LLM index failed to load for similarity query.", + ) + return [] # constrain only the node(s) that match the document IDs, if given doc_node_ids = ( diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index bd217fb89..692b1697b 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -299,11 +299,15 @@ def test_query_similar_documents( with ( patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage, patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index, + patch( + "paperless_ai.indexing.vector_store_file_exists", + ) as mock_vector_store_exists, patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls, patch("paperless_ai.indexing.Document.objects.filter") as mock_filter, ): mock_storage.return_value = MagicMock() mock_storage.return_value.persist_dir = temp_llm_index_dir + mock_vector_store_exists.return_value = True mock_index = MagicMock() mock_load_or_build_index.return_value = mock_index