From d294508982cbd17444f6214b6fea36075489cea4 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue, 27 Jan 2026 13:48:17 -0800 Subject: [PATCH] Fixhancement: auto-queue llm index if needed (#11891) --- src/paperless_ai/indexing.py | 38 +++++++++++++ src/paperless_ai/tests/test_ai_indexing.py | 64 ++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 03c8aa9be..654c56f3b 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -1,11 +1,14 @@ import logging import shutil +from datetime import timedelta from pathlib import Path import faiss import llama_index.core.settings as llama_settings import tqdm +from celery import states from django.conf import settings +from django.utils import timezone from llama_index.core import Document as LlamaDocument from llama_index.core import StorageContext from llama_index.core import VectorStoreIndex @@ -21,6 +24,7 @@ from llama_index.core.text_splitter import TokenTextSplitter from llama_index.vector_stores.faiss import FaissVectorStore from documents.models import Document +from documents.models import PaperlessTask from paperless_ai.embedding import build_llm_index_text from paperless_ai.embedding import get_embedding_dim from paperless_ai.embedding import get_embedding_model @@ -28,6 +32,29 @@ from paperless_ai.embedding import get_embedding_model logger = logging.getLogger("paperless_ai.indexing") +def queue_llm_index_update_if_needed(*, rebuild: bool, reason: str) -> bool: + from documents.tasks import llmindex_index + + has_running = PaperlessTask.objects.filter( + task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE, + status__in=[states.PENDING, states.STARTED], + ).exists() + has_recent = PaperlessTask.objects.filter( + task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE, + date_created__gte=(timezone.now() - timedelta(minutes=5)), + ).exists() + if has_running or has_recent: + return False + + llmindex_index.delay(rebuild=rebuild, scheduled=False, auto=True) + logger.warning( + "Queued LLM index update%s: %s", + " (rebuild)" if rebuild else "", + reason, + ) + return True + + def get_or_create_storage_context(*, rebuild=False): """ Loads or creates the StorageContext (vector store, docstore, index store). @@ -93,6 +120,10 @@ def load_or_build_index(nodes=None): except ValueError as e: logger.warning("Failed to load index from storage: %s", e) if not nodes: + queue_llm_index_update_if_needed( + rebuild=vector_store_file_exists(), + reason="LLM index missing or invalid while loading.", + ) logger.info("No nodes provided for index creation.") raise return VectorStoreIndex( @@ -250,6 +281,13 @@ def query_similar_documents( """ Runs a similarity query and returns top-k similar Document objects. """ + if not vector_store_file_exists(): + queue_llm_index_update_if_needed( + rebuild=False, + reason="LLM index not found for similarity query.", + ) + return [] + index = load_or_build_index() # constrain only the node(s) that match the document IDs, if given diff --git a/src/paperless_ai/tests/test_ai_indexing.py b/src/paperless_ai/tests/test_ai_indexing.py index bd217fb89..7505d49b0 100644 --- a/src/paperless_ai/tests/test_ai_indexing.py +++ b/src/paperless_ai/tests/test_ai_indexing.py @@ -3,11 +3,13 @@ from unittest.mock import MagicMock from unittest.mock import patch import pytest +from celery import states from django.test import override_settings from django.utils import timezone from llama_index.core.base.embeddings.base import BaseEmbedding from documents.models import Document +from documents.models import PaperlessTask from paperless_ai import indexing @@ -288,6 +290,36 @@ def test_update_llm_index_no_documents( ) +@pytest.mark.django_db +def test_queue_llm_index_update_if_needed_enqueues_when_idle_or_skips_recent(): + # No existing tasks + with patch("documents.tasks.llmindex_index") as mock_task: + result = indexing.queue_llm_index_update_if_needed( + rebuild=True, + reason="test enqueue", + ) + + assert result is True + mock_task.delay.assert_called_once_with(rebuild=True, scheduled=False, auto=True) + + PaperlessTask.objects.create( + task_id="task-1", + task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE, + status=states.STARTED, + date_created=timezone.now(), + ) + + # Existing running task + with patch("documents.tasks.llmindex_index") as mock_task: + result = indexing.queue_llm_index_update_if_needed( + rebuild=False, + reason="should skip", + ) + + assert result is False + mock_task.delay.assert_not_called() + + @override_settings( LLM_EMBEDDING_BACKEND="huggingface", LLM_BACKEND="ollama", @@ -299,11 +331,15 @@ def test_query_similar_documents( with ( patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage, patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index, + patch( + "paperless_ai.indexing.vector_store_file_exists", + ) as mock_vector_store_exists, patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls, patch("paperless_ai.indexing.Document.objects.filter") as mock_filter, ): mock_storage.return_value = MagicMock() mock_storage.return_value.persist_dir = temp_llm_index_dir + mock_vector_store_exists.return_value = True mock_index = MagicMock() mock_load_or_build_index.return_value = mock_index @@ -332,3 +368,31 @@ def test_query_similar_documents( mock_filter.assert_called_once_with(pk__in=[1, 2]) assert result == mock_filtered_docs + + +@pytest.mark.django_db +def test_query_similar_documents_triggers_update_when_index_missing( + temp_llm_index_dir, + real_document, +): + with ( + patch( + "paperless_ai.indexing.vector_store_file_exists", + return_value=False, + ), + patch( + "paperless_ai.indexing.queue_llm_index_update_if_needed", + ) as mock_queue, + patch("paperless_ai.indexing.load_or_build_index") as mock_load, + ): + result = indexing.query_similar_documents( + real_document, + top_k=2, + ) + + mock_queue.assert_called_once_with( + rebuild=False, + reason="LLM index not found for similarity query.", + ) + mock_load.assert_not_called() + assert result == []