mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Fixhancement: auto-queue llm index if needed (#11891)
This commit is contained in:
@@ -1,11 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
|
from datetime import timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import faiss
|
import faiss
|
||||||
import llama_index.core.settings as llama_settings
|
import llama_index.core.settings as llama_settings
|
||||||
import tqdm
|
import tqdm
|
||||||
|
from celery import states
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
from llama_index.core import Document as LlamaDocument
|
from llama_index.core import Document as LlamaDocument
|
||||||
from llama_index.core import StorageContext
|
from llama_index.core import StorageContext
|
||||||
from llama_index.core import VectorStoreIndex
|
from llama_index.core import VectorStoreIndex
|
||||||
@@ -21,6 +24,7 @@ from llama_index.core.text_splitter import TokenTextSplitter
|
|||||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
from documents.models import PaperlessTask
|
||||||
from paperless_ai.embedding import build_llm_index_text
|
from paperless_ai.embedding import build_llm_index_text
|
||||||
from paperless_ai.embedding import get_embedding_dim
|
from paperless_ai.embedding import get_embedding_dim
|
||||||
from paperless_ai.embedding import get_embedding_model
|
from paperless_ai.embedding import get_embedding_model
|
||||||
@@ -28,6 +32,29 @@ from paperless_ai.embedding import get_embedding_model
|
|||||||
logger = logging.getLogger("paperless_ai.indexing")
|
logger = logging.getLogger("paperless_ai.indexing")
|
||||||
|
|
||||||
|
|
||||||
|
def queue_llm_index_update_if_needed(*, rebuild: bool, reason: str) -> bool:
|
||||||
|
from documents.tasks import llmindex_index
|
||||||
|
|
||||||
|
has_running = PaperlessTask.objects.filter(
|
||||||
|
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||||
|
status__in=[states.PENDING, states.STARTED],
|
||||||
|
).exists()
|
||||||
|
has_recent = PaperlessTask.objects.filter(
|
||||||
|
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||||
|
date_created__gte=(timezone.now() - timedelta(minutes=5)),
|
||||||
|
).exists()
|
||||||
|
if has_running or has_recent:
|
||||||
|
return False
|
||||||
|
|
||||||
|
llmindex_index.delay(rebuild=rebuild, scheduled=False, auto=True)
|
||||||
|
logger.warning(
|
||||||
|
"Queued LLM index update%s: %s",
|
||||||
|
" (rebuild)" if rebuild else "",
|
||||||
|
reason,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_storage_context(*, rebuild=False):
|
def get_or_create_storage_context(*, rebuild=False):
|
||||||
"""
|
"""
|
||||||
Loads or creates the StorageContext (vector store, docstore, index store).
|
Loads or creates the StorageContext (vector store, docstore, index store).
|
||||||
@@ -93,6 +120,10 @@ def load_or_build_index(nodes=None):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.warning("Failed to load index from storage: %s", e)
|
logger.warning("Failed to load index from storage: %s", e)
|
||||||
if not nodes:
|
if not nodes:
|
||||||
|
queue_llm_index_update_if_needed(
|
||||||
|
rebuild=vector_store_file_exists(),
|
||||||
|
reason="LLM index missing or invalid while loading.",
|
||||||
|
)
|
||||||
logger.info("No nodes provided for index creation.")
|
logger.info("No nodes provided for index creation.")
|
||||||
raise
|
raise
|
||||||
return VectorStoreIndex(
|
return VectorStoreIndex(
|
||||||
@@ -250,6 +281,13 @@ def query_similar_documents(
|
|||||||
"""
|
"""
|
||||||
Runs a similarity query and returns top-k similar Document objects.
|
Runs a similarity query and returns top-k similar Document objects.
|
||||||
"""
|
"""
|
||||||
|
if not vector_store_file_exists():
|
||||||
|
queue_llm_index_update_if_needed(
|
||||||
|
rebuild=False,
|
||||||
|
reason="LLM index not found for similarity query.",
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
index = load_or_build_index()
|
index = load_or_build_index()
|
||||||
|
|
||||||
# constrain only the node(s) that match the document IDs, if given
|
# constrain only the node(s) that match the document IDs, if given
|
||||||
|
|||||||
@@ -3,11 +3,13 @@ from unittest.mock import MagicMock
|
|||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from celery import states
|
||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from llama_index.core.base.embeddings.base import BaseEmbedding
|
from llama_index.core.base.embeddings.base import BaseEmbedding
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
from documents.models import PaperlessTask
|
||||||
from paperless_ai import indexing
|
from paperless_ai import indexing
|
||||||
|
|
||||||
|
|
||||||
@@ -288,6 +290,36 @@ def test_update_llm_index_no_documents(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_queue_llm_index_update_if_needed_enqueues_when_idle_or_skips_recent():
|
||||||
|
# No existing tasks
|
||||||
|
with patch("documents.tasks.llmindex_index") as mock_task:
|
||||||
|
result = indexing.queue_llm_index_update_if_needed(
|
||||||
|
rebuild=True,
|
||||||
|
reason="test enqueue",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
mock_task.delay.assert_called_once_with(rebuild=True, scheduled=False, auto=True)
|
||||||
|
|
||||||
|
PaperlessTask.objects.create(
|
||||||
|
task_id="task-1",
|
||||||
|
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||||
|
status=states.STARTED,
|
||||||
|
date_created=timezone.now(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Existing running task
|
||||||
|
with patch("documents.tasks.llmindex_index") as mock_task:
|
||||||
|
result = indexing.queue_llm_index_update_if_needed(
|
||||||
|
rebuild=False,
|
||||||
|
reason="should skip",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is False
|
||||||
|
mock_task.delay.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(
|
||||||
LLM_EMBEDDING_BACKEND="huggingface",
|
LLM_EMBEDDING_BACKEND="huggingface",
|
||||||
LLM_BACKEND="ollama",
|
LLM_BACKEND="ollama",
|
||||||
@@ -299,11 +331,15 @@ def test_query_similar_documents(
|
|||||||
with (
|
with (
|
||||||
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
|
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
|
||||||
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
|
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
|
||||||
|
patch(
|
||||||
|
"paperless_ai.indexing.vector_store_file_exists",
|
||||||
|
) as mock_vector_store_exists,
|
||||||
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
|
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
|
||||||
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
|
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
|
||||||
):
|
):
|
||||||
mock_storage.return_value = MagicMock()
|
mock_storage.return_value = MagicMock()
|
||||||
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
||||||
|
mock_vector_store_exists.return_value = True
|
||||||
|
|
||||||
mock_index = MagicMock()
|
mock_index = MagicMock()
|
||||||
mock_load_or_build_index.return_value = mock_index
|
mock_load_or_build_index.return_value = mock_index
|
||||||
@@ -332,3 +368,31 @@ def test_query_similar_documents(
|
|||||||
mock_filter.assert_called_once_with(pk__in=[1, 2])
|
mock_filter.assert_called_once_with(pk__in=[1, 2])
|
||||||
|
|
||||||
assert result == mock_filtered_docs
|
assert result == mock_filtered_docs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_query_similar_documents_triggers_update_when_index_missing(
|
||||||
|
temp_llm_index_dir,
|
||||||
|
real_document,
|
||||||
|
):
|
||||||
|
with (
|
||||||
|
patch(
|
||||||
|
"paperless_ai.indexing.vector_store_file_exists",
|
||||||
|
return_value=False,
|
||||||
|
),
|
||||||
|
patch(
|
||||||
|
"paperless_ai.indexing.queue_llm_index_update_if_needed",
|
||||||
|
) as mock_queue,
|
||||||
|
patch("paperless_ai.indexing.load_or_build_index") as mock_load,
|
||||||
|
):
|
||||||
|
result = indexing.query_similar_documents(
|
||||||
|
real_document,
|
||||||
|
top_k=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_queue.assert_called_once_with(
|
||||||
|
rebuild=False,
|
||||||
|
reason="LLM index not found for similarity query.",
|
||||||
|
)
|
||||||
|
mock_load.assert_not_called()
|
||||||
|
assert result == []
|
||||||
|
|||||||
Reference in New Issue
Block a user