From cd4540412a64eda9e9b0897cd71e9142f7b27bbd Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 28 Apr 2025 14:39:31 -0700 Subject: [PATCH] indexing cleanup and tests --- src/paperless/ai/chat.py | 2 +- src/paperless/ai/indexing.py | 26 ++--------- src/paperless/tests/test_ai_indexing.py | 58 ++++++++++++++++++++++--- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/src/paperless/ai/chat.py b/src/paperless/ai/chat.py index 45d44db8c..84f0db060 100644 --- a/src/paperless/ai/chat.py +++ b/src/paperless/ai/chat.py @@ -26,7 +26,7 @@ def stream_chat_with_documents(query_str: str, documents: list[Document]): client = AIClient() index = load_or_build_index() - doc_ids = [doc.pk for doc in documents] + doc_ids = [str(doc.pk) for doc in documents] # Filter only the node(s) that match the document IDs nodes = [ diff --git a/src/paperless/ai/indexing.py b/src/paperless/ai/indexing.py index 11b8179ee..95442e55b 100644 --- a/src/paperless/ai/indexing.py +++ b/src/paperless/ai/indexing.py @@ -52,23 +52,10 @@ def get_or_create_storage_context(*, rebuild=False): ) -def get_vector_store_index(storage_context, embed_model): - """ - Returns a VectorStoreIndex given a storage context and embed model. - """ - return VectorStoreIndex( - storage_context=storage_context, - embed_model=embed_model, - ) - - def build_document_node(document: Document) -> list[BaseNode]: """ Given a Document, returns parsed Nodes ready for indexing. """ - if not document.content: - return [] - text = build_llm_index_text(document) metadata = { "document_id": str(document.id), @@ -97,9 +84,10 @@ def load_or_build_index(storage_context: StorageContext, embed_model, nodes=None try: return load_index_from_storage(storage_context=storage_context) except ValueError as e: - logger.debug("Failed to load index from storage: %s", e) + logger.warning("Failed to load index from storage: %s", e) if not nodes: - return None + logger.info("No nodes provided for index creation.") + raise return VectorStoreIndex( nodes=nodes, storage_context=storage_context, @@ -116,7 +104,7 @@ def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex): existing_nodes = [ node.node_id for node in index.docstore.get_nodes(all_node_ids) - if node.metadata.get("document_id") == document.id + if node.metadata.get("document_id") == str(document.id) ] for node_id in existing_nodes: # Delete from docstore, FAISS IndexFlatL2 are append-only @@ -208,9 +196,6 @@ def llm_index_add_or_update_document(document: Document): index = load_or_build_index(storage_context, embed_model, nodes=new_nodes) - if index is None: - return - remove_document_docstore_nodes(document, index) index.insert_nodes(new_nodes) @@ -229,9 +214,6 @@ def llm_index_remove_document(document: Document): index = load_or_build_index(storage_context, embed_model) - if index is None: - return - remove_document_docstore_nodes(document, index) storage_context.persist(persist_dir=settings.LLM_INDEX_DIR) diff --git a/src/paperless/tests/test_ai_indexing.py b/src/paperless/tests/test_ai_indexing.py index 970f8293d..73df742b1 100644 --- a/src/paperless/tests/test_ai_indexing.py +++ b/src/paperless/tests/test_ai_indexing.py @@ -28,7 +28,6 @@ def real_document(db): @pytest.fixture def mock_embed_model(): - """Mocks the embedding model.""" with patch("paperless.ai.indexing.get_embedding_model") as mock: mock.return_value = FakeEmbedding() yield mock @@ -57,7 +56,7 @@ def test_build_document_node(real_document): @pytest.mark.django_db -def test_rebuild_llm_index( +def test_update_llm_index( temp_llm_index_dir, real_document, mock_embed_model, @@ -72,6 +71,49 @@ def test_rebuild_llm_index( assert any(temp_llm_index_dir.glob("*.json")) +def test_get_or_create_storage_context_raises_exception( + temp_llm_index_dir, + mock_embed_model, +): + with pytest.raises(Exception): + indexing.get_or_create_storage_context(rebuild=False) + + +def test_load_or_build_index_builds_when_nodes_given( + temp_llm_index_dir, + mock_embed_model, + real_document, +): + storage_context = MagicMock() + with patch( + "paperless.ai.indexing.load_index_from_storage", + side_effect=ValueError("Index not found"), + ): + with patch( + "paperless.ai.indexing.VectorStoreIndex", + return_value=MagicMock(), + ) as mock_index_cls: + indexing.load_or_build_index( + storage_context, + mock_embed_model, + nodes=[indexing.build_document_node(real_document)], + ) + mock_index_cls.assert_called_once() + + +def test_load_or_build_index_raises_exception_when_no_nodes( + temp_llm_index_dir, + mock_embed_model, +): + storage_context = MagicMock() + with patch( + "paperless.ai.indexing.load_index_from_storage", + side_effect=ValueError("Index not found"), + ): + with pytest.raises(Exception): + indexing.load_or_build_index(storage_context, mock_embed_model) + + @pytest.mark.django_db def test_add_or_update_document_updates_existing_entry( temp_llm_index_dir, @@ -91,14 +133,18 @@ def test_remove_document_deletes_node_from_docstore( mock_embed_model, ): indexing.update_llm_index(rebuild=True) - indexing.llm_index_add_or_update_document(real_document) - indexing.llm_index_remove_document(real_document) + storage_context = indexing.get_or_create_storage_context() + index = indexing.load_or_build_index(storage_context, mock_embed_model) + assert len(index.docstore.docs) == 1 - assert any(temp_llm_index_dir.glob("*.json")) + indexing.llm_index_remove_document(real_document) + storage_context = indexing.get_or_create_storage_context() + index = indexing.load_or_build_index(storage_context, mock_embed_model) + assert len(index.docstore.docs) == 0 @pytest.mark.django_db -def test_rebuild_llm_index_no_documents( +def test_update_llm_index_no_documents( temp_llm_index_dir, mock_embed_model, ):