From 07a7d7b8154240f61f1af4d3961f896173c0c808 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 28 Apr 2025 19:03:53 -0700 Subject: [PATCH] Cover partial indexing --- src/paperless/ai/indexing.py | 15 +++----- src/paperless/tests/test_ai_indexing.py | 51 +++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/src/paperless/ai/indexing.py b/src/paperless/ai/indexing.py index bc275c83f..840d58f37 100644 --- a/src/paperless/ai/indexing.py +++ b/src/paperless/ai/indexing.py @@ -147,8 +147,6 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False): for node in index.docstore.get_nodes(all_node_ids) } - node_ids_to_remove = [] - for document in tqdm.tqdm(documents, disable=progress_bar_disable): doc_id = str(document.id) document_modified = document.modified.isoformat() @@ -160,22 +158,19 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False): if node_modified == document_modified: continue - node_ids_to_remove.append(node.node_id) + # Again, delete from docstore, FAISS IndexFlatL2 are append-only + index.docstore.delete_document(node.node_id) nodes.extend(build_document_node(document)) else: # New document, add it nodes.extend(build_document_node(document)) - if node_ids_to_remove or nodes: + if nodes: logger.info( - "Updating LLM index with %d new nodes and removing %d old nodes.", + "Updating %d nodes in LLM index.", len(nodes), - len(node_ids_to_remove), ) - if node_ids_to_remove: - index.delete_nodes(node_ids_to_remove) - if nodes: - index.insert_nodes(nodes) + index.insert_nodes(nodes) else: logger.info("No changes detected, skipping llm index rebuild.") diff --git a/src/paperless/tests/test_ai_indexing.py b/src/paperless/tests/test_ai_indexing.py index c0279171a..d7b83316d 100644 --- a/src/paperless/tests/test_ai_indexing.py +++ b/src/paperless/tests/test_ai_indexing.py @@ -72,6 +72,57 @@ def test_update_llm_index( assert any(temp_llm_index_dir.glob("*.json")) +@pytest.mark.django_db +def test_update_llm_index_partial_update( + temp_llm_index_dir, + real_document, + mock_embed_model, +): + doc2 = Document.objects.create( + title="Test Document 2", + content="This is some test content 2.", + added=timezone.now(), + checksum="1234567890abcdef", + ) + # Initial index + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([real_document, doc2]) + mock_all.return_value = mock_queryset + + indexing.update_llm_index(rebuild=True) + + # modify document + updated_document = real_document + updated_document.modified = timezone.now() # simulate modification + + # new doc + doc3 = Document.objects.create( + title="Test Document 3", + content="This is some test content 3.", + added=timezone.now(), + checksum="abcdef1234567890", + ) + + with patch("documents.models.Document.objects.all") as mock_all: + mock_queryset = MagicMock() + mock_queryset.exists.return_value = True + mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3]) + mock_all.return_value = mock_queryset + + # assert logs "Updating LLM index with %d new nodes and removing %d old nodes." + with patch("paperless.ai.indexing.logger") as mock_logger: + indexing.update_llm_index(rebuild=False) + mock_logger.info.assert_called_once_with( + "Updating %d nodes in LLM index.", + 2, + ) + indexing.update_llm_index(rebuild=False) + + assert any(temp_llm_index_dir.glob("*.json")) + + def test_get_or_create_storage_context_raises_exception( temp_llm_index_dir, mock_embed_model,