Cover partial indexing

This commit is contained in:
shamoon 2025-04-28 19:03:53 -07:00
parent a1fb3ee7de
commit 07a7d7b815
No known key found for this signature in database
2 changed files with 56 additions and 10 deletions

View File

@ -147,8 +147,6 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
for node in index.docstore.get_nodes(all_node_ids)
}
node_ids_to_remove = []
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
doc_id = str(document.id)
document_modified = document.modified.isoformat()
@ -160,22 +158,19 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
if node_modified == document_modified:
continue
node_ids_to_remove.append(node.node_id)
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
index.docstore.delete_document(node.node_id)
nodes.extend(build_document_node(document))
else:
# New document, add it
nodes.extend(build_document_node(document))
if node_ids_to_remove or nodes:
if nodes:
logger.info(
"Updating LLM index with %d new nodes and removing %d old nodes.",
"Updating %d nodes in LLM index.",
len(nodes),
len(node_ids_to_remove),
)
if node_ids_to_remove:
index.delete_nodes(node_ids_to_remove)
if nodes:
index.insert_nodes(nodes)
index.insert_nodes(nodes)
else:
logger.info("No changes detected, skipping llm index rebuild.")

View File

@ -72,6 +72,57 @@ def test_update_llm_index(
assert any(temp_llm_index_dir.glob("*.json"))
@pytest.mark.django_db
def test_update_llm_index_partial_update(
temp_llm_index_dir,
real_document,
mock_embed_model,
):
doc2 = Document.objects.create(
title="Test Document 2",
content="This is some test content 2.",
added=timezone.now(),
checksum="1234567890abcdef",
)
# Initial index
with patch("documents.models.Document.objects.all") as mock_all:
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([real_document, doc2])
mock_all.return_value = mock_queryset
indexing.update_llm_index(rebuild=True)
# modify document
updated_document = real_document
updated_document.modified = timezone.now() # simulate modification
# new doc
doc3 = Document.objects.create(
title="Test Document 3",
content="This is some test content 3.",
added=timezone.now(),
checksum="abcdef1234567890",
)
with patch("documents.models.Document.objects.all") as mock_all:
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
mock_all.return_value = mock_queryset
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
with patch("paperless.ai.indexing.logger") as mock_logger:
indexing.update_llm_index(rebuild=False)
mock_logger.info.assert_called_once_with(
"Updating %d nodes in LLM index.",
2,
)
indexing.update_llm_index(rebuild=False)
assert any(temp_llm_index_dir.glob("*.json"))
def test_get_or_create_storage_context_raises_exception(
temp_llm_index_dir,
mock_embed_model,