From e3d1ca77dcb0686ae4b599676ee18f7763417576 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 28 Apr 2025 08:37:02 -0700 Subject: [PATCH] Some cleanup, typing --- src/paperless/ai/indexing.py | 22 ++++++++++------------ src/paperless/config.py | 4 +--- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/paperless/ai/indexing.py b/src/paperless/ai/indexing.py index 4742ca0ab..2ec4f4925 100644 --- a/src/paperless/ai/indexing.py +++ b/src/paperless/ai/indexing.py @@ -61,7 +61,7 @@ def get_vector_store_index(storage_context, embed_model): ) -def build_document_node(document) -> list[BaseNode]: +def build_document_node(document: Document) -> list[BaseNode]: """ Given a Document, returns parsed Nodes ready for indexing. """ @@ -109,7 +109,7 @@ def load_or_build_index(storage_context, embed_model, nodes=None): raise -def remove_existing_document_nodes(document, index): +def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex): """ Removes existing documents from docstore for a given document from the index. This is necessary because FAISS IndexFlatL2 is append-only. @@ -153,7 +153,7 @@ def rebuild_llm_index(*, progress_bar_disable=False, rebuild=False): storage_context.persist(persist_dir=settings.LLM_INDEX_DIR) -def llm_index_add_or_update_document(document): +def llm_index_add_or_update_document(document: Document): """ Adds or updates a document in the LLM index. If the document already exists, it will be replaced. @@ -168,18 +168,19 @@ def llm_index_add_or_update_document(document): index = load_or_build_index(storage_context, embed_model, nodes=new_nodes) if index is None: - # Nothing to index return - # Remove old nodes - remove_existing_document_nodes(document, index) + remove_document_docstore_nodes(document, index) index.insert_nodes(new_nodes) storage_context.persist(persist_dir=settings.LLM_INDEX_DIR) -def llm_index_remove_document(document): +def llm_index_remove_document(document: Document): + """ + Removes a document from the LLM index. + """ embed_model = get_embedding_model() llama_settings.embed_model = embed_model @@ -187,10 +188,9 @@ def llm_index_remove_document(document): index = load_or_build_index(storage_context, embed_model) if index is None: - return # Nothing to remove + return - # Remove old nodes - remove_existing_document_nodes(document, index) + remove_document_docstore_nodes(document, index) storage_context.persist(persist_dir=settings.LLM_INDEX_DIR) @@ -202,11 +202,9 @@ def query_similar_documents(document: Document, top_k: int = 5) -> list[Document index = load_or_build_index() retriever = VectorIndexRetriever(index=index, similarity_top_k=top_k) - # Build query from the document text query_text = (document.title or "") + "\n" + (document.content or "") results = retriever.retrieve(query_text) - # Each result.node.metadata["document_id"] should match our stored doc document_ids = [ int(node.metadata["document_id"]) for node in results diff --git a/src/paperless/config.py b/src/paperless/config.py index fc4fe23cf..ca61e00c7 100644 --- a/src/paperless/config.py +++ b/src/paperless/config.py @@ -202,7 +202,5 @@ class AIConfig(BaseConfig): def llm_index_enabled(self) -> bool: return ( - self.ai_enabled - and self.llm_embedding_backend - and self.llm_embedding_backend + self.ai_enabled and self.llm_embedding_backend and self.llm_embedding_model )