mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-22 22:34:20 -06:00
Compare commits
2 Commits
feature-te
...
feature-au
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0886627aa8 | ||
|
|
65b47e86c3 |
@@ -1,11 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
|
from datetime import timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import faiss
|
import faiss
|
||||||
import llama_index.core.settings as llama_settings
|
import llama_index.core.settings as llama_settings
|
||||||
import tqdm
|
import tqdm
|
||||||
|
from celery import states
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
from llama_index.core import Document as LlamaDocument
|
from llama_index.core import Document as LlamaDocument
|
||||||
from llama_index.core import StorageContext
|
from llama_index.core import StorageContext
|
||||||
from llama_index.core import VectorStoreIndex
|
from llama_index.core import VectorStoreIndex
|
||||||
@@ -21,6 +24,7 @@ from llama_index.core.text_splitter import TokenTextSplitter
|
|||||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
from documents.models import PaperlessTask
|
||||||
from paperless_ai.embedding import build_llm_index_text
|
from paperless_ai.embedding import build_llm_index_text
|
||||||
from paperless_ai.embedding import get_embedding_dim
|
from paperless_ai.embedding import get_embedding_dim
|
||||||
from paperless_ai.embedding import get_embedding_model
|
from paperless_ai.embedding import get_embedding_model
|
||||||
@@ -28,6 +32,29 @@ from paperless_ai.embedding import get_embedding_model
|
|||||||
logger = logging.getLogger("paperless_ai.indexing")
|
logger = logging.getLogger("paperless_ai.indexing")
|
||||||
|
|
||||||
|
|
||||||
|
def queue_llm_index_update_if_needed(*, rebuild: bool, reason: str) -> bool:
|
||||||
|
from documents.tasks import llmindex_index
|
||||||
|
|
||||||
|
has_running = PaperlessTask.objects.filter(
|
||||||
|
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||||
|
status__in=[states.PENDING, states.STARTED],
|
||||||
|
).exists()
|
||||||
|
has_recent = PaperlessTask.objects.filter(
|
||||||
|
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||||
|
date_created__gte=(timezone.now() - timedelta(minutes=5)),
|
||||||
|
).exists()
|
||||||
|
if has_running or has_recent:
|
||||||
|
return False
|
||||||
|
|
||||||
|
llmindex_index.delay(rebuild=rebuild, scheduled=False, auto=True)
|
||||||
|
logger.warning(
|
||||||
|
"Queued LLM index update%s: %s",
|
||||||
|
" (rebuild)" if rebuild else "",
|
||||||
|
reason,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_storage_context(*, rebuild=False):
|
def get_or_create_storage_context(*, rebuild=False):
|
||||||
"""
|
"""
|
||||||
Loads or creates the StorageContext (vector store, docstore, index store).
|
Loads or creates the StorageContext (vector store, docstore, index store).
|
||||||
@@ -93,6 +120,10 @@ def load_or_build_index(nodes=None):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.warning("Failed to load index from storage: %s", e)
|
logger.warning("Failed to load index from storage: %s", e)
|
||||||
if not nodes:
|
if not nodes:
|
||||||
|
queue_llm_index_update_if_needed(
|
||||||
|
rebuild=vector_store_file_exists(),
|
||||||
|
reason="LLM index missing or invalid while loading.",
|
||||||
|
)
|
||||||
logger.info("No nodes provided for index creation.")
|
logger.info("No nodes provided for index creation.")
|
||||||
raise
|
raise
|
||||||
return VectorStoreIndex(
|
return VectorStoreIndex(
|
||||||
@@ -250,7 +281,21 @@ def query_similar_documents(
|
|||||||
"""
|
"""
|
||||||
Runs a similarity query and returns top-k similar Document objects.
|
Runs a similarity query and returns top-k similar Document objects.
|
||||||
"""
|
"""
|
||||||
|
if not vector_store_file_exists():
|
||||||
|
queue_llm_index_update_if_needed(
|
||||||
|
rebuild=False,
|
||||||
|
reason="LLM index not found for similarity query.",
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
index = load_or_build_index()
|
index = load_or_build_index()
|
||||||
|
except ValueError:
|
||||||
|
queue_llm_index_update_if_needed(
|
||||||
|
rebuild=True,
|
||||||
|
reason="LLM index failed to load for similarity query.",
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
# constrain only the node(s) that match the document IDs, if given
|
# constrain only the node(s) that match the document IDs, if given
|
||||||
doc_node_ids = (
|
doc_node_ids = (
|
||||||
|
|||||||
@@ -299,11 +299,15 @@ def test_query_similar_documents(
|
|||||||
with (
|
with (
|
||||||
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
|
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
|
||||||
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
|
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
|
||||||
|
patch(
|
||||||
|
"paperless_ai.indexing.vector_store_file_exists",
|
||||||
|
) as mock_vector_store_exists,
|
||||||
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
|
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
|
||||||
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
|
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
|
||||||
):
|
):
|
||||||
mock_storage.return_value = MagicMock()
|
mock_storage.return_value = MagicMock()
|
||||||
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
||||||
|
mock_vector_store_exists.return_value = True
|
||||||
|
|
||||||
mock_index = MagicMock()
|
mock_index = MagicMock()
|
||||||
mock_load_or_build_index.return_value = mock_index
|
mock_load_or_build_index.return_value = mock_index
|
||||||
|
|||||||
Reference in New Issue
Block a user