Use PaperlessTask for llmindex

This commit is contained in:
shamoon 2025-04-29 19:40:05 -07:00
parent 51a7581860
commit 374596b1bc
No known key found for this signature in database
9 changed files with 82 additions and 48 deletions

View File

@ -11,6 +11,7 @@ export enum PaperlessTaskName {
TrainClassifier = 'train_classifier',
SanityCheck = 'check_sanity',
IndexOptimize = 'index_optimize',
LLMIndexUpdate = 'llmindex_update',
}
export enum PaperlessTaskStatus {

View File

@ -18,4 +18,5 @@ class Command(ProgressBarMixin, BaseCommand):
llmindex_index(
progress_bar_disable=self.no_progress_bar,
rebuild=options["command"] == "rebuild",
scheduled=False,
)

View File

@ -0,0 +1,30 @@
# Generated by Django 5.1.8 on 2025-04-30 02:38
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1065_workflowaction_assign_custom_fields_values"),
]
operations = [
migrations.AlterField(
model_name="paperlesstask",
name="task_name",
field=models.CharField(
choices=[
("consume_file", "Consume File"),
("train_classifier", "Train Classifier"),
("check_sanity", "Check Sanity"),
("index_optimize", "Index Optimize"),
("llmindex_update", "LLM Index Update"),
],
help_text="Name of the task that was run",
max_length=255,
null=True,
verbose_name="Task Name",
),
),
]

View File

@ -543,6 +543,7 @@ class PaperlessTask(ModelWithOwner):
TRAIN_CLASSIFIER = ("train_classifier", _("Train Classifier"))
CHECK_SANITY = ("check_sanity", _("Check Sanity"))
INDEX_OPTIMIZE = ("index_optimize", _("Index Optimize"))
LLMINDEX_UPDATE = ("llmindex_update", _("LLM Index Update"))
task_id = models.CharField(
max_length=255,

View File

@ -514,13 +514,29 @@ def check_scheduled_workflows():
@shared_task
def llmindex_index(*, progress_bar_disable=False, rebuild=False):
def llmindex_index(*, progress_bar_disable=True, rebuild=False, scheduled=True):
ai_config = AIConfig()
if ai_config.llm_index_enabled():
update_llm_index(
task = PaperlessTask.objects.create(
type=PaperlessTask.TaskType.SCHEDULED_TASK
if scheduled
else PaperlessTask.TaskType.MANUAL_TASK,
task_id=uuid.uuid4(),
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
status=states.STARTED,
date_created=timezone.now(),
date_started=timezone.now(),
)
from paperless_ai.indexing import update_llm_index
result = update_llm_index(
progress_bar_disable=progress_bar_disable,
rebuild=rebuild,
)
task.status = states.SUCCESS
task.result = result
task.date_done = timezone.now()
task.save(update_fields=["status", "result", "date_done"])
@shared_task
@ -531,11 +547,3 @@ def update_document_in_llm_index(document):
@shared_task
def remove_document_from_llm_index(document):
llm_index_remove_document(document)
# TODO: schedule to run periodically
@shared_task
def rebuild_llm_index_task():
from paperless_ai.indexing import update_llm_index
update_llm_index(rebuild=True)

View File

@ -1,4 +1,4 @@
# Generated by Django 5.1.7 on 2025-04-24 02:09
# Generated by Django 5.1.8 on 2025-04-30 02:38
from django.db import migrations
from django.db import models
@ -19,27 +19,6 @@ class Migration(migrations.Migration):
verbose_name="Enables AI features",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_backend",
field=models.CharField(
blank=True,
choices=[("openai", "OpenAI"), ("local", "Local")],
max_length=32,
null=True,
verbose_name="Sets the LLM Embedding backend",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_model",
field=models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Sets the LLM Embedding model",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_api_key",
@ -61,6 +40,27 @@ class Migration(migrations.Migration):
verbose_name="Sets the LLM backend",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_backend",
field=models.CharField(
blank=True,
choices=[("openai", "OpenAI"), ("huggingface", "Huggingface")],
max_length=32,
null=True,
verbose_name="Sets the LLM embedding backend",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_embedding_model",
field=models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Sets the LLM embedding model",
),
),
migrations.AddField(
model_name="applicationconfiguration",
name="llm_model",

View File

@ -236,9 +236,6 @@ def _parse_beat_schedule() -> dict:
"options": {
# 1 hour before default schedule sends again
"expires": 23.0 * 60.0 * 60.0,
"kwargs": {
"progress_bar_disable": True,
},
},
},
]

View File

@ -208,9 +208,6 @@ class TestCeleryScheduleParsing(TestCase):
"schedule": crontab(minute=10, hour=2),
"options": {
"expires": self.LLM_INDEX_EXPIRE_TIME,
"kwargs": {
"progress_bar_disable": True,
},
},
},
},
@ -270,9 +267,6 @@ class TestCeleryScheduleParsing(TestCase):
"schedule": crontab(minute=10, hour=2),
"options": {
"expires": self.LLM_INDEX_EXPIRE_TIME,
"kwargs": {
"progress_bar_disable": True,
},
},
},
},
@ -324,9 +318,6 @@ class TestCeleryScheduleParsing(TestCase):
"schedule": crontab(minute=10, hour=2),
"options": {
"expires": self.LLM_INDEX_EXPIRE_TIME,
"kwargs": {
"progress_bar_disable": True,
},
},
},
},

View File

@ -115,7 +115,7 @@ def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
index.docstore.delete_document(node_id)
def update_llm_index(*, progress_bar_disable=False, rebuild=False):
def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
"""
Rebuild or update the LLM index.
"""
@ -123,8 +123,9 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
documents = Document.objects.all()
if not documents.exists():
logger.warning("No documents found to index.")
return
msg = "No documents found to index."
logger.warning(msg)
return msg
if (
rebuild
@ -145,6 +146,7 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
embed_model=embed_model,
show_progress=not progress_bar_disable,
)
msg = "LLM index rebuilt successfully."
else:
# Update existing index
index = load_or_build_index()
@ -173,15 +175,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
nodes.extend(build_document_node(document))
if nodes:
msg = "LLM index updated successfully."
logger.info(
"Updating %d nodes in LLM index.",
len(nodes),
)
index.insert_nodes(nodes)
else:
logger.info("No changes detected, skipping llm index rebuild.")
msg = "No changes detected in LLM index."
logger.info(msg)
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
return msg
def llm_index_add_or_update_document(document: Document):