mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-23 12:58:18 -05:00
Use PaperlessTask for llmindex
This commit is contained in:
parent
51a7581860
commit
374596b1bc
@ -11,6 +11,7 @@ export enum PaperlessTaskName {
|
|||||||
TrainClassifier = 'train_classifier',
|
TrainClassifier = 'train_classifier',
|
||||||
SanityCheck = 'check_sanity',
|
SanityCheck = 'check_sanity',
|
||||||
IndexOptimize = 'index_optimize',
|
IndexOptimize = 'index_optimize',
|
||||||
|
LLMIndexUpdate = 'llmindex_update',
|
||||||
}
|
}
|
||||||
|
|
||||||
export enum PaperlessTaskStatus {
|
export enum PaperlessTaskStatus {
|
||||||
|
@ -18,4 +18,5 @@ class Command(ProgressBarMixin, BaseCommand):
|
|||||||
llmindex_index(
|
llmindex_index(
|
||||||
progress_bar_disable=self.no_progress_bar,
|
progress_bar_disable=self.no_progress_bar,
|
||||||
rebuild=options["command"] == "rebuild",
|
rebuild=options["command"] == "rebuild",
|
||||||
|
scheduled=False,
|
||||||
)
|
)
|
||||||
|
@ -0,0 +1,30 @@
|
|||||||
|
# Generated by Django 5.1.8 on 2025-04-30 02:38
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("documents", "1065_workflowaction_assign_custom_fields_values"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name="paperlesstask",
|
||||||
|
name="task_name",
|
||||||
|
field=models.CharField(
|
||||||
|
choices=[
|
||||||
|
("consume_file", "Consume File"),
|
||||||
|
("train_classifier", "Train Classifier"),
|
||||||
|
("check_sanity", "Check Sanity"),
|
||||||
|
("index_optimize", "Index Optimize"),
|
||||||
|
("llmindex_update", "LLM Index Update"),
|
||||||
|
],
|
||||||
|
help_text="Name of the task that was run",
|
||||||
|
max_length=255,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Task Name",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
@ -543,6 +543,7 @@ class PaperlessTask(ModelWithOwner):
|
|||||||
TRAIN_CLASSIFIER = ("train_classifier", _("Train Classifier"))
|
TRAIN_CLASSIFIER = ("train_classifier", _("Train Classifier"))
|
||||||
CHECK_SANITY = ("check_sanity", _("Check Sanity"))
|
CHECK_SANITY = ("check_sanity", _("Check Sanity"))
|
||||||
INDEX_OPTIMIZE = ("index_optimize", _("Index Optimize"))
|
INDEX_OPTIMIZE = ("index_optimize", _("Index Optimize"))
|
||||||
|
LLMINDEX_UPDATE = ("llmindex_update", _("LLM Index Update"))
|
||||||
|
|
||||||
task_id = models.CharField(
|
task_id = models.CharField(
|
||||||
max_length=255,
|
max_length=255,
|
||||||
|
@ -514,13 +514,29 @@ def check_scheduled_workflows():
|
|||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def llmindex_index(*, progress_bar_disable=False, rebuild=False):
|
def llmindex_index(*, progress_bar_disable=True, rebuild=False, scheduled=True):
|
||||||
ai_config = AIConfig()
|
ai_config = AIConfig()
|
||||||
if ai_config.llm_index_enabled():
|
if ai_config.llm_index_enabled():
|
||||||
update_llm_index(
|
task = PaperlessTask.objects.create(
|
||||||
|
type=PaperlessTask.TaskType.SCHEDULED_TASK
|
||||||
|
if scheduled
|
||||||
|
else PaperlessTask.TaskType.MANUAL_TASK,
|
||||||
|
task_id=uuid.uuid4(),
|
||||||
|
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||||
|
status=states.STARTED,
|
||||||
|
date_created=timezone.now(),
|
||||||
|
date_started=timezone.now(),
|
||||||
|
)
|
||||||
|
from paperless_ai.indexing import update_llm_index
|
||||||
|
|
||||||
|
result = update_llm_index(
|
||||||
progress_bar_disable=progress_bar_disable,
|
progress_bar_disable=progress_bar_disable,
|
||||||
rebuild=rebuild,
|
rebuild=rebuild,
|
||||||
)
|
)
|
||||||
|
task.status = states.SUCCESS
|
||||||
|
task.result = result
|
||||||
|
task.date_done = timezone.now()
|
||||||
|
task.save(update_fields=["status", "result", "date_done"])
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
@ -531,11 +547,3 @@ def update_document_in_llm_index(document):
|
|||||||
@shared_task
|
@shared_task
|
||||||
def remove_document_from_llm_index(document):
|
def remove_document_from_llm_index(document):
|
||||||
llm_index_remove_document(document)
|
llm_index_remove_document(document)
|
||||||
|
|
||||||
|
|
||||||
# TODO: schedule to run periodically
|
|
||||||
@shared_task
|
|
||||||
def rebuild_llm_index_task():
|
|
||||||
from paperless_ai.indexing import update_llm_index
|
|
||||||
|
|
||||||
update_llm_index(rebuild=True)
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# Generated by Django 5.1.7 on 2025-04-24 02:09
|
# Generated by Django 5.1.8 on 2025-04-30 02:38
|
||||||
|
|
||||||
from django.db import migrations
|
from django.db import migrations
|
||||||
from django.db import models
|
from django.db import models
|
||||||
@ -19,27 +19,6 @@ class Migration(migrations.Migration):
|
|||||||
verbose_name="Enables AI features",
|
verbose_name="Enables AI features",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
migrations.AddField(
|
|
||||||
model_name="applicationconfiguration",
|
|
||||||
name="llm_embedding_backend",
|
|
||||||
field=models.CharField(
|
|
||||||
blank=True,
|
|
||||||
choices=[("openai", "OpenAI"), ("local", "Local")],
|
|
||||||
max_length=32,
|
|
||||||
null=True,
|
|
||||||
verbose_name="Sets the LLM Embedding backend",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name="applicationconfiguration",
|
|
||||||
name="llm_embedding_model",
|
|
||||||
field=models.CharField(
|
|
||||||
blank=True,
|
|
||||||
max_length=32,
|
|
||||||
null=True,
|
|
||||||
verbose_name="Sets the LLM Embedding model",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
migrations.AddField(
|
||||||
model_name="applicationconfiguration",
|
model_name="applicationconfiguration",
|
||||||
name="llm_api_key",
|
name="llm_api_key",
|
||||||
@ -61,6 +40,27 @@ class Migration(migrations.Migration):
|
|||||||
verbose_name="Sets the LLM backend",
|
verbose_name="Sets the LLM backend",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="applicationconfiguration",
|
||||||
|
name="llm_embedding_backend",
|
||||||
|
field=models.CharField(
|
||||||
|
blank=True,
|
||||||
|
choices=[("openai", "OpenAI"), ("huggingface", "Huggingface")],
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Sets the LLM embedding backend",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="applicationconfiguration",
|
||||||
|
name="llm_embedding_model",
|
||||||
|
field=models.CharField(
|
||||||
|
blank=True,
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Sets the LLM embedding model",
|
||||||
|
),
|
||||||
|
),
|
||||||
migrations.AddField(
|
migrations.AddField(
|
||||||
model_name="applicationconfiguration",
|
model_name="applicationconfiguration",
|
||||||
name="llm_model",
|
name="llm_model",
|
||||||
|
@ -236,9 +236,6 @@ def _parse_beat_schedule() -> dict:
|
|||||||
"options": {
|
"options": {
|
||||||
# 1 hour before default schedule sends again
|
# 1 hour before default schedule sends again
|
||||||
"expires": 23.0 * 60.0 * 60.0,
|
"expires": 23.0 * 60.0 * 60.0,
|
||||||
"kwargs": {
|
|
||||||
"progress_bar_disable": True,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
@ -208,9 +208,6 @@ class TestCeleryScheduleParsing(TestCase):
|
|||||||
"schedule": crontab(minute=10, hour=2),
|
"schedule": crontab(minute=10, hour=2),
|
||||||
"options": {
|
"options": {
|
||||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||||
"kwargs": {
|
|
||||||
"progress_bar_disable": True,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -270,9 +267,6 @@ class TestCeleryScheduleParsing(TestCase):
|
|||||||
"schedule": crontab(minute=10, hour=2),
|
"schedule": crontab(minute=10, hour=2),
|
||||||
"options": {
|
"options": {
|
||||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||||
"kwargs": {
|
|
||||||
"progress_bar_disable": True,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -324,9 +318,6 @@ class TestCeleryScheduleParsing(TestCase):
|
|||||||
"schedule": crontab(minute=10, hour=2),
|
"schedule": crontab(minute=10, hour=2),
|
||||||
"options": {
|
"options": {
|
||||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||||
"kwargs": {
|
|
||||||
"progress_bar_disable": True,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -115,7 +115,7 @@ def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
|
|||||||
index.docstore.delete_document(node_id)
|
index.docstore.delete_document(node_id)
|
||||||
|
|
||||||
|
|
||||||
def update_llm_index(*, progress_bar_disable=False, rebuild=False):
|
def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||||
"""
|
"""
|
||||||
Rebuild or update the LLM index.
|
Rebuild or update the LLM index.
|
||||||
"""
|
"""
|
||||||
@ -123,8 +123,9 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
|
|||||||
|
|
||||||
documents = Document.objects.all()
|
documents = Document.objects.all()
|
||||||
if not documents.exists():
|
if not documents.exists():
|
||||||
logger.warning("No documents found to index.")
|
msg = "No documents found to index."
|
||||||
return
|
logger.warning(msg)
|
||||||
|
return msg
|
||||||
|
|
||||||
if (
|
if (
|
||||||
rebuild
|
rebuild
|
||||||
@ -145,6 +146,7 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
|
|||||||
embed_model=embed_model,
|
embed_model=embed_model,
|
||||||
show_progress=not progress_bar_disable,
|
show_progress=not progress_bar_disable,
|
||||||
)
|
)
|
||||||
|
msg = "LLM index rebuilt successfully."
|
||||||
else:
|
else:
|
||||||
# Update existing index
|
# Update existing index
|
||||||
index = load_or_build_index()
|
index = load_or_build_index()
|
||||||
@ -173,15 +175,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
|
|||||||
nodes.extend(build_document_node(document))
|
nodes.extend(build_document_node(document))
|
||||||
|
|
||||||
if nodes:
|
if nodes:
|
||||||
|
msg = "LLM index updated successfully."
|
||||||
logger.info(
|
logger.info(
|
||||||
"Updating %d nodes in LLM index.",
|
"Updating %d nodes in LLM index.",
|
||||||
len(nodes),
|
len(nodes),
|
||||||
)
|
)
|
||||||
index.insert_nodes(nodes)
|
index.insert_nodes(nodes)
|
||||||
else:
|
else:
|
||||||
logger.info("No changes detected, skipping llm index rebuild.")
|
msg = "No changes detected in LLM index."
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
||||||
|
return msg
|
||||||
|
|
||||||
|
|
||||||
def llm_index_add_or_update_document(document: Document):
|
def llm_index_add_or_update_document(document: Document):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user