Use PaperlessTask for llmindex

2025-07-04 16:24:39 -05:00 · 2025-04-29 19:40:05 -07:00 · 2025-04-29 19:40:05 -07:00 · 374596b1bc
commit 374596b1bc
parent 51a7581860
9 changed files with 82 additions and 48 deletions
--- a/src-ui/src/app/data/paperless-task.ts
+++ b/src-ui/src/app/data/paperless-task.ts
@ -11,6 +11,7 @@ export enum PaperlessTaskName {
  TrainClassifier = 'train_classifier',
  SanityCheck = 'check_sanity',
  IndexOptimize = 'index_optimize',
+  LLMIndexUpdate = 'llmindex_update',
 }

 export enum PaperlessTaskStatus {
--- a/src/documents/management/commands/document_llmindex.py
+++ b/src/documents/management/commands/document_llmindex.py
@ -18,4 +18,5 @@ class Command(ProgressBarMixin, BaseCommand):
            llmindex_index(
                progress_bar_disable=self.no_progress_bar,
                rebuild=options["command"] == "rebuild",
+                scheduled=False,
            )
--- a/src/documents/migrations/1066_alter_paperlesstask_task_name.py
+++ b/src/documents/migrations/1066_alter_paperlesstask_task_name.py
@ -0,0 +1,30 @@
+# Generated by Django 5.1.8 on 2025-04-30 02:38
+
+from django.db import migrations
+from django.db import models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("documents", "1065_workflowaction_assign_custom_fields_values"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="paperlesstask",
+            name="task_name",
+            field=models.CharField(
+                choices=[
+                    ("consume_file", "Consume File"),
+                    ("train_classifier", "Train Classifier"),
+                    ("check_sanity", "Check Sanity"),
+                    ("index_optimize", "Index Optimize"),
+                    ("llmindex_update", "LLM Index Update"),
+                ],
+                help_text="Name of the task that was run",
+                max_length=255,
+                null=True,
+                verbose_name="Task Name",
+            ),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -543,6 +543,7 @@ class PaperlessTask(ModelWithOwner):
        TRAIN_CLASSIFIER = ("train_classifier", _("Train Classifier"))
        CHECK_SANITY = ("check_sanity", _("Check Sanity"))
        INDEX_OPTIMIZE = ("index_optimize", _("Index Optimize"))
+        LLMINDEX_UPDATE = ("llmindex_update", _("LLM Index Update"))

    task_id = models.CharField(
        max_length=255,
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -514,13 +514,29 @@ def check_scheduled_workflows():


@shared_task
-def llmindex_index(*, progress_bar_disable=False, rebuild=False):
+def llmindex_index(*, progress_bar_disable=True, rebuild=False, scheduled=True):
    ai_config = AIConfig()
    if ai_config.llm_index_enabled():
-        update_llm_index(
+        task = PaperlessTask.objects.create(
+            type=PaperlessTask.TaskType.SCHEDULED_TASK
+            if scheduled
+            else PaperlessTask.TaskType.MANUAL_TASK,
+            task_id=uuid.uuid4(),
+            task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
+            status=states.STARTED,
+            date_created=timezone.now(),
+            date_started=timezone.now(),
+        )
+        from paperless_ai.indexing import update_llm_index
+
+        result = update_llm_index(
            progress_bar_disable=progress_bar_disable,
            rebuild=rebuild,
        )
+        task.status = states.SUCCESS
+        task.result = result
+        task.date_done = timezone.now()
+        task.save(update_fields=["status", "result", "date_done"])


@shared_task
@ -531,11 +547,3 @@ def update_document_in_llm_index(document):
@shared_task
 def remove_document_from_llm_index(document):
    llm_index_remove_document(document)
-
-
-# TODO: schedule to run periodically
-@shared_task
-def rebuild_llm_index_task():
-    from paperless_ai.indexing import update_llm_index
-
-    update_llm_index(rebuild=True)
--- a/src/paperless/migrations/0004_applicationconfiguration_ai_enabled_and_more.py
+++ b/src/paperless/migrations/0004_applicationconfiguration_ai_enabled_and_more.py
@ -1,4 +1,4 @@
-# Generated by Django 5.1.7 on 2025-04-24 02:09
+# Generated by Django 5.1.8 on 2025-04-30 02:38

 from django.db import migrations
 from django.db import models
@ -19,27 +19,6 @@ class Migration(migrations.Migration):
                verbose_name="Enables AI features",
            ),
        ),
-        migrations.AddField(
-            model_name="applicationconfiguration",
-            name="llm_embedding_backend",
-            field=models.CharField(
-                blank=True,
-                choices=[("openai", "OpenAI"), ("local", "Local")],
-                max_length=32,
-                null=True,
-                verbose_name="Sets the LLM Embedding backend",
-            ),
-        ),
-        migrations.AddField(
-            model_name="applicationconfiguration",
-            name="llm_embedding_model",
-            field=models.CharField(
-                blank=True,
-                max_length=32,
-                null=True,
-                verbose_name="Sets the LLM Embedding model",
-            ),
-        ),
        migrations.AddField(
            model_name="applicationconfiguration",
            name="llm_api_key",
@ -61,6 +40,27 @@ class Migration(migrations.Migration):
                verbose_name="Sets the LLM backend",
            ),
        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_embedding_backend",
+            field=models.CharField(
+                blank=True,
+                choices=[("openai", "OpenAI"), ("huggingface", "Huggingface")],
+                max_length=32,
+                null=True,
+                verbose_name="Sets the LLM embedding backend",
+            ),
+        ),
+        migrations.AddField(
+            model_name="applicationconfiguration",
+            name="llm_embedding_model",
+            field=models.CharField(
+                blank=True,
+                max_length=32,
+                null=True,
+                verbose_name="Sets the LLM embedding model",
+            ),
+        ),
        migrations.AddField(
            model_name="applicationconfiguration",
            name="llm_model",
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -236,9 +236,6 @@ def _parse_beat_schedule() -> dict:
            "options": {
                # 1 hour before default schedule sends again
                "expires": 23.0 * 60.0 * 60.0,
-                "kwargs": {
-                    "progress_bar_disable": True,
-                },
            },
        },
    ]
--- a/src/paperless/tests/test_settings.py
+++ b/src/paperless/tests/test_settings.py
@ -208,9 +208,6 @@ class TestCeleryScheduleParsing(TestCase):
                    "schedule": crontab(minute=10, hour=2),
                    "options": {
                        "expires": self.LLM_INDEX_EXPIRE_TIME,
-                        "kwargs": {
-                            "progress_bar_disable": True,
-                        },
                    },
                },
            },
@ -270,9 +267,6 @@ class TestCeleryScheduleParsing(TestCase):
                    "schedule": crontab(minute=10, hour=2),
                    "options": {
                        "expires": self.LLM_INDEX_EXPIRE_TIME,
-                        "kwargs": {
-                            "progress_bar_disable": True,
-                        },
                    },
                },
            },
@ -324,9 +318,6 @@ class TestCeleryScheduleParsing(TestCase):
                    "schedule": crontab(minute=10, hour=2),
                    "options": {
                        "expires": self.LLM_INDEX_EXPIRE_TIME,
-                        "kwargs": {
-                            "progress_bar_disable": True,
-                        },
                    },
                },
            },
--- a/src/paperless_ai/indexing.py
+++ b/src/paperless_ai/indexing.py
@ -115,7 +115,7 @@ def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
        index.docstore.delete_document(node_id)


-def update_llm_index(*, progress_bar_disable=False, rebuild=False):
+def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
    """
    Rebuild or update the LLM index.
    """
@ -123,8 +123,9 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):

    documents = Document.objects.all()
    if not documents.exists():
-        logger.warning("No documents found to index.")
-        return
+        msg = "No documents found to index."
+        logger.warning(msg)
+        return msg

    if (
        rebuild
@ -145,6 +146,7 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
            embed_model=embed_model,
            show_progress=not progress_bar_disable,
        )
+        msg = "LLM index rebuilt successfully."
    else:
        # Update existing index
        index = load_or_build_index()
@ -173,15 +175,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
                nodes.extend(build_document_node(document))

        if nodes:
+            msg = "LLM index updated successfully."
            logger.info(
                "Updating %d nodes in LLM index.",
                len(nodes),
            )
            index.insert_nodes(nodes)
        else:
-            logger.info("No changes detected, skipping llm index rebuild.")
+            msg = "No changes detected in LLM index."
+            logger.info(msg)

    index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
+    return msg


 def llm_index_add_or_update_document(document: Document):