mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-14 21:54:22 -06:00
Feature: Paperless AI (#10319)
This commit is contained in:
@@ -11,6 +11,7 @@ class DocumentsConfig(AppConfig):
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.signals import document_updated
|
||||
from documents.signals.handlers import add_inbox_tags
|
||||
from documents.signals.handlers import add_or_update_document_in_llm_index
|
||||
from documents.signals.handlers import add_to_index
|
||||
from documents.signals.handlers import run_workflows_added
|
||||
from documents.signals.handlers import run_workflows_updated
|
||||
@@ -26,6 +27,7 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(set_storage_path)
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_workflows_added)
|
||||
document_consumption_finished.connect(add_or_update_document_in_llm_index)
|
||||
document_updated.connect(run_workflows_updated)
|
||||
|
||||
import documents.schema # noqa: F401
|
||||
|
||||
@@ -41,6 +41,7 @@ class SuggestionCacheData:
|
||||
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
|
||||
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
|
||||
CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
|
||||
LLM_CACHE_CLASSIFIER_VERSION: Final[int] = 1000 # Marker distinguishing LLM suggestions
|
||||
|
||||
CACHE_1_MINUTE: Final[int] = 60
|
||||
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
|
||||
@@ -196,6 +197,54 @@ def refresh_suggestions_cache(
|
||||
cache.touch(doc_key, timeout)
|
||||
|
||||
|
||||
def get_llm_suggestion_cache(
|
||||
document_id: int,
|
||||
backend: str,
|
||||
) -> SuggestionCacheData | None:
|
||||
doc_key = get_suggestion_cache_key(document_id)
|
||||
data: SuggestionCacheData = cache.get(doc_key)
|
||||
|
||||
if data and data.classifier_hash == backend:
|
||||
return data
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def set_llm_suggestions_cache(
|
||||
document_id: int,
|
||||
suggestions: dict,
|
||||
*,
|
||||
backend: str,
|
||||
timeout: int = CACHE_50_MINUTES,
|
||||
) -> None:
|
||||
"""
|
||||
Cache LLM-generated suggestions using a backend-specific identifier (e.g. 'openai:gpt-4').
|
||||
"""
|
||||
doc_key = get_suggestion_cache_key(document_id)
|
||||
cache.set(
|
||||
doc_key,
|
||||
SuggestionCacheData(
|
||||
classifier_version=LLM_CACHE_CLASSIFIER_VERSION,
|
||||
classifier_hash=backend,
|
||||
suggestions=suggestions,
|
||||
),
|
||||
timeout,
|
||||
)
|
||||
|
||||
|
||||
def invalidate_llm_suggestions_cache(
|
||||
document_id: int,
|
||||
) -> None:
|
||||
"""
|
||||
Invalidate the LLM suggestions cache for a specific document and backend.
|
||||
"""
|
||||
doc_key = get_suggestion_cache_key(document_id)
|
||||
data: SuggestionCacheData = cache.get(doc_key)
|
||||
|
||||
if data:
|
||||
cache.delete(doc_key)
|
||||
|
||||
|
||||
def get_metadata_cache_key(document_id: int) -> str:
|
||||
"""
|
||||
Returns the basic key for a document's metadata
|
||||
|
||||
22
src/documents/management/commands/document_llmindex.py
Normal file
22
src/documents/management/commands/document_llmindex.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from django.core.management import BaseCommand
|
||||
from django.db import transaction
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.tasks import llmindex_index
|
||||
|
||||
|
||||
class Command(ProgressBarMixin, BaseCommand):
|
||||
help = "Manages the LLM-based vector index for Paperless."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("command", choices=["rebuild", "update"])
|
||||
self.add_argument_progress_bar_mixin(parser)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
with transaction.atomic():
|
||||
llmindex_index(
|
||||
progress_bar_disable=self.no_progress_bar,
|
||||
rebuild=options["command"] == "rebuild",
|
||||
scheduled=False,
|
||||
)
|
||||
@@ -0,0 +1,30 @@
|
||||
# Generated by Django 5.1.8 on 2025-04-30 02:38
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1074_workflowrun_deleted_at_workflowrun_restored_at_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="paperlesstask",
|
||||
name="task_name",
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
("consume_file", "Consume File"),
|
||||
("train_classifier", "Train Classifier"),
|
||||
("check_sanity", "Check Sanity"),
|
||||
("index_optimize", "Index Optimize"),
|
||||
("llmindex_update", "LLM Index Update"),
|
||||
],
|
||||
help_text="Name of the task that was run",
|
||||
max_length=255,
|
||||
null=True,
|
||||
verbose_name="Task Name",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -598,6 +598,7 @@ class PaperlessTask(ModelWithOwner):
|
||||
TRAIN_CLASSIFIER = ("train_classifier", _("Train Classifier"))
|
||||
CHECK_SANITY = ("check_sanity", _("Check Sanity"))
|
||||
INDEX_OPTIMIZE = ("index_optimize", _("Index Optimize"))
|
||||
LLMINDEX_UPDATE = ("llmindex_update", _("LLM Index Update"))
|
||||
|
||||
task_id = models.CharField(
|
||||
max_length=255,
|
||||
|
||||
@@ -26,6 +26,8 @@ from filelock import FileLock
|
||||
|
||||
from documents import matching
|
||||
from documents.caching import clear_document_caches
|
||||
from documents.caching import invalidate_llm_suggestions_cache
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import delete_empty_directories
|
||||
from documents.file_handling import generate_filename
|
||||
@@ -52,6 +54,7 @@ from documents.workflows.mutations import apply_assignment_to_overrides
|
||||
from documents.workflows.mutations import apply_removal_to_document
|
||||
from documents.workflows.mutations import apply_removal_to_overrides
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from documents.classifier import DocumentClassifier
|
||||
@@ -638,6 +641,15 @@ def cleanup_custom_field_deletion(sender, instance: CustomField, **kwargs):
|
||||
)
|
||||
|
||||
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def update_llm_suggestions_cache(sender, instance, **kwargs):
|
||||
"""
|
||||
Invalidate the LLM suggestions cache when a document is saved.
|
||||
"""
|
||||
# Invalidate the cache for the document
|
||||
invalidate_llm_suggestions_cache(instance.pk)
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=User)
|
||||
@receiver(models.signals.post_delete, sender=Group)
|
||||
def cleanup_user_deletion(sender, instance: User | Group, **kwargs):
|
||||
@@ -944,3 +956,26 @@ def close_connection_pool_on_worker_init(**kwargs):
|
||||
for conn in connections.all(initialized_only=True):
|
||||
if conn.alias == "default" and hasattr(conn, "pool") and conn.pool:
|
||||
conn.close_pool()
|
||||
|
||||
|
||||
def add_or_update_document_in_llm_index(sender, document, **kwargs):
|
||||
"""
|
||||
Add or update a document in the LLM index when it is created or updated.
|
||||
"""
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
from documents.tasks import update_document_in_llm_index
|
||||
|
||||
update_document_in_llm_index.delay(document)
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def delete_document_from_llm_index(sender, instance: Document, **kwargs):
|
||||
"""
|
||||
Delete a document from the LLM index when it is deleted.
|
||||
"""
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
from documents.tasks import remove_document_from_llm_index
|
||||
|
||||
remove_document_from_llm_index.delay(instance)
|
||||
|
||||
@@ -54,6 +54,10 @@ from documents.signals import document_updated
|
||||
from documents.signals.handlers import cleanup_document_deletion
|
||||
from documents.signals.handlers import run_workflows
|
||||
from documents.workflows.utils import get_workflows_for_trigger
|
||||
from paperless.config import AIConfig
|
||||
from paperless_ai.indexing import llm_index_add_or_update_document
|
||||
from paperless_ai.indexing import llm_index_remove_document
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
from auditlog.models import LogEntry
|
||||
@@ -242,6 +246,13 @@ def bulk_update_documents(document_ids):
|
||||
for doc in documents:
|
||||
index.update_document(writer, doc)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
update_llm_index(
|
||||
progress_bar_disable=True,
|
||||
rebuild=False,
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
def update_document_content_maybe_archive_file(document_id):
|
||||
@@ -341,6 +352,10 @@ def update_document_content_maybe_archive_file(document_id):
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
llm_index_add_or_update_document(document)
|
||||
|
||||
clear_document_caches(document.pk)
|
||||
|
||||
except Exception:
|
||||
@@ -558,3 +573,55 @@ def update_document_parent_tags(tag: Tag, new_parent: Tag) -> None:
|
||||
|
||||
if affected:
|
||||
bulk_update_documents.delay(document_ids=list(affected))
|
||||
|
||||
|
||||
@shared_task
|
||||
def llmindex_index(
|
||||
*,
|
||||
progress_bar_disable=True,
|
||||
rebuild=False,
|
||||
scheduled=True,
|
||||
auto=False,
|
||||
):
|
||||
ai_config = AIConfig()
|
||||
if ai_config.llm_index_enabled:
|
||||
task = PaperlessTask.objects.create(
|
||||
type=PaperlessTask.TaskType.SCHEDULED_TASK
|
||||
if scheduled
|
||||
else PaperlessTask.TaskType.AUTO
|
||||
if auto
|
||||
else PaperlessTask.TaskType.MANUAL_TASK,
|
||||
task_id=uuid.uuid4(),
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
status=states.STARTED,
|
||||
date_created=timezone.now(),
|
||||
date_started=timezone.now(),
|
||||
)
|
||||
from paperless_ai.indexing import update_llm_index
|
||||
|
||||
try:
|
||||
result = update_llm_index(
|
||||
progress_bar_disable=progress_bar_disable,
|
||||
rebuild=rebuild,
|
||||
)
|
||||
task.status = states.SUCCESS
|
||||
task.result = result
|
||||
except Exception as e:
|
||||
logger.error("LLM index error: " + str(e))
|
||||
task.status = states.FAILURE
|
||||
task.result = str(e)
|
||||
|
||||
task.date_done = timezone.now()
|
||||
task.save(update_fields=["status", "result", "date_done"])
|
||||
else:
|
||||
logger.info("LLM index is disabled, skipping update.")
|
||||
|
||||
|
||||
@shared_task
|
||||
def update_document_in_llm_index(document):
|
||||
llm_index_add_or_update_document(document)
|
||||
|
||||
|
||||
@shared_task
|
||||
def remove_document_from_llm_index(document):
|
||||
llm_index_remove_document(document)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import json
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
@@ -66,6 +67,13 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
||||
"barcode_max_pages": None,
|
||||
"barcode_enable_tag": None,
|
||||
"barcode_tag_mapping": None,
|
||||
"ai_enabled": False,
|
||||
"llm_embedding_backend": None,
|
||||
"llm_embedding_model": None,
|
||||
"llm_backend": None,
|
||||
"llm_model": None,
|
||||
"llm_api_key": None,
|
||||
"llm_endpoint": None,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -611,3 +619,76 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED)
|
||||
self.assertEqual(ApplicationConfiguration.objects.count(), 1)
|
||||
|
||||
def test_update_llm_api_key(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing config with llm_api_key specified
|
||||
WHEN:
|
||||
- API to update llm_api_key is called with all *s
|
||||
- API to update llm_api_key is called with empty string
|
||||
THEN:
|
||||
- llm_api_key is unchanged
|
||||
- llm_api_key is set to None
|
||||
"""
|
||||
config = ApplicationConfiguration.objects.first()
|
||||
config.llm_api_key = "1234567890"
|
||||
config.save()
|
||||
|
||||
# Test with all *
|
||||
response = self.client.patch(
|
||||
f"{self.ENDPOINT}1/",
|
||||
json.dumps(
|
||||
{
|
||||
"llm_api_key": "*" * 32,
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
config.refresh_from_db()
|
||||
self.assertEqual(config.llm_api_key, "1234567890")
|
||||
# Test with empty string
|
||||
response = self.client.patch(
|
||||
f"{self.ENDPOINT}1/",
|
||||
json.dumps(
|
||||
{
|
||||
"llm_api_key": "",
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
config.refresh_from_db()
|
||||
self.assertEqual(config.llm_api_key, None)
|
||||
|
||||
def test_enable_ai_index_triggers_update(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing config with AI disabled
|
||||
WHEN:
|
||||
- Config is updated to enable AI with llm_embedding_backend
|
||||
THEN:
|
||||
- LLM index is triggered to update
|
||||
"""
|
||||
config = ApplicationConfiguration.objects.first()
|
||||
config.ai_enabled = False
|
||||
config.llm_embedding_backend = None
|
||||
config.save()
|
||||
|
||||
with (
|
||||
patch("documents.tasks.llmindex_index.delay") as mock_update,
|
||||
patch("paperless_ai.indexing.vector_store_file_exists") as mock_exists,
|
||||
):
|
||||
mock_exists.return_value = False
|
||||
self.client.patch(
|
||||
f"{self.ENDPOINT}1/",
|
||||
json.dumps(
|
||||
{
|
||||
"ai_enabled": True,
|
||||
"llm_embedding_backend": "openai",
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
mock_update.assert_called_once()
|
||||
|
||||
@@ -310,3 +310,69 @@ class TestSystemStatus(APITestCase):
|
||||
"ERROR",
|
||||
)
|
||||
self.assertIsNotNone(response.data["tasks"]["sanity_check_error"])
|
||||
|
||||
def test_system_status_ai_disabled(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- The AI feature is disabled
|
||||
WHEN:
|
||||
- The user requests the system status
|
||||
THEN:
|
||||
- The response contains the correct AI status
|
||||
"""
|
||||
with override_settings(AI_ENABLED=False):
|
||||
self.client.force_login(self.user)
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["llmindex_status"], "DISABLED")
|
||||
self.assertIsNone(response.data["tasks"]["llmindex_error"])
|
||||
|
||||
def test_system_status_ai_enabled(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- The AI index feature is enabled, but no tasks are found
|
||||
- The AI index feature is enabled and a task is found
|
||||
WHEN:
|
||||
- The user requests the system status
|
||||
THEN:
|
||||
- The response contains the correct AI status
|
||||
"""
|
||||
with override_settings(AI_ENABLED=True, LLM_EMBEDDING_BACKEND="openai"):
|
||||
self.client.force_login(self.user)
|
||||
|
||||
# No tasks found
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["llmindex_status"], "WARNING")
|
||||
|
||||
PaperlessTask.objects.create(
|
||||
type=PaperlessTask.TaskType.SCHEDULED_TASK,
|
||||
status=states.SUCCESS,
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
)
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["llmindex_status"], "OK")
|
||||
self.assertIsNone(response.data["tasks"]["llmindex_error"])
|
||||
|
||||
def test_system_status_ai_error(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- The AI index feature is enabled and a task is found with an error
|
||||
WHEN:
|
||||
- The user requests the system status
|
||||
THEN:
|
||||
- The response contains the correct AI status
|
||||
"""
|
||||
with override_settings(AI_ENABLED=True, LLM_EMBEDDING_BACKEND="openai"):
|
||||
PaperlessTask.objects.create(
|
||||
type=PaperlessTask.TaskType.SCHEDULED_TASK,
|
||||
status=states.FAILURE,
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
result="AI index update failed",
|
||||
)
|
||||
self.client.force_login(self.user)
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["tasks"]["llmindex_status"], "ERROR")
|
||||
self.assertIsNotNone(response.data["tasks"]["llmindex_error"])
|
||||
|
||||
@@ -49,6 +49,7 @@ class TestApiUiSettings(DirectoriesMixin, APITestCase):
|
||||
"backend_setting": "default",
|
||||
},
|
||||
"email_enabled": False,
|
||||
"ai_enabled": False,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@@ -3,14 +3,17 @@ from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import tasks
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import PaperlessTask
|
||||
from documents.models import Tag
|
||||
from documents.sanity_checker import SanityCheckFailedException
|
||||
from documents.sanity_checker import SanityCheckMessages
|
||||
@@ -270,3 +273,103 @@ class TestUpdateContent(DirectoriesMixin, TestCase):
|
||||
|
||||
tasks.update_document_content_maybe_archive_file(doc.pk)
|
||||
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
|
||||
|
||||
|
||||
class TestAIIndex(DirectoriesMixin, TestCase):
|
||||
@override_settings(
|
||||
AI_ENABLED=True,
|
||||
LLM_EMBEDDING_BACKEND="huggingface",
|
||||
)
|
||||
def test_ai_index_success(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document exists, AI is enabled, llm index backend is set
|
||||
WHEN:
|
||||
- llmindex_index task is called
|
||||
THEN:
|
||||
- update_llm_index is called, and the task is marked as success
|
||||
"""
|
||||
Document.objects.create(
|
||||
title="test",
|
||||
content="my document",
|
||||
checksum="wow",
|
||||
)
|
||||
# lazy-loaded so mock the actual function
|
||||
with mock.patch("paperless_ai.indexing.update_llm_index") as update_llm_index:
|
||||
update_llm_index.return_value = "LLM index updated successfully."
|
||||
tasks.llmindex_index()
|
||||
update_llm_index.assert_called_once()
|
||||
task = PaperlessTask.objects.get(
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
)
|
||||
self.assertEqual(task.status, states.SUCCESS)
|
||||
self.assertEqual(task.result, "LLM index updated successfully.")
|
||||
|
||||
@override_settings(
|
||||
AI_ENABLED=True,
|
||||
LLM_EMBEDDING_BACKEND="huggingface",
|
||||
)
|
||||
def test_ai_index_failure(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document exists, AI is enabled, llm index backend is set
|
||||
WHEN:
|
||||
- llmindex_index task is called
|
||||
THEN:
|
||||
- update_llm_index raises an exception, and the task is marked as failure
|
||||
"""
|
||||
Document.objects.create(
|
||||
title="test",
|
||||
content="my document",
|
||||
checksum="wow",
|
||||
)
|
||||
# lazy-loaded so mock the actual function
|
||||
with mock.patch("paperless_ai.indexing.update_llm_index") as update_llm_index:
|
||||
update_llm_index.side_effect = Exception("LLM index update failed.")
|
||||
tasks.llmindex_index()
|
||||
update_llm_index.assert_called_once()
|
||||
task = PaperlessTask.objects.get(
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
)
|
||||
self.assertEqual(task.status, states.FAILURE)
|
||||
self.assertIn("LLM index update failed.", task.result)
|
||||
|
||||
def test_update_document_in_llm_index(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Nothing
|
||||
WHEN:
|
||||
- update_document_in_llm_index task is called
|
||||
THEN:
|
||||
- llm_index_add_or_update_document is called
|
||||
"""
|
||||
doc = Document.objects.create(
|
||||
title="test",
|
||||
content="my document",
|
||||
checksum="wow",
|
||||
)
|
||||
with mock.patch(
|
||||
"documents.tasks.llm_index_add_or_update_document",
|
||||
) as llm_index_add_or_update_document:
|
||||
tasks.update_document_in_llm_index(doc)
|
||||
llm_index_add_or_update_document.assert_called_once_with(doc)
|
||||
|
||||
def test_remove_document_from_llm_index(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Nothing
|
||||
WHEN:
|
||||
- remove_document_from_llm_index task is called
|
||||
THEN:
|
||||
- llm_index_remove_document is called
|
||||
"""
|
||||
doc = Document.objects.create(
|
||||
title="test",
|
||||
content="my document",
|
||||
checksum="wow",
|
||||
)
|
||||
with mock.patch(
|
||||
"documents.tasks.llm_index_remove_document",
|
||||
) as llm_index_remove_document:
|
||||
tasks.remove_document_from_llm_index(doc)
|
||||
llm_index_remove_document.assert_called_once_with(doc)
|
||||
|
||||
@@ -2,6 +2,8 @@ import json
|
||||
import tempfile
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Group
|
||||
@@ -15,9 +17,15 @@ from django.utils import timezone
|
||||
from guardian.shortcuts import assign_perm
|
||||
from rest_framework import status
|
||||
|
||||
from documents.caching import get_llm_suggestion_cache
|
||||
from documents.caching import set_llm_suggestions_cache
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import ShareLink
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.signals.handlers import update_llm_suggestions_cache
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless.models import ApplicationConfiguration
|
||||
|
||||
@@ -270,3 +278,176 @@ class TestViews(DirectoriesMixin, TestCase):
|
||||
f"Possible N+1 queries detected: {num_queries_small} queries for 2 tags, "
|
||||
f"but {num_queries_large} queries for 50 tags"
|
||||
)
|
||||
|
||||
|
||||
class TestAISuggestions(DirectoriesMixin, TestCase):
|
||||
def setUp(self):
|
||||
self.user = User.objects.create_superuser(username="testuser")
|
||||
self.document = Document.objects.create(
|
||||
title="Test Document",
|
||||
filename="test.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
self.tag1 = Tag.objects.create(name="tag1")
|
||||
self.correspondent1 = Correspondent.objects.create(name="correspondent1")
|
||||
self.document_type1 = DocumentType.objects.create(name="type1")
|
||||
self.path1 = StoragePath.objects.create(name="path1")
|
||||
super().setUp()
|
||||
|
||||
@patch("documents.views.get_llm_suggestion_cache")
|
||||
@patch("documents.views.refresh_suggestions_cache")
|
||||
@override_settings(
|
||||
AI_ENABLED=True,
|
||||
LLM_BACKEND="mock_backend",
|
||||
)
|
||||
def test_suggestions_with_cached_llm(self, mock_refresh_cache, mock_get_cache):
|
||||
mock_get_cache.return_value = MagicMock(suggestions={"tags": ["tag1", "tag2"]})
|
||||
|
||||
self.client.force_login(user=self.user)
|
||||
response = self.client.get(f"/api/documents/{self.document.pk}/suggestions/")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.json(), {"tags": ["tag1", "tag2"]})
|
||||
mock_refresh_cache.assert_called_once_with(self.document.pk)
|
||||
|
||||
@patch("documents.views.get_ai_document_classification")
|
||||
@override_settings(
|
||||
AI_ENABLED=True,
|
||||
LLM_BACKEND="mock_backend",
|
||||
)
|
||||
def test_suggestions_with_ai_enabled(
|
||||
self,
|
||||
mock_get_ai_classification,
|
||||
):
|
||||
mock_get_ai_classification.return_value = {
|
||||
"title": "AI Title",
|
||||
"tags": ["tag1", "tag2"],
|
||||
"correspondents": ["correspondent1"],
|
||||
"document_types": ["type1"],
|
||||
"storage_paths": ["path1"],
|
||||
"dates": ["2023-01-01"],
|
||||
}
|
||||
|
||||
self.client.force_login(user=self.user)
|
||||
response = self.client.get(f"/api/documents/{self.document.pk}/suggestions/")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(
|
||||
response.json(),
|
||||
{
|
||||
"title": "AI Title",
|
||||
"tags": [self.tag1.pk],
|
||||
"suggested_tags": ["tag2"],
|
||||
"correspondents": [self.correspondent1.pk],
|
||||
"suggested_correspondents": [],
|
||||
"document_types": [self.document_type1.pk],
|
||||
"suggested_document_types": [],
|
||||
"storage_paths": [self.path1.pk],
|
||||
"suggested_storage_paths": [],
|
||||
"dates": ["2023-01-01"],
|
||||
},
|
||||
)
|
||||
|
||||
def test_invalidate_suggestions_cache(self):
|
||||
self.client.force_login(user=self.user)
|
||||
suggestions = {
|
||||
"title": "AI Title",
|
||||
"tags": ["tag1", "tag2"],
|
||||
"correspondents": ["correspondent1"],
|
||||
"document_types": ["type1"],
|
||||
"storage_paths": ["path1"],
|
||||
"dates": ["2023-01-01"],
|
||||
}
|
||||
set_llm_suggestions_cache(
|
||||
self.document.pk,
|
||||
suggestions,
|
||||
backend="mock_backend",
|
||||
)
|
||||
self.assertEqual(
|
||||
get_llm_suggestion_cache(
|
||||
self.document.pk,
|
||||
backend="mock_backend",
|
||||
).suggestions,
|
||||
suggestions,
|
||||
)
|
||||
# post_save signal triggered
|
||||
update_llm_suggestions_cache(
|
||||
sender=None,
|
||||
instance=self.document,
|
||||
)
|
||||
self.assertIsNone(
|
||||
get_llm_suggestion_cache(
|
||||
self.document.pk,
|
||||
backend="mock_backend",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class TestAIChatStreamingView(DirectoriesMixin, TestCase):
|
||||
ENDPOINT = "/api/documents/chat/"
|
||||
|
||||
def setUp(self):
|
||||
self.user = User.objects.create_user(username="testuser", password="pass")
|
||||
self.client.force_login(user=self.user)
|
||||
self.document = Document.objects.create(
|
||||
title="Test Document",
|
||||
filename="test.pdf",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
super().setUp()
|
||||
|
||||
@override_settings(AI_ENABLED=False)
|
||||
def test_post_ai_disabled(self):
|
||||
response = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data='{"q": "question"}',
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, 400)
|
||||
self.assertIn(b"AI is required for this feature", response.content)
|
||||
|
||||
@patch("documents.views.stream_chat_with_documents")
|
||||
@patch("documents.views.get_objects_for_user_owner_aware")
|
||||
@override_settings(AI_ENABLED=True)
|
||||
def test_post_no_document_id(self, mock_get_objects, mock_stream_chat):
|
||||
mock_get_objects.return_value = [self.document]
|
||||
mock_stream_chat.return_value = iter([b"data"])
|
||||
response = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data='{"q": "question"}',
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response["Content-Type"], "text/event-stream")
|
||||
|
||||
@patch("documents.views.stream_chat_with_documents")
|
||||
@override_settings(AI_ENABLED=True)
|
||||
def test_post_with_document_id(self, mock_stream_chat):
|
||||
mock_stream_chat.return_value = iter([b"data"])
|
||||
response = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=f'{{"q": "question", "document_id": {self.document.pk}}}',
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response["Content-Type"], "text/event-stream")
|
||||
|
||||
@override_settings(AI_ENABLED=True)
|
||||
def test_post_with_invalid_document_id(self):
|
||||
response = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data='{"q": "question", "document_id": 999999}',
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, 400)
|
||||
self.assertIn(b"Document not found", response.content)
|
||||
|
||||
@patch("documents.views.has_perms_owner_aware")
|
||||
@override_settings(AI_ENABLED=True)
|
||||
def test_post_with_document_id_no_permission(self, mock_has_perms):
|
||||
mock_has_perms.return_value = False
|
||||
response = self.client.post(
|
||||
self.ENDPOINT,
|
||||
data=f'{{"q": "question", "document_id": {self.document.pk}}}',
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, 403)
|
||||
self.assertIn(b"Insufficient permissions", response.content)
|
||||
|
||||
@@ -45,6 +45,7 @@ from django.http import HttpResponseBadRequest
|
||||
from django.http import HttpResponseForbidden
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.http import HttpResponseServerError
|
||||
from django.http import StreamingHttpResponse
|
||||
from django.shortcuts import get_object_or_404
|
||||
from django.utils import timezone
|
||||
from django.utils.decorators import method_decorator
|
||||
@@ -52,6 +53,7 @@ from django.utils.timezone import make_aware
|
||||
from django.utils.translation import get_language
|
||||
from django.views import View
|
||||
from django.views.decorators.cache import cache_control
|
||||
from django.views.decorators.csrf import ensure_csrf_cookie
|
||||
from django.views.decorators.http import condition
|
||||
from django.views.decorators.http import last_modified
|
||||
from django.views.generic import TemplateView
|
||||
@@ -91,10 +93,12 @@ from documents import index
|
||||
from documents.bulk_download import ArchiveOnlyStrategy
|
||||
from documents.bulk_download import OriginalAndArchiveStrategy
|
||||
from documents.bulk_download import OriginalsOnlyStrategy
|
||||
from documents.caching import get_llm_suggestion_cache
|
||||
from documents.caching import get_metadata_cache
|
||||
from documents.caching import get_suggestion_cache
|
||||
from documents.caching import refresh_metadata_cache
|
||||
from documents.caching import refresh_suggestions_cache
|
||||
from documents.caching import set_llm_suggestions_cache
|
||||
from documents.caching import set_metadata_cache
|
||||
from documents.caching import set_suggestions_cache
|
||||
from documents.classifier import load_classifier
|
||||
@@ -182,18 +186,27 @@ from documents.signals import document_updated
|
||||
from documents.tasks import consume_file
|
||||
from documents.tasks import empty_trash
|
||||
from documents.tasks import index_optimize
|
||||
from documents.tasks import llmindex_index
|
||||
from documents.tasks import sanity_check
|
||||
from documents.tasks import train_classifier
|
||||
from documents.tasks import update_document_parent_tags
|
||||
from documents.utils import get_boolean
|
||||
from paperless import version
|
||||
from paperless.celery import app as celery_app
|
||||
from paperless.config import AIConfig
|
||||
from paperless.config import GeneralConfig
|
||||
from paperless.db import GnuPG
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless.views import StandardPagination
|
||||
from paperless_ai.ai_classifier import get_ai_document_classification
|
||||
from paperless_ai.chat import stream_chat_with_documents
|
||||
from paperless_ai.matching import extract_unmatched_names
|
||||
from paperless_ai.matching import match_correspondents_by_name
|
||||
from paperless_ai.matching import match_document_types_by_name
|
||||
from paperless_ai.matching import match_storage_paths_by_name
|
||||
from paperless_ai.matching import match_tags_by_name
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
from paperless_mail.oauth import PaperlessMailOAuth2Manager
|
||||
@@ -934,37 +947,103 @@ class DocumentViewSet(
|
||||
):
|
||||
return HttpResponseForbidden("Insufficient permissions")
|
||||
|
||||
document_suggestions = get_suggestion_cache(doc.pk)
|
||||
ai_config = AIConfig()
|
||||
|
||||
if document_suggestions is not None:
|
||||
refresh_suggestions_cache(doc.pk)
|
||||
return Response(document_suggestions.suggestions)
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
dates = []
|
||||
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
||||
gen = parse_date_generator(doc.filename, doc.content)
|
||||
dates = sorted(
|
||||
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
|
||||
if ai_config.ai_enabled:
|
||||
cached_llm_suggestions = get_llm_suggestion_cache(
|
||||
doc.pk,
|
||||
backend=ai_config.llm_backend,
|
||||
)
|
||||
|
||||
resp_data = {
|
||||
"correspondents": [
|
||||
c.id for c in match_correspondents(doc, classifier, request.user)
|
||||
],
|
||||
"tags": [t.id for t in match_tags(doc, classifier, request.user)],
|
||||
"document_types": [
|
||||
dt.id for dt in match_document_types(doc, classifier, request.user)
|
||||
],
|
||||
"storage_paths": [
|
||||
dt.id for dt in match_storage_paths(doc, classifier, request.user)
|
||||
],
|
||||
"dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
|
||||
}
|
||||
if cached_llm_suggestions:
|
||||
refresh_suggestions_cache(doc.pk)
|
||||
return Response(cached_llm_suggestions.suggestions)
|
||||
|
||||
# Cache the suggestions and the classifier hash for later
|
||||
set_suggestions_cache(doc.pk, resp_data, classifier)
|
||||
llm_suggestions = get_ai_document_classification(doc, request.user)
|
||||
|
||||
matched_tags = match_tags_by_name(
|
||||
llm_suggestions.get("tags", []),
|
||||
request.user,
|
||||
)
|
||||
matched_correspondents = match_correspondents_by_name(
|
||||
llm_suggestions.get("correspondents", []),
|
||||
request.user,
|
||||
)
|
||||
matched_types = match_document_types_by_name(
|
||||
llm_suggestions.get("document_types", []),
|
||||
request.user,
|
||||
)
|
||||
matched_paths = match_storage_paths_by_name(
|
||||
llm_suggestions.get("storage_paths", []),
|
||||
request.user,
|
||||
)
|
||||
|
||||
resp_data = {
|
||||
"title": llm_suggestions.get("title"),
|
||||
"tags": [t.id for t in matched_tags],
|
||||
"suggested_tags": extract_unmatched_names(
|
||||
llm_suggestions.get("tags", []),
|
||||
matched_tags,
|
||||
),
|
||||
"correspondents": [c.id for c in matched_correspondents],
|
||||
"suggested_correspondents": extract_unmatched_names(
|
||||
llm_suggestions.get("correspondents", []),
|
||||
matched_correspondents,
|
||||
),
|
||||
"document_types": [d.id for d in matched_types],
|
||||
"suggested_document_types": extract_unmatched_names(
|
||||
llm_suggestions.get("document_types", []),
|
||||
matched_types,
|
||||
),
|
||||
"storage_paths": [s.id for s in matched_paths],
|
||||
"suggested_storage_paths": extract_unmatched_names(
|
||||
llm_suggestions.get("storage_paths", []),
|
||||
matched_paths,
|
||||
),
|
||||
"dates": llm_suggestions.get("dates", []),
|
||||
}
|
||||
|
||||
set_llm_suggestions_cache(doc.pk, resp_data, backend=ai_config.llm_backend)
|
||||
else:
|
||||
document_suggestions = get_suggestion_cache(doc.pk)
|
||||
|
||||
if document_suggestions is not None:
|
||||
refresh_suggestions_cache(doc.pk)
|
||||
return Response(document_suggestions.suggestions)
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
dates = []
|
||||
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
||||
gen = parse_date_generator(doc.filename, doc.content)
|
||||
dates = sorted(
|
||||
{
|
||||
i
|
||||
for i in itertools.islice(
|
||||
gen,
|
||||
settings.NUMBER_OF_SUGGESTED_DATES,
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
resp_data = {
|
||||
"correspondents": [
|
||||
c.id for c in match_correspondents(doc, classifier, request.user)
|
||||
],
|
||||
"tags": [t.id for t in match_tags(doc, classifier, request.user)],
|
||||
"document_types": [
|
||||
dt.id for dt in match_document_types(doc, classifier, request.user)
|
||||
],
|
||||
"storage_paths": [
|
||||
dt.id for dt in match_storage_paths(doc, classifier, request.user)
|
||||
],
|
||||
"dates": [
|
||||
date.strftime("%Y-%m-%d") for date in dates if date is not None
|
||||
],
|
||||
}
|
||||
|
||||
# Cache the suggestions and the classifier hash for later
|
||||
set_suggestions_cache(doc.pk, resp_data, classifier)
|
||||
|
||||
return Response(resp_data)
|
||||
|
||||
@@ -1288,6 +1367,59 @@ class DocumentViewSet(
|
||||
)
|
||||
|
||||
|
||||
class ChatStreamingSerializer(serializers.Serializer):
|
||||
q = serializers.CharField(required=True)
|
||||
document_id = serializers.IntegerField(required=False, allow_null=True)
|
||||
|
||||
|
||||
@method_decorator(
|
||||
[
|
||||
ensure_csrf_cookie,
|
||||
cache_control(no_cache=True),
|
||||
],
|
||||
name="dispatch",
|
||||
)
|
||||
class ChatStreamingView(GenericAPIView):
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = ChatStreamingSerializer
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
request.compress_exempt = True
|
||||
ai_config = AIConfig()
|
||||
if not ai_config.ai_enabled:
|
||||
return HttpResponseBadRequest("AI is required for this feature")
|
||||
|
||||
try:
|
||||
question = request.data["q"]
|
||||
except KeyError:
|
||||
return HttpResponseBadRequest("Invalid request")
|
||||
|
||||
doc_id = request.data.get("document_id")
|
||||
|
||||
if doc_id:
|
||||
try:
|
||||
document = Document.objects.get(id=doc_id)
|
||||
except Document.DoesNotExist:
|
||||
return HttpResponseBadRequest("Document not found")
|
||||
|
||||
if not has_perms_owner_aware(request.user, "view_document", document):
|
||||
return HttpResponseForbidden("Insufficient permissions")
|
||||
|
||||
documents = [document]
|
||||
else:
|
||||
documents = get_objects_for_user_owner_aware(
|
||||
request.user,
|
||||
"view_document",
|
||||
Document,
|
||||
)
|
||||
|
||||
response = StreamingHttpResponse(
|
||||
stream_chat_with_documents(query_str=question, documents=documents),
|
||||
content_type="text/event-stream",
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
@extend_schema_view(
|
||||
list=extend_schema(
|
||||
description="Document views including search",
|
||||
@@ -2446,6 +2578,10 @@ class UiSettingsView(GenericAPIView):
|
||||
|
||||
ui_settings["email_enabled"] = settings.EMAIL_ENABLED
|
||||
|
||||
ai_config = AIConfig()
|
||||
|
||||
ui_settings["ai_enabled"] = ai_config.ai_enabled
|
||||
|
||||
user_resp = {
|
||||
"id": user.id,
|
||||
"username": user.username,
|
||||
@@ -2587,6 +2723,10 @@ class TasksViewSet(ReadOnlyModelViewSet):
|
||||
sanity_check,
|
||||
{"scheduled": False, "raise_on_error": False},
|
||||
),
|
||||
PaperlessTask.TaskName.LLMINDEX_UPDATE: (
|
||||
llmindex_index,
|
||||
{"scheduled": False, "rebuild": False},
|
||||
),
|
||||
}
|
||||
|
||||
def get_queryset(self):
|
||||
@@ -3106,6 +3246,31 @@ class SystemStatusView(PassUserMixin):
|
||||
last_sanity_check.date_done if last_sanity_check else None
|
||||
)
|
||||
|
||||
ai_config = AIConfig()
|
||||
if not ai_config.llm_index_enabled:
|
||||
llmindex_status = "DISABLED"
|
||||
llmindex_error = None
|
||||
llmindex_last_modified = None
|
||||
else:
|
||||
last_llmindex_update = (
|
||||
PaperlessTask.objects.filter(
|
||||
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
|
||||
)
|
||||
.order_by("-date_done")
|
||||
.first()
|
||||
)
|
||||
llmindex_status = "OK"
|
||||
llmindex_error = None
|
||||
if last_llmindex_update is None:
|
||||
llmindex_status = "WARNING"
|
||||
llmindex_error = "No LLM index update tasks found"
|
||||
elif last_llmindex_update and last_llmindex_update.status == states.FAILURE:
|
||||
llmindex_status = "ERROR"
|
||||
llmindex_error = last_llmindex_update.result
|
||||
llmindex_last_modified = (
|
||||
last_llmindex_update.date_done if last_llmindex_update else None
|
||||
)
|
||||
|
||||
return Response(
|
||||
{
|
||||
"pngx_version": current_version,
|
||||
@@ -3143,6 +3308,9 @@ class SystemStatusView(PassUserMixin):
|
||||
"sanity_check_status": sanity_check_status,
|
||||
"sanity_check_last_run": sanity_check_last_run,
|
||||
"sanity_check_error": sanity_check_error,
|
||||
"llmindex_status": llmindex_status,
|
||||
"llmindex_last_modified": llmindex_last_modified,
|
||||
"llmindex_error": llmindex_error,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
@@ -169,3 +169,37 @@ class GeneralConfig(BaseConfig):
|
||||
|
||||
self.app_title = app_config.app_title or None
|
||||
self.app_logo = app_config.app_logo.url if app_config.app_logo else None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class AIConfig(BaseConfig):
|
||||
"""
|
||||
AI related settings that require global scope
|
||||
"""
|
||||
|
||||
ai_enabled: bool = dataclasses.field(init=False)
|
||||
llm_embedding_backend: str = dataclasses.field(init=False)
|
||||
llm_embedding_model: str = dataclasses.field(init=False)
|
||||
llm_backend: str = dataclasses.field(init=False)
|
||||
llm_model: str = dataclasses.field(init=False)
|
||||
llm_api_key: str = dataclasses.field(init=False)
|
||||
llm_endpoint: str = dataclasses.field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
app_config = self._get_config_instance()
|
||||
|
||||
self.ai_enabled = app_config.ai_enabled or settings.AI_ENABLED
|
||||
self.llm_embedding_backend = (
|
||||
app_config.llm_embedding_backend or settings.LLM_EMBEDDING_BACKEND
|
||||
)
|
||||
self.llm_embedding_model = (
|
||||
app_config.llm_embedding_model or settings.LLM_EMBEDDING_MODEL
|
||||
)
|
||||
self.llm_backend = app_config.llm_backend or settings.LLM_BACKEND
|
||||
self.llm_model = app_config.llm_model or settings.LLM_MODEL
|
||||
self.llm_api_key = app_config.llm_api_key or settings.LLM_API_KEY
|
||||
self.llm_endpoint = app_config.llm_endpoint or settings.LLM_ENDPOINT
|
||||
|
||||
@property
|
||||
def llm_index_enabled(self) -> bool:
|
||||
return bool(self.ai_enabled and self.llm_embedding_backend)
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
# Generated by Django 5.2.6 on 2025-09-30 17:43
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("paperless", "0004_applicationconfiguration_barcode_asn_prefix_and_more"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="ai_enabled",
|
||||
field=models.BooleanField(
|
||||
default=False,
|
||||
null=True,
|
||||
verbose_name="Enables AI features",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_api_key",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=1024,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM API key",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_backend",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("openai", "OpenAI"), ("ollama", "Ollama")],
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM backend",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_embedding_backend",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("openai", "OpenAI"), ("huggingface", "Huggingface")],
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM embedding backend",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_embedding_model",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM embedding model",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_endpoint",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=256,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM endpoint, optional",
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="applicationconfiguration",
|
||||
name="llm_model",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
max_length=128,
|
||||
null=True,
|
||||
verbose_name="Sets the LLM model",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -74,6 +74,20 @@ class ColorConvertChoices(models.TextChoices):
|
||||
CMYK = ("CMYK", _("CMYK"))
|
||||
|
||||
|
||||
class LLMEmbeddingBackend(models.TextChoices):
|
||||
OPENAI = ("openai", _("OpenAI"))
|
||||
HUGGINGFACE = ("huggingface", _("Huggingface"))
|
||||
|
||||
|
||||
class LLMBackend(models.TextChoices):
|
||||
"""
|
||||
Matches to --llm-backend
|
||||
"""
|
||||
|
||||
OPENAI = ("openai", _("OpenAI"))
|
||||
OLLAMA = ("ollama", _("Ollama"))
|
||||
|
||||
|
||||
class ApplicationConfiguration(AbstractSingletonModel):
|
||||
"""
|
||||
Settings which are common across more than 1 parser
|
||||
@@ -265,6 +279,60 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
null=True,
|
||||
)
|
||||
|
||||
"""
|
||||
AI related settings
|
||||
"""
|
||||
|
||||
ai_enabled = models.BooleanField(
|
||||
verbose_name=_("Enables AI features"),
|
||||
null=True,
|
||||
default=False,
|
||||
)
|
||||
|
||||
llm_embedding_backend = models.CharField(
|
||||
verbose_name=_("Sets the LLM embedding backend"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
choices=LLMEmbeddingBackend.choices,
|
||||
)
|
||||
|
||||
llm_embedding_model = models.CharField(
|
||||
verbose_name=_("Sets the LLM embedding model"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
)
|
||||
|
||||
llm_backend = models.CharField(
|
||||
verbose_name=_("Sets the LLM backend"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
choices=LLMBackend.choices,
|
||||
)
|
||||
|
||||
llm_model = models.CharField(
|
||||
verbose_name=_("Sets the LLM model"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=128,
|
||||
)
|
||||
|
||||
llm_api_key = models.CharField(
|
||||
verbose_name=_("Sets the LLM API key"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=1024,
|
||||
)
|
||||
|
||||
llm_endpoint = models.CharField(
|
||||
verbose_name=_("Sets the LLM endpoint, optional"),
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=256,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("paperless application settings")
|
||||
|
||||
|
||||
@@ -206,6 +206,10 @@ class ProfileSerializer(PasswordValidationMixin, serializers.ModelSerializer):
|
||||
class ApplicationConfigurationSerializer(serializers.ModelSerializer):
|
||||
user_args = serializers.JSONField(binary=True, allow_null=True)
|
||||
barcode_tag_mapping = serializers.JSONField(binary=True, allow_null=True)
|
||||
llm_api_key = ObfuscatedPasswordField(
|
||||
required=False,
|
||||
allow_null=True,
|
||||
)
|
||||
|
||||
def run_validation(self, data):
|
||||
# Empty strings treated as None to avoid unexpected behavior
|
||||
@@ -215,6 +219,11 @@ class ApplicationConfigurationSerializer(serializers.ModelSerializer):
|
||||
data["barcode_tag_mapping"] = None
|
||||
if "language" in data and data["language"] == "":
|
||||
data["language"] = None
|
||||
if "llm_api_key" in data and data["llm_api_key"] is not None:
|
||||
if data["llm_api_key"] == "":
|
||||
data["llm_api_key"] = None
|
||||
elif len(data["llm_api_key"].replace("*", "")) == 0:
|
||||
del data["llm_api_key"]
|
||||
return super().run_validation(data)
|
||||
|
||||
def update(self, instance, validated_data):
|
||||
|
||||
@@ -12,6 +12,7 @@ from typing import Final
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from celery.schedules import crontab
|
||||
from compression_middleware.middleware import CompressionMiddleware
|
||||
from dateparser.languages.loader import LocaleDataLoader
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from dotenv import load_dotenv
|
||||
@@ -229,6 +230,17 @@ def _parse_beat_schedule() -> dict:
|
||||
"expires": 59.0 * 60.0,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "Rebuild LLM index",
|
||||
"env_key": "PAPERLESS_LLM_INDEX_TASK_CRON",
|
||||
# Default daily at 02:10
|
||||
"env_default": "10 2 * * *",
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"options": {
|
||||
# 1 hour before default schedule sends again
|
||||
"expires": 23.0 * 60.0 * 60.0,
|
||||
},
|
||||
},
|
||||
]
|
||||
for task in tasks:
|
||||
# Either get the environment setting or use the default
|
||||
@@ -287,6 +299,7 @@ MODEL_FILE = __get_path(
|
||||
"PAPERLESS_MODEL_FILE",
|
||||
DATA_DIR / "classification_model.pickle",
|
||||
)
|
||||
LLM_INDEX_DIR = DATA_DIR / "llm_index"
|
||||
|
||||
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", DATA_DIR / "log")
|
||||
|
||||
@@ -380,6 +393,19 @@ MIDDLEWARE = [
|
||||
if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"): # pragma: no cover
|
||||
MIDDLEWARE.insert(0, "compression_middleware.middleware.CompressionMiddleware")
|
||||
|
||||
# Workaround to not compress streaming responses (e.g. chat).
|
||||
# See https://github.com/friedelwolff/django-compression-middleware/pull/7
|
||||
original_process_response = CompressionMiddleware.process_response
|
||||
|
||||
|
||||
def patched_process_response(self, request, response):
|
||||
if getattr(request, "compress_exempt", False):
|
||||
return response
|
||||
return original_process_response(self, request, response)
|
||||
|
||||
|
||||
CompressionMiddleware.process_response = patched_process_response
|
||||
|
||||
ROOT_URLCONF = "paperless.urls"
|
||||
|
||||
|
||||
@@ -585,6 +611,10 @@ X_FRAME_OPTIONS = "SAMEORIGIN"
|
||||
# The next 3 settings can also be set using just PAPERLESS_URL
|
||||
CSRF_TRUSTED_ORIGINS = __get_list("PAPERLESS_CSRF_TRUSTED_ORIGINS")
|
||||
|
||||
if DEBUG:
|
||||
# Allow access from the angular development server during debugging
|
||||
CSRF_TRUSTED_ORIGINS.append("http://localhost:4200")
|
||||
|
||||
# We allow CORS from localhost:8000
|
||||
CORS_ALLOWED_ORIGINS = __get_list(
|
||||
"PAPERLESS_CORS_ALLOWED_HOSTS",
|
||||
@@ -595,6 +625,8 @@ if DEBUG:
|
||||
# Allow access from the angular development server during debugging
|
||||
CORS_ALLOWED_ORIGINS.append("http://localhost:4200")
|
||||
|
||||
CORS_ALLOW_CREDENTIALS = True
|
||||
|
||||
CORS_EXPOSE_HEADERS = [
|
||||
"Content-Disposition",
|
||||
]
|
||||
@@ -868,6 +900,7 @@ LOGGING = {
|
||||
"loggers": {
|
||||
"paperless": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
||||
"paperless_mail": {"handlers": ["file_mail"], "level": "DEBUG"},
|
||||
"paperless_ai": {"handlers": ["file_paperless"], "level": "DEBUG"},
|
||||
"ocrmypdf": {"handlers": ["file_paperless"], "level": "INFO"},
|
||||
"celery": {"handlers": ["file_celery"], "level": "DEBUG"},
|
||||
"kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
|
||||
@@ -1404,3 +1437,16 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
|
||||
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
|
||||
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
|
||||
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
|
||||
|
||||
################################################################################
|
||||
# AI Settings #
|
||||
################################################################################
|
||||
AI_ENABLED = __get_boolean("PAPERLESS_AI_ENABLED", "NO")
|
||||
LLM_EMBEDDING_BACKEND = os.getenv(
|
||||
"PAPERLESS_AI_LLM_EMBEDDING_BACKEND",
|
||||
) # "huggingface" or "openai"
|
||||
LLM_EMBEDDING_MODEL = os.getenv("PAPERLESS_AI_LLM_EMBEDDING_MODEL")
|
||||
LLM_BACKEND = os.getenv("PAPERLESS_AI_LLM_BACKEND") # "ollama" or "openai"
|
||||
LLM_MODEL = os.getenv("PAPERLESS_AI_LLM_MODEL")
|
||||
LLM_API_KEY = os.getenv("PAPERLESS_AI_LLM_API_KEY")
|
||||
LLM_ENDPOINT = os.getenv("PAPERLESS_AI_LLM_ENDPOINT")
|
||||
|
||||
@@ -160,6 +160,7 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
|
||||
EMPTY_TRASH_EXPIRE_TIME = 23.0 * 60.0 * 60.0
|
||||
RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME = 59.0 * 60.0
|
||||
LLM_INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
|
||||
|
||||
def test_schedule_configuration_default(self):
|
||||
"""
|
||||
@@ -204,6 +205,13 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
|
||||
},
|
||||
"Rebuild LLM index": {
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"schedule": crontab(minute=10, hour=2),
|
||||
"options": {
|
||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||
},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
@@ -256,6 +264,13 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
|
||||
},
|
||||
"Rebuild LLM index": {
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"schedule": crontab(minute=10, hour=2),
|
||||
"options": {
|
||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||
},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
@@ -300,6 +315,13 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
"options": {"expires": self.RUN_SCHEDULED_WORKFLOWS_EXPIRE_TIME},
|
||||
},
|
||||
"Rebuild LLM index": {
|
||||
"task": "documents.tasks.llmindex_index",
|
||||
"schedule": crontab(minute=10, hour=2),
|
||||
"options": {
|
||||
"expires": self.LLM_INDEX_EXPIRE_TIME,
|
||||
},
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
@@ -322,6 +344,7 @@ class TestCeleryScheduleParsing(TestCase):
|
||||
"PAPERLESS_INDEX_TASK_CRON": "disable",
|
||||
"PAPERLESS_EMPTY_TRASH_TASK_CRON": "disable",
|
||||
"PAPERLESS_WORKFLOW_SCHEDULED_TASK_CRON": "disable",
|
||||
"PAPERLESS_LLM_INDEX_TASK_CRON": "disable",
|
||||
},
|
||||
):
|
||||
schedule = _parse_beat_schedule()
|
||||
|
||||
@@ -18,6 +18,7 @@ from rest_framework.routers import DefaultRouter
|
||||
from documents.views import BulkDownloadView
|
||||
from documents.views import BulkEditObjectsView
|
||||
from documents.views import BulkEditView
|
||||
from documents.views import ChatStreamingView
|
||||
from documents.views import CorrespondentViewSet
|
||||
from documents.views import CustomFieldViewSet
|
||||
from documents.views import DocumentTypeViewSet
|
||||
@@ -139,6 +140,11 @@ urlpatterns = [
|
||||
SelectionDataView.as_view(),
|
||||
name="selection_data",
|
||||
),
|
||||
re_path(
|
||||
"^chat/",
|
||||
ChatStreamingView.as_view(),
|
||||
name="chat_streaming_view",
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
|
||||
@@ -35,6 +35,7 @@ from rest_framework.viewsets import ModelViewSet
|
||||
|
||||
from documents.index import DelayedQuery
|
||||
from documents.permissions import PaperlessObjectPermissions
|
||||
from documents.tasks import llmindex_index
|
||||
from paperless.filters import GroupFilterSet
|
||||
from paperless.filters import UserFilterSet
|
||||
from paperless.models import ApplicationConfiguration
|
||||
@@ -43,6 +44,7 @@ from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import PaperlessAuthTokenSerializer
|
||||
from paperless.serialisers import ProfileSerializer
|
||||
from paperless.serialisers import UserSerializer
|
||||
from paperless_ai.indexing import vector_store_file_exists
|
||||
|
||||
|
||||
class PaperlessObtainAuthTokenView(ObtainAuthToken):
|
||||
@@ -358,6 +360,30 @@ class ApplicationConfigurationViewSet(ModelViewSet):
|
||||
def create(self, request, *args, **kwargs):
|
||||
return Response(status=405) # Not Allowed
|
||||
|
||||
def perform_update(self, serializer):
|
||||
old_instance = ApplicationConfiguration.objects.all().first()
|
||||
old_ai_index_enabled = (
|
||||
old_instance.ai_enabled and old_instance.llm_embedding_backend
|
||||
)
|
||||
|
||||
new_instance: ApplicationConfiguration = serializer.save()
|
||||
new_ai_index_enabled = (
|
||||
new_instance.ai_enabled and new_instance.llm_embedding_backend
|
||||
)
|
||||
|
||||
if (
|
||||
not old_ai_index_enabled
|
||||
and new_ai_index_enabled
|
||||
and not vector_store_file_exists()
|
||||
):
|
||||
# AI index was just enabled and vector store file does not exist
|
||||
llmindex_index.delay(
|
||||
progress_bar_disable=True,
|
||||
rebuild=True,
|
||||
scheduled=False,
|
||||
auto=True,
|
||||
)
|
||||
|
||||
|
||||
@extend_schema_view(
|
||||
post=extend_schema(
|
||||
|
||||
0
src/paperless_ai/__init__.py
Normal file
0
src/paperless_ai/__init__.py
Normal file
102
src/paperless_ai/ai_classifier.py
Normal file
102
src/paperless_ai/ai_classifier.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import logging
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from documents.models import Document
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
from paperless.config import AIConfig
|
||||
from paperless_ai.client import AIClient
|
||||
from paperless_ai.indexing import query_similar_documents
|
||||
from paperless_ai.indexing import truncate_content
|
||||
|
||||
logger = logging.getLogger("paperless_ai.rag_classifier")
|
||||
|
||||
|
||||
def build_prompt_without_rag(document: Document) -> str:
|
||||
filename = document.filename or ""
|
||||
content = truncate_content(document.content[:4000] or "")
|
||||
|
||||
return f"""
|
||||
You are a document classification assistant.
|
||||
|
||||
Analyze the following document and extract the following information:
|
||||
- A short descriptive title
|
||||
- Tags that reflect the content
|
||||
- Names of people or organizations mentioned
|
||||
- The type or category of the document
|
||||
- Suggested folder paths for storing the document
|
||||
- Up to 3 relevant dates in YYYY-MM-DD format
|
||||
|
||||
Filename:
|
||||
{filename}
|
||||
|
||||
Content:
|
||||
{content}
|
||||
""".strip()
|
||||
|
||||
|
||||
def build_prompt_with_rag(document: Document, user: User | None = None) -> str:
|
||||
base_prompt = build_prompt_without_rag(document)
|
||||
context = truncate_content(get_context_for_document(document, user))
|
||||
|
||||
return f"""{base_prompt}
|
||||
|
||||
Additional context from similar documents:
|
||||
{context}
|
||||
""".strip()
|
||||
|
||||
|
||||
def get_context_for_document(
|
||||
doc: Document,
|
||||
user: User | None = None,
|
||||
max_docs: int = 5,
|
||||
) -> str:
|
||||
visible_documents = (
|
||||
get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"view_document",
|
||||
Document,
|
||||
)
|
||||
if user
|
||||
else None
|
||||
)
|
||||
similar_docs = query_similar_documents(
|
||||
document=doc,
|
||||
document_ids=[document.pk for document in visible_documents]
|
||||
if visible_documents
|
||||
else None,
|
||||
)[:max_docs]
|
||||
context_blocks = []
|
||||
for similar in similar_docs:
|
||||
text = similar.content[:1000] or ""
|
||||
title = similar.title or similar.filename or "Untitled"
|
||||
context_blocks.append(f"TITLE: {title}\n{text}")
|
||||
return "\n\n".join(context_blocks)
|
||||
|
||||
|
||||
def parse_ai_response(raw: dict) -> dict:
|
||||
return {
|
||||
"title": raw.get("title", ""),
|
||||
"tags": raw.get("tags", []),
|
||||
"correspondents": raw.get("correspondents", []),
|
||||
"document_types": raw.get("document_types", []),
|
||||
"storage_paths": raw.get("storage_paths", []),
|
||||
"dates": raw.get("dates", []),
|
||||
}
|
||||
|
||||
|
||||
def get_ai_document_classification(
|
||||
document: Document,
|
||||
user: User | None = None,
|
||||
) -> dict:
|
||||
ai_config = AIConfig()
|
||||
|
||||
prompt = (
|
||||
build_prompt_with_rag(document, user)
|
||||
if ai_config.llm_embedding_backend
|
||||
else build_prompt_without_rag(document)
|
||||
)
|
||||
|
||||
client = AIClient()
|
||||
result = client.run_llm_query(prompt)
|
||||
return parse_ai_response(result)
|
||||
10
src/paperless_ai/base_model.py
Normal file
10
src/paperless_ai/base_model.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from llama_index.core.bridge.pydantic import BaseModel
|
||||
|
||||
|
||||
class DocumentClassifierSchema(BaseModel):
|
||||
title: str
|
||||
tags: list[str]
|
||||
correspondents: list[str]
|
||||
document_types: list[str]
|
||||
storage_paths: list[str]
|
||||
dates: list[str]
|
||||
105
src/paperless_ai/chat.py
Normal file
105
src/paperless_ai/chat.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from llama_index.core import VectorStoreIndex
|
||||
from llama_index.core.prompts import PromptTemplate
|
||||
from llama_index.core.query_engine import RetrieverQueryEngine
|
||||
|
||||
from documents.models import Document
|
||||
from paperless_ai.client import AIClient
|
||||
from paperless_ai.indexing import load_or_build_index
|
||||
|
||||
logger = logging.getLogger("paperless_ai.chat")
|
||||
|
||||
MAX_SINGLE_DOC_CONTEXT_CHARS = 15000
|
||||
SINGLE_DOC_SNIPPET_CHARS = 800
|
||||
|
||||
CHAT_PROMPT_TMPL = PromptTemplate(
|
||||
template="""Context information is below.
|
||||
---------------------
|
||||
{context_str}
|
||||
---------------------
|
||||
Given the context information and not prior knowledge, answer the query.
|
||||
Query: {query_str}
|
||||
Answer:""",
|
||||
)
|
||||
|
||||
|
||||
def stream_chat_with_documents(query_str: str, documents: list[Document]):
|
||||
client = AIClient()
|
||||
index = load_or_build_index()
|
||||
|
||||
doc_ids = [str(doc.pk) for doc in documents]
|
||||
|
||||
# Filter only the node(s) that match the document IDs
|
||||
nodes = [
|
||||
node
|
||||
for node in index.docstore.docs.values()
|
||||
if node.metadata.get("document_id") in doc_ids
|
||||
]
|
||||
|
||||
if len(nodes) == 0:
|
||||
logger.warning("No nodes found for the given documents.")
|
||||
yield "Sorry, I couldn't find any content to answer your question."
|
||||
return
|
||||
|
||||
local_index = VectorStoreIndex(nodes=nodes)
|
||||
retriever = local_index.as_retriever(
|
||||
similarity_top_k=3 if len(documents) == 1 else 5,
|
||||
)
|
||||
|
||||
if len(documents) == 1:
|
||||
# Just one doc — provide full content
|
||||
doc = documents[0]
|
||||
# TODO: include document metadata in the context
|
||||
content = doc.content or ""
|
||||
context_body = content
|
||||
|
||||
if len(content) > MAX_SINGLE_DOC_CONTEXT_CHARS:
|
||||
logger.info(
|
||||
"Truncating single-document context from %s to %s characters",
|
||||
len(content),
|
||||
MAX_SINGLE_DOC_CONTEXT_CHARS,
|
||||
)
|
||||
context_body = content[:MAX_SINGLE_DOC_CONTEXT_CHARS]
|
||||
|
||||
top_nodes = retriever.retrieve(query_str)
|
||||
if len(top_nodes) > 0:
|
||||
snippets = "\n\n".join(
|
||||
f"TITLE: {node.metadata.get('title')}\n{node.text[:SINGLE_DOC_SNIPPET_CHARS]}"
|
||||
for node in top_nodes
|
||||
)
|
||||
context_body = f"{context_body}\n\nTOP MATCHES:\n{snippets}"
|
||||
|
||||
context = f"TITLE: {doc.title or doc.filename}\n{context_body}"
|
||||
else:
|
||||
top_nodes = retriever.retrieve(query_str)
|
||||
|
||||
if len(top_nodes) == 0:
|
||||
logger.warning("Retriever returned no nodes for the given documents.")
|
||||
yield "Sorry, I couldn't find any content to answer your question."
|
||||
return
|
||||
|
||||
context = "\n\n".join(
|
||||
f"TITLE: {node.metadata.get('title')}\n{node.text[:SINGLE_DOC_SNIPPET_CHARS]}"
|
||||
for node in top_nodes
|
||||
)
|
||||
|
||||
prompt = CHAT_PROMPT_TMPL.partial_format(
|
||||
context_str=context,
|
||||
query_str=query_str,
|
||||
).format(llm=client.llm)
|
||||
|
||||
query_engine = RetrieverQueryEngine.from_args(
|
||||
retriever=retriever,
|
||||
llm=client.llm,
|
||||
streaming=True,
|
||||
)
|
||||
|
||||
logger.debug("Document chat prompt: %s", prompt)
|
||||
|
||||
response_stream = query_engine.query(prompt)
|
||||
|
||||
for chunk in response_stream.response_gen:
|
||||
yield chunk
|
||||
sys.stdout.flush()
|
||||
69
src/paperless_ai/client.py
Normal file
69
src/paperless_ai/client.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import logging
|
||||
|
||||
from llama_index.core.llms import ChatMessage
|
||||
from llama_index.core.program.function_program import get_function_tool
|
||||
from llama_index.llms.ollama import Ollama
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
from paperless.config import AIConfig
|
||||
from paperless_ai.base_model import DocumentClassifierSchema
|
||||
|
||||
logger = logging.getLogger("paperless_ai.client")
|
||||
|
||||
|
||||
class AIClient:
|
||||
"""
|
||||
A client for interacting with an LLM backend.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.settings = AIConfig()
|
||||
self.llm = self.get_llm()
|
||||
|
||||
def get_llm(self) -> Ollama | OpenAI:
|
||||
if self.settings.llm_backend == "ollama":
|
||||
return Ollama(
|
||||
model=self.settings.llm_model or "llama3",
|
||||
base_url=self.settings.llm_endpoint or "http://localhost:11434",
|
||||
request_timeout=120,
|
||||
)
|
||||
elif self.settings.llm_backend == "openai":
|
||||
return OpenAI(
|
||||
model=self.settings.llm_model or "gpt-3.5-turbo",
|
||||
api_base=self.settings.llm_endpoint or None,
|
||||
api_key=self.settings.llm_api_key,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported LLM backend: {self.settings.llm_backend}")
|
||||
|
||||
def run_llm_query(self, prompt: str) -> str:
|
||||
logger.debug(
|
||||
"Running LLM query against %s with model %s",
|
||||
self.settings.llm_backend,
|
||||
self.settings.llm_model,
|
||||
)
|
||||
|
||||
user_msg = ChatMessage(role="user", content=prompt)
|
||||
tool = get_function_tool(DocumentClassifierSchema)
|
||||
result = self.llm.chat_with_tools(
|
||||
tools=[tool],
|
||||
user_msg=user_msg,
|
||||
chat_history=[],
|
||||
)
|
||||
tool_calls = self.llm.get_tool_calls_from_response(
|
||||
result,
|
||||
error_on_no_tool_calls=True,
|
||||
)
|
||||
logger.debug("LLM query result: %s", tool_calls)
|
||||
parsed = DocumentClassifierSchema(**tool_calls[0].tool_kwargs)
|
||||
return parsed.model_dump()
|
||||
|
||||
def run_chat(self, messages: list[ChatMessage]) -> str:
|
||||
logger.debug(
|
||||
"Running chat query against %s with model %s",
|
||||
self.settings.llm_backend,
|
||||
self.settings.llm_model,
|
||||
)
|
||||
result = self.llm.chat(messages)
|
||||
logger.debug("Chat result: %s", result)
|
||||
return result
|
||||
92
src/paperless_ai/embedding.py
Normal file
92
src/paperless_ai/embedding.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from llama_index.core.base.embeddings.base import BaseEmbedding
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import Note
|
||||
from paperless.config import AIConfig
|
||||
from paperless.models import LLMEmbeddingBackend
|
||||
|
||||
|
||||
def get_embedding_model() -> BaseEmbedding:
|
||||
config = AIConfig()
|
||||
|
||||
match config.llm_embedding_backend:
|
||||
case LLMEmbeddingBackend.OPENAI:
|
||||
return OpenAIEmbedding(
|
||||
model=config.llm_embedding_model or "text-embedding-3-small",
|
||||
api_key=config.llm_api_key,
|
||||
)
|
||||
case LLMEmbeddingBackend.HUGGINGFACE:
|
||||
return HuggingFaceEmbedding(
|
||||
model_name=config.llm_embedding_model
|
||||
or "sentence-transformers/all-MiniLM-L6-v2",
|
||||
)
|
||||
case _:
|
||||
raise ValueError(
|
||||
f"Unsupported embedding backend: {config.llm_embedding_backend}",
|
||||
)
|
||||
|
||||
|
||||
def get_embedding_dim() -> int:
|
||||
"""
|
||||
Loads embedding dimension from meta.json if available, otherwise infers it
|
||||
from a dummy embedding and stores it for future use.
|
||||
"""
|
||||
config = AIConfig()
|
||||
model = config.llm_embedding_model or (
|
||||
"text-embedding-3-small"
|
||||
if config.llm_embedding_backend == "openai"
|
||||
else "sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
|
||||
meta_path: Path = settings.LLM_INDEX_DIR / "meta.json"
|
||||
if meta_path.exists():
|
||||
with meta_path.open() as f:
|
||||
meta = json.load(f)
|
||||
if meta.get("embedding_model") != model:
|
||||
raise RuntimeError(
|
||||
f"Embedding model changed from {meta.get('embedding_model')} to {model}. "
|
||||
"You must rebuild the index.",
|
||||
)
|
||||
return meta["dim"]
|
||||
|
||||
embedding_model = get_embedding_model()
|
||||
test_embed = embedding_model.get_text_embedding("test")
|
||||
dim = len(test_embed)
|
||||
|
||||
with meta_path.open("w") as f:
|
||||
json.dump({"embedding_model": model, "dim": dim}, f)
|
||||
|
||||
return dim
|
||||
|
||||
|
||||
def build_llm_index_text(doc: Document) -> str:
|
||||
lines = [
|
||||
f"Title: {doc.title}",
|
||||
f"Filename: {doc.filename}",
|
||||
f"Created: {doc.created}",
|
||||
f"Added: {doc.added}",
|
||||
f"Modified: {doc.modified}",
|
||||
f"Tags: {', '.join(tag.name for tag in doc.tags.all())}",
|
||||
f"Document Type: {doc.document_type.name if doc.document_type else ''}",
|
||||
f"Correspondent: {doc.correspondent.name if doc.correspondent else ''}",
|
||||
f"Storage Path: {doc.storage_path.name if doc.storage_path else ''}",
|
||||
f"Archive Serial Number: {doc.archive_serial_number or ''}",
|
||||
f"Notes: {','.join([str(c.note) for c in Note.objects.filter(document=doc)])}",
|
||||
]
|
||||
|
||||
for instance in doc.custom_fields.all():
|
||||
lines.append(f"Custom Field - {instance.field.name}: {instance}")
|
||||
|
||||
lines.append("\nContent:\n")
|
||||
lines.append(doc.content or "")
|
||||
|
||||
return "\n".join(lines)
|
||||
283
src/paperless_ai/indexing.py
Normal file
283
src/paperless_ai/indexing.py
Normal file
@@ -0,0 +1,283 @@
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import faiss
|
||||
import llama_index.core.settings as llama_settings
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from llama_index.core import Document as LlamaDocument
|
||||
from llama_index.core import StorageContext
|
||||
from llama_index.core import VectorStoreIndex
|
||||
from llama_index.core import load_index_from_storage
|
||||
from llama_index.core.indices.prompt_helper import PromptHelper
|
||||
from llama_index.core.node_parser import SimpleNodeParser
|
||||
from llama_index.core.prompts import PromptTemplate
|
||||
from llama_index.core.retrievers import VectorIndexRetriever
|
||||
from llama_index.core.schema import BaseNode
|
||||
from llama_index.core.storage.docstore import SimpleDocumentStore
|
||||
from llama_index.core.storage.index_store import SimpleIndexStore
|
||||
from llama_index.core.text_splitter import TokenTextSplitter
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
|
||||
from documents.models import Document
|
||||
from paperless_ai.embedding import build_llm_index_text
|
||||
from paperless_ai.embedding import get_embedding_dim
|
||||
from paperless_ai.embedding import get_embedding_model
|
||||
|
||||
logger = logging.getLogger("paperless_ai.indexing")
|
||||
|
||||
|
||||
def get_or_create_storage_context(*, rebuild=False):
|
||||
"""
|
||||
Loads or creates the StorageContext (vector store, docstore, index store).
|
||||
If rebuild=True, deletes and recreates everything.
|
||||
"""
|
||||
if rebuild:
|
||||
shutil.rmtree(settings.LLM_INDEX_DIR, ignore_errors=True)
|
||||
settings.LLM_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if rebuild or not settings.LLM_INDEX_DIR.exists():
|
||||
embedding_dim = get_embedding_dim()
|
||||
faiss_index = faiss.IndexFlatL2(embedding_dim)
|
||||
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
||||
docstore = SimpleDocumentStore()
|
||||
index_store = SimpleIndexStore()
|
||||
else:
|
||||
vector_store = FaissVectorStore.from_persist_dir(settings.LLM_INDEX_DIR)
|
||||
docstore = SimpleDocumentStore.from_persist_dir(settings.LLM_INDEX_DIR)
|
||||
index_store = SimpleIndexStore.from_persist_dir(settings.LLM_INDEX_DIR)
|
||||
|
||||
return StorageContext.from_defaults(
|
||||
docstore=docstore,
|
||||
index_store=index_store,
|
||||
vector_store=vector_store,
|
||||
persist_dir=settings.LLM_INDEX_DIR,
|
||||
)
|
||||
|
||||
|
||||
def build_document_node(document: Document) -> list[BaseNode]:
|
||||
"""
|
||||
Given a Document, returns parsed Nodes ready for indexing.
|
||||
"""
|
||||
text = build_llm_index_text(document)
|
||||
metadata = {
|
||||
"document_id": str(document.id),
|
||||
"title": document.title,
|
||||
"tags": [t.name for t in document.tags.all()],
|
||||
"correspondent": document.correspondent.name
|
||||
if document.correspondent
|
||||
else None,
|
||||
"document_type": document.document_type.name
|
||||
if document.document_type
|
||||
else None,
|
||||
"created": document.created.isoformat() if document.created else None,
|
||||
"added": document.added.isoformat() if document.added else None,
|
||||
"modified": document.modified.isoformat(),
|
||||
}
|
||||
doc = LlamaDocument(text=text, metadata=metadata)
|
||||
parser = SimpleNodeParser()
|
||||
return parser.get_nodes_from_documents([doc])
|
||||
|
||||
|
||||
def load_or_build_index(nodes=None):
|
||||
"""
|
||||
Load an existing VectorStoreIndex if present,
|
||||
or build a new one using provided nodes if storage is empty.
|
||||
"""
|
||||
embed_model = get_embedding_model()
|
||||
llama_settings.Settings.embed_model = embed_model
|
||||
storage_context = get_or_create_storage_context()
|
||||
try:
|
||||
return load_index_from_storage(storage_context=storage_context)
|
||||
except ValueError as e:
|
||||
logger.warning("Failed to load index from storage: %s", e)
|
||||
if not nodes:
|
||||
logger.info("No nodes provided for index creation.")
|
||||
raise
|
||||
return VectorStoreIndex(
|
||||
nodes=nodes,
|
||||
storage_context=storage_context,
|
||||
embed_model=embed_model,
|
||||
)
|
||||
|
||||
|
||||
def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
|
||||
"""
|
||||
Removes existing documents from docstore for a given document from the index.
|
||||
This is necessary because FAISS IndexFlatL2 is append-only.
|
||||
"""
|
||||
all_node_ids = list(index.docstore.docs.keys())
|
||||
existing_nodes = [
|
||||
node.node_id
|
||||
for node in index.docstore.get_nodes(all_node_ids)
|
||||
if node.metadata.get("document_id") == str(document.id)
|
||||
]
|
||||
for node_id in existing_nodes:
|
||||
# Delete from docstore, FAISS IndexFlatL2 are append-only
|
||||
index.docstore.delete_document(node_id)
|
||||
|
||||
|
||||
def vector_store_file_exists():
|
||||
"""
|
||||
Check if the vector store file exists in the LLM index directory.
|
||||
"""
|
||||
return Path(settings.LLM_INDEX_DIR / "default__vector_store.json").exists()
|
||||
|
||||
|
||||
def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||
"""
|
||||
Rebuild or update the LLM index.
|
||||
"""
|
||||
nodes = []
|
||||
|
||||
documents = Document.objects.all()
|
||||
if not documents.exists():
|
||||
msg = "No documents found to index."
|
||||
logger.warning(msg)
|
||||
return msg
|
||||
|
||||
if rebuild or not vector_store_file_exists():
|
||||
# remove meta.json to force re-detection of embedding dim
|
||||
(settings.LLM_INDEX_DIR / "meta.json").unlink(missing_ok=True)
|
||||
# Rebuild index from scratch
|
||||
logger.info("Rebuilding LLM index.")
|
||||
embed_model = get_embedding_model()
|
||||
llama_settings.Settings.embed_model = embed_model
|
||||
storage_context = get_or_create_storage_context(rebuild=True)
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
document_nodes = build_document_node(document)
|
||||
nodes.extend(document_nodes)
|
||||
|
||||
index = VectorStoreIndex(
|
||||
nodes=nodes,
|
||||
storage_context=storage_context,
|
||||
embed_model=embed_model,
|
||||
show_progress=not progress_bar_disable,
|
||||
)
|
||||
msg = "LLM index rebuilt successfully."
|
||||
else:
|
||||
# Update existing index
|
||||
index = load_or_build_index()
|
||||
all_node_ids = list(index.docstore.docs.keys())
|
||||
existing_nodes = {
|
||||
node.metadata.get("document_id"): node
|
||||
for node in index.docstore.get_nodes(all_node_ids)
|
||||
}
|
||||
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
doc_id = str(document.id)
|
||||
document_modified = document.modified.isoformat()
|
||||
|
||||
if doc_id in existing_nodes:
|
||||
node = existing_nodes[doc_id]
|
||||
node_modified = node.metadata.get("modified")
|
||||
|
||||
if node_modified == document_modified:
|
||||
continue
|
||||
|
||||
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
||||
index.docstore.delete_document(node.node_id)
|
||||
nodes.extend(build_document_node(document))
|
||||
else:
|
||||
# New document, add it
|
||||
nodes.extend(build_document_node(document))
|
||||
|
||||
if nodes:
|
||||
msg = "LLM index updated successfully."
|
||||
logger.info(
|
||||
"Updating %d nodes in LLM index.",
|
||||
len(nodes),
|
||||
)
|
||||
index.insert_nodes(nodes)
|
||||
else:
|
||||
msg = "No changes detected in LLM index."
|
||||
logger.info(msg)
|
||||
|
||||
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
||||
return msg
|
||||
|
||||
|
||||
def llm_index_add_or_update_document(document: Document):
|
||||
"""
|
||||
Adds or updates a document in the LLM index.
|
||||
If the document already exists, it will be replaced.
|
||||
"""
|
||||
new_nodes = build_document_node(document)
|
||||
|
||||
index = load_or_build_index(nodes=new_nodes)
|
||||
|
||||
remove_document_docstore_nodes(document, index)
|
||||
|
||||
index.insert_nodes(new_nodes)
|
||||
|
||||
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
||||
|
||||
|
||||
def llm_index_remove_document(document: Document):
|
||||
"""
|
||||
Removes a document from the LLM index.
|
||||
"""
|
||||
index = load_or_build_index()
|
||||
|
||||
remove_document_docstore_nodes(document, index)
|
||||
|
||||
index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
|
||||
|
||||
|
||||
def truncate_content(content: str) -> str:
|
||||
prompt_helper = PromptHelper(
|
||||
context_window=8192,
|
||||
num_output=512,
|
||||
chunk_overlap_ratio=0.1,
|
||||
chunk_size_limit=None,
|
||||
)
|
||||
splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=50)
|
||||
content_chunks = splitter.split_text(content)
|
||||
truncated_chunks = prompt_helper.truncate(
|
||||
prompt=PromptTemplate(template="{content}"),
|
||||
text_chunks=content_chunks,
|
||||
padding=5,
|
||||
)
|
||||
return " ".join(truncated_chunks)
|
||||
|
||||
|
||||
def query_similar_documents(
|
||||
document: Document,
|
||||
top_k: int = 5,
|
||||
document_ids: list[int] | None = None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Runs a similarity query and returns top-k similar Document objects.
|
||||
"""
|
||||
index = load_or_build_index()
|
||||
|
||||
# constrain only the node(s) that match the document IDs, if given
|
||||
doc_node_ids = (
|
||||
[
|
||||
node.node_id
|
||||
for node in index.docstore.docs.values()
|
||||
if node.metadata.get("document_id") in document_ids
|
||||
]
|
||||
if document_ids
|
||||
else None
|
||||
)
|
||||
|
||||
retriever = VectorIndexRetriever(
|
||||
index=index,
|
||||
similarity_top_k=top_k,
|
||||
doc_ids=doc_node_ids,
|
||||
)
|
||||
|
||||
query_text = truncate_content(
|
||||
(document.title or "") + "\n" + (document.content or ""),
|
||||
)
|
||||
results = retriever.retrieve(query_text)
|
||||
|
||||
document_ids = [
|
||||
int(node.metadata["document_id"])
|
||||
for node in results
|
||||
if "document_id" in node.metadata
|
||||
]
|
||||
|
||||
return list(Document.objects.filter(pk__in=document_ids))
|
||||
102
src/paperless_ai/matching.py
Normal file
102
src/paperless_ai/matching.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import difflib
|
||||
import logging
|
||||
import re
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
|
||||
MATCH_THRESHOLD = 0.8
|
||||
|
||||
logger = logging.getLogger("paperless_ai.matching")
|
||||
|
||||
|
||||
def match_tags_by_name(names: list[str], user: User) -> list[Tag]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_tag"],
|
||||
Tag,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
|
||||
|
||||
def match_correspondents_by_name(names: list[str], user: User) -> list[Correspondent]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_correspondent"],
|
||||
Correspondent,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
|
||||
|
||||
def match_document_types_by_name(names: list[str], user: User) -> list[DocumentType]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_documenttype"],
|
||||
DocumentType,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
|
||||
|
||||
def match_storage_paths_by_name(names: list[str], user: User) -> list[StoragePath]:
|
||||
queryset = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
["view_storagepath"],
|
||||
StoragePath,
|
||||
)
|
||||
return _match_names_to_queryset(names, queryset, "name")
|
||||
|
||||
|
||||
def _normalize(s: str) -> str:
|
||||
s = s.lower()
|
||||
s = re.sub(r"[^\w\s]", "", s) # remove punctuation
|
||||
s = s.strip()
|
||||
return s
|
||||
|
||||
|
||||
def _match_names_to_queryset(names: list[str], queryset, attr: str):
|
||||
results = []
|
||||
objects = list(queryset)
|
||||
object_names = [_normalize(getattr(obj, attr)) for obj in objects]
|
||||
|
||||
for name in names:
|
||||
if not name:
|
||||
continue
|
||||
target = _normalize(name)
|
||||
|
||||
# First try exact match
|
||||
if target in object_names:
|
||||
index = object_names.index(target)
|
||||
matched = objects.pop(index)
|
||||
object_names.pop(index) # keep object list aligned after removal
|
||||
results.append(matched)
|
||||
continue
|
||||
|
||||
# Fuzzy match fallback
|
||||
matches = difflib.get_close_matches(
|
||||
target,
|
||||
object_names,
|
||||
n=1,
|
||||
cutoff=MATCH_THRESHOLD,
|
||||
)
|
||||
if matches:
|
||||
index = object_names.index(matches[0])
|
||||
matched = objects.pop(index)
|
||||
object_names.pop(index)
|
||||
results.append(matched)
|
||||
else:
|
||||
pass
|
||||
return results
|
||||
|
||||
|
||||
def extract_unmatched_names(
|
||||
names: list[str],
|
||||
matched_objects: list,
|
||||
attr="name",
|
||||
) -> list[str]:
|
||||
matched_names = {getattr(obj, attr).lower() for obj in matched_objects}
|
||||
return [name for name in names if name.lower() not in matched_names]
|
||||
0
src/paperless_ai/tests/__init__.py
Normal file
0
src/paperless_ai/tests/__init__.py
Normal file
186
src/paperless_ai/tests/test_ai_classifier.py
Normal file
186
src/paperless_ai/tests/test_ai_classifier.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.models import Document
|
||||
from paperless_ai.ai_classifier import build_prompt_with_rag
|
||||
from paperless_ai.ai_classifier import build_prompt_without_rag
|
||||
from paperless_ai.ai_classifier import get_ai_document_classification
|
||||
from paperless_ai.ai_classifier import get_context_for_document
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_document():
|
||||
doc = MagicMock(spec=Document)
|
||||
doc.title = "Test Title"
|
||||
doc.filename = "test_file.pdf"
|
||||
doc.created = "2023-01-01"
|
||||
doc.added = "2023-01-02"
|
||||
doc.modified = "2023-01-03"
|
||||
|
||||
tag1 = MagicMock()
|
||||
tag1.name = "Tag1"
|
||||
tag2 = MagicMock()
|
||||
tag2.name = "Tag2"
|
||||
doc.tags.all = MagicMock(return_value=[tag1, tag2])
|
||||
|
||||
doc.document_type = MagicMock()
|
||||
doc.document_type.name = "Invoice"
|
||||
doc.correspondent = MagicMock()
|
||||
doc.correspondent.name = "Test Correspondent"
|
||||
doc.archive_serial_number = "12345"
|
||||
doc.content = "This is the document content."
|
||||
|
||||
cf1 = MagicMock(__str__=lambda x: "Value1")
|
||||
cf1.field = MagicMock()
|
||||
cf1.field.name = "Field1"
|
||||
cf1.value = "Value1"
|
||||
cf2 = MagicMock(__str__=lambda x: "Value2")
|
||||
cf2.field = MagicMock()
|
||||
cf2.field.name = "Field2"
|
||||
cf2.value = "Value2"
|
||||
doc.custom_fields.all = MagicMock(return_value=[cf1, cf2])
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_similar_documents():
|
||||
doc1 = MagicMock()
|
||||
doc1.content = "Content of document 1"
|
||||
doc1.title = "Title 1"
|
||||
doc1.filename = "file1.txt"
|
||||
|
||||
doc2 = MagicMock()
|
||||
doc2.content = "Content of document 2"
|
||||
doc2.title = None
|
||||
doc2.filename = "file2.txt"
|
||||
|
||||
doc3 = MagicMock()
|
||||
doc3.content = None
|
||||
doc3.title = None
|
||||
doc3.filename = None
|
||||
|
||||
return [doc1, doc2, doc3]
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@patch("paperless_ai.client.AIClient.run_llm_query")
|
||||
@override_settings(
|
||||
LLM_BACKEND="ollama",
|
||||
LLM_MODEL="some_model",
|
||||
)
|
||||
def test_get_ai_document_classification_success(mock_run_llm_query, mock_document):
|
||||
mock_run_llm_query.return_value = {
|
||||
"title": "Test Title",
|
||||
"tags": ["test", "document"],
|
||||
"correspondents": ["John Doe"],
|
||||
"document_types": ["report"],
|
||||
"storage_paths": ["Reports"],
|
||||
"dates": ["2023-01-01"],
|
||||
}
|
||||
|
||||
result = get_ai_document_classification(mock_document)
|
||||
|
||||
assert result["title"] == "Test Title"
|
||||
assert result["tags"] == ["test", "document"]
|
||||
assert result["correspondents"] == ["John Doe"]
|
||||
assert result["document_types"] == ["report"]
|
||||
assert result["storage_paths"] == ["Reports"]
|
||||
assert result["dates"] == ["2023-01-01"]
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@patch("paperless_ai.client.AIClient.run_llm_query")
|
||||
def test_get_ai_document_classification_failure(mock_run_llm_query, mock_document):
|
||||
mock_run_llm_query.side_effect = Exception("LLM query failed")
|
||||
|
||||
# assert raises an exception
|
||||
with pytest.raises(Exception):
|
||||
get_ai_document_classification(mock_document)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@patch("paperless_ai.client.AIClient.run_llm_query")
|
||||
@patch("paperless_ai.ai_classifier.build_prompt_with_rag")
|
||||
@override_settings(
|
||||
LLM_EMBEDDING_BACKEND="huggingface",
|
||||
LLM_EMBEDDING_MODEL="some_model",
|
||||
LLM_BACKEND="ollama",
|
||||
LLM_MODEL="some_model",
|
||||
)
|
||||
def test_use_rag_if_configured(
|
||||
mock_build_prompt_with_rag,
|
||||
mock_run_llm_query,
|
||||
mock_document,
|
||||
):
|
||||
mock_build_prompt_with_rag.return_value = "Prompt with RAG"
|
||||
mock_run_llm_query.return_value.text = json.dumps({})
|
||||
get_ai_document_classification(mock_document)
|
||||
mock_build_prompt_with_rag.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@patch("paperless_ai.client.AIClient.run_llm_query")
|
||||
@patch("paperless_ai.ai_classifier.build_prompt_without_rag")
|
||||
@patch("paperless.config.AIConfig")
|
||||
@override_settings(
|
||||
LLM_BACKEND="ollama",
|
||||
LLM_MODEL="some_model",
|
||||
)
|
||||
def test_use_without_rag_if_not_configured(
|
||||
mock_ai_config,
|
||||
mock_build_prompt_without_rag,
|
||||
mock_run_llm_query,
|
||||
mock_document,
|
||||
):
|
||||
mock_ai_config.llm_embedding_backend = None
|
||||
mock_build_prompt_without_rag.return_value = "Prompt without RAG"
|
||||
mock_run_llm_query.return_value.text = json.dumps({})
|
||||
get_ai_document_classification(mock_document)
|
||||
mock_build_prompt_without_rag.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@override_settings(
|
||||
LLM_EMBEDDING_BACKEND="huggingface",
|
||||
LLM_BACKEND="ollama",
|
||||
LLM_MODEL="some_model",
|
||||
)
|
||||
def test_prompt_with_without_rag(mock_document):
|
||||
with patch(
|
||||
"paperless_ai.ai_classifier.get_context_for_document",
|
||||
return_value="Context from similar documents",
|
||||
):
|
||||
prompt = build_prompt_without_rag(mock_document)
|
||||
assert "Additional context from similar documents:" not in prompt
|
||||
|
||||
prompt = build_prompt_with_rag(mock_document)
|
||||
assert "Additional context from similar documents:" in prompt
|
||||
|
||||
|
||||
@patch("paperless_ai.ai_classifier.query_similar_documents")
|
||||
def test_get_context_for_document(
|
||||
mock_query_similar_documents,
|
||||
mock_document,
|
||||
mock_similar_documents,
|
||||
):
|
||||
mock_query_similar_documents.return_value = mock_similar_documents
|
||||
|
||||
result = get_context_for_document(mock_document, max_docs=2)
|
||||
|
||||
expected_result = (
|
||||
"TITLE: Title 1\nContent of document 1\n\n"
|
||||
"TITLE: file2.txt\nContent of document 2"
|
||||
)
|
||||
assert result == expected_result
|
||||
mock_query_similar_documents.assert_called_once()
|
||||
|
||||
|
||||
def test_get_context_for_document_no_similar_docs(mock_document):
|
||||
with patch("paperless_ai.ai_classifier.query_similar_documents", return_value=[]):
|
||||
result = get_context_for_document(mock_document)
|
||||
assert result == ""
|
||||
334
src/paperless_ai/tests/test_ai_indexing.py
Normal file
334
src/paperless_ai/tests/test_ai_indexing.py
Normal file
@@ -0,0 +1,334 @@
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
from llama_index.core.base.embeddings.base import BaseEmbedding
|
||||
|
||||
from documents.models import Document
|
||||
from paperless_ai import indexing
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path):
|
||||
original_dir = indexing.settings.LLM_INDEX_DIR
|
||||
indexing.settings.LLM_INDEX_DIR = tmp_path
|
||||
yield tmp_path
|
||||
indexing.settings.LLM_INDEX_DIR = original_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def real_document(db):
|
||||
return Document.objects.create(
|
||||
title="Test Document",
|
||||
content="This is some test content.",
|
||||
added=timezone.now(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_embed_model():
|
||||
fake = FakeEmbedding()
|
||||
with (
|
||||
patch("paperless_ai.indexing.get_embedding_model") as mock_index,
|
||||
patch(
|
||||
"paperless_ai.embedding.get_embedding_model",
|
||||
) as mock_embedding,
|
||||
):
|
||||
mock_index.return_value = fake
|
||||
mock_embedding.return_value = fake
|
||||
yield mock_index
|
||||
|
||||
|
||||
class FakeEmbedding(BaseEmbedding):
|
||||
# TODO: maybe a better way to do this?
|
||||
def _aget_query_embedding(self, query: str) -> list[float]:
|
||||
return [0.1] * self.get_query_embedding_dim()
|
||||
|
||||
def _get_query_embedding(self, query: str) -> list[float]:
|
||||
return [0.1] * self.get_query_embedding_dim()
|
||||
|
||||
def _get_text_embedding(self, text: str) -> list[float]:
|
||||
return [0.1] * self.get_query_embedding_dim()
|
||||
|
||||
def get_query_embedding_dim(self) -> int:
|
||||
return 384 # Match your real FAISS config
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_document_node(real_document):
|
||||
nodes = indexing.build_document_node(real_document)
|
||||
assert len(nodes) > 0
|
||||
assert nodes[0].metadata["document_id"] == str(real_document.id)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
mock_embed_model,
|
||||
):
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
assert any(temp_llm_index_dir.glob("*.json"))
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index_removes_meta(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
mock_embed_model,
|
||||
):
|
||||
# Pre-create a meta.json with incorrect data
|
||||
(temp_llm_index_dir / "meta.json").write_text(
|
||||
json.dumps({"embedding_model": "old", "dim": 1}),
|
||||
)
|
||||
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
meta = json.loads((temp_llm_index_dir / "meta.json").read_text())
|
||||
from paperless.config import AIConfig
|
||||
|
||||
config = AIConfig()
|
||||
expected_model = config.llm_embedding_model or (
|
||||
"text-embedding-3-small"
|
||||
if config.llm_embedding_backend == "openai"
|
||||
else "sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
assert meta == {"embedding_model": expected_model, "dim": 384}
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index_partial_update(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
mock_embed_model,
|
||||
):
|
||||
doc2 = Document.objects.create(
|
||||
title="Test Document 2",
|
||||
content="This is some test content 2.",
|
||||
added=timezone.now(),
|
||||
checksum="1234567890abcdef",
|
||||
)
|
||||
# Initial index
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document, doc2])
|
||||
mock_all.return_value = mock_queryset
|
||||
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
# modify document
|
||||
updated_document = real_document
|
||||
updated_document.modified = timezone.now() # simulate modification
|
||||
|
||||
# new doc
|
||||
doc3 = Document.objects.create(
|
||||
title="Test Document 3",
|
||||
content="This is some test content 3.",
|
||||
added=timezone.now(),
|
||||
checksum="abcdef1234567890",
|
||||
)
|
||||
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
|
||||
mock_all.return_value = mock_queryset
|
||||
|
||||
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
|
||||
with patch("paperless_ai.indexing.logger") as mock_logger:
|
||||
indexing.update_llm_index(rebuild=False)
|
||||
mock_logger.info.assert_called_once_with(
|
||||
"Updating %d nodes in LLM index.",
|
||||
2,
|
||||
)
|
||||
indexing.update_llm_index(rebuild=False)
|
||||
|
||||
assert any(temp_llm_index_dir.glob("*.json"))
|
||||
|
||||
|
||||
def test_get_or_create_storage_context_raises_exception(
|
||||
temp_llm_index_dir,
|
||||
mock_embed_model,
|
||||
):
|
||||
with pytest.raises(Exception):
|
||||
indexing.get_or_create_storage_context(rebuild=False)
|
||||
|
||||
|
||||
@override_settings(
|
||||
LLM_EMBEDDING_BACKEND="huggingface",
|
||||
)
|
||||
def test_load_or_build_index_builds_when_nodes_given(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
mock_embed_model,
|
||||
):
|
||||
with (
|
||||
patch(
|
||||
"paperless_ai.indexing.load_index_from_storage",
|
||||
side_effect=ValueError("Index not found"),
|
||||
),
|
||||
patch(
|
||||
"paperless_ai.indexing.VectorStoreIndex",
|
||||
return_value=MagicMock(),
|
||||
) as mock_index_cls,
|
||||
patch(
|
||||
"paperless_ai.indexing.get_or_create_storage_context",
|
||||
return_value=MagicMock(),
|
||||
) as mock_storage,
|
||||
):
|
||||
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
||||
indexing.load_or_build_index(
|
||||
nodes=[indexing.build_document_node(real_document)],
|
||||
)
|
||||
mock_index_cls.assert_called_once()
|
||||
|
||||
|
||||
def test_load_or_build_index_raises_exception_when_no_nodes(
|
||||
temp_llm_index_dir,
|
||||
mock_embed_model,
|
||||
):
|
||||
with (
|
||||
patch(
|
||||
"paperless_ai.indexing.load_index_from_storage",
|
||||
side_effect=ValueError("Index not found"),
|
||||
),
|
||||
patch(
|
||||
"paperless_ai.indexing.get_or_create_storage_context",
|
||||
return_value=MagicMock(),
|
||||
),
|
||||
):
|
||||
with pytest.raises(Exception):
|
||||
indexing.load_or_build_index()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_load_or_build_index_succeeds_when_nodes_given(
|
||||
temp_llm_index_dir,
|
||||
mock_embed_model,
|
||||
):
|
||||
with (
|
||||
patch(
|
||||
"paperless_ai.indexing.load_index_from_storage",
|
||||
side_effect=ValueError("Index not found"),
|
||||
),
|
||||
patch(
|
||||
"paperless_ai.indexing.VectorStoreIndex",
|
||||
return_value=MagicMock(),
|
||||
) as mock_index_cls,
|
||||
patch(
|
||||
"paperless_ai.indexing.get_or_create_storage_context",
|
||||
return_value=MagicMock(),
|
||||
) as mock_storage,
|
||||
):
|
||||
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
||||
indexing.load_or_build_index(
|
||||
nodes=[MagicMock()],
|
||||
)
|
||||
mock_index_cls.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_add_or_update_document_updates_existing_entry(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
mock_embed_model,
|
||||
):
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
indexing.llm_index_add_or_update_document(real_document)
|
||||
|
||||
assert any(temp_llm_index_dir.glob("*.json"))
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_remove_document_deletes_node_from_docstore(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
mock_embed_model,
|
||||
):
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
index = indexing.load_or_build_index()
|
||||
assert len(index.docstore.docs) == 1
|
||||
|
||||
indexing.llm_index_remove_document(real_document)
|
||||
index = indexing.load_or_build_index()
|
||||
assert len(index.docstore.docs) == 0
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_update_llm_index_no_documents(
|
||||
temp_llm_index_dir,
|
||||
mock_embed_model,
|
||||
):
|
||||
with patch("documents.models.Document.objects.all") as mock_all:
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = False
|
||||
mock_queryset.__iter__.return_value = iter([])
|
||||
mock_all.return_value = mock_queryset
|
||||
|
||||
# check log message
|
||||
with patch("paperless_ai.indexing.logger") as mock_logger:
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
mock_logger.warning.assert_called_once_with(
|
||||
"No documents found to index.",
|
||||
)
|
||||
|
||||
|
||||
@override_settings(
|
||||
LLM_EMBEDDING_BACKEND="huggingface",
|
||||
LLM_BACKEND="ollama",
|
||||
)
|
||||
def test_query_similar_documents(
|
||||
temp_llm_index_dir,
|
||||
real_document,
|
||||
):
|
||||
with (
|
||||
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
|
||||
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
|
||||
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
|
||||
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
|
||||
):
|
||||
mock_storage.return_value = MagicMock()
|
||||
mock_storage.return_value.persist_dir = temp_llm_index_dir
|
||||
|
||||
mock_index = MagicMock()
|
||||
mock_load_or_build_index.return_value = mock_index
|
||||
|
||||
mock_retriever = MagicMock()
|
||||
mock_retriever_cls.return_value = mock_retriever
|
||||
|
||||
mock_node1 = MagicMock()
|
||||
mock_node1.metadata = {"document_id": 1}
|
||||
|
||||
mock_node2 = MagicMock()
|
||||
mock_node2.metadata = {"document_id": 2}
|
||||
|
||||
mock_retriever.retrieve.return_value = [mock_node1, mock_node2]
|
||||
|
||||
mock_filtered_docs = [MagicMock(pk=1), MagicMock(pk=2)]
|
||||
mock_filter.return_value = mock_filtered_docs
|
||||
|
||||
result = indexing.query_similar_documents(real_document, top_k=3)
|
||||
|
||||
mock_load_or_build_index.assert_called_once()
|
||||
mock_retriever_cls.assert_called_once()
|
||||
mock_retriever.retrieve.assert_called_once_with(
|
||||
"Test Document\nThis is some test content.",
|
||||
)
|
||||
mock_filter.assert_called_once_with(pk__in=[1, 2])
|
||||
|
||||
assert result == mock_filtered_docs
|
||||
142
src/paperless_ai/tests/test_chat.py
Normal file
142
src/paperless_ai/tests/test_chat.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from llama_index.core import VectorStoreIndex
|
||||
from llama_index.core.schema import TextNode
|
||||
|
||||
from paperless_ai.chat import stream_chat_with_documents
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_embed_model():
|
||||
from llama_index.core import settings as llama_settings
|
||||
|
||||
mock_embed_model = MagicMock()
|
||||
mock_embed_model._get_text_embedding_batch.return_value = [
|
||||
[0.1] * 1536,
|
||||
] # 1 vector per input
|
||||
llama_settings.Settings._embed_model = mock_embed_model
|
||||
yield
|
||||
llama_settings.Settings._embed_model = None
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_embed_nodes():
|
||||
with patch(
|
||||
"llama_index.core.indices.vector_store.base.embed_nodes",
|
||||
) as mock_embed_nodes:
|
||||
mock_embed_nodes.side_effect = lambda nodes, *_args, **_kwargs: {
|
||||
node.node_id: [0.1] * 1536 for node in nodes
|
||||
}
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_document():
|
||||
doc = MagicMock()
|
||||
doc.pk = 1
|
||||
doc.title = "Test Document"
|
||||
doc.filename = "test_file.pdf"
|
||||
doc.content = "This is the document content."
|
||||
return doc
|
||||
|
||||
|
||||
def test_stream_chat_with_one_document_full_content(mock_document):
|
||||
with (
|
||||
patch("paperless_ai.chat.AIClient") as mock_client_cls,
|
||||
patch("paperless_ai.chat.load_or_build_index") as mock_load_index,
|
||||
patch(
|
||||
"paperless_ai.chat.RetrieverQueryEngine.from_args",
|
||||
) as mock_query_engine_cls,
|
||||
):
|
||||
mock_client = MagicMock()
|
||||
mock_client_cls.return_value = mock_client
|
||||
mock_client.llm = MagicMock()
|
||||
|
||||
mock_node = TextNode(
|
||||
text="This is node content.",
|
||||
metadata={"document_id": str(mock_document.pk), "title": "Test Document"},
|
||||
)
|
||||
mock_index = MagicMock()
|
||||
mock_index.docstore.docs.values.return_value = [mock_node]
|
||||
mock_load_index.return_value = mock_index
|
||||
|
||||
mock_response_stream = MagicMock()
|
||||
mock_response_stream.response_gen = iter(["chunk1", "chunk2"])
|
||||
mock_query_engine = MagicMock()
|
||||
mock_query_engine_cls.return_value = mock_query_engine
|
||||
mock_query_engine.query.return_value = mock_response_stream
|
||||
|
||||
output = list(stream_chat_with_documents("What is this?", [mock_document]))
|
||||
|
||||
assert output == ["chunk1", "chunk2"]
|
||||
|
||||
|
||||
def test_stream_chat_with_multiple_documents_retrieval(patch_embed_nodes):
|
||||
with (
|
||||
patch("paperless_ai.chat.AIClient") as mock_client_cls,
|
||||
patch("paperless_ai.chat.load_or_build_index") as mock_load_index,
|
||||
patch(
|
||||
"paperless_ai.chat.RetrieverQueryEngine.from_args",
|
||||
) as mock_query_engine_cls,
|
||||
patch.object(VectorStoreIndex, "as_retriever") as mock_as_retriever,
|
||||
):
|
||||
# Mock AIClient and LLM
|
||||
mock_client = MagicMock()
|
||||
mock_client_cls.return_value = mock_client
|
||||
mock_client.llm = MagicMock()
|
||||
|
||||
# Create two real TextNodes
|
||||
mock_node1 = TextNode(
|
||||
text="Content for doc 1.",
|
||||
metadata={"document_id": "1", "title": "Document 1"},
|
||||
)
|
||||
mock_node2 = TextNode(
|
||||
text="Content for doc 2.",
|
||||
metadata={"document_id": "2", "title": "Document 2"},
|
||||
)
|
||||
mock_index = MagicMock()
|
||||
mock_index.docstore.docs.values.return_value = [mock_node1, mock_node2]
|
||||
mock_load_index.return_value = mock_index
|
||||
|
||||
# Patch as_retriever to return a retriever whose retrieve() returns mock_node1 and mock_node2
|
||||
mock_retriever = MagicMock()
|
||||
mock_retriever.retrieve.return_value = [mock_node1, mock_node2]
|
||||
mock_as_retriever.return_value = mock_retriever
|
||||
|
||||
# Mock response stream
|
||||
mock_response_stream = MagicMock()
|
||||
mock_response_stream.response_gen = iter(["chunk1", "chunk2"])
|
||||
|
||||
# Mock RetrieverQueryEngine
|
||||
mock_query_engine = MagicMock()
|
||||
mock_query_engine_cls.return_value = mock_query_engine
|
||||
mock_query_engine.query.return_value = mock_response_stream
|
||||
|
||||
# Fake documents
|
||||
doc1 = MagicMock(pk=1)
|
||||
doc2 = MagicMock(pk=2)
|
||||
|
||||
output = list(stream_chat_with_documents("What's up?", [doc1, doc2]))
|
||||
|
||||
assert output == ["chunk1", "chunk2"]
|
||||
|
||||
|
||||
def test_stream_chat_no_matching_nodes():
|
||||
with (
|
||||
patch("paperless_ai.chat.AIClient") as mock_client_cls,
|
||||
patch("paperless_ai.chat.load_or_build_index") as mock_load_index,
|
||||
):
|
||||
mock_client = MagicMock()
|
||||
mock_client_cls.return_value = mock_client
|
||||
mock_client.llm = MagicMock()
|
||||
|
||||
mock_index = MagicMock()
|
||||
# No matching nodes
|
||||
mock_index.docstore.docs.values.return_value = []
|
||||
mock_load_index.return_value = mock_index
|
||||
|
||||
output = list(stream_chat_with_documents("Any info?", [MagicMock(pk=1)]))
|
||||
|
||||
assert output == ["Sorry, I couldn't find any content to answer your question."]
|
||||
111
src/paperless_ai/tests/test_client.py
Normal file
111
src/paperless_ai/tests/test_client.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from llama_index.core.llms import ChatMessage
|
||||
from llama_index.core.llms.llm import ToolSelection
|
||||
|
||||
from paperless_ai.client import AIClient
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ai_config():
|
||||
with patch("paperless_ai.client.AIConfig") as MockAIConfig:
|
||||
mock_config = MagicMock()
|
||||
MockAIConfig.return_value = mock_config
|
||||
yield mock_config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ollama_llm():
|
||||
with patch("paperless_ai.client.Ollama") as MockOllama:
|
||||
yield MockOllama
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_openai_llm():
|
||||
with patch("paperless_ai.client.OpenAI") as MockOpenAI:
|
||||
yield MockOpenAI
|
||||
|
||||
|
||||
def test_get_llm_ollama(mock_ai_config, mock_ollama_llm):
|
||||
mock_ai_config.llm_backend = "ollama"
|
||||
mock_ai_config.llm_model = "test_model"
|
||||
mock_ai_config.llm_endpoint = "http://test-url"
|
||||
|
||||
client = AIClient()
|
||||
|
||||
mock_ollama_llm.assert_called_once_with(
|
||||
model="test_model",
|
||||
base_url="http://test-url",
|
||||
request_timeout=120,
|
||||
)
|
||||
assert client.llm == mock_ollama_llm.return_value
|
||||
|
||||
|
||||
def test_get_llm_openai(mock_ai_config, mock_openai_llm):
|
||||
mock_ai_config.llm_backend = "openai"
|
||||
mock_ai_config.llm_model = "test_model"
|
||||
mock_ai_config.llm_api_key = "test_api_key"
|
||||
mock_ai_config.llm_endpoint = "http://test-url"
|
||||
|
||||
client = AIClient()
|
||||
|
||||
mock_openai_llm.assert_called_once_with(
|
||||
model="test_model",
|
||||
api_base="http://test-url",
|
||||
api_key="test_api_key",
|
||||
)
|
||||
assert client.llm == mock_openai_llm.return_value
|
||||
|
||||
|
||||
def test_get_llm_unsupported_backend(mock_ai_config):
|
||||
mock_ai_config.llm_backend = "unsupported"
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported LLM backend: unsupported"):
|
||||
AIClient()
|
||||
|
||||
|
||||
def test_run_llm_query(mock_ai_config, mock_ollama_llm):
|
||||
mock_ai_config.llm_backend = "ollama"
|
||||
mock_ai_config.llm_model = "test_model"
|
||||
mock_ai_config.llm_endpoint = "http://test-url"
|
||||
|
||||
mock_llm_instance = mock_ollama_llm.return_value
|
||||
|
||||
tool_selection = ToolSelection(
|
||||
tool_id="call_test",
|
||||
tool_name="DocumentClassifierSchema",
|
||||
tool_kwargs={
|
||||
"title": "Test Title",
|
||||
"tags": ["test", "document"],
|
||||
"correspondents": ["John Doe"],
|
||||
"document_types": ["report"],
|
||||
"storage_paths": ["Reports"],
|
||||
"dates": ["2023-01-01"],
|
||||
},
|
||||
)
|
||||
|
||||
mock_llm_instance.chat_with_tools.return_value = MagicMock()
|
||||
mock_llm_instance.get_tool_calls_from_response.return_value = [tool_selection]
|
||||
|
||||
client = AIClient()
|
||||
result = client.run_llm_query("test_prompt")
|
||||
|
||||
assert result["title"] == "Test Title"
|
||||
|
||||
|
||||
def test_run_chat(mock_ai_config, mock_ollama_llm):
|
||||
mock_ai_config.llm_backend = "ollama"
|
||||
mock_ai_config.llm_model = "test_model"
|
||||
mock_ai_config.llm_endpoint = "http://test-url"
|
||||
|
||||
mock_llm_instance = mock_ollama_llm.return_value
|
||||
mock_llm_instance.chat.return_value = "test_chat_result"
|
||||
|
||||
client = AIClient()
|
||||
messages = [ChatMessage(role="user", content="Hello")]
|
||||
result = client.run_chat(messages)
|
||||
|
||||
mock_llm_instance.chat.assert_called_once_with(messages)
|
||||
assert result == "test_chat_result"
|
||||
169
src/paperless_ai/tests/test_embedding.py
Normal file
169
src/paperless_ai/tests/test_embedding.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.models import LLMEmbeddingBackend
|
||||
from paperless_ai.embedding import build_llm_index_text
|
||||
from paperless_ai.embedding import get_embedding_dim
|
||||
from paperless_ai.embedding import get_embedding_model
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_ai_config():
|
||||
with patch("paperless_ai.embedding.AIConfig") as MockAIConfig:
|
||||
yield MockAIConfig
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_llm_index_dir(tmp_path):
|
||||
original_dir = settings.LLM_INDEX_DIR
|
||||
settings.LLM_INDEX_DIR = tmp_path
|
||||
yield tmp_path
|
||||
settings.LLM_INDEX_DIR = original_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_document():
|
||||
doc = MagicMock(spec=Document)
|
||||
doc.title = "Test Title"
|
||||
doc.filename = "test_file.pdf"
|
||||
doc.created = "2023-01-01"
|
||||
doc.added = "2023-01-02"
|
||||
doc.modified = "2023-01-03"
|
||||
|
||||
tag1 = MagicMock()
|
||||
tag1.name = "Tag1"
|
||||
tag2 = MagicMock()
|
||||
tag2.name = "Tag2"
|
||||
doc.tags.all = MagicMock(return_value=[tag1, tag2])
|
||||
|
||||
doc.document_type = MagicMock()
|
||||
doc.document_type.name = "Invoice"
|
||||
doc.correspondent = MagicMock()
|
||||
doc.correspondent.name = "Test Correspondent"
|
||||
doc.archive_serial_number = "12345"
|
||||
doc.content = "This is the document content."
|
||||
|
||||
cf1 = MagicMock(__str__=lambda x: "Value1")
|
||||
cf1.field = MagicMock()
|
||||
cf1.field.name = "Field1"
|
||||
cf1.value = "Value1"
|
||||
cf2 = MagicMock(__str__=lambda x: "Value2")
|
||||
cf2.field = MagicMock()
|
||||
cf2.field.name = "Field2"
|
||||
cf2.value = "Value2"
|
||||
doc.custom_fields.all = MagicMock(return_value=[cf1, cf2])
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
def test_get_embedding_model_openai(mock_ai_config):
|
||||
mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.OPENAI
|
||||
mock_ai_config.return_value.llm_embedding_model = "text-embedding-3-small"
|
||||
mock_ai_config.return_value.llm_api_key = "test_api_key"
|
||||
|
||||
with patch("paperless_ai.embedding.OpenAIEmbedding") as MockOpenAIEmbedding:
|
||||
model = get_embedding_model()
|
||||
MockOpenAIEmbedding.assert_called_once_with(
|
||||
model="text-embedding-3-small",
|
||||
api_key="test_api_key",
|
||||
)
|
||||
assert model == MockOpenAIEmbedding.return_value
|
||||
|
||||
|
||||
def test_get_embedding_model_huggingface(mock_ai_config):
|
||||
mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.HUGGINGFACE
|
||||
mock_ai_config.return_value.llm_embedding_model = (
|
||||
"sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
|
||||
with patch(
|
||||
"paperless_ai.embedding.HuggingFaceEmbedding",
|
||||
) as MockHuggingFaceEmbedding:
|
||||
model = get_embedding_model()
|
||||
MockHuggingFaceEmbedding.assert_called_once_with(
|
||||
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
||||
)
|
||||
assert model == MockHuggingFaceEmbedding.return_value
|
||||
|
||||
|
||||
def test_get_embedding_model_invalid_backend(mock_ai_config):
|
||||
mock_ai_config.return_value.llm_embedding_backend = "INVALID_BACKEND"
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Unsupported embedding backend: INVALID_BACKEND",
|
||||
):
|
||||
get_embedding_model()
|
||||
|
||||
|
||||
def test_get_embedding_dim_infers_and_saves(temp_llm_index_dir, mock_ai_config):
|
||||
mock_ai_config.return_value.llm_embedding_backend = "openai"
|
||||
mock_ai_config.return_value.llm_embedding_model = None
|
||||
|
||||
class DummyEmbedding:
|
||||
def get_text_embedding(self, text):
|
||||
return [0.0] * 7
|
||||
|
||||
with patch(
|
||||
"paperless_ai.embedding.get_embedding_model",
|
||||
return_value=DummyEmbedding(),
|
||||
) as mock_get:
|
||||
dim = get_embedding_dim()
|
||||
mock_get.assert_called_once()
|
||||
|
||||
assert dim == 7
|
||||
meta = json.loads((temp_llm_index_dir / "meta.json").read_text())
|
||||
assert meta == {"embedding_model": "text-embedding-3-small", "dim": 7}
|
||||
|
||||
|
||||
def test_get_embedding_dim_reads_existing_meta(temp_llm_index_dir, mock_ai_config):
|
||||
mock_ai_config.return_value.llm_embedding_backend = "openai"
|
||||
mock_ai_config.return_value.llm_embedding_model = None
|
||||
|
||||
(temp_llm_index_dir / "meta.json").write_text(
|
||||
json.dumps({"embedding_model": "text-embedding-3-small", "dim": 11}),
|
||||
)
|
||||
|
||||
with patch("paperless_ai.embedding.get_embedding_model") as mock_get:
|
||||
assert get_embedding_dim() == 11
|
||||
mock_get.assert_not_called()
|
||||
|
||||
|
||||
def test_get_embedding_dim_raises_on_model_change(temp_llm_index_dir, mock_ai_config):
|
||||
mock_ai_config.return_value.llm_embedding_backend = "openai"
|
||||
mock_ai_config.return_value.llm_embedding_model = None
|
||||
|
||||
(temp_llm_index_dir / "meta.json").write_text(
|
||||
json.dumps({"embedding_model": "old", "dim": 11}),
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match="Embedding model changed from old to text-embedding-3-small",
|
||||
):
|
||||
get_embedding_dim()
|
||||
|
||||
|
||||
def test_build_llm_index_text(mock_document):
|
||||
with patch("documents.models.Note.objects.filter") as mock_notes_filter:
|
||||
mock_notes_filter.return_value = [
|
||||
MagicMock(note="Note1"),
|
||||
MagicMock(note="Note2"),
|
||||
]
|
||||
|
||||
result = build_llm_index_text(mock_document)
|
||||
|
||||
assert "Title: Test Title" in result
|
||||
assert "Filename: test_file.pdf" in result
|
||||
assert "Created: 2023-01-01" in result
|
||||
assert "Tags: Tag1, Tag2" in result
|
||||
assert "Document Type: Invoice" in result
|
||||
assert "Correspondent: Test Correspondent" in result
|
||||
assert "Notes: Note1,Note2" in result
|
||||
assert "Content:\n\nThis is the document content." in result
|
||||
assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result
|
||||
86
src/paperless_ai/tests/test_matching.py
Normal file
86
src/paperless_ai/tests/test_matching.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Correspondent
|
||||
from documents.models import DocumentType
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from paperless_ai.matching import extract_unmatched_names
|
||||
from paperless_ai.matching import match_correspondents_by_name
|
||||
from paperless_ai.matching import match_document_types_by_name
|
||||
from paperless_ai.matching import match_storage_paths_by_name
|
||||
from paperless_ai.matching import match_tags_by_name
|
||||
|
||||
|
||||
class TestAIMatching(TestCase):
|
||||
def setUp(self):
|
||||
# Create test data for Tag
|
||||
self.tag1 = Tag.objects.create(name="Test Tag 1")
|
||||
self.tag2 = Tag.objects.create(name="Test Tag 2")
|
||||
|
||||
# Create test data for Correspondent
|
||||
self.correspondent1 = Correspondent.objects.create(name="Test Correspondent 1")
|
||||
self.correspondent2 = Correspondent.objects.create(name="Test Correspondent 2")
|
||||
|
||||
# Create test data for DocumentType
|
||||
self.document_type1 = DocumentType.objects.create(name="Test Document Type 1")
|
||||
self.document_type2 = DocumentType.objects.create(name="Test Document Type 2")
|
||||
|
||||
# Create test data for StoragePath
|
||||
self.storage_path1 = StoragePath.objects.create(name="Test Storage Path 1")
|
||||
self.storage_path2 = StoragePath.objects.create(name="Test Storage Path 2")
|
||||
|
||||
@patch("paperless_ai.matching.get_objects_for_user_owner_aware")
|
||||
def test_match_tags_by_name(self, mock_get_objects):
|
||||
mock_get_objects.return_value = Tag.objects.all()
|
||||
names = ["Test Tag 1", "Nonexistent Tag"]
|
||||
result = match_tags_by_name(names, user=None)
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].name, "Test Tag 1")
|
||||
|
||||
@patch("paperless_ai.matching.get_objects_for_user_owner_aware")
|
||||
def test_match_correspondents_by_name(self, mock_get_objects):
|
||||
mock_get_objects.return_value = Correspondent.objects.all()
|
||||
names = ["Test Correspondent 1", "Nonexistent Correspondent"]
|
||||
result = match_correspondents_by_name(names, user=None)
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].name, "Test Correspondent 1")
|
||||
|
||||
@patch("paperless_ai.matching.get_objects_for_user_owner_aware")
|
||||
def test_match_document_types_by_name(self, mock_get_objects):
|
||||
mock_get_objects.return_value = DocumentType.objects.all()
|
||||
names = ["Test Document Type 1", "Nonexistent Document Type"]
|
||||
result = match_document_types_by_name(names, user=None)
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].name, "Test Document Type 1")
|
||||
|
||||
@patch("paperless_ai.matching.get_objects_for_user_owner_aware")
|
||||
def test_match_storage_paths_by_name(self, mock_get_objects):
|
||||
mock_get_objects.return_value = StoragePath.objects.all()
|
||||
names = ["Test Storage Path 1", "Nonexistent Storage Path"]
|
||||
result = match_storage_paths_by_name(names, user=None)
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0].name, "Test Storage Path 1")
|
||||
|
||||
def test_extract_unmatched_names(self):
|
||||
llm_names = ["Test Tag 1", "Nonexistent Tag"]
|
||||
matched_objects = [self.tag1]
|
||||
unmatched_names = extract_unmatched_names(llm_names, matched_objects)
|
||||
self.assertEqual(unmatched_names, ["Nonexistent Tag"])
|
||||
|
||||
@patch("paperless_ai.matching.get_objects_for_user_owner_aware")
|
||||
def test_match_tags_by_name_with_empty_names(self, mock_get_objects):
|
||||
mock_get_objects.return_value = Tag.objects.all()
|
||||
names = [None, "", " "]
|
||||
result = match_tags_by_name(names, user=None)
|
||||
self.assertEqual(result, [])
|
||||
|
||||
@patch("paperless_ai.matching.get_objects_for_user_owner_aware")
|
||||
def test_match_tags_with_fuzzy_matching(self, mock_get_objects):
|
||||
mock_get_objects.return_value = Tag.objects.all()
|
||||
names = ["Test Taag 1", "Teest Tag 2"]
|
||||
result = match_tags_by_name(names, user=None)
|
||||
self.assertEqual(len(result), 2)
|
||||
self.assertEqual(result[0].name, "Test Tag 1")
|
||||
self.assertEqual(result[1].name, "Test Tag 2")
|
||||
Reference in New Issue
Block a user