Unify prompts, cover

This commit is contained in:
shamoon 2025-04-28 13:46:22 -07:00
parent 014eafe3d1
commit 62fd722019
No known key found for this signature in database
2 changed files with 17 additions and 37 deletions

View File

@ -21,6 +21,7 @@ def build_prompt_without_rag(document: Document) -> str:
Never ask for further information, additional content or ask questions. Never include any other text.
Suggested tags and document types must be strictly based on the content of the document.
Do not change the field names or the JSON structure, only provide the values. Use double quotes and proper JSON syntax.
Each field must be a list of plain strings.
The JSON object must contain the following fields:
- title: A short, descriptive title
@ -30,8 +31,6 @@ def build_prompt_without_rag(document: Document) -> str:
- storage_paths: Suggested folder paths (e.g. "Medical/Insurance")
- dates: List up to 3 relevant dates in YYYY-MM-DD format
Respond ONLY in JSON.
Each field must be a list of plain strings.
The format of the JSON object is as follows:
{{
"title": "xxxxx",
@ -43,7 +42,6 @@ def build_prompt_without_rag(document: Document) -> str:
}}
---
FILENAME:
{filename}
@ -56,41 +54,9 @@ def build_prompt_without_rag(document: Document) -> str:
def build_prompt_with_rag(document: Document) -> str:
context = get_context_for_document(document)
content = document.content or ""
filename = document.filename or ""
prompt = build_prompt_without_rag(document)
prompt = f"""
You are a helpful assistant that extracts structured information from documents.
You have access to similar documents as context to help improve suggestions.
Only output valid JSON in the format below. No additional explanations.
The JSON object must contain:
- title: A short, human-readable, descriptive title based on the content
- tags: A list of relevant topics
- correspondents: People or organizations involved
- document_types: Type or category of the document
- storage_paths: Suggested folder paths
- dates: Up to 3 relevant dates in YYYY-MM-DD
Respond ONLY in JSON.
Each field must be a list of plain strings.
The format of the JSON object is as follows:
{{
"title": "xxxxx",
"tags": ["xxxx", "xxxx"],
"correspondents": ["xxxx", "xxxx"],
"document_types": ["xxxx", "xxxx"],
"storage_paths": ["xxxx", "xxxx"],
"dates": ["YYYY-MM-DD", "YYYY-MM-DD", "YYYY-MM-DD"],
}}
Here is the document:
FILENAME:
{filename}
CONTENT:
{content[:4000]}
prompt += f"""
CONTEXT FROM SIMILAR DOCUMENTS:
{context[:4000]}

View File

@ -6,6 +6,8 @@ import pytest
from django.test import override_settings
from documents.models import Document
from paperless.ai.ai_classifier import build_prompt_with_rag
from paperless.ai.ai_classifier import build_prompt_without_rag
from paperless.ai.ai_classifier import get_ai_document_classification
from paperless.ai.ai_classifier import parse_ai_response
@ -101,3 +103,15 @@ def test_use_without_rag_if_not_configured(
mock_run_llm_query.return_value.text = json.dumps({})
get_ai_document_classification(mock_document)
mock_build_prompt_without_rag.assert_called_once()
@override_settings(
LLM_BACKEND="ollama",
LLM_MODEL="some_model",
)
def test_prompt_with_without_rag(mock_document):
prompt = build_prompt_without_rag(mock_document)
assert "CONTEXT FROM SIMILAR DOCUMENTS:" not in prompt
prompt = build_prompt_with_rag(mock_document)
assert "CONTEXT FROM SIMILAR DOCUMENTS:" in prompt