From 8c0a61dbc6014e9eccb87116fa211c4eaf070ede Mon Sep 17 00:00:00 2001
From: shamoon <4887959+shamoon@users.noreply.github.com>
Date: Sat, 19 Apr 2025 21:56:09 -0700
Subject: [PATCH] wow llama3 is bad

---
 src/documents/ai/llm_classifier.py | 39 ++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/documents/ai/llm_classifier.py b/src/documents/ai/llm_classifier.py
index b4c4db33f..a7e196b7b 100644
--- a/src/documents/ai/llm_classifier.py
+++ b/src/documents/ai/llm_classifier.py
@@ -15,24 +15,37 @@ def get_ai_document_classification(document: Document) -> dict:
     filename = document.filename or ""
     content = document.content or ""
 
-    # Limit the content to 10k characters
-    content = content[:10000]
-
     prompt = f"""
-    You are a document classification assistant. Based on the content below, return a JSON object suggesting the following classification fields:
-    - title: A descriptive title for the document
-    - tags: A list of tags that describe the document (e.g. ["medical", "insurance"])
-    - correspondent: Who sent or issued this document (e.g. "Kaiser Permanente")
-    - document_types: The type or category (e.g. "invoice", "medical record", "statement")
-    - storage_paths: Suggested storage folders (e.g. "Insurance/2024")
-    - dates: Up to 3 dates in ISO format (YYYY-MM-DD) found in the document, relevant to its content
+    You are an assistant that extracts structured information from documents.
+    Only respond with the JSON object as described below.
+    Never ask for further information, additional content or ask questions. Never include any other text.
+    Suggested tags and document types must be strictly based on the content of the document.
+    Do not change the field names or the JSON structure, only provide the values. Use double quotes and proper JSON syntax.
 
-    Return only a valid JSON object. Do not add commentary.
+    The JSON object must contain the following fields:
+    - title: A short, descriptive title
+    - tags: A list of simple tags like ["insurance", "medical", "receipts"]
+    - correspondents: A list of names or organizations mentioned in the document
+    - document_types: The type/category of the document (e.g. "invoice", "medical record")
+    - storage_paths: Suggested folder paths (e.g. "Medical/Insurance")
+    - dates: List up to 3 relevant dates in YYYY-MM-DD format
 
-    FILENAME: {filename}
+    The format of the JSON object is as follows:
+    {{
+        "title": "xxxxx",
+        "tags": ["xxxx", "xxxx"],
+        "correspondents": ["xxxx", "xxxx"],
+        "document_types": ["xxxx", "xxxx"],
+        "storage_paths": ["xxxx", "xxxx"],
+        "dates": ["YYYY-MM-DD", "YYYY-MM-DD", "YYYY-MM-DD"],
+    }}
+    ---
+
+    FILENAME:
+    {filename}
 
     CONTENT:
-    {content}
+    {content[:8000]}  # Trim to safe size
     """
 
     try: