Enhancement: Limit excessively long content length when computing suggestions (#10656)

This helps prevent excessive processing times on very large documents
by limiting the text analyzed during date parsing, tag prediction,
and correspondent matching.

If the document exceeds 1.2M chars, crop to 1M char.
This commit is contained in:
Antoine Mérino
2025-09-09 22:02:16 +02:00
committed by GitHub
parent 84d85d7a23
commit 8adc26e09d
3 changed files with 65 additions and 5 deletions

View File

@@ -41,7 +41,11 @@ def log_reason(
def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_correspondent(document.content) if classifier else None
pred_id = (
classifier.predict_correspondent(document.suggestion_content)
if classifier
else None
)
if user is None and document.owner is not None:
user = document.owner
@@ -65,8 +69,11 @@ def match_correspondents(document: Document, classifier: DocumentClassifier, use
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_document_type(document.content) if classifier else None
pred_id = (
classifier.predict_document_type(document.suggestion_content)
if classifier
else None
)
if user is None and document.owner is not None:
user = document.owner
@@ -89,7 +96,9 @@ def match_document_types(document: Document, classifier: DocumentClassifier, use
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
predicted_tag_ids = (
classifier.predict_tags(document.suggestion_content) if classifier else []
)
if user is None and document.owner is not None:
user = document.owner
@@ -112,7 +121,11 @@ def match_tags(document: Document, classifier: DocumentClassifier, user=None):
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_storage_path(document.content) if classifier else None
pred_id = (
classifier.predict_storage_path(document.suggestion_content)
if classifier
else None
)
if user is None and document.owner is not None:
user = document.owner

View File

@@ -305,6 +305,28 @@ class Document(SoftDeleteModel, ModelWithOwner):
res += f" {self.title}"
return res
@property
def suggestion_content(self):
"""
Returns the document text used to generate suggestions.
If the document content length exceeds a specified limit,
the text is cropped to include the start and end segments.
Otherwise, the full content is returned.
This improves processing speed for large documents while keeping
enough context for accurate suggestions.
"""
if not self.content or len(self.content) <= 1200000:
return self.content
else:
# Use 80% from the start and 20% from the end
# to preserve both opening and closing context.
head_len = 800000
tail_len = 200000
return " ".join((self.content[:head_len], self.content[-tail_len:]))
@property
def source_path(self) -> Path:
if self.filename:

View File

@@ -6,6 +6,7 @@ from unittest import mock
from django.test import TestCase
from django.test import override_settings
from faker import Faker
from documents.models import Correspondent
from documents.models import Document
@@ -105,3 +106,27 @@ class TestDocument(TestCase):
created=date(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
def test_suggestion_content():
"""
Check that the document for suggestion is cropped, only if it exceeds the length limit.
"""
fake_text = Faker().text(max_nb_chars=1201000)
# Do not crop content under 1.2M chars
content_under_limit = fake_text[:1200000]
doc = Document(
title="test",
created=date(2025, 6, 1),
content=content_under_limit,
)
assert doc.suggestion_content == content_under_limit
# If over the limit, crop to 1M char (800K from the beginning, 200K from the end)
content_over_limit = fake_text[:1200001]
expected_cropped_content = (
content_over_limit[:800000] + " " + content_over_limit[-200000:]
)
doc.content = content_over_limit
assert doc.suggestion_content == expected_cropped_content