From 8adc26e09df9a3d1f68cfa46dc1e574ca092dc34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20M=C3=A9rino?= Date: Tue, 9 Sep 2025 22:02:16 +0200 Subject: [PATCH] Enhancement: Limit excessively long content length when computing suggestions (#10656) This helps prevent excessive processing times on very large documents by limiting the text analyzed during date parsing, tag prediction, and correspondent matching. If the document exceeds 1.2M chars, crop to 1M char. --- src/documents/matching.py | 23 +++++++++++++++----- src/documents/models.py | 22 +++++++++++++++++++ src/documents/tests/test_document_model.py | 25 ++++++++++++++++++++++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/documents/matching.py b/src/documents/matching.py index 15a8ec443..346f9d55a 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -41,7 +41,11 @@ def log_reason( def match_correspondents(document: Document, classifier: DocumentClassifier, user=None): - pred_id = classifier.predict_correspondent(document.content) if classifier else None + pred_id = ( + classifier.predict_correspondent(document.suggestion_content) + if classifier + else None + ) if user is None and document.owner is not None: user = document.owner @@ -65,8 +69,11 @@ def match_correspondents(document: Document, classifier: DocumentClassifier, use def match_document_types(document: Document, classifier: DocumentClassifier, user=None): - pred_id = classifier.predict_document_type(document.content) if classifier else None - + pred_id = ( + classifier.predict_document_type(document.suggestion_content) + if classifier + else None + ) if user is None and document.owner is not None: user = document.owner @@ -89,7 +96,9 @@ def match_document_types(document: Document, classifier: DocumentClassifier, use def match_tags(document: Document, classifier: DocumentClassifier, user=None): - predicted_tag_ids = classifier.predict_tags(document.content) if classifier else [] + predicted_tag_ids = ( + classifier.predict_tags(document.suggestion_content) if classifier else [] + ) if user is None and document.owner is not None: user = document.owner @@ -112,7 +121,11 @@ def match_tags(document: Document, classifier: DocumentClassifier, user=None): def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None): - pred_id = classifier.predict_storage_path(document.content) if classifier else None + pred_id = ( + classifier.predict_storage_path(document.suggestion_content) + if classifier + else None + ) if user is None and document.owner is not None: user = document.owner diff --git a/src/documents/models.py b/src/documents/models.py index e93f14054..72e3996d5 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -305,6 +305,28 @@ class Document(SoftDeleteModel, ModelWithOwner): res += f" {self.title}" return res + @property + def suggestion_content(self): + """ + Returns the document text used to generate suggestions. + + If the document content length exceeds a specified limit, + the text is cropped to include the start and end segments. + Otherwise, the full content is returned. + + This improves processing speed for large documents while keeping + enough context for accurate suggestions. + """ + if not self.content or len(self.content) <= 1200000: + return self.content + else: + # Use 80% from the start and 20% from the end + # to preserve both opening and closing context. + head_len = 800000 + tail_len = 200000 + + return " ".join((self.content[:head_len], self.content[-tail_len:])) + @property def source_path(self) -> Path: if self.filename: diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py index 87ebdb561..fc4f17e04 100644 --- a/src/documents/tests/test_document_model.py +++ b/src/documents/tests/test_document_model.py @@ -6,6 +6,7 @@ from unittest import mock from django.test import TestCase from django.test import override_settings +from faker import Faker from documents.models import Correspondent from documents.models import Document @@ -105,3 +106,27 @@ class TestDocument(TestCase): created=date(2020, 12, 25), ) self.assertEqual(doc.get_public_filename(), "2020-12-25 test") + + +def test_suggestion_content(): + """ + Check that the document for suggestion is cropped, only if it exceeds the length limit. + """ + fake_text = Faker().text(max_nb_chars=1201000) + + # Do not crop content under 1.2M chars + content_under_limit = fake_text[:1200000] + doc = Document( + title="test", + created=date(2025, 6, 1), + content=content_under_limit, + ) + assert doc.suggestion_content == content_under_limit + + # If over the limit, crop to 1M char (800K from the beginning, 200K from the end) + content_over_limit = fake_text[:1200001] + expected_cropped_content = ( + content_over_limit[:800000] + " " + content_over_limit[-200000:] + ) + doc.content = content_over_limit + assert doc.suggestion_content == expected_cropped_content