diff --git a/src/documents/matching.py b/src/documents/matching.py index 15a8ec443..346f9d55a 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -41,7 +41,11 @@ def log_reason( def match_correspondents(document: Document, classifier: DocumentClassifier, user=None): - pred_id = classifier.predict_correspondent(document.content) if classifier else None + pred_id = ( + classifier.predict_correspondent(document.suggestion_content) + if classifier + else None + ) if user is None and document.owner is not None: user = document.owner @@ -65,8 +69,11 @@ def match_correspondents(document: Document, classifier: DocumentClassifier, use def match_document_types(document: Document, classifier: DocumentClassifier, user=None): - pred_id = classifier.predict_document_type(document.content) if classifier else None - + pred_id = ( + classifier.predict_document_type(document.suggestion_content) + if classifier + else None + ) if user is None and document.owner is not None: user = document.owner @@ -89,7 +96,9 @@ def match_document_types(document: Document, classifier: DocumentClassifier, use def match_tags(document: Document, classifier: DocumentClassifier, user=None): - predicted_tag_ids = classifier.predict_tags(document.content) if classifier else [] + predicted_tag_ids = ( + classifier.predict_tags(document.suggestion_content) if classifier else [] + ) if user is None and document.owner is not None: user = document.owner @@ -112,7 +121,11 @@ def match_tags(document: Document, classifier: DocumentClassifier, user=None): def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None): - pred_id = classifier.predict_storage_path(document.content) if classifier else None + pred_id = ( + classifier.predict_storage_path(document.suggestion_content) + if classifier + else None + ) if user is None and document.owner is not None: user = document.owner diff --git a/src/documents/models.py b/src/documents/models.py index e93f14054..72e3996d5 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -305,6 +305,28 @@ class Document(SoftDeleteModel, ModelWithOwner): res += f" {self.title}" return res + @property + def suggestion_content(self): + """ + Returns the document text used to generate suggestions. + + If the document content length exceeds a specified limit, + the text is cropped to include the start and end segments. + Otherwise, the full content is returned. + + This improves processing speed for large documents while keeping + enough context for accurate suggestions. + """ + if not self.content or len(self.content) <= 1200000: + return self.content + else: + # Use 80% from the start and 20% from the end + # to preserve both opening and closing context. + head_len = 800000 + tail_len = 200000 + + return " ".join((self.content[:head_len], self.content[-tail_len:])) + @property def source_path(self) -> Path: if self.filename: diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py index 87ebdb561..fc4f17e04 100644 --- a/src/documents/tests/test_document_model.py +++ b/src/documents/tests/test_document_model.py @@ -6,6 +6,7 @@ from unittest import mock from django.test import TestCase from django.test import override_settings +from faker import Faker from documents.models import Correspondent from documents.models import Document @@ -105,3 +106,27 @@ class TestDocument(TestCase): created=date(2020, 12, 25), ) self.assertEqual(doc.get_public_filename(), "2020-12-25 test") + + +def test_suggestion_content(): + """ + Check that the document for suggestion is cropped, only if it exceeds the length limit. + """ + fake_text = Faker().text(max_nb_chars=1201000) + + # Do not crop content under 1.2M chars + content_under_limit = fake_text[:1200000] + doc = Document( + title="test", + created=date(2025, 6, 1), + content=content_under_limit, + ) + assert doc.suggestion_content == content_under_limit + + # If over the limit, crop to 1M char (800K from the beginning, 200K from the end) + content_over_limit = fake_text[:1200001] + expected_cropped_content = ( + content_over_limit[:800000] + " " + content_over_limit[-200000:] + ) + doc.content = content_over_limit + assert doc.suggestion_content == expected_cropped_content