mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-26 01:12:43 -05:00
Enhancement: Limit excessively long content length when computing suggestions (#10656)
This helps prevent excessive processing times on very large documents by limiting the text analyzed during date parsing, tag prediction, and correspondent matching. If the document exceeds 1.2M chars, crop to 1M char.
This commit is contained in:
@@ -305,6 +305,28 @@ class Document(SoftDeleteModel, ModelWithOwner):
|
||||
res += f" {self.title}"
|
||||
return res
|
||||
|
||||
@property
|
||||
def suggestion_content(self):
|
||||
"""
|
||||
Returns the document text used to generate suggestions.
|
||||
|
||||
If the document content length exceeds a specified limit,
|
||||
the text is cropped to include the start and end segments.
|
||||
Otherwise, the full content is returned.
|
||||
|
||||
This improves processing speed for large documents while keeping
|
||||
enough context for accurate suggestions.
|
||||
"""
|
||||
if not self.content or len(self.content) <= 1200000:
|
||||
return self.content
|
||||
else:
|
||||
# Use 80% from the start and 20% from the end
|
||||
# to preserve both opening and closing context.
|
||||
head_len = 800000
|
||||
tail_len = 200000
|
||||
|
||||
return " ".join((self.content[:head_len], self.content[-tail_len:]))
|
||||
|
||||
@property
|
||||
def source_path(self) -> Path:
|
||||
if self.filename:
|
||||
|
Reference in New Issue
Block a user