mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-16 01:31:09 -06:00
Merge commit from fork
* Add safe regex matching with timeouts and validation * Remove redundant length check * Remove timeouterror workaround
This commit is contained in:
@@ -63,6 +63,7 @@ dependencies = [
|
|||||||
"pyzbar~=0.1.9",
|
"pyzbar~=0.1.9",
|
||||||
"rapidfuzz~=3.14.0",
|
"rapidfuzz~=3.14.0",
|
||||||
"redis[hiredis]~=5.2.1",
|
"redis[hiredis]~=5.2.1",
|
||||||
|
"regex>=2025.9.18",
|
||||||
"scikit-learn~=1.7.0",
|
"scikit-learn~=1.7.0",
|
||||||
"setproctitle~=1.3.4",
|
"setproctitle~=1.3.4",
|
||||||
"tika-client~=0.10.0",
|
"tika-client~=0.10.0",
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from documents.models import Tag
|
|||||||
from documents.models import Workflow
|
from documents.models import Workflow
|
||||||
from documents.models import WorkflowTrigger
|
from documents.models import WorkflowTrigger
|
||||||
from documents.permissions import get_objects_for_user_owner_aware
|
from documents.permissions import get_objects_for_user_owner_aware
|
||||||
|
from documents.regex import safe_regex_search
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
@@ -152,7 +153,7 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
|
|||||||
|
|
||||||
|
|
||||||
def matches(matching_model: MatchingModel, document: Document):
|
def matches(matching_model: MatchingModel, document: Document):
|
||||||
search_kwargs = {}
|
search_flags = 0
|
||||||
|
|
||||||
document_content = document.content
|
document_content = document.content
|
||||||
|
|
||||||
@@ -161,14 +162,18 @@ def matches(matching_model: MatchingModel, document: Document):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if matching_model.is_insensitive:
|
if matching_model.is_insensitive:
|
||||||
search_kwargs = {"flags": re.IGNORECASE}
|
search_flags = re.IGNORECASE
|
||||||
|
|
||||||
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
|
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
||||||
for word in _split_match(matching_model):
|
for word in _split_match(matching_model):
|
||||||
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
|
search_result = re.search(
|
||||||
|
rf"\b{word}\b",
|
||||||
|
document_content,
|
||||||
|
flags=search_flags,
|
||||||
|
)
|
||||||
if not search_result:
|
if not search_result:
|
||||||
return False
|
return False
|
||||||
log_reason(
|
log_reason(
|
||||||
@@ -180,7 +185,7 @@ def matches(matching_model: MatchingModel, document: Document):
|
|||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||||
for word in _split_match(matching_model):
|
for word in _split_match(matching_model):
|
||||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
if re.search(rf"\b{word}\b", document_content, flags=search_flags):
|
||||||
log_reason(matching_model, document, f"it contains this word: {word}")
|
log_reason(matching_model, document, f"it contains this word: {word}")
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@@ -190,7 +195,7 @@ def matches(matching_model: MatchingModel, document: Document):
|
|||||||
re.search(
|
re.search(
|
||||||
rf"\b{re.escape(matching_model.match)}\b",
|
rf"\b{re.escape(matching_model.match)}\b",
|
||||||
document_content,
|
document_content,
|
||||||
**search_kwargs,
|
flags=search_flags,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
if result:
|
if result:
|
||||||
@@ -202,16 +207,11 @@ def matches(matching_model: MatchingModel, document: Document):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||||
try:
|
match = safe_regex_search(
|
||||||
match = re.search(
|
matching_model.match,
|
||||||
re.compile(matching_model.match, **search_kwargs),
|
document_content,
|
||||||
document_content,
|
flags=search_flags,
|
||||||
)
|
)
|
||||||
except re.error:
|
|
||||||
logger.error(
|
|
||||||
f"Error while processing regular expression {matching_model.match}",
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
if match:
|
if match:
|
||||||
log_reason(
|
log_reason(
|
||||||
matching_model,
|
matching_model,
|
||||||
|
|||||||
50
src/documents/regex.py
Normal file
50
src/documents/regex.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import textwrap
|
||||||
|
|
||||||
|
import regex
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.regex")
|
||||||
|
|
||||||
|
REGEX_TIMEOUT_SECONDS: float = getattr(settings, "MATCH_REGEX_TIMEOUT_SECONDS", 0.1)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_regex_pattern(pattern: str) -> None:
|
||||||
|
"""
|
||||||
|
Validate user provided regex for basic compile errors.
|
||||||
|
Raises ValueError on validation failure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
regex.compile(pattern)
|
||||||
|
except regex.error as exc:
|
||||||
|
raise ValueError(exc.msg) from exc
|
||||||
|
|
||||||
|
|
||||||
|
def safe_regex_search(pattern: str, text: str, *, flags: int = 0):
|
||||||
|
"""
|
||||||
|
Run a regex search with a timeout. Returns a match object or None.
|
||||||
|
Validation errors and timeouts are logged and treated as no match.
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
validate_regex_pattern(pattern)
|
||||||
|
compiled = regex.compile(pattern, flags=flags)
|
||||||
|
except (regex.error, ValueError) as exc:
|
||||||
|
logger.error(
|
||||||
|
"Error while processing regular expression %s: %s",
|
||||||
|
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return compiled.search(text, timeout=REGEX_TIMEOUT_SECONDS)
|
||||||
|
except TimeoutError:
|
||||||
|
logger.warning(
|
||||||
|
"Regular expression matching timed out for pattern %s",
|
||||||
|
textwrap.shorten(pattern, width=80, placeholder="…"),
|
||||||
|
)
|
||||||
|
return None
|
||||||
@@ -71,6 +71,7 @@ from documents.parsers import is_mime_type_supported
|
|||||||
from documents.permissions import get_document_count_filter_for_user
|
from documents.permissions import get_document_count_filter_for_user
|
||||||
from documents.permissions import get_groups_with_only_permission
|
from documents.permissions import get_groups_with_only_permission
|
||||||
from documents.permissions import set_permissions_for_object
|
from documents.permissions import set_permissions_for_object
|
||||||
|
from documents.regex import validate_regex_pattern
|
||||||
from documents.templating.filepath import validate_filepath_template_and_render
|
from documents.templating.filepath import validate_filepath_template_and_render
|
||||||
from documents.templating.utils import convert_format_str_to_template_format
|
from documents.templating.utils import convert_format_str_to_template_format
|
||||||
from documents.validators import uri_validator
|
from documents.validators import uri_validator
|
||||||
@@ -141,10 +142,10 @@ class MatchingModelSerializer(serializers.ModelSerializer):
|
|||||||
and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX
|
and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
re.compile(match)
|
validate_regex_pattern(match)
|
||||||
except re.error as e:
|
except ValueError as e:
|
||||||
raise serializers.ValidationError(
|
raise serializers.ValidationError(
|
||||||
_("Invalid regular expression: %(error)s") % {"error": str(e.msg)},
|
_("Invalid regular expression: %(error)s") % {"error": str(e)},
|
||||||
)
|
)
|
||||||
return match
|
return match
|
||||||
|
|
||||||
|
|||||||
@@ -206,6 +206,22 @@ class TestMatching(_TestMatchingBase):
|
|||||||
def test_tach_invalid_regex(self):
|
def test_tach_invalid_regex(self):
|
||||||
self._test_matching("[", "MATCH_REGEX", [], ["Don't match this"])
|
self._test_matching("[", "MATCH_REGEX", [], ["Don't match this"])
|
||||||
|
|
||||||
|
def test_match_regex_timeout_returns_false(self):
|
||||||
|
tag = Tag.objects.create(
|
||||||
|
name="slow",
|
||||||
|
match=r"(a+)+$",
|
||||||
|
matching_algorithm=Tag.MATCH_REGEX,
|
||||||
|
)
|
||||||
|
document = Document(content=("a" * 5000) + "X")
|
||||||
|
|
||||||
|
with self.assertLogs("paperless.regex", level="WARNING") as cm:
|
||||||
|
self.assertFalse(matching.matches(tag, document))
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
any("timed out" in message for message in cm.output),
|
||||||
|
f"Expected timeout log, got {cm.output}",
|
||||||
|
)
|
||||||
|
|
||||||
def test_match_fuzzy(self):
|
def test_match_fuzzy(self):
|
||||||
self._test_matching(
|
self._test_matching(
|
||||||
"Springfield, Miss.",
|
"Springfield, Miss.",
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@@ -2163,6 +2163,7 @@ dependencies = [
|
|||||||
{ name = "pyzbar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "pyzbar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
|
{ name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
@@ -2306,6 +2307,7 @@ requires-dist = [
|
|||||||
{ name = "pyzbar", specifier = "~=0.1.9" },
|
{ name = "pyzbar", specifier = "~=0.1.9" },
|
||||||
{ name = "rapidfuzz", specifier = "~=3.14.0" },
|
{ name = "rapidfuzz", specifier = "~=3.14.0" },
|
||||||
{ name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" },
|
{ name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" },
|
||||||
|
{ name = "regex", specifier = ">=2025.9.18" },
|
||||||
{ name = "scikit-learn", specifier = "~=1.7.0" },
|
{ name = "scikit-learn", specifier = "~=1.7.0" },
|
||||||
{ name = "setproctitle", specifier = "~=1.3.4" },
|
{ name = "setproctitle", specifier = "~=1.3.4" },
|
||||||
{ name = "tika-client", specifier = "~=0.10.0" },
|
{ name = "tika-client", specifier = "~=0.10.0" },
|
||||||
|
|||||||
Reference in New Issue
Block a user