From 9bdbfd362f4a15f8de109ca959f04e3a7d8a39d0 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 12 Dec 2025 09:28:47 -0800 Subject: [PATCH] Merge commit from fork * Add safe regex matching with timeouts and validation * Remove redundant length check * Remove timeouterror workaround --- pyproject.toml | 1 + src/documents/matching.py | 30 ++++++++-------- src/documents/regex.py | 50 ++++++++++++++++++++++++++ src/documents/serialisers.py | 7 ++-- src/documents/tests/test_matchables.py | 16 +++++++++ uv.lock | 2 ++ 6 files changed, 88 insertions(+), 18 deletions(-) create mode 100644 src/documents/regex.py diff --git a/pyproject.toml b/pyproject.toml index 3108aacd0..60dab9f47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ dependencies = [ "pyzbar~=0.1.9", "rapidfuzz~=3.14.0", "redis[hiredis]~=5.2.1", + "regex>=2025.9.18", "scikit-learn~=1.7.0", "setproctitle~=1.3.4", "tika-client~=0.10.0", diff --git a/src/documents/matching.py b/src/documents/matching.py index 2c8d2bf87..198ead64c 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -20,6 +20,7 @@ from documents.models import Tag from documents.models import Workflow from documents.models import WorkflowTrigger from documents.permissions import get_objects_for_user_owner_aware +from documents.regex import safe_regex_search if TYPE_CHECKING: from django.db.models import QuerySet @@ -152,7 +153,7 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user def matches(matching_model: MatchingModel, document: Document): - search_kwargs = {} + search_flags = 0 document_content = document.content @@ -161,14 +162,18 @@ def matches(matching_model: MatchingModel, document: Document): return False if matching_model.is_insensitive: - search_kwargs = {"flags": re.IGNORECASE} + search_flags = re.IGNORECASE if matching_model.matching_algorithm == MatchingModel.MATCH_NONE: return False elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL: for word in _split_match(matching_model): - search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs) + search_result = re.search( + rf"\b{word}\b", + document_content, + flags=search_flags, + ) if not search_result: return False log_reason( @@ -180,7 +185,7 @@ def matches(matching_model: MatchingModel, document: Document): elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY: for word in _split_match(matching_model): - if re.search(rf"\b{word}\b", document_content, **search_kwargs): + if re.search(rf"\b{word}\b", document_content, flags=search_flags): log_reason(matching_model, document, f"it contains this word: {word}") return True return False @@ -190,7 +195,7 @@ def matches(matching_model: MatchingModel, document: Document): re.search( rf"\b{re.escape(matching_model.match)}\b", document_content, - **search_kwargs, + flags=search_flags, ), ) if result: @@ -202,16 +207,11 @@ def matches(matching_model: MatchingModel, document: Document): return result elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: - try: - match = re.search( - re.compile(matching_model.match, **search_kwargs), - document_content, - ) - except re.error: - logger.error( - f"Error while processing regular expression {matching_model.match}", - ) - return False + match = safe_regex_search( + matching_model.match, + document_content, + flags=search_flags, + ) if match: log_reason( matching_model, diff --git a/src/documents/regex.py b/src/documents/regex.py new file mode 100644 index 000000000..35acc5af0 --- /dev/null +++ b/src/documents/regex.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import logging +import textwrap + +import regex +from django.conf import settings + +logger = logging.getLogger("paperless.regex") + +REGEX_TIMEOUT_SECONDS: float = getattr(settings, "MATCH_REGEX_TIMEOUT_SECONDS", 0.1) + + +def validate_regex_pattern(pattern: str) -> None: + """ + Validate user provided regex for basic compile errors. + Raises ValueError on validation failure. + """ + + try: + regex.compile(pattern) + except regex.error as exc: + raise ValueError(exc.msg) from exc + + +def safe_regex_search(pattern: str, text: str, *, flags: int = 0): + """ + Run a regex search with a timeout. Returns a match object or None. + Validation errors and timeouts are logged and treated as no match. + """ + + try: + validate_regex_pattern(pattern) + compiled = regex.compile(pattern, flags=flags) + except (regex.error, ValueError) as exc: + logger.error( + "Error while processing regular expression %s: %s", + textwrap.shorten(pattern, width=80, placeholder="…"), + exc, + ) + return None + + try: + return compiled.search(text, timeout=REGEX_TIMEOUT_SECONDS) + except TimeoutError: + logger.warning( + "Regular expression matching timed out for pattern %s", + textwrap.shorten(pattern, width=80, placeholder="…"), + ) + return None diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 6265d291c..f4518c04f 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -71,6 +71,7 @@ from documents.parsers import is_mime_type_supported from documents.permissions import get_document_count_filter_for_user from documents.permissions import get_groups_with_only_permission from documents.permissions import set_permissions_for_object +from documents.regex import validate_regex_pattern from documents.templating.filepath import validate_filepath_template_and_render from documents.templating.utils import convert_format_str_to_template_format from documents.validators import uri_validator @@ -141,10 +142,10 @@ class MatchingModelSerializer(serializers.ModelSerializer): and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX ): try: - re.compile(match) - except re.error as e: + validate_regex_pattern(match) + except ValueError as e: raise serializers.ValidationError( - _("Invalid regular expression: %(error)s") % {"error": str(e.msg)}, + _("Invalid regular expression: %(error)s") % {"error": str(e)}, ) return match diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 180cf77ed..8b2a7a463 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -206,6 +206,22 @@ class TestMatching(_TestMatchingBase): def test_tach_invalid_regex(self): self._test_matching("[", "MATCH_REGEX", [], ["Don't match this"]) + def test_match_regex_timeout_returns_false(self): + tag = Tag.objects.create( + name="slow", + match=r"(a+)+$", + matching_algorithm=Tag.MATCH_REGEX, + ) + document = Document(content=("a" * 5000) + "X") + + with self.assertLogs("paperless.regex", level="WARNING") as cm: + self.assertFalse(matching.matches(tag, document)) + + self.assertTrue( + any("timed out" in message for message in cm.output), + f"Expected timeout log, got {cm.output}", + ) + def test_match_fuzzy(self): self._test_matching( "Springfield, Miss.", diff --git a/uv.lock b/uv.lock index ff0bb6b5b..69d1f50bb 100644 --- a/uv.lock +++ b/uv.lock @@ -2163,6 +2163,7 @@ dependencies = [ { name = "pyzbar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -2306,6 +2307,7 @@ requires-dist = [ { name = "pyzbar", specifier = "~=0.1.9" }, { name = "rapidfuzz", specifier = "~=3.14.0" }, { name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" }, + { name = "regex", specifier = ">=2025.9.18" }, { name = "scikit-learn", specifier = "~=1.7.0" }, { name = "setproctitle", specifier = "~=1.3.4" }, { name = "tika-client", specifier = "~=0.10.0" },