Normalize Unicode in workflow filename matching

2026-01-12 21:44:21 -06:00 · 2026-01-05 11:03:50 -08:00
parent b145878d50
commit 8a14548434
2 changed files with 115 additions and 9 deletions
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -2,10 +2,12 @@ from __future__ import annotations

 import logging
 import re
+import unicodedata
 from fnmatch import fnmatch
 from fnmatch import translate as fnmatch_translate
 from typing import TYPE_CHECKING

+from django.db.models import Q
 from rest_framework import serializers

 from documents.data_models import ConsumableDocument
@@ -30,6 +32,34 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.matching")


+def _normalize_glob_value(value: str) -> str:
+    """Normalize strings for glob-style matching (case-insensitive)."""
+
+    return unicodedata.normalize("NFC", str(value)).casefold()
+
+
+def _normalized_fnmatch(name: str, pattern: str) -> bool:
+    """Canonicalize Unicode and compare using fnmatch semantics."""
+
+    return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
+
+
+def _glob_regex_variants(pattern: str) -> list[str]:
+    """
+    Build regex patterns that match both NFC and NFD forms of a glob pattern.
+    Using both forms lets DB prefilters remain Unicode-normalization agnostic.
+    """
+
+    regexes = set()
+    for normalized in {
+        unicodedata.normalize("NFC", pattern),
+        unicodedata.normalize("NFD", pattern),
+    }:
+        regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
+        regexes.add(regex)
+    return list(regexes)
+
+
 def log_reason(
    matching_model: MatchingModel | WorkflowTrigger,
    document: Document,
@@ -305,9 +335,9 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
-        and not fnmatch(
-            document.original_file.name.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_file.name,
+            trigger.filter_filename,
        )
    ):
        reason = (
@@ -328,7 +358,7 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_path is not None
        and len(trigger.filter_path) > 0
-        and not fnmatch(
+        and not _normalized_fnmatch(
            match_against,
            trigger.filter_path,
        )
@@ -492,9 +522,9 @@ def existing_document_matches_workflow(
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
        and document.original_filename is not None
-        and not fnmatch(
-            document.original_filename.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_filename,
+            trigger.filter_filename,
        )
    ):
        return (
@@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.annotate(**annotations).filter(custom_field_q)

    if trigger.filter_filename:
-        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
-        documents = documents.filter(original_filename__iregex=regex)
+        regexes = _glob_regex_variants(trigger.filter_filename)
+        filename_q = Q()
+        for regex in regexes:
+            filename_q |= Q(original_filename__iregex=regex)
+        documents = documents.filter(filename_q)

    return documents