From 8a14548434b697d0bb276d69415207c352583d98 Mon Sep 17 00:00:00 2001
From: shamoon <4887959+shamoon@users.noreply.github.com>
Date: Mon, 5 Jan 2026 11:03:50 -0800
Subject: [PATCH] Normalize Unicode in workflow filename matching

---
 src/documents/matching.py             | 51 +++++++++++++++----
 src/documents/tests/test_workflows.py | 73 +++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/src/documents/matching.py b/src/documents/matching.py
index 198ead64c..68087bf91 100644
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -2,10 +2,12 @@ from __future__ import annotations
 
 import logging
 import re
+import unicodedata
 from fnmatch import fnmatch
 from fnmatch import translate as fnmatch_translate
 from typing import TYPE_CHECKING
 
+from django.db.models import Q
 from rest_framework import serializers
 
 from documents.data_models import ConsumableDocument
@@ -30,6 +32,34 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.matching")
 
 
+def _normalize_glob_value(value: str) -> str:
+    """Normalize strings for glob-style matching (case-insensitive)."""
+
+    return unicodedata.normalize("NFC", str(value)).casefold()
+
+
+def _normalized_fnmatch(name: str, pattern: str) -> bool:
+    """Canonicalize Unicode and compare using fnmatch semantics."""
+
+    return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
+
+
+def _glob_regex_variants(pattern: str) -> list[str]:
+    """
+    Build regex patterns that match both NFC and NFD forms of a glob pattern.
+    Using both forms lets DB prefilters remain Unicode-normalization agnostic.
+    """
+
+    regexes = set()
+    for normalized in {
+        unicodedata.normalize("NFC", pattern),
+        unicodedata.normalize("NFD", pattern),
+    }:
+        regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
+        regexes.add(regex)
+    return list(regexes)
+
+
 def log_reason(
     matching_model: MatchingModel | WorkflowTrigger,
     document: Document,
@@ -305,9 +335,9 @@ def consumable_document_matches_workflow(
     if (
         trigger.filter_filename is not None
         and len(trigger.filter_filename) > 0
-        and not fnmatch(
-            document.original_file.name.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_file.name,
+            trigger.filter_filename,
         )
     ):
         reason = (
@@ -328,7 +358,7 @@ def consumable_document_matches_workflow(
     if (
         trigger.filter_path is not None
         and len(trigger.filter_path) > 0
-        and not fnmatch(
+        and not _normalized_fnmatch(
             match_against,
             trigger.filter_path,
         )
@@ -492,9 +522,9 @@ def existing_document_matches_workflow(
         trigger.filter_filename is not None
         and len(trigger.filter_filename) > 0
         and document.original_filename is not None
-        and not fnmatch(
-            document.original_filename.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_filename,
+            trigger.filter_filename,
         )
     ):
         return (
@@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger(
         documents = documents.annotate(**annotations).filter(custom_field_q)
 
     if trigger.filter_filename:
-        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
-        documents = documents.filter(original_filename__iregex=regex)
+        regexes = _glob_regex_variants(trigger.filter_filename)
+        filename_q = Q()
+        for regex in regexes:
+            filename_q |= Q(original_filename__iregex=regex)
+        documents = documents.filter(filename_q)
 
     return documents
 
diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py
index 249183b6e..659219b43 100644
--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -557,6 +557,50 @@ class TestWorkflows(
         expected_str = f"Document filename {test_file.name} does not match"
         self.assertIn(expected_str, cm.output[1])
 
+    def test_workflow_match_filename_diacritics_normalized(self):
+        """
+        GIVEN:
+            - Consumption workflow filtering on filename with diacritics
+        WHEN:
+            - File with decomposed Unicode filename is consumed
+        THEN:
+            - Workflow still matches and applies overrides
+        """
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
+            sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
+            filter_filename="*račun*",
+        )
+        action = WorkflowAction.objects.create(
+            assign_title="Diacritics matched",
+        )
+        action.save()
+
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        decomposed_name = "rac\u030cun.pdf"
+        test_file = shutil.copy(
+            self.SAMPLE_DIR / "simple.pdf",
+            self.dirs.scratch_dir / decomposed_name,
+        )
+
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
+            tasks.consume_file(
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file=test_file,
+                ),
+                None,
+            )
+            document = Document.objects.first()
+            self.assertEqual(document.title, "Diacritics matched")
+
     def test_workflow_no_match_path(self):
         """
         GIVEN:
@@ -946,6 +990,35 @@ class TestWorkflows(
         self.assertEqual(doc.correspondent, self.c2)
         self.assertEqual(doc.title, f"Doc created in {created.year}")
 
+    def test_document_added_filename_diacritics_normalized(self):
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
+            filter_filename="*račun*",
+        )
+        action = WorkflowAction.objects.create(
+            assign_title="Matched diacritics",
+        )
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        doc = Document.objects.create(
+            title="sample test",
+            correspondent=self.c,
+            original_filename="rac\u030cun.pdf",
+        )
+
+        document_consumption_finished.send(
+            sender=self.__class__,
+            document=doc,
+        )
+
+        self.assertEqual(doc.title, "Matched diacritics")
+
     def test_document_added_no_match_filename(self):
         trigger = WorkflowTrigger.objects.create(
             type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,