Normalize Unicode in workflow filename matching

2026-02-22 00:49:35 -06:00 · 2026-01-05 11:03:50 -08:00
parent b145878d50
commit 8a14548434
2 changed files with 115 additions and 9 deletions
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -2,10 +2,12 @@ from __future__ import annotations

 import logging
 import re
+import unicodedata
 from fnmatch import fnmatch
 from fnmatch import translate as fnmatch_translate
 from typing import TYPE_CHECKING

+from django.db.models import Q
 from rest_framework import serializers

 from documents.data_models import ConsumableDocument
@@ -30,6 +32,34 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.matching")


+def _normalize_glob_value(value: str) -> str:
+    """Normalize strings for glob-style matching (case-insensitive)."""
+
+    return unicodedata.normalize("NFC", str(value)).casefold()
+
+
+def _normalized_fnmatch(name: str, pattern: str) -> bool:
+    """Canonicalize Unicode and compare using fnmatch semantics."""
+
+    return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
+
+
+def _glob_regex_variants(pattern: str) -> list[str]:
+    """
+    Build regex patterns that match both NFC and NFD forms of a glob pattern.
+    Using both forms lets DB prefilters remain Unicode-normalization agnostic.
+    """
+
+    regexes = set()
+    for normalized in {
+        unicodedata.normalize("NFC", pattern),
+        unicodedata.normalize("NFD", pattern),
+    }:
+        regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
+        regexes.add(regex)
+    return list(regexes)
+
+
 def log_reason(
    matching_model: MatchingModel | WorkflowTrigger,
    document: Document,
@@ -305,9 +335,9 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
-        and not fnmatch(
-            document.original_file.name.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_file.name,
+            trigger.filter_filename,
        )
    ):
        reason = (
@@ -328,7 +358,7 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_path is not None
        and len(trigger.filter_path) > 0
-        and not fnmatch(
+        and not _normalized_fnmatch(
            match_against,
            trigger.filter_path,
        )
@@ -492,9 +522,9 @@ def existing_document_matches_workflow(
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
        and document.original_filename is not None
-        and not fnmatch(
-            document.original_filename.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_filename,
+            trigger.filter_filename,
        )
    ):
        return (
@@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.annotate(**annotations).filter(custom_field_q)

    if trigger.filter_filename:
-        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
-        documents = documents.filter(original_filename__iregex=regex)
+        regexes = _glob_regex_variants(trigger.filter_filename)
+        filename_q = Q()
+        for regex in regexes:
+            filename_q |= Q(original_filename__iregex=regex)
+        documents = documents.filter(filename_q)

    return documents

--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -557,6 +557,50 @@ class TestWorkflows(
        expected_str = f"Document filename {test_file.name} does not match"
        self.assertIn(expected_str, cm.output[1])

+    def test_workflow_match_filename_diacritics_normalized(self):
+        """
+        GIVEN:
+            - Consumption workflow filtering on filename with diacritics
+        WHEN:
+            - File with decomposed Unicode filename is consumed
+        THEN:
+            - Workflow still matches and applies overrides
+        """
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
+            sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
+            filter_filename="*račun*",
+        )
+        action = WorkflowAction.objects.create(
+            assign_title="Diacritics matched",
+        )
+        action.save()
+
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        decomposed_name = "rac\u030cun.pdf"
+        test_file = shutil.copy(
+            self.SAMPLE_DIR / "simple.pdf",
+            self.dirs.scratch_dir / decomposed_name,
+        )
+
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
+            tasks.consume_file(
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file=test_file,
+                ),
+                None,
+            )
+            document = Document.objects.first()
+            self.assertEqual(document.title, "Diacritics matched")
+
    def test_workflow_no_match_path(self):
        """
        GIVEN:
@@ -946,6 +990,35 @@ class TestWorkflows(
        self.assertEqual(doc.correspondent, self.c2)
        self.assertEqual(doc.title, f"Doc created in {created.year}")

+    def test_document_added_filename_diacritics_normalized(self):
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
+            filter_filename="*račun*",
+        )
+        action = WorkflowAction.objects.create(
+            assign_title="Matched diacritics",
+        )
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        doc = Document.objects.create(
+            title="sample test",
+            correspondent=self.c,
+            original_filename="rac\u030cun.pdf",
+        )
+
+        document_consumption_finished.send(
+            sender=self.__class__,
+            document=doc,
+        )
+
+        self.assertEqual(doc.title, "Matched diacritics")
+
    def test_document_added_no_match_filename(self):
        trigger = WorkflowTrigger.objects.create(
            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,