Normalize Unicode in workflow filename matching

2026-01-12 21:44:21 -06:00 · 2026-01-05 11:03:50 -08:00
parent b145878d50
commit 8a14548434
2 changed files with 115 additions and 9 deletions
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -2,10 +2,12 @@ from __future__ import annotations
 import logging
 import re
 import unicodedata
 from fnmatch import fnmatch
 from fnmatch import translate as fnmatch_translate
 from typing import TYPE_CHECKING
 from django.db.models import Q
 from rest_framework import serializers
 from documents.data_models import ConsumableDocument
@@ -30,6 +32,34 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.matching")
 def _normalize_glob_value(value: str) -> str:
    """Normalize strings for glob-style matching (case-insensitive)."""
    return unicodedata.normalize("NFC", str(value)).casefold()
 def _normalized_fnmatch(name: str, pattern: str) -> bool:
    """Canonicalize Unicode and compare using fnmatch semantics."""
    return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
 def _glob_regex_variants(pattern: str) -> list[str]:
    """
    Build regex patterns that match both NFC and NFD forms of a glob pattern.
    Using both forms lets DB prefilters remain Unicode-normalization agnostic.
    """
    regexes = set()
    for normalized in {
        unicodedata.normalize("NFC", pattern),
        unicodedata.normalize("NFD", pattern),
    }:
        regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
        regexes.add(regex)
    return list(regexes)
 def log_reason(
    matching_model: MatchingModel | WorkflowTrigger,
    document: Document,
@@ -305,9 +335,9 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
-        and not fnmatch(
+        and not _normalized_fnmatch(
-            document.original_file.name.lower(),
+            document.original_file.name,
-            trigger.filter_filename.lower(),
+            trigger.filter_filename,
        )
    ):
        reason = (
@@ -328,7 +358,7 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_path is not None
        and len(trigger.filter_path) > 0
-        and not fnmatch(
+        and not _normalized_fnmatch(
            match_against,
            trigger.filter_path,
        )
@@ -492,9 +522,9 @@ def existing_document_matches_workflow(
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
        and document.original_filename is not None
-        and not fnmatch(
+        and not _normalized_fnmatch(
-            document.original_filename.lower(),
+            document.original_filename,
-            trigger.filter_filename.lower(),
+            trigger.filter_filename,
        )
    ):
        return (
@@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.annotate(**annotations).filter(custom_field_q)
    if trigger.filter_filename:
-        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
+        regexes = _glob_regex_variants(trigger.filter_filename)
-        documents = documents.filter(original_filename__iregex=regex)
+        filename_q = Q()
        for regex in regexes:
            filename_q |= Q(original_filename__iregex=regex)
        documents = documents.filter(filename_q)
    return documents
--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -557,6 +557,50 @@ class TestWorkflows(
        expected_str = f"Document filename {test_file.name} does not match"
        self.assertIn(expected_str, cm.output[1])
    def test_workflow_match_filename_diacritics_normalized(self):
        """
        GIVEN:
            - Consumption workflow filtering on filename with diacritics
        WHEN:
            - File with decomposed Unicode filename is consumed
        THEN:
            - Workflow still matches and applies overrides
        """
        trigger = WorkflowTrigger.objects.create(
            type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
            sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
            filter_filename="*račun*",
        )
        action = WorkflowAction.objects.create(
            assign_title="Diacritics matched",
        )
        action.save()
        w = Workflow.objects.create(
            name="Workflow 1",
            order=0,
        )
        w.triggers.add(trigger)
        w.actions.add(action)
        w.save()
        decomposed_name = "rac\u030cun.pdf"
        test_file = shutil.copy(
            self.SAMPLE_DIR / "simple.pdf",
            self.dirs.scratch_dir / decomposed_name,
        )
        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
            tasks.consume_file(
                ConsumableDocument(
                    source=DocumentSource.ConsumeFolder,
                    original_file=test_file,
                ),
                None,
            )
            document = Document.objects.first()
            self.assertEqual(document.title, "Diacritics matched")
    def test_workflow_no_match_path(self):
        """
        GIVEN:
@@ -946,6 +990,35 @@ class TestWorkflows(
        self.assertEqual(doc.correspondent, self.c2)
        self.assertEqual(doc.title, f"Doc created in {created.year}")
    def test_document_added_filename_diacritics_normalized(self):
        trigger = WorkflowTrigger.objects.create(
            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
            filter_filename="*račun*",
        )
        action = WorkflowAction.objects.create(
            assign_title="Matched diacritics",
        )
        w = Workflow.objects.create(
            name="Workflow 1",
            order=0,
        )
        w.triggers.add(trigger)
        w.actions.add(action)
        w.save()
        doc = Document.objects.create(
            title="sample test",
            correspondent=self.c,
            original_filename="rac\u030cun.pdf",
        )
        document_consumption_finished.send(
            sender=self.__class__,
            document=doc,
        )
        self.assertEqual(doc.title, "Matched diacritics")
    def test_document_added_no_match_filename(self):
        trigger = WorkflowTrigger.objects.create(
            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,