From 8a14548434b697d0bb276d69415207c352583d98 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 5 Jan 2026 11:03:50 -0800 Subject: [PATCH] Normalize Unicode in workflow filename matching --- src/documents/matching.py | 51 +++++++++++++++---- src/documents/tests/test_workflows.py | 73 +++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/src/documents/matching.py b/src/documents/matching.py index 198ead64c..68087bf91 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -2,10 +2,12 @@ from __future__ import annotations import logging import re +import unicodedata from fnmatch import fnmatch from fnmatch import translate as fnmatch_translate from typing import TYPE_CHECKING +from django.db.models import Q from rest_framework import serializers from documents.data_models import ConsumableDocument @@ -30,6 +32,34 @@ if TYPE_CHECKING: logger = logging.getLogger("paperless.matching") +def _normalize_glob_value(value: str) -> str: + """Normalize strings for glob-style matching (case-insensitive).""" + + return unicodedata.normalize("NFC", str(value)).casefold() + + +def _normalized_fnmatch(name: str, pattern: str) -> bool: + """Canonicalize Unicode and compare using fnmatch semantics.""" + + return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern)) + + +def _glob_regex_variants(pattern: str) -> list[str]: + """ + Build regex patterns that match both NFC and NFD forms of a glob pattern. + Using both forms lets DB prefilters remain Unicode-normalization agnostic. + """ + + regexes = set() + for normalized in { + unicodedata.normalize("NFC", pattern), + unicodedata.normalize("NFD", pattern), + }: + regex = fnmatch_translate(normalized).lstrip("^").rstrip("$") + regexes.add(regex) + return list(regexes) + + def log_reason( matching_model: MatchingModel | WorkflowTrigger, document: Document, @@ -305,9 +335,9 @@ def consumable_document_matches_workflow( if ( trigger.filter_filename is not None and len(trigger.filter_filename) > 0 - and not fnmatch( - document.original_file.name.lower(), - trigger.filter_filename.lower(), + and not _normalized_fnmatch( + document.original_file.name, + trigger.filter_filename, ) ): reason = ( @@ -328,7 +358,7 @@ def consumable_document_matches_workflow( if ( trigger.filter_path is not None and len(trigger.filter_path) > 0 - and not fnmatch( + and not _normalized_fnmatch( match_against, trigger.filter_path, ) @@ -492,9 +522,9 @@ def existing_document_matches_workflow( trigger.filter_filename is not None and len(trigger.filter_filename) > 0 and document.original_filename is not None - and not fnmatch( - document.original_filename.lower(), - trigger.filter_filename.lower(), + and not _normalized_fnmatch( + document.original_filename, + trigger.filter_filename, ) ): return ( @@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger( documents = documents.annotate(**annotations).filter(custom_field_q) if trigger.filter_filename: - regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$") - documents = documents.filter(original_filename__iregex=regex) + regexes = _glob_regex_variants(trigger.filter_filename) + filename_q = Q() + for regex in regexes: + filename_q |= Q(original_filename__iregex=regex) + documents = documents.filter(filename_q) return documents diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py index 249183b6e..659219b43 100644 --- a/src/documents/tests/test_workflows.py +++ b/src/documents/tests/test_workflows.py @@ -557,6 +557,50 @@ class TestWorkflows( expected_str = f"Document filename {test_file.name} does not match" self.assertIn(expected_str, cm.output[1]) + def test_workflow_match_filename_diacritics_normalized(self): + """ + GIVEN: + - Consumption workflow filtering on filename with diacritics + WHEN: + - File with decomposed Unicode filename is consumed + THEN: + - Workflow still matches and applies overrides + """ + trigger = WorkflowTrigger.objects.create( + type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION, + sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}", + filter_filename="*račun*", + ) + action = WorkflowAction.objects.create( + assign_title="Diacritics matched", + ) + action.save() + + w = Workflow.objects.create( + name="Workflow 1", + order=0, + ) + w.triggers.add(trigger) + w.actions.add(action) + w.save() + + decomposed_name = "rac\u030cun.pdf" + test_file = shutil.copy( + self.SAMPLE_DIR / "simple.pdf", + self.dirs.scratch_dir / decomposed_name, + ) + + with mock.patch("documents.tasks.ProgressManager", DummyProgressManager): + tasks.consume_file( + ConsumableDocument( + source=DocumentSource.ConsumeFolder, + original_file=test_file, + ), + None, + ) + document = Document.objects.first() + self.assertEqual(document.title, "Diacritics matched") + def test_workflow_no_match_path(self): """ GIVEN: @@ -946,6 +990,35 @@ class TestWorkflows( self.assertEqual(doc.correspondent, self.c2) self.assertEqual(doc.title, f"Doc created in {created.year}") + def test_document_added_filename_diacritics_normalized(self): + trigger = WorkflowTrigger.objects.create( + type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED, + filter_filename="*račun*", + ) + action = WorkflowAction.objects.create( + assign_title="Matched diacritics", + ) + w = Workflow.objects.create( + name="Workflow 1", + order=0, + ) + w.triggers.add(trigger) + w.actions.add(action) + w.save() + + doc = Document.objects.create( + title="sample test", + correspondent=self.c, + original_filename="rac\u030cun.pdf", + ) + + document_consumption_finished.send( + sender=self.__class__, + document=doc, + ) + + self.assertEqual(doc.title, "Matched diacritics") + def test_document_added_no_match_filename(self): trigger = WorkflowTrigger.objects.create( type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,