Normalize Unicode in workflow filename matching

This commit is contained in:
shamoon
2026-01-05 11:03:50 -08:00
parent b145878d50
commit 8a14548434
2 changed files with 115 additions and 9 deletions

View File

@@ -2,10 +2,12 @@ from __future__ import annotations
import logging
import re
import unicodedata
from fnmatch import fnmatch
from fnmatch import translate as fnmatch_translate
from typing import TYPE_CHECKING
from django.db.models import Q
from rest_framework import serializers
from documents.data_models import ConsumableDocument
@@ -30,6 +32,34 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.matching")
def _normalize_glob_value(value: str) -> str:
"""Normalize strings for glob-style matching (case-insensitive)."""
return unicodedata.normalize("NFC", str(value)).casefold()
def _normalized_fnmatch(name: str, pattern: str) -> bool:
"""Canonicalize Unicode and compare using fnmatch semantics."""
return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
def _glob_regex_variants(pattern: str) -> list[str]:
"""
Build regex patterns that match both NFC and NFD forms of a glob pattern.
Using both forms lets DB prefilters remain Unicode-normalization agnostic.
"""
regexes = set()
for normalized in {
unicodedata.normalize("NFC", pattern),
unicodedata.normalize("NFD", pattern),
}:
regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
regexes.add(regex)
return list(regexes)
def log_reason(
matching_model: MatchingModel | WorkflowTrigger,
document: Document,
@@ -305,9 +335,9 @@ def consumable_document_matches_workflow(
if (
trigger.filter_filename is not None
and len(trigger.filter_filename) > 0
and not fnmatch(
document.original_file.name.lower(),
trigger.filter_filename.lower(),
and not _normalized_fnmatch(
document.original_file.name,
trigger.filter_filename,
)
):
reason = (
@@ -328,7 +358,7 @@ def consumable_document_matches_workflow(
if (
trigger.filter_path is not None
and len(trigger.filter_path) > 0
and not fnmatch(
and not _normalized_fnmatch(
match_against,
trigger.filter_path,
)
@@ -492,9 +522,9 @@ def existing_document_matches_workflow(
trigger.filter_filename is not None
and len(trigger.filter_filename) > 0
and document.original_filename is not None
and not fnmatch(
document.original_filename.lower(),
trigger.filter_filename.lower(),
and not _normalized_fnmatch(
document.original_filename,
trigger.filter_filename,
)
):
return (
@@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger(
documents = documents.annotate(**annotations).filter(custom_field_q)
if trigger.filter_filename:
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
documents = documents.filter(original_filename__iregex=regex)
regexes = _glob_regex_variants(trigger.filter_filename)
filename_q = Q()
for regex in regexes:
filename_q |= Q(original_filename__iregex=regex)
documents = documents.filter(filename_q)
return documents

View File

@@ -557,6 +557,50 @@ class TestWorkflows(
expected_str = f"Document filename {test_file.name} does not match"
self.assertIn(expected_str, cm.output[1])
def test_workflow_match_filename_diacritics_normalized(self):
"""
GIVEN:
- Consumption workflow filtering on filename with diacritics
WHEN:
- File with decomposed Unicode filename is consumed
THEN:
- Workflow still matches and applies overrides
"""
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
filter_filename="*račun*",
)
action = WorkflowAction.objects.create(
assign_title="Diacritics matched",
)
action.save()
w = Workflow.objects.create(
name="Workflow 1",
order=0,
)
w.triggers.add(trigger)
w.actions.add(action)
w.save()
decomposed_name = "rac\u030cun.pdf"
test_file = shutil.copy(
self.SAMPLE_DIR / "simple.pdf",
self.dirs.scratch_dir / decomposed_name,
)
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=test_file,
),
None,
)
document = Document.objects.first()
self.assertEqual(document.title, "Diacritics matched")
def test_workflow_no_match_path(self):
"""
GIVEN:
@@ -946,6 +990,35 @@ class TestWorkflows(
self.assertEqual(doc.correspondent, self.c2)
self.assertEqual(doc.title, f"Doc created in {created.year}")
def test_document_added_filename_diacritics_normalized(self):
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
filter_filename="*račun*",
)
action = WorkflowAction.objects.create(
assign_title="Matched diacritics",
)
w = Workflow.objects.create(
name="Workflow 1",
order=0,
)
w.triggers.add(trigger)
w.actions.add(action)
w.save()
doc = Document.objects.create(
title="sample test",
correspondent=self.c,
original_filename="rac\u030cun.pdf",
)
document_consumption_finished.send(
sender=self.__class__,
document=doc,
)
self.assertEqual(doc.title, "Matched diacritics")
def test_document_added_no_match_filename(self):
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,