mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-12 21:44:21 -06:00
Normalize Unicode in workflow filename matching
This commit is contained in:
@@ -2,10 +2,12 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import unicodedata
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
from fnmatch import translate as fnmatch_translate
|
from fnmatch import translate as fnmatch_translate
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from django.db.models import Q
|
||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
|
|
||||||
from documents.data_models import ConsumableDocument
|
from documents.data_models import ConsumableDocument
|
||||||
@@ -30,6 +32,34 @@ if TYPE_CHECKING:
|
|||||||
logger = logging.getLogger("paperless.matching")
|
logger = logging.getLogger("paperless.matching")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_glob_value(value: str) -> str:
|
||||||
|
"""Normalize strings for glob-style matching (case-insensitive)."""
|
||||||
|
|
||||||
|
return unicodedata.normalize("NFC", str(value)).casefold()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized_fnmatch(name: str, pattern: str) -> bool:
|
||||||
|
"""Canonicalize Unicode and compare using fnmatch semantics."""
|
||||||
|
|
||||||
|
return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
|
||||||
|
|
||||||
|
|
||||||
|
def _glob_regex_variants(pattern: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Build regex patterns that match both NFC and NFD forms of a glob pattern.
|
||||||
|
Using both forms lets DB prefilters remain Unicode-normalization agnostic.
|
||||||
|
"""
|
||||||
|
|
||||||
|
regexes = set()
|
||||||
|
for normalized in {
|
||||||
|
unicodedata.normalize("NFC", pattern),
|
||||||
|
unicodedata.normalize("NFD", pattern),
|
||||||
|
}:
|
||||||
|
regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
|
||||||
|
regexes.add(regex)
|
||||||
|
return list(regexes)
|
||||||
|
|
||||||
|
|
||||||
def log_reason(
|
def log_reason(
|
||||||
matching_model: MatchingModel | WorkflowTrigger,
|
matching_model: MatchingModel | WorkflowTrigger,
|
||||||
document: Document,
|
document: Document,
|
||||||
@@ -305,9 +335,9 @@ def consumable_document_matches_workflow(
|
|||||||
if (
|
if (
|
||||||
trigger.filter_filename is not None
|
trigger.filter_filename is not None
|
||||||
and len(trigger.filter_filename) > 0
|
and len(trigger.filter_filename) > 0
|
||||||
and not fnmatch(
|
and not _normalized_fnmatch(
|
||||||
document.original_file.name.lower(),
|
document.original_file.name,
|
||||||
trigger.filter_filename.lower(),
|
trigger.filter_filename,
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
reason = (
|
reason = (
|
||||||
@@ -328,7 +358,7 @@ def consumable_document_matches_workflow(
|
|||||||
if (
|
if (
|
||||||
trigger.filter_path is not None
|
trigger.filter_path is not None
|
||||||
and len(trigger.filter_path) > 0
|
and len(trigger.filter_path) > 0
|
||||||
and not fnmatch(
|
and not _normalized_fnmatch(
|
||||||
match_against,
|
match_against,
|
||||||
trigger.filter_path,
|
trigger.filter_path,
|
||||||
)
|
)
|
||||||
@@ -492,9 +522,9 @@ def existing_document_matches_workflow(
|
|||||||
trigger.filter_filename is not None
|
trigger.filter_filename is not None
|
||||||
and len(trigger.filter_filename) > 0
|
and len(trigger.filter_filename) > 0
|
||||||
and document.original_filename is not None
|
and document.original_filename is not None
|
||||||
and not fnmatch(
|
and not _normalized_fnmatch(
|
||||||
document.original_filename.lower(),
|
document.original_filename,
|
||||||
trigger.filter_filename.lower(),
|
trigger.filter_filename,
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
return (
|
return (
|
||||||
@@ -573,8 +603,11 @@ def prefilter_documents_by_workflowtrigger(
|
|||||||
documents = documents.annotate(**annotations).filter(custom_field_q)
|
documents = documents.annotate(**annotations).filter(custom_field_q)
|
||||||
|
|
||||||
if trigger.filter_filename:
|
if trigger.filter_filename:
|
||||||
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
|
regexes = _glob_regex_variants(trigger.filter_filename)
|
||||||
documents = documents.filter(original_filename__iregex=regex)
|
filename_q = Q()
|
||||||
|
for regex in regexes:
|
||||||
|
filename_q |= Q(original_filename__iregex=regex)
|
||||||
|
documents = documents.filter(filename_q)
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
|||||||
@@ -557,6 +557,50 @@ class TestWorkflows(
|
|||||||
expected_str = f"Document filename {test_file.name} does not match"
|
expected_str = f"Document filename {test_file.name} does not match"
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
|
def test_workflow_match_filename_diacritics_normalized(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Consumption workflow filtering on filename with diacritics
|
||||||
|
WHEN:
|
||||||
|
- File with decomposed Unicode filename is consumed
|
||||||
|
THEN:
|
||||||
|
- Workflow still matches and applies overrides
|
||||||
|
"""
|
||||||
|
trigger = WorkflowTrigger.objects.create(
|
||||||
|
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
|
||||||
|
sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
|
||||||
|
filter_filename="*račun*",
|
||||||
|
)
|
||||||
|
action = WorkflowAction.objects.create(
|
||||||
|
assign_title="Diacritics matched",
|
||||||
|
)
|
||||||
|
action.save()
|
||||||
|
|
||||||
|
w = Workflow.objects.create(
|
||||||
|
name="Workflow 1",
|
||||||
|
order=0,
|
||||||
|
)
|
||||||
|
w.triggers.add(trigger)
|
||||||
|
w.actions.add(action)
|
||||||
|
w.save()
|
||||||
|
|
||||||
|
decomposed_name = "rac\u030cun.pdf"
|
||||||
|
test_file = shutil.copy(
|
||||||
|
self.SAMPLE_DIR / "simple.pdf",
|
||||||
|
self.dirs.scratch_dir / decomposed_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
|
||||||
|
tasks.consume_file(
|
||||||
|
ConsumableDocument(
|
||||||
|
source=DocumentSource.ConsumeFolder,
|
||||||
|
original_file=test_file,
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
document = Document.objects.first()
|
||||||
|
self.assertEqual(document.title, "Diacritics matched")
|
||||||
|
|
||||||
def test_workflow_no_match_path(self):
|
def test_workflow_no_match_path(self):
|
||||||
"""
|
"""
|
||||||
GIVEN:
|
GIVEN:
|
||||||
@@ -946,6 +990,35 @@ class TestWorkflows(
|
|||||||
self.assertEqual(doc.correspondent, self.c2)
|
self.assertEqual(doc.correspondent, self.c2)
|
||||||
self.assertEqual(doc.title, f"Doc created in {created.year}")
|
self.assertEqual(doc.title, f"Doc created in {created.year}")
|
||||||
|
|
||||||
|
def test_document_added_filename_diacritics_normalized(self):
|
||||||
|
trigger = WorkflowTrigger.objects.create(
|
||||||
|
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
||||||
|
filter_filename="*račun*",
|
||||||
|
)
|
||||||
|
action = WorkflowAction.objects.create(
|
||||||
|
assign_title="Matched diacritics",
|
||||||
|
)
|
||||||
|
w = Workflow.objects.create(
|
||||||
|
name="Workflow 1",
|
||||||
|
order=0,
|
||||||
|
)
|
||||||
|
w.triggers.add(trigger)
|
||||||
|
w.actions.add(action)
|
||||||
|
w.save()
|
||||||
|
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title="sample test",
|
||||||
|
correspondent=self.c,
|
||||||
|
original_filename="rac\u030cun.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
document_consumption_finished.send(
|
||||||
|
sender=self.__class__,
|
||||||
|
document=doc,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(doc.title, "Matched diacritics")
|
||||||
|
|
||||||
def test_document_added_no_match_filename(self):
|
def test_document_added_no_match_filename(self):
|
||||||
trigger = WorkflowTrigger.objects.create(
|
trigger = WorkflowTrigger.objects.create(
|
||||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
||||||
|
|||||||
Reference in New Issue
Block a user