Normalize and casefold input in TitleContentFilter

Consolidate
Normalize filenames and titles to NFC
2026-01-08 21:24:26 -06:00 · 2026-01-07 13:02:43 -08:00 · 2026-01-05 11:19:35 -08:00 · 2026-01-05 11:17:16 -08:00 · 2026-01-05 11:10:21 -08:00 · 2026-01-05 11:03:50 -08:00
16 changed files with 279 additions and 371 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -115,7 +115,7 @@ jobs:
            --frozen \
            mkdocs gh-deploy --force --no-history
      - name: Upload artifact
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
        with:
          name: documentation
          path: site/
@@ -215,7 +215,7 @@ jobs:
          cache-dependency-path: 'src-ui/pnpm-lock.yaml'
      - name: Cache frontend dependencies
        id: cache-frontend-deps
-        uses: actions/cache@v4
+        uses: actions/cache@v5
        with:
          path: |
            ~/.pnpm-store
@@ -248,7 +248,7 @@ jobs:
          cache-dependency-path: 'src-ui/pnpm-lock.yaml'
      - name: Cache frontend dependencies
        id: cache-frontend-deps
-        uses: actions/cache@v4
+        uses: actions/cache@v5
        with:
          path: |
            ~/.pnpm-store
@@ -301,7 +301,7 @@ jobs:
          cache-dependency-path: 'src-ui/pnpm-lock.yaml'
      - name: Cache frontend dependencies
        id: cache-frontend-deps
-        uses: actions/cache@v4
+        uses: actions/cache@v5
        with:
          path: |
            ~/.pnpm-store
@@ -333,7 +333,7 @@ jobs:
          cache-dependency-path: 'src-ui/pnpm-lock.yaml'
      - name: Cache frontend dependencies
        id: cache-frontend-deps
-        uses: actions/cache@v4
+        uses: actions/cache@v5
        with:
          path: |
            ~/.pnpm-store
@@ -476,7 +476,7 @@ jobs:
          docker cp frontend-extract:/usr/src/paperless/src/documents/static/frontend src/documents/static/frontend/
      - name: Upload frontend artifact
        if: steps.build-vars.outputs.can-push == 'true'
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
        with:
          name: frontend-compiled
          path: src/documents/static/frontend/
@@ -510,12 +510,12 @@ jobs:
          sudo apt-get update -qq
          sudo apt-get install -qq --no-install-recommends gettext liblept5
      - name: Download frontend artifact
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
        with:
          name: frontend-compiled
          path: src/documents/static/frontend/
      - name: Download documentation artifact
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
        with:
          name: documentation
          path: docs/_build/html/
@@ -578,7 +578,7 @@ jobs:
          sudo chown -R 1000:1000 paperless-ngx/
          tar -cJf paperless-ngx.tar.xz paperless-ngx/
      - name: Upload release artifact
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
        with:
          name: release
          path: dist/paperless-ngx.tar.xz
@@ -595,7 +595,7 @@ jobs:
    if: github.ref_type == 'tag' && (startsWith(github.ref_name, 'v') || contains(github.ref_name, '-beta.rc'))
    steps:
      - name: Download release artifact
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
        with:
          name: release
          path: ./
--- a/.github/workflows/repo-maintenance.yml
+++ b/.github/workflows/repo-maintenance.yml
@@ -37,7 +37,7 @@ jobs:
    if: github.repository_owner == 'paperless-ngx'
    runs-on: ubuntu-24.04
    steps:
-      - uses: dessant/lock-threads@v5
+      - uses: dessant/lock-threads@v6
        with:
          issue-inactive-days: '30'
          pr-inactive-days: '30'
--- a/.github/workflows/translate-strings.yml
+++ b/.github/workflows/translate-strings.yml
@@ -47,7 +47,7 @@ jobs:
          cache-dependency-path: 'src-ui/pnpm-lock.yaml'
      - name: Cache frontend dependencies
        id: cache-frontend-deps
-        uses: actions/cache@v4
+        uses: actions/cache@v5
        with:
          path: |
            ~/.pnpm-store
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -16,7 +16,6 @@ from pikepdf import Pdf
 from documents.converters import convert_from_tiff_to_pdf
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
-from documents.models import Document
 from documents.models import Tag
 from documents.plugins.base import ConsumeTaskPlugin
 from documents.plugins.base import StopConsumeTaskError
@@ -116,24 +115,6 @@ class BarcodePlugin(ConsumeTaskPlugin):
        self._tiff_conversion_done = False
        self.barcodes: list[Barcode] = []

-    def _apply_detected_asn(self, detected_asn: int) -> None:
-        """
-        Apply a detected ASN to metadata if allowed.
-        """
-        if (
-            self.metadata.skip_asn_if_exists
-            and Document.global_objects.filter(
-                archive_serial_number=detected_asn,
-            ).exists()
-        ):
-            logger.info(
-                f"Found ASN in barcode {detected_asn} but skipping because it already exists.",
-            )
-            return
-
-        logger.info(f"Found ASN in barcode: {detected_asn}")
-        self.metadata.asn = detected_asn
-
    def run(self) -> None:
        # Some operations may use PIL, override pixel setting if needed
        maybe_override_pixel_limit()
@@ -205,8 +186,13 @@ class BarcodePlugin(ConsumeTaskPlugin):

        # Update/overwrite an ASN if possible
        # After splitting, as otherwise each split document gets the same ASN
-        if self.settings.barcode_enable_asn and (located_asn := self.asn) is not None:
-            self._apply_detected_asn(located_asn)
+        if (
+            self.settings.barcode_enable_asn
+            and not self.metadata.skip_asn
+            and (located_asn := self.asn) is not None
+        ):
+            logger.info(f"Found ASN in barcode: {located_asn}")
+            self.metadata.asn = located_asn

    def cleanup(self) -> None:
        self.temp_dir.cleanup()
--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Literal

+from celery import chain
 from celery import chord
 from celery import group
 from celery import shared_task
@@ -37,42 +38,6 @@ if TYPE_CHECKING:
 logger: logging.Logger = logging.getLogger("paperless.bulk_edit")


-@shared_task(bind=True)
-def restore_archive_serial_numbers_task(
-    self,
-    backup: dict[int, int],
-    *args,
-    **kwargs,
-) -> None:
-    restore_archive_serial_numbers(backup)
-
-
-def release_archive_serial_numbers(doc_ids: list[int]) -> dict[int, int]:
-    """
-    Clears ASNs on documents that are about to be replaced so new documents
-    can be assigned ASNs without uniqueness collisions. Returns a backup map
-    of doc_id -> previous ASN for potential restoration.
-    """
-    qs = Document.objects.filter(
-        id__in=doc_ids,
-        archive_serial_number__isnull=False,
-    ).only("pk", "archive_serial_number")
-    backup = dict(qs.values_list("pk", "archive_serial_number"))
-    qs.update(archive_serial_number=None)
-    logger.info(f"Released archive serial numbers for documents {list(backup.keys())}")
-    return backup
-
-
-def restore_archive_serial_numbers(backup: dict[int, int]) -> None:
-    """
-    Restores ASNs using the provided backup map, intended for
-    rollback when replacement consumption fails.
-    """
-    for doc_id, asn in backup.items():
-        Document.objects.filter(pk=doc_id).update(archive_serial_number=asn)
-    logger.info(f"Restored archive serial numbers for documents {list(backup.keys())}")
-
-
 def set_correspondent(
    doc_ids: list[int],
    correspondent: Correspondent,
@@ -421,7 +386,6 @@ def merge(

    merged_pdf = pikepdf.new()
    version: str = merged_pdf.pdf_version
-    handoff_asn: int | None = None
    # use doc_ids to preserve order
    for doc_id in doc_ids:
        doc = qs.get(id=doc_id)
@@ -437,8 +401,6 @@ def merge(
                version = max(version, pdf.pdf_version)
                merged_pdf.pages.extend(pdf.pages)
            affected_docs.append(doc.id)
-            if handoff_asn is None and doc.archive_serial_number is not None:
-                handoff_asn = doc.archive_serial_number
        except Exception as e:
            logger.exception(
                f"Error merging document {doc.id}, it will not be included in the merge: {e}",
@@ -464,8 +426,6 @@ def merge(
                DocumentMetadataOverrides.from_document(metadata_document)
            )
            overrides.title = metadata_document.title + " (merged)"
-            if metadata_document.archive_serial_number is not None:
-                handoff_asn = metadata_document.archive_serial_number
        else:
            overrides = DocumentMetadataOverrides()
    else:
@@ -473,11 +433,8 @@ def merge(

    if user is not None:
        overrides.owner_id = user.id
-    if not delete_originals:
-        overrides.skip_asn_if_exists = True
-
-    if delete_originals and handoff_asn is not None:
-        overrides.asn = handoff_asn
+    # Avoid copying or detecting ASN from merged PDFs to prevent collision
+    overrides.skip_asn = True

    logger.info("Adding merged document to the task queue.")

@@ -490,20 +447,12 @@ def merge(
    )

    if delete_originals:
-        backup = release_archive_serial_numbers(affected_docs)
        logger.info(
            "Queueing removal of original documents after consumption of merged document",
        )
-        try:
-            consume_task.apply_async(
-                link=[delete.si(affected_docs)],
-                link_error=[restore_archive_serial_numbers_task.s(backup)],
-            )
-        except Exception:
-            restore_archive_serial_numbers(backup)
-            raise
-        else:
-            consume_task.delay()
+        chain(consume_task, delete.si(affected_docs)).delay()
+    else:
+        consume_task.delay()

    return "OK"

@@ -545,8 +494,6 @@ def split(
                overrides.title = f"{doc.title} (split {idx + 1})"
                if user is not None:
                    overrides.owner_id = user.id
-                if not delete_originals:
-                    overrides.skip_asn_if_exists = True
                logger.info(
                    f"Adding split document with pages {split_doc} to the task queue.",
                )
@@ -561,20 +508,10 @@ def split(
                )

            if delete_originals:
-                backup = release_archive_serial_numbers([doc.id])
                logger.info(
                    "Queueing removal of original document after consumption of the split documents",
                )
-                try:
-                    chord(
-                        header=consume_tasks,
-                        body=delete.si([doc.id]),
-                    ).apply_async(
-                        link_error=[restore_archive_serial_numbers_task.s(backup)],
-                    )
-                except Exception:
-                    restore_archive_serial_numbers(backup)
-                    raise
+                chord(header=consume_tasks, body=delete.si([doc.id])).delay()
            else:
                group(consume_tasks).delay()

@@ -677,10 +614,7 @@ def edit_pdf(
            )
            if user is not None:
                overrides.owner_id = user.id
-            if not delete_original:
-                overrides.skip_asn_if_exists = True
-            if delete_original and len(pdf_docs) == 1:
-                overrides.asn = doc.archive_serial_number
+
            for idx, pdf in enumerate(pdf_docs, start=1):
                filepath: Path = (
                    Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
@@ -699,17 +633,7 @@ def edit_pdf(
                )

            if delete_original:
-                backup = release_archive_serial_numbers([doc.id])
-                try:
-                    chord(
-                        header=consume_tasks,
-                        body=delete.si([doc.id]),
-                    ).apply_async(
-                        link_error=[restore_archive_serial_numbers_task.s(backup)],
-                    )
-                except Exception:
-                    restore_archive_serial_numbers(backup)
-                    raise
+                chord(header=consume_tasks, body=delete.si([doc.id])).delay()
            else:
                group(consume_tasks).delay()

--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -46,6 +46,7 @@ from documents.signals.handlers import run_workflows
 from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
+from documents.utils import normalize_nfc
 from documents.utils import run_subprocess
 from paperless_mail.parsers import MailDocumentParser

@@ -111,7 +112,12 @@ class ConsumerPluginMixin:

        self.renew_logging_group()

-        self.filename = self.metadata.filename or self.input_doc.original_file.name
+        self.metadata.filename = normalize_nfc(self.metadata.filename)
+        self.metadata.title = normalize_nfc(self.metadata.title)
+
+        self.filename = normalize_nfc(
+            self.metadata.filename or self.input_doc.original_file.name,
+        )

    def _send_progress(
        self,
@@ -652,6 +658,8 @@ class ConsumerPlugin(
                    f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
                )

+        title = normalize_nfc(title)
+
        file_for_checksum = (
            self.unmodified_original
            if self.unmodified_original is not None
@@ -696,7 +704,7 @@ class ConsumerPlugin(
                pk=self.metadata.storage_path_id,
            )

-        if self.metadata.asn is not None:
+        if self.metadata.asn is not None and not self.metadata.skip_asn:
            document.archive_serial_number = self.metadata.asn

        if self.metadata.owner_id:
@@ -812,8 +820,8 @@ class ConsumerPreflightPlugin(
        """
        Check that if override_asn is given, it is unique and within a valid range
        """
-        if self.metadata.asn is None:
-            # if ASN is None
+        if self.metadata.skip_asn or self.metadata.asn is None:
+            # if skip is set or ASN is None
            return
        # Validate the range is above zero and less than uint32_t max
        # otherwise, Whoosh can't handle it in the index
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@@ -30,7 +30,7 @@ class DocumentMetadataOverrides:
    change_users: list[int] | None = None
    change_groups: list[int] | None = None
    custom_fields: dict | None = None
-    skip_asn_if_exists: bool = False
+    skip_asn: bool = False

    def update(self, other: "DocumentMetadataOverrides") -> "DocumentMetadataOverrides":
        """
@@ -50,8 +50,8 @@ class DocumentMetadataOverrides:
            self.storage_path_id = other.storage_path_id
        if other.owner_id is not None:
            self.owner_id = other.owner_id
-        if other.skip_asn_if_exists:
-            self.skip_asn_if_exists = True
+        if other.skip_asn:
+            self.skip_asn = True

        # merge
        if self.tag_ids is None:
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -6,6 +6,7 @@ from django.conf import settings
 from documents.models import Document
 from documents.templating.filepath import validate_filepath_template_and_render
 from documents.templating.utils import convert_format_str_to_template_format
+from documents.utils import normalize_nfc


 def create_source_path_directory(source_path: Path) -> None:
@@ -55,11 +56,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
    """
    if archive_filename:
        old_filename: Path | None = (
-            Path(doc.archive_filename) if doc.archive_filename else None
+            Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None
        )
        root = settings.ARCHIVE_DIR
    else:
-        old_filename = Path(doc.filename) if doc.filename else None
+        old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None
        root = settings.ORIGINALS_DIR

    # If generating archive filenames, try to make a name that is similar to
@@ -91,7 +92,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
        )
        if new_filename == old_filename:
            # still the same as before.
-            return new_filename
+            return Path(normalize_nfc(str(new_filename)))

        if (root / new_filename).exists():
            counter += 1
@@ -119,7 +120,7 @@ def format_filename(document: Document, template_str: str) -> str | None:
        "none",
    )  # backward compatibility

-    return rendered_filename
+    return normalize_nfc(rendered_filename)


 def generate_filename(
@@ -174,4 +175,4 @@ def generate_filename(
    if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
        full_path = full_path.with_suffix(full_path.suffix + ".gpg")

-    return full_path
+    return Path(normalize_nfc(str(full_path)))
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -41,6 +41,7 @@ from documents.models import PaperlessTask
 from documents.models import ShareLink
 from documents.models import StoragePath
 from documents.models import Tag
+from documents.utils import normalize_nfc

 if TYPE_CHECKING:
    from collections.abc import Callable
@@ -162,7 +163,11 @@ class TitleContentFilter(Filter):
    def filter(self, qs, value):
        value = value.strip() if isinstance(value, str) else value
        if value:
-            return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
+            normalized = normalize_nfc(value) or ""
+            folded = normalized.casefold()
+            return qs.filter(
+                Q(title__icontains=folded) | Q(content__icontains=folded),
+            )
        else:
            return qs

--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import logging
 import math
 import re
+import unicodedata
 from collections import Counter
 from contextlib import contextmanager
 from datetime import datetime
@@ -58,6 +59,14 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.index")


+def _normalize_for_index(value: str | None) -> str | None:
+    """Normalize text to NFC for consistent search/index matching."""
+
+    if value is None:
+        return None
+    return unicodedata.normalize("NFC", value)
+
+
 def get_schema() -> Schema:
    return Schema(
        id=NUMERIC(stored=True, unique=True),
@@ -163,37 +172,41 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
    viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
    writer.update_document(
        id=doc.pk,
-        title=doc.title,
-        content=doc.content,
-        correspondent=doc.correspondent.name if doc.correspondent else None,
+        title=_normalize_for_index(doc.title),
+        content=_normalize_for_index(doc.content),
+        correspondent=_normalize_for_index(
+            doc.correspondent.name if doc.correspondent else None,
+        ),
        correspondent_id=doc.correspondent.id if doc.correspondent else None,
        has_correspondent=doc.correspondent is not None,
-        tag=tags if tags else None,
+        tag=_normalize_for_index(tags) if tags else None,
        tag_id=tags_ids if tags_ids else None,
        has_tag=len(tags) > 0,
-        type=doc.document_type.name if doc.document_type else None,
+        type=_normalize_for_index(
+            doc.document_type.name if doc.document_type else None,
+        ),
        type_id=doc.document_type.id if doc.document_type else None,
        has_type=doc.document_type is not None,
        created=datetime.combine(doc.created, time.min),
        added=doc.added,
        asn=asn,
        modified=doc.modified,
-        path=doc.storage_path.name if doc.storage_path else None,
+        path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
        path_id=doc.storage_path.id if doc.storage_path else None,
        has_path=doc.storage_path is not None,
-        notes=notes,
+        notes=_normalize_for_index(notes),
        num_notes=len(notes),
-        custom_fields=custom_fields,
+        custom_fields=_normalize_for_index(custom_fields),
        custom_field_count=len(doc.custom_fields.all()),
        has_custom_fields=len(custom_fields) > 0,
        custom_fields_id=custom_fields_ids if custom_fields_ids else None,
-        owner=doc.owner.username if doc.owner else None,
+        owner=_normalize_for_index(doc.owner.username if doc.owner else None),
        owner_id=doc.owner.id if doc.owner else None,
        has_owner=doc.owner is not None,
        viewer_id=viewer_ids if viewer_ids else None,
        checksum=doc.checksum,
        page_count=doc.page_count,
-        original_filename=doc.original_filename,
+        original_filename=_normalize_for_index(doc.original_filename),
        is_shared=len(viewer_ids) > 0,
    )
    logger.debug(f"Index updated for document {doc.pk}.")
@@ -421,7 +434,7 @@ class LocalDateParser(English):

 class DelayedFullTextQuery(DelayedQuery):
    def _get_query(self) -> tuple:
-        q_str = self.query_params["query"]
+        q_str = _normalize_for_index(self.query_params["query"]) or ""
        q_str = rewrite_natural_date_keywords(q_str)
        qp = MultifieldParser(
            [
@@ -460,7 +473,12 @@ class DelayedFullTextQuery(DelayedQuery):
 class DelayedMoreLikeThisQuery(DelayedQuery):
    def _get_query(self) -> tuple:
        more_like_doc_id = int(self.query_params["more_like_id"])
-        content = Document.objects.get(id=more_like_doc_id).content
+        content = (
+            _normalize_for_index(
+                Document.objects.get(id=more_like_doc_id).content,
+            )
+            or ""
+        )

        docnum = self.searcher.document_number(id=more_like_doc_id)
        kts = self.searcher.key_terms_from_text(
@@ -488,6 +506,7 @@ def autocomplete(
    Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
    and without scoring
    """
+    term = _normalize_for_index(term) or ""
    terms = []

    with ix.searcher(weighting=TF_IDF()) as s:
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -2,10 +2,12 @@ from __future__ import annotations

 import logging
 import re
+import unicodedata
 from fnmatch import fnmatch
 from fnmatch import translate as fnmatch_translate
 from typing import TYPE_CHECKING

+from django.db.models import Q
 from rest_framework import serializers

 from documents.data_models import ConsumableDocument
@@ -21,6 +23,7 @@ from documents.models import Workflow
 from documents.models import WorkflowTrigger
 from documents.permissions import get_objects_for_user_owner_aware
 from documents.regex import safe_regex_search
+from documents.utils import normalize_nfc

 if TYPE_CHECKING:
    from django.db.models import QuerySet
@@ -30,6 +33,34 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.matching")


+def _normalize_glob_value(value: str) -> str:
+    """Normalize strings for glob-style matching (case-insensitive)."""
+
+    return (normalize_nfc(value) or "").casefold()
+
+
+def _normalized_fnmatch(name: str, pattern: str) -> bool:
+    """Canonicalize Unicode and compare using fnmatch semantics."""
+
+    return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
+
+
+def _glob_regex_variants(pattern: str) -> list[str]:
+    """
+    Build regex patterns that match both NFC and NFD forms of a glob pattern.
+    Using both forms lets DB prefilters remain Unicode-normalization agnostic.
+    """
+
+    regexes = set()
+    for normalized in {
+        normalize_nfc(pattern) or "",
+        unicodedata.normalize("NFD", pattern),
+    }:
+        regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
+        regexes.add(regex)
+    return list(regexes)
+
+
 def log_reason(
    matching_model: MatchingModel | WorkflowTrigger,
    document: Document,
@@ -305,9 +336,9 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
-        and not fnmatch(
-            document.original_file.name.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_file.name,
+            trigger.filter_filename,
        )
    ):
        reason = (
@@ -328,7 +359,7 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_path is not None
        and len(trigger.filter_path) > 0
-        and not fnmatch(
+        and not _normalized_fnmatch(
            match_against,
            trigger.filter_path,
        )
@@ -492,9 +523,9 @@ def existing_document_matches_workflow(
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
        and document.original_filename is not None
-        and not fnmatch(
-            document.original_filename.lower(),
-            trigger.filter_filename.lower(),
+        and not _normalized_fnmatch(
+            document.original_filename,
+            trigger.filter_filename,
        )
    ):
        return (
@@ -573,8 +604,11 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.annotate(**annotations).filter(custom_field_q)

    if trigger.filter_filename:
-        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
-        documents = documents.filter(original_filename__iregex=regex)
+        regexes = _glob_regex_variants(trigger.filter_filename)
+        filename_q = Q()
+        for regex in regexes:
+            filename_q |= Q(original_filename__iregex=regex)
+        documents = documents.filter(filename_q)

    return documents

--- a/src/documents/tests/test_api_search.py
+++ b/src/documents/tests/test_api_search.py
@@ -89,6 +89,23 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        self.assertEqual(len(results), 0)
        self.assertCountEqual(response.data["all"], [])

+    def test_search_handles_diacritics_normalization(self):
+        doc = Document.objects.create(
+            title="certida\u0303o de nascimento",
+            content="birth record without keyword",
+            checksum="D",
+            pk=10,
+        )
+        with AsyncWriter(index.open_index()) as writer:
+            index.update_document(writer, doc)
+
+        response = self.client.get("/api/documents/?query=certidão")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        results = response.data["results"]
+        self.assertEqual(response.data["count"], 1)
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0]["id"], doc.id)
+
    def test_search_custom_field_ordering(self):
        custom_field = CustomField.objects.create(
            name="Sortable field",
--- a/src/documents/tests/test_bulk_edit.py
+++ b/src/documents/tests/test_bulk_edit.py
@@ -602,21 +602,23 @@ class TestPDFActions(DirectoriesMixin, TestCase):
            expected_filename,
        )
        self.assertEqual(consume_file_args[1].title, None)
-        # No metadata_document_id, delete_originals False, so ASN should be None
-        self.assertIsNone(consume_file_args[1].asn)
+        self.assertTrue(consume_file_args[1].skip_asn)

        # With metadata_document_id overrides
        result = bulk_edit.merge(doc_ids, metadata_document_id=metadata_document_id)
        consume_file_args, _ = mock_consume_file.call_args
        self.assertEqual(consume_file_args[1].title, "B (merged)")
        self.assertEqual(consume_file_args[1].created, self.doc2.created)
+        self.assertTrue(consume_file_args[1].skip_asn)

        self.assertEqual(result, "OK")

    @mock.patch("documents.bulk_edit.delete.si")
    @mock.patch("documents.tasks.consume_file.s")
+    @mock.patch("documents.bulk_edit.chain")
    def test_merge_and_delete_originals(
        self,
+        mock_chain,
        mock_consume_file,
        mock_delete_documents,
    ):
@@ -630,12 +632,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
            - Document deletion task should be called
        """
        doc_ids = [self.doc1.id, self.doc2.id, self.doc3.id]
-        self.doc1.archive_serial_number = 101
-        self.doc2.archive_serial_number = 102
-        self.doc3.archive_serial_number = 103
-        self.doc1.save()
-        self.doc2.save()
-        self.doc3.save()

        result = bulk_edit.merge(doc_ids, delete_originals=True)
        self.assertEqual(result, "OK")
@@ -646,8 +642,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):

        mock_consume_file.assert_called()
        mock_delete_documents.assert_called()
-        consume_sig = mock_consume_file.return_value
-        consume_sig.apply_async.assert_called_once()
+        mock_chain.assert_called_once()

        consume_file_args, _ = mock_consume_file.call_args
        self.assertEqual(
@@ -655,7 +650,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
            expected_filename,
        )
        self.assertEqual(consume_file_args[1].title, None)
-        self.assertEqual(consume_file_args[1].asn, 101)
+        self.assertTrue(consume_file_args[1].skip_asn)

        delete_documents_args, _ = mock_delete_documents.call_args
        self.assertEqual(
@@ -663,92 +658,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
            doc_ids,
        )

-        self.doc1.refresh_from_db()
-        self.doc2.refresh_from_db()
-        self.doc3.refresh_from_db()
-        self.assertIsNone(self.doc1.archive_serial_number)
-        self.assertIsNone(self.doc2.archive_serial_number)
-        self.assertIsNone(self.doc3.archive_serial_number)
-
-    @mock.patch("documents.bulk_edit.delete.si")
-    @mock.patch("documents.tasks.consume_file.s")
-    def test_merge_and_delete_originals_restore_on_failure(
-        self,
-        mock_consume_file,
-        mock_delete_documents,
-    ):
-        """
-        GIVEN:
-            - Existing documents
-        WHEN:
-            - Merge action with deleting documents is called with 1 document
-            - Error occurs when queuing consume file task
-        THEN:
-            - Archive serial numbers are restored
-        """
-        doc_ids = [self.doc1.id]
-        self.doc1.archive_serial_number = 111
-        self.doc1.save()
-        sig = mock.Mock()
-        sig.apply_async.side_effect = Exception("boom")
-        mock_consume_file.return_value = sig
-
-        with self.assertRaises(Exception):
-            bulk_edit.merge(doc_ids, delete_originals=True)
-
-        self.doc1.refresh_from_db()
-        self.assertEqual(self.doc1.archive_serial_number, 111)
-
-    @mock.patch("documents.bulk_edit.delete.si")
-    @mock.patch("documents.tasks.consume_file.s")
-    def test_merge_and_delete_originals_metadata_handoff(
-        self,
-        mock_consume_file,
-        mock_delete_documents,
-    ):
-        """
-        GIVEN:
-            - Existing documents with ASNs
-        WHEN:
-            - Merge with delete_originals=True and metadata_document_id set
-        THEN:
-            - Handoff ASN uses metadata document ASN
-        """
-        doc_ids = [self.doc1.id, self.doc2.id]
-        self.doc1.archive_serial_number = 101
-        self.doc2.archive_serial_number = 202
-        self.doc1.save()
-        self.doc2.save()
-
-        result = bulk_edit.merge(
-            doc_ids,
-            metadata_document_id=self.doc2.id,
-            delete_originals=True,
-        )
-        self.assertEqual(result, "OK")
-
-        consume_file_args, _ = mock_consume_file.call_args
-        self.assertEqual(consume_file_args[1].asn, 202)
-
-    def test_restore_archive_serial_numbers_task(self):
-        """
-        GIVEN:
-            - Existing document with no archive serial number
-        WHEN:
-            - Restore archive serial number task is called with backup data
-        THEN:
-            - Document archive serial number is restored
-        """
-        self.doc1.archive_serial_number = 444
-        self.doc1.save()
-        Document.objects.filter(pk=self.doc1.id).update(archive_serial_number=None)
-
-        backup = {self.doc1.id: 444}
-        bulk_edit.restore_archive_serial_numbers_task(backup)
-
-        self.doc1.refresh_from_db()
-        self.assertEqual(self.doc1.archive_serial_number, 444)
-
    @mock.patch("documents.tasks.consume_file.s")
    def test_merge_with_archive_fallback(self, mock_consume_file):
        """
@@ -817,7 +726,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        self.assertEqual(mock_consume_file.call_count, 2)
        consume_file_args, _ = mock_consume_file.call_args
        self.assertEqual(consume_file_args[1].title, "B (split 2)")
-        self.assertIsNone(consume_file_args[1].asn)

        self.assertEqual(result, "OK")

@@ -842,8 +750,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        """
        doc_ids = [self.doc2.id]
        pages = [[1, 2], [3]]
-        self.doc2.archive_serial_number = 200
-        self.doc2.save()

        result = bulk_edit.split(doc_ids, pages, delete_originals=True)
        self.assertEqual(result, "OK")
@@ -861,42 +767,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
            doc_ids,
        )

-        self.doc2.refresh_from_db()
-        self.assertIsNone(self.doc2.archive_serial_number)
-
-    @mock.patch("documents.bulk_edit.delete.si")
-    @mock.patch("documents.tasks.consume_file.s")
-    @mock.patch("documents.bulk_edit.chord")
-    def test_split_restore_on_failure(
-        self,
-        mock_chord,
-        mock_consume_file,
-        mock_delete_documents,
-    ):
-        """
-        GIVEN:
-            - Existing documents
-        WHEN:
-            - Split action with deleting documents is called with 1 document and 2 page groups
-            - Error occurs when queuing chord task
-        THEN:
-            - Archive serial numbers are restored
-        """
-        doc_ids = [self.doc2.id]
-        pages = [[1, 2]]
-        self.doc2.archive_serial_number = 222
-        self.doc2.save()
-
-        sig = mock.Mock()
-        sig.apply_async.side_effect = Exception("boom")
-        mock_chord.return_value = sig
-
-        result = bulk_edit.split(doc_ids, pages, delete_originals=True)
-        self.assertEqual(result, "OK")
-
-        self.doc2.refresh_from_db()
-        self.assertEqual(self.doc2.archive_serial_number, 222)
-
    @mock.patch("documents.tasks.consume_file.delay")
    @mock.patch("pikepdf.Pdf.save")
    def test_split_with_errors(self, mock_save_pdf, mock_consume_file):
@@ -1097,49 +967,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
        mock_chord.return_value.delay.return_value = None
        doc_ids = [self.doc2.id]
        operations = [{"page": 1}, {"page": 2}]
-        self.doc2.archive_serial_number = 250
-        self.doc2.save()

        result = bulk_edit.edit_pdf(doc_ids, operations, delete_original=True)
        self.assertEqual(result, "OK")
        mock_chord.assert_called_once()
-        consume_file_args, _ = mock_consume_file.call_args
-        self.assertEqual(consume_file_args[1].asn, 250)
-        self.doc2.refresh_from_db()
-        self.assertIsNone(self.doc2.archive_serial_number)
-
-    @mock.patch("documents.bulk_edit.delete.si")
-    @mock.patch("documents.tasks.consume_file.s")
-    @mock.patch("documents.bulk_edit.chord")
-    def test_edit_pdf_restore_on_failure(
-        self,
-        mock_chord,
-        mock_consume_file,
-        mock_delete_documents,
-    ):
-        """
-        GIVEN:
-            - Existing document
-        WHEN:
-            - edit_pdf is called with delete_original=True
-            - Error occurs when queuing chord task
-        THEN:
-            - Archive serial numbers are restored
-        """
-        doc_ids = [self.doc2.id]
-        operations = [{"page": 1}]
-        self.doc2.archive_serial_number = 333
-        self.doc2.save()
-
-        sig = mock.Mock()
-        sig.apply_async.side_effect = Exception("boom")
-        mock_chord.return_value = sig
-
-        with self.assertRaises(Exception):
-            bulk_edit.edit_pdf(doc_ids, operations, delete_original=True)
-
-        self.doc2.refresh_from_db()
-        self.assertEqual(self.doc2.archive_serial_number, 333)

    @mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
    def test_edit_pdf_with_update_document(self, mock_update_document):
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -14,7 +14,6 @@ from django.test import override_settings
 from django.utils import timezone
 from guardian.core import ObjectPermissionChecker

-from documents.barcodes import BarcodePlugin
 from documents.consumer import ConsumerError
 from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
@@ -291,6 +290,23 @@ class TestConsumer(

        self._assert_first_last_send_progress()

+    def test_override_filename_normalized(self):
+        filename = self.get_test_file()
+        override_filename = "Inhaltsu\u0308bersicht.pdf"
+
+        with self.get_consumer(
+            filename,
+            DocumentMetadataOverrides(filename=override_filename),
+        ) as consumer:
+            consumer.run()
+
+            document = Document.objects.first()
+
+        self.assertIsNotNone(document)
+        self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf")
+        self.assertEqual(document.title, "Inhaltsübersicht")
+        self._assert_first_last_send_progress()
+
    def testOverrideTitle(self):
        with self.get_consumer(
            self.get_test_file(),
@@ -305,6 +321,25 @@ class TestConsumer(
        self.assertEqual(document.title, "Override Title")
        self._assert_first_last_send_progress()

+    @override_settings(FILENAME_FORMAT="{{ title }}")
+    def test_filename_format_normalized(self):
+        filename = self.get_test_file()
+        title = "Inhaltsu\u0308bersicht Faszination"
+
+        with self.get_consumer(
+            filename,
+            DocumentMetadataOverrides(title=title),
+        ) as consumer:
+            consumer.run()
+
+            document = Document.objects.first()
+
+        self.assertIsNotNone(document)
+        self.assertEqual(document.title, "Inhaltsübersicht Faszination")
+        self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf")
+        self.assertIsFile(document.source_path)
+        self._assert_first_last_send_progress()
+
    def testOverrideCorrespondent(self):
        c = Correspondent.objects.create(name="test")

@@ -413,6 +448,14 @@ class TestConsumer(
        self.assertEqual(document.archive_serial_number, 123)
        self._assert_first_last_send_progress()

+    def testMetadataOverridesSkipAsnPropagation(self):
+        overrides = DocumentMetadataOverrides()
+        incoming = DocumentMetadataOverrides(skip_asn=True)
+
+        overrides.update(incoming)
+
+        self.assertTrue(overrides.skip_asn)
+
    def testOverrideTitlePlaceholders(self):
        c = Correspondent.objects.create(name="Correspondent Name")
        dt = DocumentType.objects.create(name="DocType Name")
@@ -1233,46 +1276,3 @@ class PostConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
                        r"sample\.pdf: Error while executing post-consume script: Command '\[.*\]' returned non-zero exit status \d+\.",
                    ):
                        consumer.run_post_consume_script(doc)
-
-
-class TestMetadataOverrides(TestCase):
-    def test_update_skip_asn_if_exists(self):
-        base = DocumentMetadataOverrides()
-        incoming = DocumentMetadataOverrides(skip_asn_if_exists=True)
-        base.update(incoming)
-        self.assertTrue(base.skip_asn_if_exists)
-
-
-class TestBarcodeApplyDetectedASN(TestCase):
-    """
-    GIVEN:
-        - Existing Documents with ASN 123
-    WHEN:
-        - A BarcodePlugin which detected an ASN
-    THEN:
-        - If skip_asn_if_exists is set, and ASN exists, do not set ASN
-        - If skip_asn_if_exists is set, and ASN does not exist, set ASN
-    """
-
-    def test_apply_detected_asn_skips_existing_when_flag_set(self):
-        doc = Document.objects.create(
-            checksum="X1",
-            title="D1",
-            archive_serial_number=123,
-        )
-        metadata = DocumentMetadataOverrides(skip_asn_if_exists=True)
-        plugin = BarcodePlugin(
-            input_doc=mock.Mock(),
-            metadata=metadata,
-            status_mgr=mock.Mock(),
-            base_tmp_dir=tempfile.gettempdir(),
-            task_id="test-task",
-        )
-
-        plugin._apply_detected_asn(123)
-        self.assertIsNone(plugin.metadata.asn)
-
-        doc.hard_delete()
-
-        plugin._apply_detected_asn(123)
-        self.assertEqual(plugin.metadata.asn, 123)
--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -557,6 +557,50 @@ class TestWorkflows(
        expected_str = f"Document filename {test_file.name} does not match"
        self.assertIn(expected_str, cm.output[1])

+    def test_workflow_match_filename_diacritics_normalized(self):
+        """
+        GIVEN:
+            - Consumption workflow filtering on filename with diacritics
+        WHEN:
+            - File with decomposed Unicode filename is consumed
+        THEN:
+            - Workflow still matches and applies overrides
+        """
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
+            sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
+            filter_filename="*račun*",
+        )
+        action = WorkflowAction.objects.create(
+            assign_title="Diacritics matched",
+        )
+        action.save()
+
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        decomposed_name = "rac\u030cun.pdf"
+        test_file = shutil.copy(
+            self.SAMPLE_DIR / "simple.pdf",
+            self.dirs.scratch_dir / decomposed_name,
+        )
+
+        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
+            tasks.consume_file(
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file=test_file,
+                ),
+                None,
+            )
+            document = Document.objects.first()
+            self.assertEqual(document.title, "Diacritics matched")
+
    def test_workflow_no_match_path(self):
        """
        GIVEN:
@@ -946,6 +990,35 @@ class TestWorkflows(
        self.assertEqual(doc.correspondent, self.c2)
        self.assertEqual(doc.title, f"Doc created in {created.year}")

+    def test_document_added_filename_diacritics_normalized(self):
+        trigger = WorkflowTrigger.objects.create(
+            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
+            filter_filename="*račun*",
+        )
+        action = WorkflowAction.objects.create(
+            assign_title="Matched diacritics",
+        )
+        w = Workflow.objects.create(
+            name="Workflow 1",
+            order=0,
+        )
+        w.triggers.add(trigger)
+        w.actions.add(action)
+        w.save()
+
+        doc = Document.objects.create(
+            title="sample test",
+            correspondent=self.c,
+            original_filename="rac\u030cun.pdf",
+        )
+
+        document_consumption_finished.send(
+            sender=self.__class__,
+            document=doc,
+        )
+
+        self.assertEqual(doc.title, "Matched diacritics")
+
    def test_document_added_no_match_filename(self):
        trigger = WorkflowTrigger.objects.create(
            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -1,5 +1,7 @@
 import logging
 import shutil
+import unicodedata
+from os import PathLike
 from os import utime
 from pathlib import Path
 from subprocess import CompletedProcess
@@ -16,6 +18,14 @@ def _coerce_to_path(
    return Path(source).resolve(), Path(dest).resolve()


+def normalize_nfc(value: str | PathLike[str] | None) -> str | None:
+    """Return NFC-normalized string for filesystem-safe comparisons."""
+
+    if value is None:
+        return None
+    return unicodedata.normalize("NFC", str(value))
+
+
 def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
    """
    Copies only the m_time and a_time attributes from source to destination.
Author	SHA1	Message	Date
shamoon	e0eb6ea576	Normalize and casefold input in TitleContentFilter	2026-01-07 13:02:43 -08:00
shamoon	9d489200d9	Consolidate	2026-01-05 11:19:35 -08:00
shamoon	99294d93f9	Normalize filenames and titles to NFC	2026-01-05 11:17:16 -08:00
shamoon	d40f7b7e91	Normalize text to NFC for search and indexing	2026-01-05 11:10:21 -08:00
shamoon	8a14548434	Normalize Unicode in workflow filename matching	2026-01-05 11:03:50 -08:00
dependabot[bot]	b145878d50	Chore(deps): Bump the actions group with 4 updates (#11695 ) Bumps the actions group with 4 updates: [actions/upload-artifact](https://github.com/actions/upload-artifact), [actions/cache](https://github.com/actions/cache), [actions/download-artifact](https://github.com/actions/download-artifact) and [dessant/lock-threads](https://github.com/dessant/lock-threads). Updates `actions/upload-artifact` from 5 to 6 - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v5...v6) Updates `actions/cache` from 4 to 5 - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v4...v5) Updates `actions/download-artifact` from 6 to 7 - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/v6...v7) Updates `dessant/lock-threads` from 5 to 6 - [Release notes](https://github.com/dessant/lock-threads/releases) - [Changelog](https://github.com/dessant/lock-threads/blob/main/CHANGELOG.md) - [Commits](https://github.com/dessant/lock-threads/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major dependency-group: actions - dependency-name: actions/cache dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major dependency-group: actions - dependency-name: actions/download-artifact dependency-version: '7' dependency-type: direct:production update-type: version-update:semver-major dependency-group: actions - dependency-name: dessant/lock-threads dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major dependency-group: actions ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2026-01-05 08:14:15 -08:00