Removes the uv patch version from the image, reducing dependabot updates to it and more closely matching CI

2026-01-12 21:44:21 -06:00 · 2026-01-05 08:19:30 -08:00
10 changed files with 29 additions and 232 deletions
--- a/2
+++ b/2
@@ -32,7 +32,7 @@ RUN set -eux \
 # Purpose: Installs s6-overlay and rootfs
 # Comments:
 #  - Don't leave anything extra in here either
-FROM ghcr.io/astral-sh/uv:0.9.15-python3.12-trixie-slim AS s6-overlay-base
+FROM ghcr.io/astral-sh/uv:0.9-python3.12-trixie-slim AS s6-overlay-base

 WORKDIR /usr/src/s6

--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -46,7 +46,6 @@ from documents.signals.handlers import run_workflows
 from documents.templating.workflows import parse_w_workflow_placeholders
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
-from documents.utils import normalize_nfc
 from documents.utils import run_subprocess
 from paperless_mail.parsers import MailDocumentParser

@@ -112,12 +111,7 @@ class ConsumerPluginMixin:

        self.renew_logging_group()

-        self.metadata.filename = normalize_nfc(self.metadata.filename)
-        self.metadata.title = normalize_nfc(self.metadata.title)
-
-        self.filename = normalize_nfc(
-            self.metadata.filename or self.input_doc.original_file.name,
-        )
+        self.filename = self.metadata.filename or self.input_doc.original_file.name

    def _send_progress(
        self,
@@ -658,8 +652,6 @@ class ConsumerPlugin(
                    f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
                )

-        title = normalize_nfc(title)
-
        file_for_checksum = (
            self.unmodified_original
            if self.unmodified_original is not None
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -6,7 +6,6 @@ from django.conf import settings
 from documents.models import Document
 from documents.templating.filepath import validate_filepath_template_and_render
 from documents.templating.utils import convert_format_str_to_template_format
-from documents.utils import normalize_nfc


 def create_source_path_directory(source_path: Path) -> None:
@@ -56,11 +55,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
    """
    if archive_filename:
        old_filename: Path | None = (
-            Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None
+            Path(doc.archive_filename) if doc.archive_filename else None
        )
        root = settings.ARCHIVE_DIR
    else:
-        old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None
+        old_filename = Path(doc.filename) if doc.filename else None
        root = settings.ORIGINALS_DIR

    # If generating archive filenames, try to make a name that is similar to
@@ -92,7 +91,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
        )
        if new_filename == old_filename:
            # still the same as before.
-            return Path(normalize_nfc(str(new_filename)))
+            return new_filename

        if (root / new_filename).exists():
            counter += 1
@@ -120,7 +119,7 @@ def format_filename(document: Document, template_str: str) -> str | None:
        "none",
    )  # backward compatibility

-    return normalize_nfc(rendered_filename)
+    return rendered_filename


 def generate_filename(
@@ -175,4 +174,4 @@ def generate_filename(
    if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
        full_path = full_path.with_suffix(full_path.suffix + ".gpg")

-    return Path(normalize_nfc(str(full_path)))
+    return full_path
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -41,7 +41,6 @@ from documents.models import PaperlessTask
 from documents.models import ShareLink
 from documents.models import StoragePath
 from documents.models import Tag
-from documents.utils import normalize_nfc

 if TYPE_CHECKING:
    from collections.abc import Callable
@@ -163,11 +162,7 @@ class TitleContentFilter(Filter):
    def filter(self, qs, value):
        value = value.strip() if isinstance(value, str) else value
        if value:
-            normalized = normalize_nfc(value) or ""
-            folded = normalized.casefold()
-            return qs.filter(
-                Q(title__icontains=folded) | Q(content__icontains=folded),
-            )
+            return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
        else:
            return qs

--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,7 +3,6 @@ from __future__ import annotations
 import logging
 import math
 import re
-import unicodedata
 from collections import Counter
 from contextlib import contextmanager
 from datetime import datetime
@@ -59,14 +58,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.index")


-def _normalize_for_index(value: str | None) -> str | None:
-    """Normalize text to NFC for consistent search/index matching."""
-
-    if value is None:
-        return None
-    return unicodedata.normalize("NFC", value)
-
-
 def get_schema() -> Schema:
    return Schema(
        id=NUMERIC(stored=True, unique=True),
@@ -172,41 +163,37 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
    viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
    writer.update_document(
        id=doc.pk,
-        title=_normalize_for_index(doc.title),
-        content=_normalize_for_index(doc.content),
-        correspondent=_normalize_for_index(
-            doc.correspondent.name if doc.correspondent else None,
-        ),
+        title=doc.title,
+        content=doc.content,
+        correspondent=doc.correspondent.name if doc.correspondent else None,
        correspondent_id=doc.correspondent.id if doc.correspondent else None,
        has_correspondent=doc.correspondent is not None,
-        tag=_normalize_for_index(tags) if tags else None,
+        tag=tags if tags else None,
        tag_id=tags_ids if tags_ids else None,
        has_tag=len(tags) > 0,
-        type=_normalize_for_index(
-            doc.document_type.name if doc.document_type else None,
-        ),
+        type=doc.document_type.name if doc.document_type else None,
        type_id=doc.document_type.id if doc.document_type else None,
        has_type=doc.document_type is not None,
        created=datetime.combine(doc.created, time.min),
        added=doc.added,
        asn=asn,
        modified=doc.modified,
-        path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
+        path=doc.storage_path.name if doc.storage_path else None,
        path_id=doc.storage_path.id if doc.storage_path else None,
        has_path=doc.storage_path is not None,
-        notes=_normalize_for_index(notes),
+        notes=notes,
        num_notes=len(notes),
-        custom_fields=_normalize_for_index(custom_fields),
+        custom_fields=custom_fields,
        custom_field_count=len(doc.custom_fields.all()),
        has_custom_fields=len(custom_fields) > 0,
        custom_fields_id=custom_fields_ids if custom_fields_ids else None,
-        owner=_normalize_for_index(doc.owner.username if doc.owner else None),
+        owner=doc.owner.username if doc.owner else None,
        owner_id=doc.owner.id if doc.owner else None,
        has_owner=doc.owner is not None,
        viewer_id=viewer_ids if viewer_ids else None,
        checksum=doc.checksum,
        page_count=doc.page_count,
-        original_filename=_normalize_for_index(doc.original_filename),
+        original_filename=doc.original_filename,
        is_shared=len(viewer_ids) > 0,
    )
    logger.debug(f"Index updated for document {doc.pk}.")
@@ -434,7 +421,7 @@ class LocalDateParser(English):

 class DelayedFullTextQuery(DelayedQuery):
    def _get_query(self) -> tuple:
-        q_str = _normalize_for_index(self.query_params["query"]) or ""
+        q_str = self.query_params["query"]
        q_str = rewrite_natural_date_keywords(q_str)
        qp = MultifieldParser(
            [
@@ -473,12 +460,7 @@ class DelayedFullTextQuery(DelayedQuery):
 class DelayedMoreLikeThisQuery(DelayedQuery):
    def _get_query(self) -> tuple:
        more_like_doc_id = int(self.query_params["more_like_id"])
-        content = (
-            _normalize_for_index(
-                Document.objects.get(id=more_like_doc_id).content,
-            )
-            or ""
-        )
+        content = Document.objects.get(id=more_like_doc_id).content

        docnum = self.searcher.document_number(id=more_like_doc_id)
        kts = self.searcher.key_terms_from_text(
@@ -506,7 +488,6 @@ def autocomplete(
    Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
    and without scoring
    """
-    term = _normalize_for_index(term) or ""
    terms = []

    with ix.searcher(weighting=TF_IDF()) as s:
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -2,12 +2,10 @@ from __future__ import annotations

 import logging
 import re
-import unicodedata
 from fnmatch import fnmatch
 from fnmatch import translate as fnmatch_translate
 from typing import TYPE_CHECKING

-from django.db.models import Q
 from rest_framework import serializers

 from documents.data_models import ConsumableDocument
@@ -23,7 +21,6 @@ from documents.models import Workflow
 from documents.models import WorkflowTrigger
 from documents.permissions import get_objects_for_user_owner_aware
 from documents.regex import safe_regex_search
-from documents.utils import normalize_nfc

 if TYPE_CHECKING:
    from django.db.models import QuerySet
@@ -33,34 +30,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.matching")


-def _normalize_glob_value(value: str) -> str:
-    """Normalize strings for glob-style matching (case-insensitive)."""
-
-    return (normalize_nfc(value) or "").casefold()
-
-
-def _normalized_fnmatch(name: str, pattern: str) -> bool:
-    """Canonicalize Unicode and compare using fnmatch semantics."""
-
-    return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
-
-
-def _glob_regex_variants(pattern: str) -> list[str]:
-    """
-    Build regex patterns that match both NFC and NFD forms of a glob pattern.
-    Using both forms lets DB prefilters remain Unicode-normalization agnostic.
-    """
-
-    regexes = set()
-    for normalized in {
-        normalize_nfc(pattern) or "",
-        unicodedata.normalize("NFD", pattern),
-    }:
-        regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
-        regexes.add(regex)
-    return list(regexes)
-
-
 def log_reason(
    matching_model: MatchingModel | WorkflowTrigger,
    document: Document,
@@ -336,9 +305,9 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
-        and not _normalized_fnmatch(
-            document.original_file.name,
-            trigger.filter_filename,
+        and not fnmatch(
+            document.original_file.name.lower(),
+            trigger.filter_filename.lower(),
        )
    ):
        reason = (
@@ -359,7 +328,7 @@ def consumable_document_matches_workflow(
    if (
        trigger.filter_path is not None
        and len(trigger.filter_path) > 0
-        and not _normalized_fnmatch(
+        and not fnmatch(
            match_against,
            trigger.filter_path,
        )
@@ -523,9 +492,9 @@ def existing_document_matches_workflow(
        trigger.filter_filename is not None
        and len(trigger.filter_filename) > 0
        and document.original_filename is not None
-        and not _normalized_fnmatch(
-            document.original_filename,
-            trigger.filter_filename,
+        and not fnmatch(
+            document.original_filename.lower(),
+            trigger.filter_filename.lower(),
        )
    ):
        return (
@@ -604,11 +573,8 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.annotate(**annotations).filter(custom_field_q)

    if trigger.filter_filename:
-        regexes = _glob_regex_variants(trigger.filter_filename)
-        filename_q = Q()
-        for regex in regexes:
-            filename_q |= Q(original_filename__iregex=regex)
-        documents = documents.filter(filename_q)
+        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
+        documents = documents.filter(original_filename__iregex=regex)

    return documents

--- a/src/documents/tests/test_api_search.py
+++ b/src/documents/tests/test_api_search.py
@@ -89,23 +89,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        self.assertEqual(len(results), 0)
        self.assertCountEqual(response.data["all"], [])

-    def test_search_handles_diacritics_normalization(self):
-        doc = Document.objects.create(
-            title="certida\u0303o de nascimento",
-            content="birth record without keyword",
-            checksum="D",
-            pk=10,
-        )
-        with AsyncWriter(index.open_index()) as writer:
-            index.update_document(writer, doc)
-
-        response = self.client.get("/api/documents/?query=certidão")
-        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        results = response.data["results"]
-        self.assertEqual(response.data["count"], 1)
-        self.assertEqual(len(results), 1)
-        self.assertEqual(results[0]["id"], doc.id)
-
    def test_search_custom_field_ordering(self):
        custom_field = CustomField.objects.create(
            name="Sortable field",
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -290,23 +290,6 @@ class TestConsumer(

        self._assert_first_last_send_progress()

-    def test_override_filename_normalized(self):
-        filename = self.get_test_file()
-        override_filename = "Inhaltsu\u0308bersicht.pdf"
-
-        with self.get_consumer(
-            filename,
-            DocumentMetadataOverrides(filename=override_filename),
-        ) as consumer:
-            consumer.run()
-
-            document = Document.objects.first()
-
-        self.assertIsNotNone(document)
-        self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf")
-        self.assertEqual(document.title, "Inhaltsübersicht")
-        self._assert_first_last_send_progress()
-
    def testOverrideTitle(self):
        with self.get_consumer(
            self.get_test_file(),
@@ -321,25 +304,6 @@ class TestConsumer(
        self.assertEqual(document.title, "Override Title")
        self._assert_first_last_send_progress()

-    @override_settings(FILENAME_FORMAT="{{ title }}")
-    def test_filename_format_normalized(self):
-        filename = self.get_test_file()
-        title = "Inhaltsu\u0308bersicht Faszination"
-
-        with self.get_consumer(
-            filename,
-            DocumentMetadataOverrides(title=title),
-        ) as consumer:
-            consumer.run()
-
-            document = Document.objects.first()
-
-        self.assertIsNotNone(document)
-        self.assertEqual(document.title, "Inhaltsübersicht Faszination")
-        self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf")
-        self.assertIsFile(document.source_path)
-        self._assert_first_last_send_progress()
-
    def testOverrideCorrespondent(self):
        c = Correspondent.objects.create(name="test")

--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -557,50 +557,6 @@ class TestWorkflows(
        expected_str = f"Document filename {test_file.name} does not match"
        self.assertIn(expected_str, cm.output[1])

-    def test_workflow_match_filename_diacritics_normalized(self):
-        """
-        GIVEN:
-            - Consumption workflow filtering on filename with diacritics
-        WHEN:
-            - File with decomposed Unicode filename is consumed
-        THEN:
-            - Workflow still matches and applies overrides
-        """
-        trigger = WorkflowTrigger.objects.create(
-            type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
-            sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
-            filter_filename="*račun*",
-        )
-        action = WorkflowAction.objects.create(
-            assign_title="Diacritics matched",
-        )
-        action.save()
-
-        w = Workflow.objects.create(
-            name="Workflow 1",
-            order=0,
-        )
-        w.triggers.add(trigger)
-        w.actions.add(action)
-        w.save()
-
-        decomposed_name = "rac\u030cun.pdf"
-        test_file = shutil.copy(
-            self.SAMPLE_DIR / "simple.pdf",
-            self.dirs.scratch_dir / decomposed_name,
-        )
-
-        with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
-            tasks.consume_file(
-                ConsumableDocument(
-                    source=DocumentSource.ConsumeFolder,
-                    original_file=test_file,
-                ),
-                None,
-            )
-            document = Document.objects.first()
-            self.assertEqual(document.title, "Diacritics matched")
-
    def test_workflow_no_match_path(self):
        """
        GIVEN:
@@ -990,35 +946,6 @@ class TestWorkflows(
        self.assertEqual(doc.correspondent, self.c2)
        self.assertEqual(doc.title, f"Doc created in {created.year}")

-    def test_document_added_filename_diacritics_normalized(self):
-        trigger = WorkflowTrigger.objects.create(
-            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
-            filter_filename="*račun*",
-        )
-        action = WorkflowAction.objects.create(
-            assign_title="Matched diacritics",
-        )
-        w = Workflow.objects.create(
-            name="Workflow 1",
-            order=0,
-        )
-        w.triggers.add(trigger)
-        w.actions.add(action)
-        w.save()
-
-        doc = Document.objects.create(
-            title="sample test",
-            correspondent=self.c,
-            original_filename="rac\u030cun.pdf",
-        )
-
-        document_consumption_finished.send(
-            sender=self.__class__,
-            document=doc,
-        )
-
-        self.assertEqual(doc.title, "Matched diacritics")
-
    def test_document_added_no_match_filename(self):
        trigger = WorkflowTrigger.objects.create(
            type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -1,7 +1,5 @@
 import logging
 import shutil
-import unicodedata
-from os import PathLike
 from os import utime
 from pathlib import Path
 from subprocess import CompletedProcess
@@ -18,14 +16,6 @@ def _coerce_to_path(
    return Path(source).resolve(), Path(dest).resolve()


-def normalize_nfc(value: str | PathLike[str] | None) -> str | None:
-    """Return NFC-normalized string for filesystem-safe comparisons."""
-
-    if value is None:
-        return None
-    return unicodedata.normalize("NFC", str(value))
-
-
 def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
    """
    Copies only the m_time and a_time attributes from source to destination.