Normalize text to NFC for search and indexing

2026-01-10 21:34:20 -06:00 · 2026-01-05 11:10:21 -08:00
parent 8a14548434
commit d40f7b7e91
2 changed files with 48 additions and 12 deletions
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import logging
 import math
 import re
 import unicodedata
 from collections import Counter
 from contextlib import contextmanager
 from datetime import datetime
@@ -58,6 +59,14 @@ if TYPE_CHECKING:
 logger = logging.getLogger("paperless.index")
 def _normalize_for_index(value: str | None) -> str | None:
    """Normalize text to NFC for consistent search/index matching."""
    if value is None:
        return None
    return unicodedata.normalize("NFC", value)
 def get_schema() -> Schema:
    return Schema(
        id=NUMERIC(stored=True, unique=True),
@@ -163,37 +172,41 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
    viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
    writer.update_document(
        id=doc.pk,
-        title=doc.title,
+        title=_normalize_for_index(doc.title),
-        content=doc.content,
+        content=_normalize_for_index(doc.content),
-        correspondent=doc.correspondent.name if doc.correspondent else None,
+        correspondent=_normalize_for_index(
            doc.correspondent.name if doc.correspondent else None,
        ),
        correspondent_id=doc.correspondent.id if doc.correspondent else None,
        has_correspondent=doc.correspondent is not None,
-        tag=tags if tags else None,
+        tag=_normalize_for_index(tags) if tags else None,
        tag_id=tags_ids if tags_ids else None,
        has_tag=len(tags) > 0,
-        type=doc.document_type.name if doc.document_type else None,
+        type=_normalize_for_index(
            doc.document_type.name if doc.document_type else None,
        ),
        type_id=doc.document_type.id if doc.document_type else None,
        has_type=doc.document_type is not None,
        created=datetime.combine(doc.created, time.min),
        added=doc.added,
        asn=asn,
        modified=doc.modified,
-        path=doc.storage_path.name if doc.storage_path else None,
+        path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
        path_id=doc.storage_path.id if doc.storage_path else None,
        has_path=doc.storage_path is not None,
-        notes=notes,
+        notes=_normalize_for_index(notes),
        num_notes=len(notes),
-        custom_fields=custom_fields,
+        custom_fields=_normalize_for_index(custom_fields),
        custom_field_count=len(doc.custom_fields.all()),
        has_custom_fields=len(custom_fields) > 0,
        custom_fields_id=custom_fields_ids if custom_fields_ids else None,
-        owner=doc.owner.username if doc.owner else None,
+        owner=_normalize_for_index(doc.owner.username if doc.owner else None),
        owner_id=doc.owner.id if doc.owner else None,
        has_owner=doc.owner is not None,
        viewer_id=viewer_ids if viewer_ids else None,
        checksum=doc.checksum,
        page_count=doc.page_count,
-        original_filename=doc.original_filename,
+        original_filename=_normalize_for_index(doc.original_filename),
        is_shared=len(viewer_ids) > 0,
    )
    logger.debug(f"Index updated for document {doc.pk}.")
@@ -421,7 +434,7 @@ class LocalDateParser(English):
 class DelayedFullTextQuery(DelayedQuery):
    def _get_query(self) -> tuple:
-        q_str = self.query_params["query"]
+        q_str = _normalize_for_index(self.query_params["query"]) or ""
        q_str = rewrite_natural_date_keywords(q_str)
        qp = MultifieldParser(
            [
@@ -460,7 +473,12 @@ class DelayedFullTextQuery(DelayedQuery):
 class DelayedMoreLikeThisQuery(DelayedQuery):
    def _get_query(self) -> tuple:
        more_like_doc_id = int(self.query_params["more_like_id"])
-        content = Document.objects.get(id=more_like_doc_id).content
+        content = (
            _normalize_for_index(
                Document.objects.get(id=more_like_doc_id).content,
            )
            or ""
        )
        docnum = self.searcher.document_number(id=more_like_doc_id)
        kts = self.searcher.key_terms_from_text(
@@ -488,6 +506,7 @@ def autocomplete(
    Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
    and without scoring
    """
    term = _normalize_for_index(term) or ""
    terms = []
    with ix.searcher(weighting=TF_IDF()) as s:
--- a/src/documents/tests/test_api_search.py
+++ b/src/documents/tests/test_api_search.py
@@ -89,6 +89,23 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        self.assertEqual(len(results), 0)
        self.assertCountEqual(response.data["all"], [])
    def test_search_handles_diacritics_normalization(self):
        doc = Document.objects.create(
            title="certida\u0303o de nascimento",
            content="birth record without keyword",
            checksum="D",
            pk=10,
        )
        with AsyncWriter(index.open_index()) as writer:
            index.update_document(writer, doc)
        response = self.client.get("/api/documents/?query=certidão")
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        results = response.data["results"]
        self.assertEqual(response.data["count"], 1)
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0]["id"], doc.id)
    def test_search_custom_field_ordering(self):
        custom_field = CustomField.objects.create(
            name="Sortable field",