Normalize text to NFC for search and indexing

This commit is contained in:
shamoon
2026-01-05 11:10:21 -08:00
parent 8a14548434
commit d40f7b7e91
2 changed files with 48 additions and 12 deletions

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import logging import logging
import math import math
import re import re
import unicodedata
from collections import Counter from collections import Counter
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime from datetime import datetime
@@ -58,6 +59,14 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.index") logger = logging.getLogger("paperless.index")
def _normalize_for_index(value: str | None) -> str | None:
"""Normalize text to NFC for consistent search/index matching."""
if value is None:
return None
return unicodedata.normalize("NFC", value)
def get_schema() -> Schema: def get_schema() -> Schema:
return Schema( return Schema(
id=NUMERIC(stored=True, unique=True), id=NUMERIC(stored=True, unique=True),
@@ -163,37 +172,41 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms]) viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
writer.update_document( writer.update_document(
id=doc.pk, id=doc.pk,
title=doc.title, title=_normalize_for_index(doc.title),
content=doc.content, content=_normalize_for_index(doc.content),
correspondent=doc.correspondent.name if doc.correspondent else None, correspondent=_normalize_for_index(
doc.correspondent.name if doc.correspondent else None,
),
correspondent_id=doc.correspondent.id if doc.correspondent else None, correspondent_id=doc.correspondent.id if doc.correspondent else None,
has_correspondent=doc.correspondent is not None, has_correspondent=doc.correspondent is not None,
tag=tags if tags else None, tag=_normalize_for_index(tags) if tags else None,
tag_id=tags_ids if tags_ids else None, tag_id=tags_ids if tags_ids else None,
has_tag=len(tags) > 0, has_tag=len(tags) > 0,
type=doc.document_type.name if doc.document_type else None, type=_normalize_for_index(
doc.document_type.name if doc.document_type else None,
),
type_id=doc.document_type.id if doc.document_type else None, type_id=doc.document_type.id if doc.document_type else None,
has_type=doc.document_type is not None, has_type=doc.document_type is not None,
created=datetime.combine(doc.created, time.min), created=datetime.combine(doc.created, time.min),
added=doc.added, added=doc.added,
asn=asn, asn=asn,
modified=doc.modified, modified=doc.modified,
path=doc.storage_path.name if doc.storage_path else None, path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
path_id=doc.storage_path.id if doc.storage_path else None, path_id=doc.storage_path.id if doc.storage_path else None,
has_path=doc.storage_path is not None, has_path=doc.storage_path is not None,
notes=notes, notes=_normalize_for_index(notes),
num_notes=len(notes), num_notes=len(notes),
custom_fields=custom_fields, custom_fields=_normalize_for_index(custom_fields),
custom_field_count=len(doc.custom_fields.all()), custom_field_count=len(doc.custom_fields.all()),
has_custom_fields=len(custom_fields) > 0, has_custom_fields=len(custom_fields) > 0,
custom_fields_id=custom_fields_ids if custom_fields_ids else None, custom_fields_id=custom_fields_ids if custom_fields_ids else None,
owner=doc.owner.username if doc.owner else None, owner=_normalize_for_index(doc.owner.username if doc.owner else None),
owner_id=doc.owner.id if doc.owner else None, owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None, has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None, viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum, checksum=doc.checksum,
page_count=doc.page_count, page_count=doc.page_count,
original_filename=doc.original_filename, original_filename=_normalize_for_index(doc.original_filename),
is_shared=len(viewer_ids) > 0, is_shared=len(viewer_ids) > 0,
) )
logger.debug(f"Index updated for document {doc.pk}.") logger.debug(f"Index updated for document {doc.pk}.")
@@ -421,7 +434,7 @@ class LocalDateParser(English):
class DelayedFullTextQuery(DelayedQuery): class DelayedFullTextQuery(DelayedQuery):
def _get_query(self) -> tuple: def _get_query(self) -> tuple:
q_str = self.query_params["query"] q_str = _normalize_for_index(self.query_params["query"]) or ""
q_str = rewrite_natural_date_keywords(q_str) q_str = rewrite_natural_date_keywords(q_str)
qp = MultifieldParser( qp = MultifieldParser(
[ [
@@ -460,7 +473,12 @@ class DelayedFullTextQuery(DelayedQuery):
class DelayedMoreLikeThisQuery(DelayedQuery): class DelayedMoreLikeThisQuery(DelayedQuery):
def _get_query(self) -> tuple: def _get_query(self) -> tuple:
more_like_doc_id = int(self.query_params["more_like_id"]) more_like_doc_id = int(self.query_params["more_like_id"])
content = Document.objects.get(id=more_like_doc_id).content content = (
_normalize_for_index(
Document.objects.get(id=more_like_doc_id).content,
)
or ""
)
docnum = self.searcher.document_number(id=more_like_doc_id) docnum = self.searcher.document_number(id=more_like_doc_id)
kts = self.searcher.key_terms_from_text( kts = self.searcher.key_terms_from_text(
@@ -488,6 +506,7 @@ def autocomplete(
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
and without scoring and without scoring
""" """
term = _normalize_for_index(term) or ""
terms = [] terms = []
with ix.searcher(weighting=TF_IDF()) as s: with ix.searcher(weighting=TF_IDF()) as s:

View File

@@ -89,6 +89,23 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
self.assertCountEqual(response.data["all"], []) self.assertCountEqual(response.data["all"], [])
def test_search_handles_diacritics_normalization(self):
doc = Document.objects.create(
title="certida\u0303o de nascimento",
content="birth record without keyword",
checksum="D",
pk=10,
)
with AsyncWriter(index.open_index()) as writer:
index.update_document(writer, doc)
response = self.client.get("/api/documents/?query=certidão")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(response.data["count"], 1)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]["id"], doc.id)
def test_search_custom_field_ordering(self): def test_search_custom_field_ordering(self):
custom_field = CustomField.objects.create( custom_field = CustomField.objects.create(
name="Sortable field", name="Sortable field",