mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-10 21:34:20 -06:00
Normalize text to NFC for search and indexing
This commit is contained in:
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
import unicodedata
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -58,6 +59,14 @@ if TYPE_CHECKING:
|
|||||||
logger = logging.getLogger("paperless.index")
|
logger = logging.getLogger("paperless.index")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_for_index(value: str | None) -> str | None:
|
||||||
|
"""Normalize text to NFC for consistent search/index matching."""
|
||||||
|
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
return unicodedata.normalize("NFC", value)
|
||||||
|
|
||||||
|
|
||||||
def get_schema() -> Schema:
|
def get_schema() -> Schema:
|
||||||
return Schema(
|
return Schema(
|
||||||
id=NUMERIC(stored=True, unique=True),
|
id=NUMERIC(stored=True, unique=True),
|
||||||
@@ -163,37 +172,41 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
|
|||||||
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
|
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
|
||||||
writer.update_document(
|
writer.update_document(
|
||||||
id=doc.pk,
|
id=doc.pk,
|
||||||
title=doc.title,
|
title=_normalize_for_index(doc.title),
|
||||||
content=doc.content,
|
content=_normalize_for_index(doc.content),
|
||||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
correspondent=_normalize_for_index(
|
||||||
|
doc.correspondent.name if doc.correspondent else None,
|
||||||
|
),
|
||||||
correspondent_id=doc.correspondent.id if doc.correspondent else None,
|
correspondent_id=doc.correspondent.id if doc.correspondent else None,
|
||||||
has_correspondent=doc.correspondent is not None,
|
has_correspondent=doc.correspondent is not None,
|
||||||
tag=tags if tags else None,
|
tag=_normalize_for_index(tags) if tags else None,
|
||||||
tag_id=tags_ids if tags_ids else None,
|
tag_id=tags_ids if tags_ids else None,
|
||||||
has_tag=len(tags) > 0,
|
has_tag=len(tags) > 0,
|
||||||
type=doc.document_type.name if doc.document_type else None,
|
type=_normalize_for_index(
|
||||||
|
doc.document_type.name if doc.document_type else None,
|
||||||
|
),
|
||||||
type_id=doc.document_type.id if doc.document_type else None,
|
type_id=doc.document_type.id if doc.document_type else None,
|
||||||
has_type=doc.document_type is not None,
|
has_type=doc.document_type is not None,
|
||||||
created=datetime.combine(doc.created, time.min),
|
created=datetime.combine(doc.created, time.min),
|
||||||
added=doc.added,
|
added=doc.added,
|
||||||
asn=asn,
|
asn=asn,
|
||||||
modified=doc.modified,
|
modified=doc.modified,
|
||||||
path=doc.storage_path.name if doc.storage_path else None,
|
path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
|
||||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||||
has_path=doc.storage_path is not None,
|
has_path=doc.storage_path is not None,
|
||||||
notes=notes,
|
notes=_normalize_for_index(notes),
|
||||||
num_notes=len(notes),
|
num_notes=len(notes),
|
||||||
custom_fields=custom_fields,
|
custom_fields=_normalize_for_index(custom_fields),
|
||||||
custom_field_count=len(doc.custom_fields.all()),
|
custom_field_count=len(doc.custom_fields.all()),
|
||||||
has_custom_fields=len(custom_fields) > 0,
|
has_custom_fields=len(custom_fields) > 0,
|
||||||
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
|
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
|
||||||
owner=doc.owner.username if doc.owner else None,
|
owner=_normalize_for_index(doc.owner.username if doc.owner else None),
|
||||||
owner_id=doc.owner.id if doc.owner else None,
|
owner_id=doc.owner.id if doc.owner else None,
|
||||||
has_owner=doc.owner is not None,
|
has_owner=doc.owner is not None,
|
||||||
viewer_id=viewer_ids if viewer_ids else None,
|
viewer_id=viewer_ids if viewer_ids else None,
|
||||||
checksum=doc.checksum,
|
checksum=doc.checksum,
|
||||||
page_count=doc.page_count,
|
page_count=doc.page_count,
|
||||||
original_filename=doc.original_filename,
|
original_filename=_normalize_for_index(doc.original_filename),
|
||||||
is_shared=len(viewer_ids) > 0,
|
is_shared=len(viewer_ids) > 0,
|
||||||
)
|
)
|
||||||
logger.debug(f"Index updated for document {doc.pk}.")
|
logger.debug(f"Index updated for document {doc.pk}.")
|
||||||
@@ -421,7 +434,7 @@ class LocalDateParser(English):
|
|||||||
|
|
||||||
class DelayedFullTextQuery(DelayedQuery):
|
class DelayedFullTextQuery(DelayedQuery):
|
||||||
def _get_query(self) -> tuple:
|
def _get_query(self) -> tuple:
|
||||||
q_str = self.query_params["query"]
|
q_str = _normalize_for_index(self.query_params["query"]) or ""
|
||||||
q_str = rewrite_natural_date_keywords(q_str)
|
q_str = rewrite_natural_date_keywords(q_str)
|
||||||
qp = MultifieldParser(
|
qp = MultifieldParser(
|
||||||
[
|
[
|
||||||
@@ -460,7 +473,12 @@ class DelayedFullTextQuery(DelayedQuery):
|
|||||||
class DelayedMoreLikeThisQuery(DelayedQuery):
|
class DelayedMoreLikeThisQuery(DelayedQuery):
|
||||||
def _get_query(self) -> tuple:
|
def _get_query(self) -> tuple:
|
||||||
more_like_doc_id = int(self.query_params["more_like_id"])
|
more_like_doc_id = int(self.query_params["more_like_id"])
|
||||||
content = Document.objects.get(id=more_like_doc_id).content
|
content = (
|
||||||
|
_normalize_for_index(
|
||||||
|
Document.objects.get(id=more_like_doc_id).content,
|
||||||
|
)
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
|
||||||
docnum = self.searcher.document_number(id=more_like_doc_id)
|
docnum = self.searcher.document_number(id=more_like_doc_id)
|
||||||
kts = self.searcher.key_terms_from_text(
|
kts = self.searcher.key_terms_from_text(
|
||||||
@@ -488,6 +506,7 @@ def autocomplete(
|
|||||||
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
|
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
|
||||||
and without scoring
|
and without scoring
|
||||||
"""
|
"""
|
||||||
|
term = _normalize_for_index(term) or ""
|
||||||
terms = []
|
terms = []
|
||||||
|
|
||||||
with ix.searcher(weighting=TF_IDF()) as s:
|
with ix.searcher(weighting=TF_IDF()) as s:
|
||||||
|
|||||||
@@ -89,6 +89,23 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
|||||||
self.assertEqual(len(results), 0)
|
self.assertEqual(len(results), 0)
|
||||||
self.assertCountEqual(response.data["all"], [])
|
self.assertCountEqual(response.data["all"], [])
|
||||||
|
|
||||||
|
def test_search_handles_diacritics_normalization(self):
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title="certida\u0303o de nascimento",
|
||||||
|
content="birth record without keyword",
|
||||||
|
checksum="D",
|
||||||
|
pk=10,
|
||||||
|
)
|
||||||
|
with AsyncWriter(index.open_index()) as writer:
|
||||||
|
index.update_document(writer, doc)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?query=certidão")
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
results = response.data["results"]
|
||||||
|
self.assertEqual(response.data["count"], 1)
|
||||||
|
self.assertEqual(len(results), 1)
|
||||||
|
self.assertEqual(results[0]["id"], doc.id)
|
||||||
|
|
||||||
def test_search_custom_field_ordering(self):
|
def test_search_custom_field_ordering(self):
|
||||||
custom_field = CustomField.objects.create(
|
custom_field = CustomField.objects.create(
|
||||||
name="Sortable field",
|
name="Sortable field",
|
||||||
|
|||||||
Reference in New Issue
Block a user