From 9cf9f239edd5027d527bf86942ac1747d28f239e Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 14 Apr 2024 21:16:52 -0700 Subject: [PATCH] Document index accent folding --- src/documents/index.py | 10 ++++++++-- src/documents/tests/test_api_search.py | 6 +++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/documents/index.py b/src/documents/index.py index 388b994d8..161944dbf 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -15,6 +15,8 @@ from guardian.shortcuts import get_users_with_perms from whoosh import classify from whoosh import highlight from whoosh import query +from whoosh.analysis import CharsetFilter +from whoosh.analysis import StemmingAnalyzer from whoosh.fields import BOOLEAN from whoosh.fields import DATETIME from whoosh.fields import KEYWORD @@ -34,6 +36,7 @@ from whoosh.qparser.plugins import FieldsPlugin from whoosh.scoring import TF_IDF from whoosh.searching import ResultsPage from whoosh.searching import Searcher +from whoosh.support.charset import accent_map from whoosh.util.times import timespan from whoosh.writing import AsyncWriter @@ -46,10 +49,13 @@ logger = logging.getLogger("paperless.index") def get_schema(): + # add accent-folding filter to a stemming analyzer: + af_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) + return Schema( id=NUMERIC(stored=True, unique=True), - title=TEXT(sortable=True), - content=TEXT(), + title=TEXT(sortable=True, analyzer=af_analyzer), + content=TEXT(analyzer=af_analyzer), asn=NUMERIC(sortable=True, signed=False), correspondent=TEXT(sortable=True), correspondent_id=NUMERIC(), diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index 1b46f8e33..ecaa49d2e 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -552,7 +552,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): response = self.client.get("/api/search/autocomplete/?term=app") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) + self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"]) d3.owner = u2 @@ -561,7 +561,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): response = self.client.get("/api/search/autocomplete/?term=app") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data, [b"apples", b"applebaum"]) + self.assertEqual(response.data, [b"appl", b"applebaum"]) assign_perm("view_document", u1, d3) @@ -570,7 +570,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): response = self.client.get("/api/search/autocomplete/?term=app") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) + self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"]) def test_search_autocomplete_field_name_match(self): """