Document index accent folding

This commit is contained in:
shamoon 2024-04-14 21:16:52 -07:00
parent f009d9868e
commit 9cf9f239ed
2 changed files with 11 additions and 5 deletions
src/documents

@ -15,6 +15,8 @@ from guardian.shortcuts import get_users_with_perms
from whoosh import classify from whoosh import classify
from whoosh import highlight from whoosh import highlight
from whoosh import query from whoosh import query
from whoosh.analysis import CharsetFilter
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN from whoosh.fields import BOOLEAN
from whoosh.fields import DATETIME from whoosh.fields import DATETIME
from whoosh.fields import KEYWORD from whoosh.fields import KEYWORD
@ -34,6 +36,7 @@ from whoosh.qparser.plugins import FieldsPlugin
from whoosh.scoring import TF_IDF from whoosh.scoring import TF_IDF
from whoosh.searching import ResultsPage from whoosh.searching import ResultsPage
from whoosh.searching import Searcher from whoosh.searching import Searcher
from whoosh.support.charset import accent_map
from whoosh.util.times import timespan from whoosh.util.times import timespan
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
@ -46,10 +49,13 @@ logger = logging.getLogger("paperless.index")
def get_schema(): def get_schema():
# add accent-folding filter to a stemming analyzer:
af_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
return Schema( return Schema(
id=NUMERIC(stored=True, unique=True), id=NUMERIC(stored=True, unique=True),
title=TEXT(sortable=True), title=TEXT(sortable=True, analyzer=af_analyzer),
content=TEXT(), content=TEXT(analyzer=af_analyzer),
asn=NUMERIC(sortable=True, signed=False), asn=NUMERIC(sortable=True, signed=False),
correspondent=TEXT(sortable=True), correspondent=TEXT(sortable=True),
correspondent_id=NUMERIC(), correspondent_id=NUMERIC(),

@ -552,7 +552,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/search/autocomplete/?term=app") response = self.client.get("/api/search/autocomplete/?term=app")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"])
d3.owner = u2 d3.owner = u2
@ -561,7 +561,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/search/autocomplete/?term=app") response = self.client.get("/api/search/autocomplete/?term=app")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, [b"apples", b"applebaum"]) self.assertEqual(response.data, [b"appl", b"applebaum"])
assign_perm("view_document", u1, d3) assign_perm("view_document", u1, d3)
@ -570,7 +570,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/search/autocomplete/?term=app") response = self.client.get("/api/search/autocomplete/?term=app")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"])
def test_search_autocomplete_field_name_match(self): def test_search_autocomplete_field_name_match(self):
""" """