Document index accent folding

This commit is contained in:
shamoon 2024-04-14 21:16:52 -07:00
parent cbd9823ad6
commit dc58a5673b
No known key found for this signature in database
2 changed files with 11 additions and 5 deletions

View File

@ -17,6 +17,8 @@ from guardian.shortcuts import get_users_with_perms
from whoosh import classify from whoosh import classify
from whoosh import highlight from whoosh import highlight
from whoosh import query from whoosh import query
from whoosh.analysis import CharsetFilter
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN from whoosh.fields import BOOLEAN
from whoosh.fields import DATETIME from whoosh.fields import DATETIME
from whoosh.fields import KEYWORD from whoosh.fields import KEYWORD
@ -36,6 +38,7 @@ from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.qparser.dateparse import English from whoosh.qparser.dateparse import English
from whoosh.qparser.plugins import FieldsPlugin from whoosh.qparser.plugins import FieldsPlugin
from whoosh.scoring import TF_IDF from whoosh.scoring import TF_IDF
from whoosh.support.charset import accent_map
from whoosh.util.times import timespan from whoosh.util.times import timespan
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
@ -54,10 +57,13 @@ logger = logging.getLogger("paperless.index")
def get_schema() -> Schema: def get_schema() -> Schema:
# add accent-folding filter to a stemming analyzer:
af_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
return Schema( return Schema(
id=NUMERIC(stored=True, unique=True), id=NUMERIC(stored=True, unique=True),
title=TEXT(sortable=True), title=TEXT(sortable=True, analyzer=af_analyzer),
content=TEXT(), content=TEXT(analyzer=af_analyzer),
asn=NUMERIC(sortable=True, signed=False), asn=NUMERIC(sortable=True, signed=False),
correspondent=TEXT(sortable=True), correspondent=TEXT(sortable=True),
correspondent_id=NUMERIC(), correspondent_id=NUMERIC(),

View File

@ -557,7 +557,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/search/autocomplete/?term=app") response = self.client.get("/api/search/autocomplete/?term=app")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"])
d3.owner = u2 d3.owner = u2
@ -566,7 +566,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/search/autocomplete/?term=app") response = self.client.get("/api/search/autocomplete/?term=app")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, [b"apples", b"applebaum"]) self.assertEqual(response.data, [b"appl", b"applebaum"])
assign_perm("view_document", u1, d3) assign_perm("view_document", u1, d3)
@ -575,7 +575,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/search/autocomplete/?term=app") response = self.client.get("/api/search/autocomplete/?term=app")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"]) self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"])
def test_search_autocomplete_field_name_match(self): def test_search_autocomplete_field_name_match(self):
""" """