From 2ff54875103c28a77e1e505790b7c317439651ce Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Mon, 2 Nov 2020 12:23:50 +0100 Subject: [PATCH] paginated search results --- src/documents/index.py | 23 ++--------------------- src/documents/views.py | 42 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/src/documents/index.py b/src/documents/index.py index 62d3b822a..e3b391569 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -1,12 +1,8 @@ -from collections import Iterable - from django.db import models from django.dispatch import receiver -from whoosh.fields import Schema, TEXT, NUMERIC, DATETIME, KEYWORD +from whoosh.fields import Schema, TEXT, NUMERIC from whoosh.highlight import Formatter, get_text from whoosh.index import create_in, exists_in, open_dir -from whoosh.qparser import QueryParser -from whoosh.query import terms from whoosh.writing import AsyncWriter from documents.models import Document @@ -57,7 +53,7 @@ def get_schema(): return Schema( id=NUMERIC(stored=True, unique=True, numtype=int), title=TEXT(stored=True), - content=TEXT(stored=True) + content=TEXT() ) @@ -90,21 +86,6 @@ def remove_document_from_index(sender, instance, **kwargs): writer.delete_by_term('id', instance.id) -def query_index(ix, querystr): - with ix.searcher() as searcher: - query = QueryParser("content", ix.schema, termclass=terms.FuzzyTerm).parse(querystr) - results = searcher.search(query) - results.formatter = JsonFormatter() - results.fragmenter.surround = 50 - - return [ - {'id': r['id'], - 'highlights': r.highlights("content"), - 'score': r.score, - 'title': r['title'] - } for r in results] - - def autocomplete(ix, term, limit=10): with ix.reader() as reader: terms = [] diff --git a/src/documents/views.py b/src/documents/views.py index 4eee79bef..f8050a459 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -6,6 +6,9 @@ from django_filters.rest_framework import DjangoFilterBackend from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.views import APIView +from whoosh import highlight +from whoosh.qparser import QueryParser +from whoosh.query import terms from paperless.db import GnuPG from paperless.views import StandardPagination @@ -164,16 +167,45 @@ class SearchView(APIView): ix = index.open_index() + def add_infos_to_hit(self, r): + doc = Document.objects.get(id=r['id']) + return {'id': r['id'], + 'highlights': r.highlights("content", text=doc.content), + 'score': r.score, + 'rank': r.rank, + 'document': DocumentSerializer(doc).data, + 'title': r['title'] + } + def get(self, request, format=None): if 'query' in request.query_params: query = request.query_params['query'] - query_results = index.query_index(self.ix, query) - for r in query_results: - r['document'] = DocumentSerializer(Document.objects.get(id=r['id'])).data + try: + page = int(request.query_params.get('page', 1)) + except (ValueError, TypeError): + page = 1 + + with self.ix.searcher() as searcher: + query_parser = QueryParser("content", self.ix.schema, + termclass=terms.FuzzyTerm).parse(query) + result_page = searcher.search_page(query_parser, page) + result_page.results.fragmenter = highlight.ContextFragmenter( + surround=50) + result_page.results.fragmenter = highlight.PinpointFragmenter() + result_page.results.formatter = index.JsonFormatter() + + return Response( + {'count': len(result_page), + 'page': result_page.pagenum, + 'page_count': result_page.pagecount, + 'results': list(map(self.add_infos_to_hit, result_page))}) - return Response(query_results) else: - return Response([]) + return Response({ + 'count': 0, + 'page': 0, + 'page_count': 0, + 'results': []}) class SearchAutoCompleteView(APIView):