more like this searching

2026-02-05 23:32:46 -06:00 · 2020-12-17 21:36:21 +01:00
parent 9d88e3ee07
commit 29c094e407
10 changed files with 113 additions and 27 deletions
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,7 +3,7 @@ import os
 from contextlib import contextmanager

 from django.conf import settings
-from whoosh import highlight
+from whoosh import highlight, classify, query
 from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
 from whoosh.highlight import Formatter, get_text
 from whoosh.index import create_in, exists_in, open_dir
@@ -120,22 +120,39 @@ def remove_document_from_index(document):


@contextmanager
-def query_page(ix, querystring, page):
+def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
    searcher = ix.searcher()
    try:
-        qp = MultifieldParser(
-            ["content", "title", "correspondent", "tag", "type"],
-            ix.schema)
-        qp.add_plugin(DateParserPlugin())
+        if querystring:
+            qp = MultifieldParser(
+                ["content", "title", "correspondent", "tag", "type"],
+                ix.schema)
+            qp.add_plugin(DateParserPlugin())
+            str_q = qp.parse(querystring)
+            corrected = searcher.correct_query(str_q, querystring)
+        else:
+            str_q = None
+            corrected = None
+
+        if more_like_doc_id:
+            docnum = searcher.document_number(id=more_like_doc_id)
+            kts = searcher.key_terms_from_text('content', more_like_doc_content, numterms=20,
+                                           model=classify.Bo1Model, normalize=False)
+            more_like_q = query.Or([query.Term('content', word, boost=weight)
+                          for word, weight in kts])
+            result_page = searcher.search_page(more_like_q, page, filter=str_q, mask={docnum})
+        elif str_q:
+            result_page = searcher.search_page(str_q, page)
+        else:
+            raise ValueError(
+                "Either querystring or more_like_doc_id is required."
+            )

-        q = qp.parse(querystring)
-        result_page = searcher.search_page(q, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

-        corrected = searcher.correct_query(q, querystring)
-        if corrected.query != q:
+        if corrected and corrected.query != str_q:
            corrected_query = corrected.string
        else:
            corrected_query = None
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -335,14 +335,19 @@ class SearchView(APIView):
                }

    def get(self, request, format=None):
-        if 'query' not in request.query_params:
-            return Response({
-                'count': 0,
-                'page': 0,
-                'page_count': 0,
-                'results': []})

-        query = request.query_params['query']
+        if 'query' in request.query_params:
+            query = request.query_params['query']
+        else:
+            query = None
+
+        if 'more_like' in request.query_params:
+            more_like_id = request.query_params['more_like']
+            more_like_content = Document.objects.get(id=more_like_id).content
+        else:
+            more_like_id = None
+            more_like_content = None
+
        try:
            page = int(request.query_params.get('page', 1))
        except (ValueError, TypeError):
@@ -352,7 +357,7 @@ class SearchView(APIView):
            page = 1

        try:
-            with index.query_page(self.ix, query, page) as (result_page,
+            with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page,
                                                            corrected_query):
                return Response(
                    {'count': len(result_page),