searching for tags, spelling corrections fixes #74

2025-05-01 11:19:32 -05:00 · 2020-11-30 15:13:53 +01:00 · 2020-11-30 15:13:53 +01:00 · 1ef12d2cbc
commit 1ef12d2cbc
parent 0d8688515c
8 changed files with 70 additions and 17 deletions
--- a/docs/administration.rst
+++ b/docs/administration.rst
@ -274,6 +274,7 @@ management command:
 This command takes no arguments.
 .. _`administration-index`:
 Managing the document search index
 ==================================
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -8,6 +8,15 @@ Changelog
 paperless-ng 0.9.4
 ##################
 * Searching:
  * Paperless now supports searching by tags. In order to have this applied to your
    existing documents, you need to perform a ``document_index reindex`` management command
    (see :ref:`administration-index`)
    that adds tags to your search index. Paperless keeps your index updated after that whenever
    something changes.
  * Paperless now has spelling corrections ("Did you mean") for misstyped queries.
 * Front end:
  * Clickable tags, correspondents and types allow quick filtering for related documents.
--- a/src-ui/src/app/components/search/search.component.html
+++ b/src-ui/src/app/components/search/search.component.html
@ -1,7 +1,13 @@
 <app-page-header title="Search results">
 </app-page-header>
-<p>Search string: <i>{{query}}</i></p>
+<p>
    Search string: <i>{{query}}</i>
    <ng-container *ngIf="correctedQuery">
        - Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
    </ng-container>
 </p>
 <div [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()">
    <p>{{resultCount}} result(s)</p>
--- a/src-ui/src/app/components/search/search.component.ts
+++ b/src-ui/src/app/components/search/search.component.ts
@ -1,5 +1,5 @@
 import { Component, OnInit } from '@angular/core';
-import { ActivatedRoute } from '@angular/router';
+import { ActivatedRoute, Router } from '@angular/router';
 import { SearchHit } from 'src/app/data/search-result';
 import { SearchService } from 'src/app/services/rest/search.service';
@ -22,7 +22,9 @@ export class SearchComponent implements OnInit {
  resultCount
-  constructor(private searchService: SearchService, private route: ActivatedRoute) { }
+  correctedQuery: string = null
  constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
  ngOnInit(): void {
    this.route.queryParamMap.subscribe(paramMap => {
@ -34,6 +36,11 @@ export class SearchComponent implements OnInit {
  }
  searchCorrectedQuery() {
    this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
    this.correctedQuery = null
  }
  loadPage(append: boolean = false) {
    this.searchService.search(this.query, this.currentPage).subscribe(result => {
      if (append) {
@ -44,12 +51,11 @@ export class SearchComponent implements OnInit {
      this.pageCount = result.page_count
      this.searching = false
      this.resultCount = result.count
      this.correctedQuery = result.corrected_query
    })
  }
  onScroll() {
    console.log(this.currentPage)
    console.log(this.pageCount)
    if (this.currentPage < this.pageCount) {
      this.currentPage += 1
      this.loadPage(true)
--- a/src-ui/src/app/data/search-result.ts
+++ b/src-ui/src/app/data/search-result.ts
@ -21,6 +21,8 @@ export interface SearchResult {
  page?: number
  page_count?: number
  corrected_query?: string
  results?: SearchHit[]
--- a/src/documents/index.py
+++ b/src/documents/index.py
@ -4,7 +4,7 @@ from contextlib import contextmanager
 from django.conf import settings
 from whoosh import highlight
-from whoosh.fields import Schema, TEXT, NUMERIC
+from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD
 from whoosh.highlight import Formatter, get_text
 from whoosh.index import create_in, exists_in, open_dir
 from whoosh.qparser import MultifieldParser
@ -59,14 +59,15 @@ def get_schema():
        id=NUMERIC(stored=True, unique=True, numtype=int),
        title=TEXT(stored=True),
        content=TEXT(),
-        correspondent=TEXT(stored=True)
+        correspondent=TEXT(stored=True),
        tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True)
    )
 def open_index(recreate=False):
    try:
        if exists_in(settings.INDEX_DIR) and not recreate:
-            return open_dir(settings.INDEX_DIR)
+            return open_dir(settings.INDEX_DIR, schema=get_schema())
    except Exception as e:
        logger.error(f"Error while opening the index: {e}, recreating.")
@ -77,11 +78,13 @@ def open_index(recreate=False):
 def update_document(writer, doc):
    logger.debug("Indexing {}...".format(doc))
    tags = ",".join([t.name for t in doc.tags.all()])
    writer.update_document(
        id=doc.pk,
        title=doc.title,
        content=doc.content,
-        correspondent=doc.correspondent.name if doc.correspondent else None
+        correspondent=doc.correspondent.name if doc.correspondent else None,
        tag=tags if tags else None
    )
@ -106,13 +109,21 @@ def remove_document_from_index(document):
 def query_page(ix, query, page):
    searcher = ix.searcher()
    try:
-        query_parser = MultifieldParser(["content", "title", "correspondent"],
+        query_parser = MultifieldParser(
            ["content", "title", "correspondent", "tag"],
            ix.schema).parse(query)
        result_page = searcher.search_page(query_parser, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()
-        yield result_page
+
        corrected = searcher.correct_query(query_parser, query)
        if corrected.query != query_parser:
            corrected_query = corrected.string
        else:
            corrected_query = None
        yield result_page, corrected_query
    finally:
        searcher.close()
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@ -289,6 +289,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(len(response.data), 10)
    def test_search_spelling_correction(self):
        with AsyncWriter(index.open_index()) as writer:
            for i in range(55):
                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
                index.update_document(writer, doc)
        response = self.client.get("/api/search/?query=thing")
        correction = response.data['corrected_query']
        self.assertEqual(correction, "things")
        response = self.client.get("/api/search/?query=things")
        correction = response.data['corrected_query']
        self.assertEqual(correction, None)
    def test_statistics(self):
        doc1 = Document.objects.create(title="none1", checksum="A")
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -227,11 +227,13 @@ class SearchView(APIView):
            if page < 1:
                page = 1
-            with index.query_page(self.ix, query, page) as result_page:
+            with index.query_page(self.ix, query, page) as (result_page,
                                                            corrected_query):
                return Response(
                    {'count': len(result_page),
                     'page': result_page.pagenum,
                     'page_count': result_page.pagecount,
                     'corrected_query': corrected_query,
                     'results': list(map(self.add_infos_to_hit, result_page))})
        else: