searching for tags, spelling corrections fixes #74

This commit is contained in:
jonaswinkler 2020-11-30 15:13:53 +01:00
parent 0d8688515c
commit 1ef12d2cbc
8 changed files with 70 additions and 17 deletions

View File

@ -274,6 +274,7 @@ management command:
This command takes no arguments. This command takes no arguments.
.. _`administration-index`:
Managing the document search index Managing the document search index
================================== ==================================

View File

@ -8,6 +8,15 @@ Changelog
paperless-ng 0.9.4 paperless-ng 0.9.4
################## ##################
* Searching:
* Paperless now supports searching by tags. In order to have this applied to your
existing documents, you need to perform a ``document_index reindex`` management command
(see :ref:`administration-index`)
that adds tags to your search index. Paperless keeps your index updated after that whenever
something changes.
* Paperless now has spelling corrections ("Did you mean") for misstyped queries.
* Front end: * Front end:
* Clickable tags, correspondents and types allow quick filtering for related documents. * Clickable tags, correspondents and types allow quick filtering for related documents.

View File

@ -1,7 +1,13 @@
<app-page-header title="Search results"> <app-page-header title="Search results">
</app-page-header> </app-page-header>
<p>Search string: <i>{{query}}</i></p> <p>
Search string: <i>{{query}}</i>
<ng-container *ngIf="correctedQuery">
- Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
</ng-container>
</p>
<div [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()"> <div [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()">
<p>{{resultCount}} result(s)</p> <p>{{resultCount}} result(s)</p>

View File

@ -1,5 +1,5 @@
import { Component, OnInit } from '@angular/core'; import { Component, OnInit } from '@angular/core';
import { ActivatedRoute } from '@angular/router'; import { ActivatedRoute, Router } from '@angular/router';
import { SearchHit } from 'src/app/data/search-result'; import { SearchHit } from 'src/app/data/search-result';
import { SearchService } from 'src/app/services/rest/search.service'; import { SearchService } from 'src/app/services/rest/search.service';
@ -22,7 +22,9 @@ export class SearchComponent implements OnInit {
resultCount resultCount
constructor(private searchService: SearchService, private route: ActivatedRoute) { } correctedQuery: string = null
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
ngOnInit(): void { ngOnInit(): void {
this.route.queryParamMap.subscribe(paramMap => { this.route.queryParamMap.subscribe(paramMap => {
@ -34,6 +36,11 @@ export class SearchComponent implements OnInit {
} }
searchCorrectedQuery() {
this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
this.correctedQuery = null
}
loadPage(append: boolean = false) { loadPage(append: boolean = false) {
this.searchService.search(this.query, this.currentPage).subscribe(result => { this.searchService.search(this.query, this.currentPage).subscribe(result => {
if (append) { if (append) {
@ -44,12 +51,11 @@ export class SearchComponent implements OnInit {
this.pageCount = result.page_count this.pageCount = result.page_count
this.searching = false this.searching = false
this.resultCount = result.count this.resultCount = result.count
this.correctedQuery = result.corrected_query
}) })
} }
onScroll() { onScroll() {
console.log(this.currentPage)
console.log(this.pageCount)
if (this.currentPage < this.pageCount) { if (this.currentPage < this.pageCount) {
this.currentPage += 1 this.currentPage += 1
this.loadPage(true) this.loadPage(true)

View File

@ -21,6 +21,8 @@ export interface SearchResult {
page?: number page?: number
page_count?: number page_count?: number
corrected_query?: string
results?: SearchHit[] results?: SearchHit[]

View File

@ -4,7 +4,7 @@ from contextlib import contextmanager
from django.conf import settings from django.conf import settings
from whoosh import highlight from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD
from whoosh.highlight import Formatter, get_text from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser from whoosh.qparser import MultifieldParser
@ -59,14 +59,15 @@ def get_schema():
id=NUMERIC(stored=True, unique=True, numtype=int), id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True), title=TEXT(stored=True),
content=TEXT(), content=TEXT(),
correspondent=TEXT(stored=True) correspondent=TEXT(stored=True),
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True)
) )
def open_index(recreate=False): def open_index(recreate=False):
try: try:
if exists_in(settings.INDEX_DIR) and not recreate: if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR) return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception as e: except Exception as e:
logger.error(f"Error while opening the index: {e}, recreating.") logger.error(f"Error while opening the index: {e}, recreating.")
@ -77,11 +78,13 @@ def open_index(recreate=False):
def update_document(writer, doc): def update_document(writer, doc):
logger.debug("Indexing {}...".format(doc)) logger.debug("Indexing {}...".format(doc))
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document( writer.update_document(
id=doc.pk, id=doc.pk,
title=doc.title, title=doc.title,
content=doc.content, content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None correspondent=doc.correspondent.name if doc.correspondent else None,
tag=tags if tags else None
) )
@ -106,13 +109,21 @@ def remove_document_from_index(document):
def query_page(ix, query, page): def query_page(ix, query, page):
searcher = ix.searcher() searcher = ix.searcher()
try: try:
query_parser = MultifieldParser(["content", "title", "correspondent"], query_parser = MultifieldParser(
["content", "title", "correspondent", "tag"],
ix.schema).parse(query) ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page) result_page = searcher.search_page(query_parser, page)
result_page.results.fragmenter = highlight.ContextFragmenter( result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50) surround=50)
result_page.results.formatter = JsonFormatter() result_page.results.formatter = JsonFormatter()
yield result_page
corrected = searcher.correct_query(query_parser, query)
if corrected.query != query_parser:
corrected_query = corrected.string
else:
corrected_query = None
yield result_page, corrected_query
finally: finally:
searcher.close() searcher.close()

View File

@ -289,6 +289,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10) self.assertEqual(len(response.data), 10)
def test_search_spelling_correction(self):
with AsyncWriter(index.open_index()) as writer:
for i in range(55):
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
index.update_document(writer, doc)
response = self.client.get("/api/search/?query=thing")
correction = response.data['corrected_query']
self.assertEqual(correction, "things")
response = self.client.get("/api/search/?query=things")
correction = response.data['corrected_query']
self.assertEqual(correction, None)
def test_statistics(self): def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A") doc1 = Document.objects.create(title="none1", checksum="A")

View File

@ -227,11 +227,13 @@ class SearchView(APIView):
if page < 1: if page < 1:
page = 1 page = 1
with index.query_page(self.ix, query, page) as result_page: with index.query_page(self.ix, query, page) as (result_page,
corrected_query):
return Response( return Response(
{'count': len(result_page), {'count': len(result_page),
'page': result_page.pagenum, 'page': result_page.pagenum,
'page_count': result_page.pagecount, 'page_count': result_page.pagecount,
'corrected_query': corrected_query,
'results': list(map(self.add_infos_to_hit, result_page))}) 'results': list(map(self.add_infos_to_hit, result_page))})
else: else: