Merge branch 'feature-more-like-this' into dev

This commit is contained in:
jonaswinkler 2020-12-19 01:15:51 +01:00
commit 3cdd38cb70
14 changed files with 176 additions and 47 deletions

View File

@ -221,21 +221,16 @@ Each fragment contains a list of strings, and some of them are marked as a highl
[ [
[ [
{"text": "This is a sample text with a "}, {"text": "This is a sample text with a ", "highlight": false},
{"text": "highlighted", "term": 0}, {"text": "highlighted", "highlight": true},
{"text": " word."} {"text": " word.", "highlight": false}
], ],
[ [
{"text": "Another", "term": 1}, {"text": "Another", "highlight": true},
{"text": " fragment with a highlight."} {"text": " fragment with a highlight.", "highlight": false}
] ]
] ]
When ``term`` is present within a string, the word within ``text`` should be highlighted.
The term index groups multiple matches together and words with the same index
should get identical highlighting.
A client may use this example to produce the following output: A client may use this example to produce the following output:
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ... ... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...

View File

@ -34,6 +34,12 @@
</div> </div>
<button type="button" class="btn btn-sm btn-outline-primary mr-2" (click)="moreLike()">
<svg class="buttonicon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#three-dots" />
</svg>
<span class="d-none d-lg-inline"> More like this</span>
</button>
<button type="button" class="btn btn-sm btn-outline-primary" (click)="close()"> <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
<svg class="buttonicon" fill="currentColor"> <svg class="buttonicon" fill="currentColor">

View File

@ -172,6 +172,10 @@ export class DocumentDetailComponent implements OnInit {
} }
moreLike() {
this.router.navigate(["search"], {queryParams: {more_like:this.document.id}})
}
hasNext() { hasNext() {
return this.documentListViewService.hasNext(this.documentId) return this.documentListViewService.hasNext(this.documentId)
} }

View File

@ -23,8 +23,14 @@
</p> </p>
<div class="d-flex justify-content-between align-items-center"> <div class="d-flex align-items-center">
<div class="btn-group"> <div class="btn-group">
<a routerLink="/search" [queryParams]="{'more_like': document.id}" class="btn btn-sm btn-outline-secondary" *ngIf="moreLikeThis">
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M3 9.5a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3zm5 0a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3zm5 0a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3z"/>
</svg>
More like this
</a>
<a routerLink="/documents/{{document.id}}" class="btn btn-sm btn-outline-secondary"> <a routerLink="/documents/{{document.id}}" class="btn btn-sm btn-outline-secondary">
<svg width="1em" height="1em" viewBox="0 0 16 16" class="bi bi-pencil" fill="currentColor" xmlns="http://www.w3.org/2000/svg"> <svg width="1em" height="1em" viewBox="0 0 16 16" class="bi bi-pencil" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" d="M12.146.146a.5.5 0 0 1 .708 0l3 3a.5.5 0 0 1 0 .708l-10 10a.5.5 0 0 1-.168.11l-5 2a.5.5 0 0 1-.65-.65l2-5a.5.5 0 0 1 .11-.168l10-10zM11.207 2.5L13.5 4.793 14.793 3.5 12.5 1.207 11.207 2.5zm1.586 3L10.5 3.207 4 9.707V10h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.293l6.5-6.5zm-9.761 5.175l-.106.106-1.528 3.821 3.821-1.528.106-.106A.5.5 0 0 1 5 12.5V12h-.5a.5.5 0 0 1-.5-.5V11h-.5a.5.5 0 0 1-.468-.325z"/> <path fill-rule="evenodd" d="M12.146.146a.5.5 0 0 1 .708 0l3 3a.5.5 0 0 1 0 .708l-10 10a.5.5 0 0 1-.168.11l-5 2a.5.5 0 0 1-.65-.65l2-5a.5.5 0 0 1 .11-.168l10-10zM11.207 2.5L13.5 4.793 14.793 3.5 12.5 1.207 11.207 2.5zm1.586 3L10.5 3.207 4 9.707V10h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.293l6.5-6.5zm-9.761 5.175l-.106.106-1.528 3.821 3.821-1.528.106-.106A.5.5 0 0 1 5 12.5V12h-.5a.5.5 0 0 1-.5-.5V11h-.5a.5.5 0 0 1-.468-.325z"/>
@ -45,7 +51,13 @@
</svg> </svg>
Download Download
</a> </a>
</div> </div>
<small class="text-muted ml-auto">Score:</small>
<ngb-progressbar *ngIf="searchScore" [type]="searchScoreClass" [value]="searchScore" class="search-score-bar mx-2" [max]="1"></ngb-progressbar>
<small class="text-muted">Created: {{document.created | date}}</small> <small class="text-muted">Created: {{document.created | date}}</small>
</div> </div>

View File

@ -10,3 +10,9 @@
position: absolute; position: absolute;
} }
.search-score-bar {
width: 100px;
height: 5px;
margin-top: 2px;
}

View File

@ -12,6 +12,9 @@ export class DocumentCardLargeComponent implements OnInit {
constructor(private documentService: DocumentService, private sanitizer: DomSanitizer) { } constructor(private documentService: DocumentService, private sanitizer: DomSanitizer) { }
@Input()
moreLikeThis: boolean = false
@Input() @Input()
document: PaperlessDocument document: PaperlessDocument
@ -24,6 +27,19 @@ export class DocumentCardLargeComponent implements OnInit {
@Output() @Output()
clickCorrespondent = new EventEmitter<number>() clickCorrespondent = new EventEmitter<number>()
@Input()
searchScore: number
get searchScoreClass() {
if (this.searchScore > 0.7) {
return "success"
} else if (this.searchScore > 0.3) {
return "warning"
} else {
return "danger"
}
}
ngOnInit(): void { ngOnInit(): void {
} }

View File

@ -1,3 +1,3 @@
... <span *ngFor="let fragment of highlights"> ... <span *ngFor="let fragment of highlights">
<span *ngFor="let token of fragment" [ngClass]="token.term != null ? 'match term'+ token.term : ''">{{token.text}}</span> ... <span *ngFor="let token of fragment" [class.match]="token.highlight">{{token.text}}</span> ...
</span> </span>

View File

@ -1,4 +1,4 @@
.match { .match {
color: black; color: black;
background-color: orange; background-color: rgb(255, 211, 66);
} }

View File

@ -3,7 +3,12 @@
<div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div> <div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div>
<p> <p *ngIf="more_like">
Showing documents similar to
<a routerLink="/documents/{{more_like}}">{{more_like_doc?.original_file_name}}</a>
</p>
<p *ngIf="query">
Search string: <i>{{query}}</i> Search string: <i>{{query}}</i>
<ng-container *ngIf="correctedQuery"> <ng-container *ngIf="correctedQuery">
- Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"? - Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
@ -15,7 +20,9 @@
<p>{{resultCount}} result(s)</p> <p>{{resultCount}} result(s)</p>
<app-document-card-large *ngFor="let result of results" <app-document-card-large *ngFor="let result of results"
[document]="result.document" [document]="result.document"
[details]="result.highlights"> [details]="result.highlights"
[searchScore]="result.score / maxScore"
[moreLikeThis]="true">
</app-document-card-large> </app-document-card-large>
</div> </div>

View File

@ -1,6 +1,9 @@
import { Component, OnInit } from '@angular/core'; import { Component, OnInit } from '@angular/core';
import { ActivatedRoute, Router } from '@angular/router'; import { ActivatedRoute, Router } from '@angular/router';
import { PaperlessDocument } from 'src/app/data/paperless-document';
import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
import { SearchHit } from 'src/app/data/search-result'; import { SearchHit } from 'src/app/data/search-result';
import { DocumentService } from 'src/app/services/rest/document.service';
import { SearchService } from 'src/app/services/rest/search.service'; import { SearchService } from 'src/app/services/rest/search.service';
@Component({ @Component({
@ -14,6 +17,10 @@ export class SearchComponent implements OnInit {
query: string = "" query: string = ""
more_like: number
more_like_doc: PaperlessDocument
searching = false searching = false
currentPage = 1 currentPage = 1
@ -26,11 +33,24 @@ export class SearchComponent implements OnInit {
errorMessage: string errorMessage: string
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { } get maxScore() {
return this.results?.length > 0 ? this.results[0].score : 100
}
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router, private documentService: DocumentService) { }
ngOnInit(): void { ngOnInit(): void {
this.route.queryParamMap.subscribe(paramMap => { this.route.queryParamMap.subscribe(paramMap => {
window.scrollTo(0, 0)
this.query = paramMap.get('query') this.query = paramMap.get('query')
this.more_like = paramMap.has('more_like') ? +paramMap.get('more_like') : null
if (this.more_like) {
this.documentService.get(this.more_like).subscribe(r => {
this.more_like_doc = r
})
} else {
this.more_like_doc = null
}
this.searching = true this.searching = true
this.currentPage = 1 this.currentPage = 1
this.loadPage() this.loadPage()
@ -39,13 +59,14 @@ export class SearchComponent implements OnInit {
} }
searchCorrectedQuery() { searchCorrectedQuery() {
this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}}) this.router.navigate(["search"], {queryParams: {query: this.correctedQuery, more_like: this.more_like}})
} }
loadPage(append: boolean = false) { loadPage(append: boolean = false) {
this.errorMessage = null this.errorMessage = null
this.correctedQuery = null this.correctedQuery = null
this.searchService.search(this.query, this.currentPage).subscribe(result => {
this.searchService.search(this.query, this.currentPage, this.more_like).subscribe(result => {
if (append) { if (append) {
this.results.push(...result.results) this.results.push(...result.results)
} else { } else {

View File

@ -15,11 +15,17 @@ export class SearchService {
constructor(private http: HttpClient, private documentService: DocumentService) { } constructor(private http: HttpClient, private documentService: DocumentService) { }
search(query: string, page?: number): Observable<SearchResult> { search(query: string, page?: number, more_like?: number): Observable<SearchResult> {
let httpParams = new HttpParams().set('query', query) let httpParams = new HttpParams()
if (query) {
httpParams = httpParams.set('query', query)
}
if (page) { if (page) {
httpParams = httpParams.set('page', page.toString()) httpParams = httpParams.set('page', page.toString())
} }
if (more_like) {
httpParams = httpParams.set('more_like', more_like.toString())
}
return this.http.get<SearchResult>(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe( return this.http.get<SearchResult>(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe(
map(result => { map(result => {
result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document)) result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document))

View File

@ -3,7 +3,7 @@ import os
from contextlib import contextmanager from contextlib import contextmanager
from django.conf import settings from django.conf import settings
from whoosh import highlight from whoosh import highlight, classify, query
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir from whoosh.index import create_in, exists_in, open_dir
@ -20,32 +20,37 @@ class JsonFormatter(Formatter):
self.seen = {} self.seen = {}
def format_token(self, text, token, replace=False): def format_token(self, text, token, replace=False):
seen = self.seen
ttext = self._text(get_text(text, token, replace)) ttext = self._text(get_text(text, token, replace))
if ttext in seen: return {'text': ttext, 'highlight': 'true'}
termnum = seen[ttext]
else:
termnum = len(seen)
seen[ttext] = termnum
return {'text': ttext, 'term': termnum}
def format_fragment(self, fragment, replace=False): def format_fragment(self, fragment, replace=False):
output = [] output = []
index = fragment.startchar index = fragment.startchar
text = fragment.text text = fragment.text
amend_token = None
for t in fragment.matches: for t in fragment.matches:
if t.startchar is None: if t.startchar is None:
continue continue
if t.startchar < index: if t.startchar < index:
continue continue
if t.startchar > index: if t.startchar > index:
output.append({'text': text[index:t.startchar]}) text_inbetween = text[index:t.startchar]
output.append(self.format_token(text, t, replace)) if amend_token and t.startchar - index < 10:
amend_token['text'] += text_inbetween
else:
output.append({'text': text_inbetween,
'highlight': False})
amend_token = None
token = self.format_token(text, t, replace)
if amend_token:
amend_token['text'] += token['text']
else:
output.append(token)
amend_token = token
index = t.endchar index = t.endchar
if index < fragment.endchar: if index < fragment.endchar:
output.append({'text': text[index:fragment.endchar]}) output.append({'text': text[index:fragment.endchar],
'highlight': False})
return output return output
def format(self, fragments, replace=False): def format(self, fragments, replace=False):
@ -120,22 +125,42 @@ def remove_document_from_index(document):
@contextmanager @contextmanager
def query_page(ix, querystring, page): def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
searcher = ix.searcher() searcher = ix.searcher()
try: try:
if querystring:
qp = MultifieldParser( qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"], ["content", "title", "correspondent", "tag", "type"],
ix.schema) ix.schema)
qp.add_plugin(DateParserPlugin()) qp.add_plugin(DateParserPlugin())
str_q = qp.parse(querystring)
corrected = searcher.correct_query(str_q, querystring)
else:
str_q = None
corrected = None
if more_like_doc_id:
docnum = searcher.document_number(id=more_like_doc_id)
kts = searcher.key_terms_from_text(
'content', more_like_doc_content, numterms=20,
model=classify.Bo1Model, normalize=False)
more_like_q = query.Or(
[query.Term('content', word, boost=weight)
for word, weight in kts])
result_page = searcher.search_page(
more_like_q, page, filter=str_q, mask={docnum})
elif str_q:
result_page = searcher.search_page(str_q, page)
else:
raise ValueError(
"Either querystring or more_like_doc_id is required."
)
q = qp.parse(querystring)
result_page = searcher.search_page(q, page)
result_page.results.fragmenter = highlight.ContextFragmenter( result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50) surround=50)
result_page.results.formatter = JsonFormatter() result_page.results.formatter = JsonFormatter()
corrected = searcher.correct_query(q, querystring) if corrected and corrected.query != str_q:
if corrected.query != q:
corrected_query = corrected.string corrected_query = corrected.string
else: else:
corrected_query = None corrected_query = None

View File

@ -351,6 +351,25 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(correction, None) self.assertEqual(correction, None)
def test_search_more_like(self):
d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
with AsyncWriter(index.open_index()) as writer:
index.update_document(writer, d1)
index.update_document(writer, d2)
index.update_document(writer, d3)
response = self.client.get(f"/api/search/?more_like={d2.id}")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['id'], d3.id)
self.assertEqual(results[1]['id'], d1.id)
def test_statistics(self): def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A") doc1 = Document.objects.create(title="none1", checksum="A")

View File

@ -340,14 +340,27 @@ class SearchView(APIView):
} }
def get(self, request, format=None): def get(self, request, format=None):
if 'query' not in request.query_params:
if 'query' in request.query_params:
query = request.query_params['query']
else:
query = None
if 'more_like' in request.query_params:
more_like_id = request.query_params['more_like']
more_like_content = Document.objects.get(id=more_like_id).content
else:
more_like_id = None
more_like_content = None
if not query and not more_like_id:
return Response({ return Response({
'count': 0, 'count': 0,
'page': 0, 'page': 0,
'page_count': 0, 'page_count': 0,
'corrected_query': None,
'results': []}) 'results': []})
query = request.query_params['query']
try: try:
page = int(request.query_params.get('page', 1)) page = int(request.query_params.get('page', 1))
except (ValueError, TypeError): except (ValueError, TypeError):
@ -357,8 +370,7 @@ class SearchView(APIView):
page = 1 page = 1
try: try:
with index.query_page(self.ix, query, page) as (result_page, with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
corrected_query):
return Response( return Response(
{'count': len(result_page), {'count': len(result_page),
'page': result_page.pagenum, 'page': result_page.pagenum,