mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'feature-more-like-this' into dev
This commit is contained in:
commit
3cdd38cb70
15
docs/api.rst
15
docs/api.rst
@ -221,21 +221,16 @@ Each fragment contains a list of strings, and some of them are marked as a highl
|
||||
|
||||
[
|
||||
[
|
||||
{"text": "This is a sample text with a "},
|
||||
{"text": "highlighted", "term": 0},
|
||||
{"text": " word."}
|
||||
{"text": "This is a sample text with a ", "highlight": false},
|
||||
{"text": "highlighted", "highlight": true},
|
||||
{"text": " word.", "highlight": false}
|
||||
],
|
||||
[
|
||||
{"text": "Another", "term": 1},
|
||||
{"text": " fragment with a highlight."}
|
||||
{"text": "Another", "highlight": true},
|
||||
{"text": " fragment with a highlight.", "highlight": false}
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
|
||||
When ``term`` is present within a string, the word within ``text`` should be highlighted.
|
||||
The term index groups multiple matches together and words with the same index
|
||||
should get identical highlighting.
|
||||
A client may use this example to produce the following output:
|
||||
|
||||
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
|
||||
|
@ -34,6 +34,12 @@
|
||||
|
||||
</div>
|
||||
|
||||
<button type="button" class="btn btn-sm btn-outline-primary mr-2" (click)="moreLike()">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#three-dots" />
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> More like this</span>
|
||||
</button>
|
||||
|
||||
<button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
|
@ -172,6 +172,10 @@ export class DocumentDetailComponent implements OnInit {
|
||||
|
||||
}
|
||||
|
||||
moreLike() {
|
||||
this.router.navigate(["search"], {queryParams: {more_like:this.document.id}})
|
||||
}
|
||||
|
||||
hasNext() {
|
||||
return this.documentListViewService.hasNext(this.documentId)
|
||||
}
|
||||
|
@ -23,8 +23,14 @@
|
||||
</p>
|
||||
|
||||
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<div class="d-flex align-items-center">
|
||||
<div class="btn-group">
|
||||
<a routerLink="/search" [queryParams]="{'more_like': document.id}" class="btn btn-sm btn-outline-secondary" *ngIf="moreLikeThis">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots" viewBox="0 0 16 16">
|
||||
<path fill-rule="evenodd" d="M3 9.5a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3zm5 0a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3zm5 0a1.5 1.5 0 1 1 0-3 1.5 1.5 0 0 1 0 3z"/>
|
||||
</svg>
|
||||
More like this
|
||||
</a>
|
||||
<a routerLink="/documents/{{document.id}}" class="btn btn-sm btn-outline-secondary">
|
||||
<svg width="1em" height="1em" viewBox="0 0 16 16" class="bi bi-pencil" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
|
||||
<path fill-rule="evenodd" d="M12.146.146a.5.5 0 0 1 .708 0l3 3a.5.5 0 0 1 0 .708l-10 10a.5.5 0 0 1-.168.11l-5 2a.5.5 0 0 1-.65-.65l2-5a.5.5 0 0 1 .11-.168l10-10zM11.207 2.5L13.5 4.793 14.793 3.5 12.5 1.207 11.207 2.5zm1.586 3L10.5 3.207 4 9.707V10h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.293l6.5-6.5zm-9.761 5.175l-.106.106-1.528 3.821 3.821-1.528.106-.106A.5.5 0 0 1 5 12.5V12h-.5a.5.5 0 0 1-.5-.5V11h-.5a.5.5 0 0 1-.468-.325z"/>
|
||||
@ -45,10 +51,16 @@
|
||||
</svg>
|
||||
Download
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
<small class="text-muted ml-auto">Score:</small>
|
||||
|
||||
<ngb-progressbar *ngIf="searchScore" [type]="searchScoreClass" [value]="searchScore" class="search-score-bar mx-2" [max]="1"></ngb-progressbar>
|
||||
|
||||
<small class="text-muted">Created: {{document.created | date}}</small>
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -9,4 +9,10 @@
|
||||
height: 100%;
|
||||
position: absolute;
|
||||
|
||||
}
|
||||
|
||||
.search-score-bar {
|
||||
width: 100px;
|
||||
height: 5px;
|
||||
margin-top: 2px;
|
||||
}
|
@ -12,6 +12,9 @@ export class DocumentCardLargeComponent implements OnInit {
|
||||
|
||||
constructor(private documentService: DocumentService, private sanitizer: DomSanitizer) { }
|
||||
|
||||
@Input()
|
||||
moreLikeThis: boolean = false
|
||||
|
||||
@Input()
|
||||
document: PaperlessDocument
|
||||
|
||||
@ -24,6 +27,19 @@ export class DocumentCardLargeComponent implements OnInit {
|
||||
@Output()
|
||||
clickCorrespondent = new EventEmitter<number>()
|
||||
|
||||
@Input()
|
||||
searchScore: number
|
||||
|
||||
get searchScoreClass() {
|
||||
if (this.searchScore > 0.7) {
|
||||
return "success"
|
||||
} else if (this.searchScore > 0.3) {
|
||||
return "warning"
|
||||
} else {
|
||||
return "danger"
|
||||
}
|
||||
}
|
||||
|
||||
ngOnInit(): void {
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
... <span *ngFor="let fragment of highlights">
|
||||
<span *ngFor="let token of fragment" [ngClass]="token.term != null ? 'match term'+ token.term : ''">{{token.text}}</span> ...
|
||||
<span *ngFor="let token of fragment" [class.match]="token.highlight">{{token.text}}</span> ...
|
||||
</span>
|
@ -1,4 +1,4 @@
|
||||
.match {
|
||||
color: black;
|
||||
background-color: orange;
|
||||
background-color: rgb(255, 211, 66);
|
||||
}
|
@ -3,7 +3,12 @@
|
||||
|
||||
<div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div>
|
||||
|
||||
<p>
|
||||
<p *ngIf="more_like">
|
||||
Showing documents similar to
|
||||
<a routerLink="/documents/{{more_like}}">{{more_like_doc?.original_file_name}}</a>
|
||||
</p>
|
||||
|
||||
<p *ngIf="query">
|
||||
Search string: <i>{{query}}</i>
|
||||
<ng-container *ngIf="correctedQuery">
|
||||
- Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
|
||||
@ -15,7 +20,9 @@
|
||||
<p>{{resultCount}} result(s)</p>
|
||||
<app-document-card-large *ngFor="let result of results"
|
||||
[document]="result.document"
|
||||
[details]="result.highlights">
|
||||
[details]="result.highlights"
|
||||
[searchScore]="result.score / maxScore"
|
||||
[moreLikeThis]="true">
|
||||
|
||||
</app-document-card-large>
|
||||
</div>
|
||||
|
@ -1,6 +1,9 @@
|
||||
import { Component, OnInit } from '@angular/core';
|
||||
import { ActivatedRoute, Router } from '@angular/router';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
|
||||
import { SearchHit } from 'src/app/data/search-result';
|
||||
import { DocumentService } from 'src/app/services/rest/document.service';
|
||||
import { SearchService } from 'src/app/services/rest/search.service';
|
||||
|
||||
@Component({
|
||||
@ -14,6 +17,10 @@ export class SearchComponent implements OnInit {
|
||||
|
||||
query: string = ""
|
||||
|
||||
more_like: number
|
||||
|
||||
more_like_doc: PaperlessDocument
|
||||
|
||||
searching = false
|
||||
|
||||
currentPage = 1
|
||||
@ -26,11 +33,24 @@ export class SearchComponent implements OnInit {
|
||||
|
||||
errorMessage: string
|
||||
|
||||
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
|
||||
get maxScore() {
|
||||
return this.results?.length > 0 ? this.results[0].score : 100
|
||||
}
|
||||
|
||||
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router, private documentService: DocumentService) { }
|
||||
|
||||
ngOnInit(): void {
|
||||
this.route.queryParamMap.subscribe(paramMap => {
|
||||
window.scrollTo(0, 0)
|
||||
this.query = paramMap.get('query')
|
||||
this.more_like = paramMap.has('more_like') ? +paramMap.get('more_like') : null
|
||||
if (this.more_like) {
|
||||
this.documentService.get(this.more_like).subscribe(r => {
|
||||
this.more_like_doc = r
|
||||
})
|
||||
} else {
|
||||
this.more_like_doc = null
|
||||
}
|
||||
this.searching = true
|
||||
this.currentPage = 1
|
||||
this.loadPage()
|
||||
@ -39,13 +59,14 @@ export class SearchComponent implements OnInit {
|
||||
}
|
||||
|
||||
searchCorrectedQuery() {
|
||||
this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
|
||||
this.router.navigate(["search"], {queryParams: {query: this.correctedQuery, more_like: this.more_like}})
|
||||
}
|
||||
|
||||
loadPage(append: boolean = false) {
|
||||
this.errorMessage = null
|
||||
this.correctedQuery = null
|
||||
this.searchService.search(this.query, this.currentPage).subscribe(result => {
|
||||
|
||||
this.searchService.search(this.query, this.currentPage, this.more_like).subscribe(result => {
|
||||
if (append) {
|
||||
this.results.push(...result.results)
|
||||
} else {
|
||||
|
@ -15,11 +15,17 @@ export class SearchService {
|
||||
|
||||
constructor(private http: HttpClient, private documentService: DocumentService) { }
|
||||
|
||||
search(query: string, page?: number): Observable<SearchResult> {
|
||||
let httpParams = new HttpParams().set('query', query)
|
||||
search(query: string, page?: number, more_like?: number): Observable<SearchResult> {
|
||||
let httpParams = new HttpParams()
|
||||
if (query) {
|
||||
httpParams = httpParams.set('query', query)
|
||||
}
|
||||
if (page) {
|
||||
httpParams = httpParams.set('page', page.toString())
|
||||
}
|
||||
if (more_like) {
|
||||
httpParams = httpParams.set('more_like', more_like.toString())
|
||||
}
|
||||
return this.http.get<SearchResult>(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe(
|
||||
map(result => {
|
||||
result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document))
|
||||
|
@ -3,7 +3,7 @@ import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.conf import settings
|
||||
from whoosh import highlight
|
||||
from whoosh import highlight, classify, query
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
from whoosh.index import create_in, exists_in, open_dir
|
||||
@ -20,32 +20,37 @@ class JsonFormatter(Formatter):
|
||||
self.seen = {}
|
||||
|
||||
def format_token(self, text, token, replace=False):
|
||||
seen = self.seen
|
||||
ttext = self._text(get_text(text, token, replace))
|
||||
if ttext in seen:
|
||||
termnum = seen[ttext]
|
||||
else:
|
||||
termnum = len(seen)
|
||||
seen[ttext] = termnum
|
||||
|
||||
return {'text': ttext, 'term': termnum}
|
||||
return {'text': ttext, 'highlight': 'true'}
|
||||
|
||||
def format_fragment(self, fragment, replace=False):
|
||||
output = []
|
||||
index = fragment.startchar
|
||||
text = fragment.text
|
||||
|
||||
amend_token = None
|
||||
for t in fragment.matches:
|
||||
if t.startchar is None:
|
||||
continue
|
||||
if t.startchar < index:
|
||||
continue
|
||||
if t.startchar > index:
|
||||
output.append({'text': text[index:t.startchar]})
|
||||
output.append(self.format_token(text, t, replace))
|
||||
text_inbetween = text[index:t.startchar]
|
||||
if amend_token and t.startchar - index < 10:
|
||||
amend_token['text'] += text_inbetween
|
||||
else:
|
||||
output.append({'text': text_inbetween,
|
||||
'highlight': False})
|
||||
amend_token = None
|
||||
token = self.format_token(text, t, replace)
|
||||
if amend_token:
|
||||
amend_token['text'] += token['text']
|
||||
else:
|
||||
output.append(token)
|
||||
amend_token = token
|
||||
index = t.endchar
|
||||
if index < fragment.endchar:
|
||||
output.append({'text': text[index:fragment.endchar]})
|
||||
output.append({'text': text[index:fragment.endchar],
|
||||
'highlight': False})
|
||||
return output
|
||||
|
||||
def format(self, fragments, replace=False):
|
||||
@ -120,22 +125,42 @@ def remove_document_from_index(document):
|
||||
|
||||
|
||||
@contextmanager
|
||||
def query_page(ix, querystring, page):
|
||||
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
|
||||
searcher = ix.searcher()
|
||||
try:
|
||||
qp = MultifieldParser(
|
||||
["content", "title", "correspondent", "tag", "type"],
|
||||
ix.schema)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
if querystring:
|
||||
qp = MultifieldParser(
|
||||
["content", "title", "correspondent", "tag", "type"],
|
||||
ix.schema)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
str_q = qp.parse(querystring)
|
||||
corrected = searcher.correct_query(str_q, querystring)
|
||||
else:
|
||||
str_q = None
|
||||
corrected = None
|
||||
|
||||
if more_like_doc_id:
|
||||
docnum = searcher.document_number(id=more_like_doc_id)
|
||||
kts = searcher.key_terms_from_text(
|
||||
'content', more_like_doc_content, numterms=20,
|
||||
model=classify.Bo1Model, normalize=False)
|
||||
more_like_q = query.Or(
|
||||
[query.Term('content', word, boost=weight)
|
||||
for word, weight in kts])
|
||||
result_page = searcher.search_page(
|
||||
more_like_q, page, filter=str_q, mask={docnum})
|
||||
elif str_q:
|
||||
result_page = searcher.search_page(str_q, page)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Either querystring or more_like_doc_id is required."
|
||||
)
|
||||
|
||||
q = qp.parse(querystring)
|
||||
result_page = searcher.search_page(q, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
|
||||
corrected = searcher.correct_query(q, querystring)
|
||||
if corrected.query != q:
|
||||
if corrected and corrected.query != str_q:
|
||||
corrected_query = corrected.string
|
||||
else:
|
||||
corrected_query = None
|
||||
|
@ -351,6 +351,25 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
|
||||
self.assertEqual(correction, None)
|
||||
|
||||
def test_search_more_like(self):
|
||||
d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
|
||||
d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
|
||||
d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
|
||||
response = self.client.get(f"/api/search/?more_like={d2.id}")
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
results = response.data['results']
|
||||
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['id'], d3.id)
|
||||
self.assertEqual(results[1]['id'], d1.id)
|
||||
|
||||
def test_statistics(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
|
@ -340,14 +340,27 @@ class SearchView(APIView):
|
||||
}
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'query' not in request.query_params:
|
||||
|
||||
if 'query' in request.query_params:
|
||||
query = request.query_params['query']
|
||||
else:
|
||||
query = None
|
||||
|
||||
if 'more_like' in request.query_params:
|
||||
more_like_id = request.query_params['more_like']
|
||||
more_like_content = Document.objects.get(id=more_like_id).content
|
||||
else:
|
||||
more_like_id = None
|
||||
more_like_content = None
|
||||
|
||||
if not query and not more_like_id:
|
||||
return Response({
|
||||
'count': 0,
|
||||
'page': 0,
|
||||
'page_count': 0,
|
||||
'corrected_query': None,
|
||||
'results': []})
|
||||
|
||||
query = request.query_params['query']
|
||||
try:
|
||||
page = int(request.query_params.get('page', 1))
|
||||
except (ValueError, TypeError):
|
||||
@ -357,8 +370,7 @@ class SearchView(APIView):
|
||||
page = 1
|
||||
|
||||
try:
|
||||
with index.query_page(self.ix, query, page) as (result_page,
|
||||
corrected_query):
|
||||
with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
|
Loading…
x
Reference in New Issue
Block a user