diff --git a/docs/api.rst b/docs/api.rst
index d352758fa..cff72a970 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -221,21 +221,16 @@ Each fragment contains a list of strings, and some of them are marked as a highl
[
[
- {"text": "This is a sample text with a "},
- {"text": "highlighted", "term": 0},
- {"text": " word."}
+ {"text": "This is a sample text with a ", "highlight": false},
+ {"text": "highlighted", "highlight": true},
+ {"text": " word.", "highlight": false}
],
[
- {"text": "Another", "term": 1},
- {"text": " fragment with a highlight."}
+ {"text": "Another", "highlight": true},
+ {"text": " fragment with a highlight.", "highlight": false}
]
]
-
-
-When ``term`` is present within a string, the word within ``text`` should be highlighted.
-The term index groups multiple matches together and words with the same index
-should get identical highlighting.
A client may use this example to produce the following output:
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html
index d47c07de1..228264378 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@@ -34,6 +34,12 @@
+
+
+
+
+ More like this
+
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts
index 75a64f548..d705c3176 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -172,6 +172,10 @@ export class DocumentDetailComponent implements OnInit {
}
+ moreLike() {
+ this.router.navigate(["search"], {queryParams: {more_like:this.document.id}})
+ }
+
hasNext() {
return this.documentListViewService.hasNext(this.documentId)
}
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
index c2645db5e..5bf0c9af2 100644
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@@ -23,8 +23,14 @@
- ()
+ @Input()
+ searchScore: number
+
+ get searchScoreClass() {
+ if (this.searchScore > 0.7) {
+ return "success"
+ } else if (this.searchScore > 0.3) {
+ return "warning"
+ } else {
+ return "danger"
+ }
+ }
+
ngOnInit(): void {
}
diff --git a/src-ui/src/app/components/search/result-highlight/result-highlight.component.html b/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
index 1842f5cea..5dc5baa94 100644
--- a/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
+++ b/src-ui/src/app/components/search/result-highlight/result-highlight.component.html
@@ -1,3 +1,3 @@
...
- {{token.text}} ...
+ {{token.text}} ...
\ No newline at end of file
diff --git a/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss b/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
index 645fb0426..e04dd13b2 100644
--- a/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
+++ b/src-ui/src/app/components/search/result-highlight/result-highlight.component.scss
@@ -1,4 +1,4 @@
.match {
color: black;
- background-color: orange;
+ background-color: rgb(255, 211, 66);
}
\ No newline at end of file
diff --git a/src-ui/src/app/components/search/search.component.html b/src-ui/src/app/components/search/search.component.html
index 55fcee900..de6f0133f 100644
--- a/src-ui/src/app/components/search/search.component.html
+++ b/src-ui/src/app/components/search/search.component.html
@@ -3,7 +3,12 @@
Invalid search query: {{errorMessage}}
-
+
+ Showing documents similar to
+ {{more_like_doc?.original_file_name}}
+
+
+
Search string: {{query}}
- Did you mean "{{correctedQuery}} "?
@@ -15,7 +20,9 @@
{{resultCount}} result(s)
+ [details]="result.highlights"
+ [searchScore]="result.score / maxScore"
+ [moreLikeThis]="true">
diff --git a/src-ui/src/app/components/search/search.component.ts b/src-ui/src/app/components/search/search.component.ts
index de8b4652f..4570ac3fa 100644
--- a/src-ui/src/app/components/search/search.component.ts
+++ b/src-ui/src/app/components/search/search.component.ts
@@ -1,6 +1,9 @@
import { Component, OnInit } from '@angular/core';
import { ActivatedRoute, Router } from '@angular/router';
+import { PaperlessDocument } from 'src/app/data/paperless-document';
+import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
import { SearchHit } from 'src/app/data/search-result';
+import { DocumentService } from 'src/app/services/rest/document.service';
import { SearchService } from 'src/app/services/rest/search.service';
@Component({
@@ -14,6 +17,10 @@ export class SearchComponent implements OnInit {
query: string = ""
+ more_like: number
+
+ more_like_doc: PaperlessDocument
+
searching = false
currentPage = 1
@@ -26,11 +33,24 @@ export class SearchComponent implements OnInit {
errorMessage: string
- constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
+ get maxScore() {
+ return this.results?.length > 0 ? this.results[0].score : 100
+ }
+
+ constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router, private documentService: DocumentService) { }
ngOnInit(): void {
this.route.queryParamMap.subscribe(paramMap => {
+ window.scrollTo(0, 0)
this.query = paramMap.get('query')
+ this.more_like = paramMap.has('more_like') ? +paramMap.get('more_like') : null
+ if (this.more_like) {
+ this.documentService.get(this.more_like).subscribe(r => {
+ this.more_like_doc = r
+ })
+ } else {
+ this.more_like_doc = null
+ }
this.searching = true
this.currentPage = 1
this.loadPage()
@@ -39,13 +59,14 @@ export class SearchComponent implements OnInit {
}
searchCorrectedQuery() {
- this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
+ this.router.navigate(["search"], {queryParams: {query: this.correctedQuery, more_like: this.more_like}})
}
loadPage(append: boolean = false) {
this.errorMessage = null
this.correctedQuery = null
- this.searchService.search(this.query, this.currentPage).subscribe(result => {
+
+ this.searchService.search(this.query, this.currentPage, this.more_like).subscribe(result => {
if (append) {
this.results.push(...result.results)
} else {
diff --git a/src-ui/src/app/services/rest/search.service.ts b/src-ui/src/app/services/rest/search.service.ts
index b19a55769..3799f3dc7 100644
--- a/src-ui/src/app/services/rest/search.service.ts
+++ b/src-ui/src/app/services/rest/search.service.ts
@@ -15,11 +15,17 @@ export class SearchService {
constructor(private http: HttpClient, private documentService: DocumentService) { }
- search(query: string, page?: number): Observable {
- let httpParams = new HttpParams().set('query', query)
+ search(query: string, page?: number, more_like?: number): Observable {
+ let httpParams = new HttpParams()
+ if (query) {
+ httpParams = httpParams.set('query', query)
+ }
if (page) {
httpParams = httpParams.set('page', page.toString())
}
+ if (more_like) {
+ httpParams = httpParams.set('more_like', more_like.toString())
+ }
return this.http.get(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe(
map(result => {
result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document))
diff --git a/src/documents/index.py b/src/documents/index.py
index 53bf34542..308ee932e 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -3,7 +3,7 @@ import os
from contextlib import contextmanager
from django.conf import settings
-from whoosh import highlight
+from whoosh import highlight, classify, query
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
@@ -20,32 +20,37 @@ class JsonFormatter(Formatter):
self.seen = {}
def format_token(self, text, token, replace=False):
- seen = self.seen
ttext = self._text(get_text(text, token, replace))
- if ttext in seen:
- termnum = seen[ttext]
- else:
- termnum = len(seen)
- seen[ttext] = termnum
-
- return {'text': ttext, 'term': termnum}
+ return {'text': ttext, 'highlight': 'true'}
def format_fragment(self, fragment, replace=False):
output = []
index = fragment.startchar
text = fragment.text
-
+ amend_token = None
for t in fragment.matches:
if t.startchar is None:
continue
if t.startchar < index:
continue
if t.startchar > index:
- output.append({'text': text[index:t.startchar]})
- output.append(self.format_token(text, t, replace))
+ text_inbetween = text[index:t.startchar]
+ if amend_token and t.startchar - index < 10:
+ amend_token['text'] += text_inbetween
+ else:
+ output.append({'text': text_inbetween,
+ 'highlight': False})
+ amend_token = None
+ token = self.format_token(text, t, replace)
+ if amend_token:
+ amend_token['text'] += token['text']
+ else:
+ output.append(token)
+ amend_token = token
index = t.endchar
if index < fragment.endchar:
- output.append({'text': text[index:fragment.endchar]})
+ output.append({'text': text[index:fragment.endchar],
+ 'highlight': False})
return output
def format(self, fragments, replace=False):
@@ -120,22 +125,42 @@ def remove_document_from_index(document):
@contextmanager
-def query_page(ix, querystring, page):
+def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
searcher = ix.searcher()
try:
- qp = MultifieldParser(
- ["content", "title", "correspondent", "tag", "type"],
- ix.schema)
- qp.add_plugin(DateParserPlugin())
+ if querystring:
+ qp = MultifieldParser(
+ ["content", "title", "correspondent", "tag", "type"],
+ ix.schema)
+ qp.add_plugin(DateParserPlugin())
+ str_q = qp.parse(querystring)
+ corrected = searcher.correct_query(str_q, querystring)
+ else:
+ str_q = None
+ corrected = None
+
+ if more_like_doc_id:
+ docnum = searcher.document_number(id=more_like_doc_id)
+ kts = searcher.key_terms_from_text(
+ 'content', more_like_doc_content, numterms=20,
+ model=classify.Bo1Model, normalize=False)
+ more_like_q = query.Or(
+ [query.Term('content', word, boost=weight)
+ for word, weight in kts])
+ result_page = searcher.search_page(
+ more_like_q, page, filter=str_q, mask={docnum})
+ elif str_q:
+ result_page = searcher.search_page(str_q, page)
+ else:
+ raise ValueError(
+ "Either querystring or more_like_doc_id is required."
+ )
- q = qp.parse(querystring)
- result_page = searcher.search_page(q, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
- corrected = searcher.correct_query(q, querystring)
- if corrected.query != q:
+ if corrected and corrected.query != str_q:
corrected_query = corrected.string
else:
corrected_query = None
diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py
index 49dddee87..ba1ab45ca 100644
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -351,6 +351,25 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(correction, None)
+ def test_search_more_like(self):
+ d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
+ d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
+ d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
+ with AsyncWriter(index.open_index()) as writer:
+ index.update_document(writer, d1)
+ index.update_document(writer, d2)
+ index.update_document(writer, d3)
+
+ response = self.client.get(f"/api/search/?more_like={d2.id}")
+
+ self.assertEqual(response.status_code, 200)
+
+ results = response.data['results']
+
+ self.assertEqual(len(results), 2)
+ self.assertEqual(results[0]['id'], d3.id)
+ self.assertEqual(results[1]['id'], d1.id)
+
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")
diff --git a/src/documents/views.py b/src/documents/views.py
index f90e9f7bc..54d0de3f6 100755
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -340,14 +340,27 @@ class SearchView(APIView):
}
def get(self, request, format=None):
- if 'query' not in request.query_params:
+
+ if 'query' in request.query_params:
+ query = request.query_params['query']
+ else:
+ query = None
+
+ if 'more_like' in request.query_params:
+ more_like_id = request.query_params['more_like']
+ more_like_content = Document.objects.get(id=more_like_id).content
+ else:
+ more_like_id = None
+ more_like_content = None
+
+ if not query and not more_like_id:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
+ 'corrected_query': None,
'results': []})
- query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
@@ -357,8 +370,7 @@ class SearchView(APIView):
page = 1
try:
- with index.query_page(self.ix, query, page) as (result_page,
- corrected_query):
+ with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
return Response(
{'count': len(result_page),
'page': result_page.pagenum,