Merge branch 'dev' into feature-ocrmypdf

This commit is contained in:
jonaswinkler 2020-11-30 16:48:09 +01:00
commit aaa6599283
25 changed files with 301 additions and 75 deletions

View File

@ -15,7 +15,7 @@ services:
POSTGRES_PASSWORD: paperless
webserver:
image: jonaswinkler/paperless-ng:0.9.3
image: jonaswinkler/paperless-ng:0.9.4
restart: always
depends_on:
- db

View File

@ -5,7 +5,7 @@ services:
restart: always
webserver:
image: jonaswinkler/paperless-ng:0.9.3
image: jonaswinkler/paperless-ng:0.9.4
restart: always
depends_on:
- broker

View File

@ -274,6 +274,7 @@ management command:
This command takes no arguments.
.. _`administration-index`:
Managing the document search index
==================================

View File

@ -8,12 +8,31 @@ Changelog
paperless-ng 0.9.4
##################
* Front end: Clickable tags, correspondents and types allow quick filtering for related documents.
* Front end: Saved views are now editable.
* Front end: Preview documents directly in the browser.
* Searching:
* Paperless now supports searching by tags, types and dates. In order to have this applied to your
existing documents, you need to perform a ``document_index reindex`` management command
(see :ref:`administration-index`)
that adds the new data to the search index. You only need to do this once, so that paperless can find
your documents by tags,types and dates. Paperless keeps the index updated after that whenever
something changes.
* Paperless now has spelling corrections ("Did you mean") for misstyped queries.
* The documentation contains :ref:`information about the query syntax <basic-searching>`.
* Front end:
* Clickable tags, correspondents and types allow quick filtering for related documents.
* Saved views are now editable.
* Preview documents directly in the browser.
* Navigation from the dashboard to saved views.
* Fixes:
* A severe error when trying to use post consume scripts.
* The documentation now contains information about bare metal installs.
* An error in the consumer that cause invalid messages of missing files to show up in the log.
* The documentation now contains information about bare metal installs and a section about
how to setup the development environment.
paperless-ng 0.9.3
##################

View File

@ -156,6 +156,62 @@ REST API
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
.. _basic-searching:
Searching
#########
Paperless offers an extensive searching mechanism that is designed to allow you to quickly
find a document you're looking for (for example, that thing that just broke and you bought
a couple months ago, that contract you signed 8 years ago).
When you search paperless for a document, it tries to match this query against your documents.
Paperless will look for matching documents by inspecting their content, title, correspondent,
type and tags. Paperless returns a scored list of results, so that documents matching your query
better will appear further up in the search results.
By default, paperless returns only documents which contain all words typed in the search bar.
However, paperless also offers advanced search syntax if you want to drill down the results
further.
Matching documents with logical expressions:
.. code:: none
shopname AND (product1 OR product2)
Matching specific tags, correspondents or types:
.. code:: none
type:invoice tag:unpaid
correspondent:university certificate
Matching dates:
.. code:: none
created:[2005 to 2009]
added:yesterday
modified:today
Matching inexact words:
.. code:: none
produ*name
.. note::
Inexact terms are hard for search indexes. These queries might take a while to execute. That's why paperless offers
auto complete and query correction.
All of these constructs can be combined as you see fit.
If you want to learn more about the query language used by paperless, paperless uses Whoosh's default query language.
Head over to `Whoosh query language <https://whoosh.readthedocs.io/en/latest/querylang.html>`_.
For details on what date parsing utilities are available, see
`Date parsing <https://whoosh.readthedocs.io/en/latest/dates.html#parsing-date-queries>`_.
.. _usage-recommended_workflow:

View File

@ -1,6 +1,9 @@
<app-widget-frame [title]="savedView.title">
<table class="table table-sm table-hover table-borderless">
<a header-buttons [routerLink]="" (click)="showAll()">Show all</a>
<table content class="table table-sm table-hover table-borderless">
<thead>
<tr>
<th>Created</th>

View File

@ -1,6 +1,8 @@
import { Component, Input, OnInit } from '@angular/core';
import { Router } from '@angular/router';
import { PaperlessDocument } from 'src/app/data/paperless-document';
import { SavedViewConfig } from 'src/app/data/saved-view-config';
import { DocumentListViewService } from 'src/app/services/document-list-view.service';
import { DocumentService } from 'src/app/services/rest/document.service';
@Component({
@ -10,7 +12,10 @@ import { DocumentService } from 'src/app/services/rest/document.service';
})
export class SavedViewWidgetComponent implements OnInit {
constructor(private documentService: DocumentService) { }
constructor(
private documentService: DocumentService,
private router: Router,
private list: DocumentListViewService) { }
@Input()
savedView: SavedViewConfig
@ -23,4 +28,9 @@ export class SavedViewWidgetComponent implements OnInit {
})
}
showAll() {
this.list.load(this.savedView)
this.router.navigate(["documents"])
}
}

View File

@ -1,4 +1,6 @@
<app-widget-frame title="Statistics">
<p class="card-text">Documents in inbox: {{statistics.documents_inbox}}</p>
<p class="card-text">Total documents: {{statistics.documents_total}}</p>
<ng-container content>
<p class="card-text">Documents in inbox: {{statistics.documents_inbox}}</p>
<p class="card-text">Total documents: {{statistics.documents_total}}</p>
</ng-container>
</app-widget-frame>

View File

@ -1,6 +1,6 @@
<app-widget-frame title="Upload new documents">
<form>
<form content>
<ngx-file-drop
dropZoneLabel="Drop documents here or" (onFileDrop)="dropped($event)"
(onFileOver)="fileOver($event)" (onFileLeave)="fileLeave($event)"

View File

@ -1,8 +1,12 @@
<div class="card mb-3 shadow">
<div class="card-header">
<h5 class="card-title mb-0">{{title}}</h5>
<div class="d-flex justify-content-between align-items-center">
<h5 class="card-title mb-0">{{title}}</h5>
<ng-content select ="[header-buttons]"></ng-content>
</div>
</div>
<div class="card-body text-dark">
<ng-content></ng-content>
<ng-content select ="[content]"></ng-content>
</div>
</div>

View File

@ -9,9 +9,11 @@
<div class="d-flex justify-content-between align-items-center">
<h5 class="card-title">
<ng-container *ngIf="document.correspondent">
<a [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{document.correspondent.name}}</a>:
<a *ngIf="clickCorrespondent.observers.length ; else nolink" [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{document.correspondent.name}}</a>
<ng-template #nolink>{{document.correspondent.name}}</ng-template>:
</ng-container>
{{document.title}}<app-tag [tag]="t" linkTitle="Filter by tag" *ngFor="let t of document.tags" class="ml-1" (click)="clickTag.emit(t)" [clickable]="true"></app-tag>
{{document.title}}
<app-tag [tag]="t" linkTitle="Filter by tag" *ngFor="let t of document.tags" class="ml-1" (click)="clickTag.emit(t)" [clickable]="clickTag.observers.length"></app-tag>
</h5>
<h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5>
</div>

View File

@ -1,13 +1,21 @@
<app-page-header title="Search results">
</app-page-header>
<p>Search string: <i>{{query}}</i></p>
<div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div>
<div [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()">
<p>
Search string: <i>{{query}}</i>
<ng-container *ngIf="correctedQuery">
- Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
</ng-container>
</p>
<div *ngIf="!errorMessage" [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()">
<p>{{resultCount}} result(s)</p>
<app-document-card-large *ngFor="let result of results"
[document]="result.document"
[details]="result.highlights">
</app-document-card-large>
</div>
</div>

View File

@ -1,5 +1,5 @@
import { Component, OnInit } from '@angular/core';
import { ActivatedRoute } from '@angular/router';
import { ActivatedRoute, Router } from '@angular/router';
import { SearchHit } from 'src/app/data/search-result';
import { SearchService } from 'src/app/services/rest/search.service';
@ -9,7 +9,7 @@ import { SearchService } from 'src/app/services/rest/search.service';
styleUrls: ['./search.component.scss']
})
export class SearchComponent implements OnInit {
results: SearchHit[] = []
query: string = ""
@ -22,7 +22,11 @@ export class SearchComponent implements OnInit {
resultCount
constructor(private searchService: SearchService, private route: ActivatedRoute) { }
correctedQuery: string = null
errorMessage: string
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
ngOnInit(): void {
this.route.queryParamMap.subscribe(paramMap => {
@ -31,10 +35,16 @@ export class SearchComponent implements OnInit {
this.currentPage = 1
this.loadPage()
})
}
searchCorrectedQuery() {
this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
}
loadPage(append: boolean = false) {
this.errorMessage = null
this.correctedQuery = null
this.searchService.search(this.query, this.currentPage).subscribe(result => {
if (append) {
this.results.push(...result.results)
@ -44,12 +54,17 @@ export class SearchComponent implements OnInit {
this.pageCount = result.page_count
this.searching = false
this.resultCount = result.count
this.correctedQuery = result.corrected_query
}, error => {
this.searching = false
this.resultCount = 1
this.pageCount = 1
this.results = []
this.errorMessage = error.error
})
}
onScroll() {
console.log(this.currentPage)
console.log(this.pageCount)
if (this.currentPage < this.pageCount) {
this.currentPage += 1
this.loadPage(true)

View File

@ -21,7 +21,9 @@ export interface SearchResult {
page?: number
page_count?: number
corrected_query?: string
results?: SearchHit[]
}
}

View File

@ -10,10 +10,11 @@ from django.db.models import Q
from django.utils import timezone
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .file_handling import create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
from .parsers import ParseError, get_parser_class_for_mime_type, \
get_supported_file_extensions, parse_date
from .signals import (
document_consumption_finished,
document_consumption_started
@ -40,6 +41,21 @@ class Consumer(LoggingMixin):
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
def pre_check_file_extension(self):
extensions = get_supported_file_extensions()
_, ext = os.path.splitext(self.filename)
if not ext:
raise ConsumerError(
f"Not consuming {self.filename}: File type unknown."
)
if ext not in extensions:
raise ConsumerError(
f"Not consuming {self.filename}: File extension {ext} does "
f"not map to any known file type ({str(extensions)})"
)
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
@ -82,6 +98,7 @@ class Consumer(LoggingMixin):
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_file_extension()
self.pre_check_directories()
self.pre_check_duplicate()

View File

@ -4,10 +4,11 @@ from contextlib import contextmanager
from django.conf import settings
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.writing import AsyncWriter
@ -59,14 +60,19 @@ def get_schema():
id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True),
content=TEXT(),
correspondent=TEXT(stored=True)
correspondent=TEXT(stored=True),
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
type=TEXT(stored=True),
created=DATETIME(stored=True, sortable=True),
modified=DATETIME(stored=True, sortable=True),
added=DATETIME(stored=True, sortable=True),
)
def open_index(recreate=False):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR)
return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception as e:
logger.error(f"Error while opening the index: {e}, recreating.")
@ -77,11 +83,17 @@ def open_index(recreate=False):
def update_document(writer, doc):
logger.debug("Indexing {}...".format(doc))
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
id=doc.pk,
title=doc.title,
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None
correspondent=doc.correspondent.name if doc.correspondent else None,
tag=tags if tags else None,
type=doc.document_type.name if doc.document_type else None,
created=doc.created,
added=doc.added,
modified=doc.modified,
)
@ -103,16 +115,27 @@ def remove_document_from_index(document):
@contextmanager
def query_page(ix, query, page):
def query_page(ix, querystring, page):
searcher = ix.searcher()
try:
query_parser = MultifieldParser(["content", "title", "correspondent"],
ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
ix.schema)
qp.add_plugin(DateParserPlugin())
q = qp.parse(querystring)
result_page = searcher.search_page(q, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
yield result_page
corrected = searcher.correct_query(q, querystring)
if corrected.query != q:
corrected_query = corrected.string
else:
corrected_query = None
yield result_page, corrected_query
finally:
searcher.close()

View File

@ -1,7 +1,6 @@
# coding=utf-8
import logging
import mimetypes
import os
import re
from collections import OrderedDict
@ -12,6 +11,8 @@ from django.db import models
from django.utils import timezone
from django.utils.text import slugify
from documents.parsers import get_default_file_extension
class MatchingModel(models.Model):
@ -204,7 +205,7 @@ class Document(models.Model):
ordering = ("correspondent", "title")
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
created = self.created.strftime("%Y%m%d")
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
@ -255,8 +256,7 @@ class Document(models.Model):
@property
def file_type(self):
# TODO: this is not stable across python versions
return mimetypes.guess_extension(str(self.mime_type))
return get_default_file_extension(self.mime_type)
@property
def thumbnail_path(self):

View File

@ -1,4 +1,5 @@
import logging
import mimetypes
import os
import re
import shutil
@ -42,6 +43,29 @@ def is_mime_type_supported(mime_type):
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
if mime_type in supported_mime_types:
return supported_mime_types[mime_type]
return None
def get_supported_file_extensions():
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
return extensions
def get_parser_class_for_mime_type(mime_type):
options = []

View File

@ -325,6 +325,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
def test_search_spelling_correction(self):
with AsyncWriter(index.open_index()) as writer:
for i in range(55):
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
index.update_document(writer, doc)
response = self.client.get("/api/search/?query=thing")
correction = response.data['corrected_query']
self.assertEqual(correction, "things")
response = self.client.get("/api/search/?query=things")
correction = response.data['corrected_query']
self.assertEqual(correction, None)
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")

View File

@ -425,7 +425,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"mime_types": ["application/pdf"],
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]
@ -551,7 +551,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).startswith("No parsers abvailable"))
self.assertTrue("File extension .pdf does not map to any" in str(e))
return
self.fail("Should throw exception")
@ -560,7 +560,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"mime_types": ["application/pdf"],
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]

View File

@ -6,7 +6,10 @@ from unittest import mock
from django.test import TestCase, override_settings
from documents.parsers import get_parser_class, DocumentParser
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
get_parser_class_for_mime_type, DocumentParser
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
def fake_magic_from_file(file, mime=False):
@ -29,7 +32,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
(None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
)
self.assertEqual(
@ -47,8 +50,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
(None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
)
self.assertEqual(
@ -96,3 +99,20 @@ class TestBaseParser(TestCase):
path = parser.get_optimised_thumbnail("any", "not important")
self.assertEqual(path, fake_get_thumbnail(None, None, None))
class TestParserAvailability(TestCase):
def test_file_extensions(self):
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
self.assertIn(ext, get_supported_file_extensions())
self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
self.assertEqual(get_default_file_extension('image/png'), ".png")
self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
self.assertEqual(get_default_file_extension('text/plain'), ".txt")
self.assertEqual(get_default_file_extension('text/csv'), ".csv")
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), None)
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)

View File

@ -236,30 +236,34 @@ class SearchView(APIView):
}
def get(self, request, format=None):
if 'query' in request.query_params:
query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
with index.query_page(self.ix, query, page) as result_page:
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
else:
if 'query' not in request.query_params:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
'results': []})
query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
try:
with index.query_page(self.ix, query, page) as (result_page,
corrected_query):
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'corrected_query': corrected_query,
'results': list(map(self.add_infos_to_hit, result_page))})
except Exception as e:
return HttpResponseBadRequest(str(e))
class SearchAutoCompleteView(APIView):

View File

@ -1 +1 @@
__version__ = (0, 9, 3)
__version__ = (0, 9, 4)

View File

@ -5,9 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"mime_types": [
"application/pdf",
"image/jpeg",
"image/png"
]
"mime_types": {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png"
}
}

View File

@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"mime_types": [
"text/plain",
"text/comma-separated-values"
]
"mime_types": {
"text/plain": ".txt",
"text/csv": ".csv",
}
}