Merge branch 'dev' into feature-ocrmypdf

This commit is contained in:
jonaswinkler
2020-11-30 16:48:09 +01:00
25 changed files with 301 additions and 75 deletions

View File

@@ -10,10 +10,11 @@ from django.db.models import Q
from django.utils import timezone
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .file_handling import create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class_for_mime_type, parse_date
from .parsers import ParseError, get_parser_class_for_mime_type, \
get_supported_file_extensions, parse_date
from .signals import (
document_consumption_finished,
document_consumption_started
@@ -40,6 +41,21 @@ class Consumer(LoggingMixin):
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
def pre_check_file_extension(self):
extensions = get_supported_file_extensions()
_, ext = os.path.splitext(self.filename)
if not ext:
raise ConsumerError(
f"Not consuming {self.filename}: File type unknown."
)
if ext not in extensions:
raise ConsumerError(
f"Not consuming {self.filename}: File extension {ext} does "
f"not map to any known file type ({str(extensions)})"
)
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
@@ -82,6 +98,7 @@ class Consumer(LoggingMixin):
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_file_extension()
self.pre_check_directories()
self.pre_check_duplicate()

View File

@@ -4,10 +4,11 @@ from contextlib import contextmanager
from django.conf import settings
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.writing import AsyncWriter
@@ -59,14 +60,19 @@ def get_schema():
id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True),
content=TEXT(),
correspondent=TEXT(stored=True)
correspondent=TEXT(stored=True),
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
type=TEXT(stored=True),
created=DATETIME(stored=True, sortable=True),
modified=DATETIME(stored=True, sortable=True),
added=DATETIME(stored=True, sortable=True),
)
def open_index(recreate=False):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR)
return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception as e:
logger.error(f"Error while opening the index: {e}, recreating.")
@@ -77,11 +83,17 @@ def open_index(recreate=False):
def update_document(writer, doc):
logger.debug("Indexing {}...".format(doc))
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
id=doc.pk,
title=doc.title,
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None
correspondent=doc.correspondent.name if doc.correspondent else None,
tag=tags if tags else None,
type=doc.document_type.name if doc.document_type else None,
created=doc.created,
added=doc.added,
modified=doc.modified,
)
@@ -103,16 +115,27 @@ def remove_document_from_index(document):
@contextmanager
def query_page(ix, query, page):
def query_page(ix, querystring, page):
searcher = ix.searcher()
try:
query_parser = MultifieldParser(["content", "title", "correspondent"],
ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
ix.schema)
qp.add_plugin(DateParserPlugin())
q = qp.parse(querystring)
result_page = searcher.search_page(q, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
yield result_page
corrected = searcher.correct_query(q, querystring)
if corrected.query != q:
corrected_query = corrected.string
else:
corrected_query = None
yield result_page, corrected_query
finally:
searcher.close()

View File

@@ -1,7 +1,6 @@
# coding=utf-8
import logging
import mimetypes
import os
import re
from collections import OrderedDict
@@ -12,6 +11,8 @@ from django.db import models
from django.utils import timezone
from django.utils.text import slugify
from documents.parsers import get_default_file_extension
class MatchingModel(models.Model):
@@ -204,7 +205,7 @@ class Document(models.Model):
ordering = ("correspondent", "title")
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
created = self.created.strftime("%Y%m%d")
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
@@ -255,8 +256,7 @@ class Document(models.Model):
@property
def file_type(self):
# TODO: this is not stable across python versions
return mimetypes.guess_extension(str(self.mime_type))
return get_default_file_extension(self.mime_type)
@property
def thumbnail_path(self):

View File

@@ -1,4 +1,5 @@
import logging
import mimetypes
import os
import re
import shutil
@@ -42,6 +43,29 @@ def is_mime_type_supported(mime_type):
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
if mime_type in supported_mime_types:
return supported_mime_types[mime_type]
return None
def get_supported_file_extensions():
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
return extensions
def get_parser_class_for_mime_type(mime_type):
options = []

View File

@@ -325,6 +325,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
def test_search_spelling_correction(self):
with AsyncWriter(index.open_index()) as writer:
for i in range(55):
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
index.update_document(writer, doc)
response = self.client.get("/api/search/?query=thing")
correction = response.data['corrected_query']
self.assertEqual(correction, "things")
response = self.client.get("/api/search/?query=things")
correction = response.data['corrected_query']
self.assertEqual(correction, None)
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")

View File

@@ -425,7 +425,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"mime_types": ["application/pdf"],
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]
@@ -551,7 +551,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).startswith("No parsers abvailable"))
self.assertTrue("File extension .pdf does not map to any" in str(e))
return
self.fail("Should throw exception")
@@ -560,7 +560,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"mime_types": ["application/pdf"],
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]

View File

@@ -6,7 +6,10 @@ from unittest import mock
from django.test import TestCase, override_settings
from documents.parsers import get_parser_class, DocumentParser
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
get_parser_class_for_mime_type, DocumentParser
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
def fake_magic_from_file(file, mime=False):
@@ -29,7 +32,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
(None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
)
self.assertEqual(
@@ -47,8 +50,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
(None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
)
self.assertEqual(
@@ -96,3 +99,20 @@ class TestBaseParser(TestCase):
path = parser.get_optimised_thumbnail("any", "not important")
self.assertEqual(path, fake_get_thumbnail(None, None, None))
class TestParserAvailability(TestCase):
def test_file_extensions(self):
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
self.assertIn(ext, get_supported_file_extensions())
self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
self.assertEqual(get_default_file_extension('image/png'), ".png")
self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
self.assertEqual(get_default_file_extension('text/plain'), ".txt")
self.assertEqual(get_default_file_extension('text/csv'), ".csv")
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), None)
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)

View File

@@ -236,30 +236,34 @@ class SearchView(APIView):
}
def get(self, request, format=None):
if 'query' in request.query_params:
query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
with index.query_page(self.ix, query, page) as result_page:
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
else:
if 'query' not in request.query_params:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
'results': []})
query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
try:
with index.query_page(self.ix, query, page) as (result_page,
corrected_query):
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'corrected_query': corrected_query,
'results': list(map(self.add_infos_to_hit, result_page))})
except Exception as e:
return HttpResponseBadRequest(str(e))
class SearchAutoCompleteView(APIView):

View File

@@ -1 +1 @@
__version__ = (0, 9, 3)
__version__ = (0, 9, 4)

View File

@@ -5,9 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"mime_types": [
"application/pdf",
"image/jpeg",
"image/png"
]
"mime_types": {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png"
}
}

View File

@@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"mime_types": [
"text/plain",
"text/comma-separated-values"
]
"mime_types": {
"text/plain": ".txt",
"text/csv": ".csv",
}
}