Merge branch 'dev' into feature-localization

This commit is contained in:
jonaswinkler
2021-01-02 00:15:03 +01:00
49 changed files with 3218 additions and 466 deletions

View File

@@ -6,29 +6,21 @@ class DocumentsConfig(AppConfig):
name = "documents"
def ready(self):
from .signals import document_consumption_started
from .signals import document_consumption_finished
from .signals.handlers import (
add_inbox_tags,
run_pre_consume_script,
run_post_consume_script,
set_log_entry,
set_correspondent,
set_document_type,
set_tags,
add_to_index
)
document_consumption_started.connect(run_pre_consume_script)
document_consumption_finished.connect(add_inbox_tags)
document_consumption_finished.connect(set_correspondent)
document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_log_entry)
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_post_consume_script)
AppConfig.ready(self)

View File

@@ -1,7 +1,7 @@
import datetime
import hashlib
import logging
import os
from subprocess import Popen
import magic
from django.conf import settings
@@ -9,6 +9,7 @@ from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import create_source_path_directory, \
@@ -66,6 +67,39 @@ class Consumer(LoggingMixin):
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
def run_pre_consume_script(self):
if not settings.PRE_CONSUME_SCRIPT:
return
try:
Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
except Exception as e:
raise ConsumerError(
f"Error while executing pre-consume script: {e}"
)
def run_post_consume_script(self, document):
if not settings.POST_CONSUME_SCRIPT:
return
try:
Popen((
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list(
"name", flat=True)))
)).wait()
except Exception as e:
raise ConsumerError(
f"Error while executing pre-consume script: {e}"
)
def try_consume_file(self,
path,
override_filename=None,
@@ -119,6 +153,8 @@ class Consumer(LoggingMixin):
logging_group=self.logging_group
)
self.run_pre_consume_script()
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.logging_group)
@@ -130,7 +166,7 @@ class Consumer(LoggingMixin):
try:
self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type)
document_parser.parse(self.path, mime_type, self.filename)
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail(
@@ -215,6 +251,9 @@ class Consumer(LoggingMixin):
# Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
self.run_post_consume_script(document)
except Exception as e:
self.log(
"error",

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.4 on 2021-01-01 21:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1009_auto_20201216_2005'),
]
operations = [
migrations.AlterField(
model_name='savedviewfilterrule',
name='value',
field=models.CharField(blank=True, max_length=128, null=True),
),
]

View File

@@ -404,7 +404,9 @@ class SavedViewFilterRule(models.Model):
value = models.CharField(
_("value"),
max_length=128)
max_length=128,
blank=True,
null=True)
class Meta:
verbose_name = _("filter rule")

View File

@@ -144,6 +144,52 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(temp_dir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(in_path),
output_file=out_path,
logging_group=logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path
def parse_date(filename, text):
"""
Returns the date of the document.
@@ -221,7 +267,7 @@ class DocumentParser(LoggingMixin):
def extract_metadata(self, document_path, mime_type):
return []
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
raise NotImplementedError()
def get_archive_path(self):

View File

@@ -11,7 +11,6 @@ from django.db.models import Q
from django.dispatch import receiver
from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from .. import index, matching
from ..file_handling import delete_empty_directories, \
@@ -147,32 +146,6 @@ def set_tags(sender,
document.tags.add(*relevant_tags)
def run_pre_consume_script(sender, filename, **kwargs):
if not settings.PRE_CONSUME_SCRIPT:
return
Popen((settings.PRE_CONSUME_SCRIPT, filename)).wait()
def run_post_consume_script(sender, document, **kwargs):
if not settings.POST_CONSUME_SCRIPT:
return
Popen((
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True)))
)).wait()
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
with FileLock(settings.MEDIA_LOCK):

View File

@@ -177,7 +177,7 @@ class DummyParser(DocumentParser):
def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
self.text = "The Text"
@@ -194,7 +194,7 @@ class FaultyParser(DocumentParser):
def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
raise ParseError("Does not compute.")
@@ -466,3 +466,53 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(dst))
self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
self.assertTrue(os.path.isfile(dst))
class PostConsumeTestCase(TestCase):
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
Consumer().run_post_consume_script(doc)
m.assert_not_called()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_simple(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
Consumer().run_post_consume_script(doc)
m.assert_called_once()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_with_correspondent(self, m):
c = Correspondent.objects.create(name="my_bank")
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
Consumer().run_post_consume_script(doc)
m.assert_called_once()
args, kwargs = m.call_args
command = args[0]
self.assertEqual(command[0], "script")
self.assertEqual(command[1], str(doc.pk))
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])

View File

@@ -1,56 +0,0 @@
from unittest import mock
from django.test import TestCase, override_settings
from documents.models import Document, Tag, Correspondent
from documents.signals.handlers import run_post_consume_script
class PostConsumeTestCase(TestCase):
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
run_post_consume_script(None, doc)
m.assert_not_called()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_simple(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
run_post_consume_script(None, doc)
m.assert_called_once()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_with_correspondent(self, m):
c = Correspondent.objects.create(name="my_bank")
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
run_post_consume_script(None, doc)
m.assert_called_once()
args, kwargs = m.call_args
command = args[0]
self.assertEqual(command[0], "script")
self.assertEqual(command[1], str(doc.pk))
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])

View File

@@ -89,6 +89,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_tika.apps.PaperlessTikaConfig",
"paperless_mail.apps.PaperlessMailConfig",
"django.contrib.admin",
@@ -436,3 +437,10 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf")
# Tika settings
PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO")
PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998")
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
)

View File

@@ -1,7 +1,6 @@
import json
import os
import re
import subprocess
import ocrmypdf
import pdftotext
@@ -10,7 +9,8 @@ from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, run_convert
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
class RasterisedDocumentParser(DocumentParser):
@@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser):
return result
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
'warning',
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
document_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
return make_thumbnail_from_pdf(
document_path, self.tempdir, self.logging_group)
def is_image(self, mime_type):
return mime_type in [
@@ -130,7 +88,7 @@ class RasterisedDocumentParser(DocumentParser):
f"Error while calculating DPI for image {image}: {e}")
return None
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
mode = settings.OCR_MODE
text_original = get_text_from_pdf(document_path)

View File

@@ -32,6 +32,6 @@ class TextDocumentParser(DocumentParser):
return out_path
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
with open(document_path, 'r') as f:
self.text = f.read()

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from django.conf import settings
from paperless_tika.signals import tika_consumer_declaration
class PaperlessTikaConfig(AppConfig):
name = "paperless_tika"
def ready(self):
from documents.signals import document_consumer_declaration
if settings.PAPERLESS_TIKA_ENABLED:
document_consumer_declaration.connect(tika_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,86 @@
import os
import requests
import dateutil.parser
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
from tika import parser
class TikaDocumentParser(DocumentParser):
"""
This parser sends documents to a local tika server
"""
def get_thumbnail(self, document_path, mime_type):
if not self.archive_path:
self.archive_path = self.convert_to_pdf(document_path)
return make_thumbnail_from_pdf(
self.archive_path, self.tempdir, self.logging_group)
def extract_metadata(self, document_path, mime_type):
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as e:
self.log("warning", f"Error while fetching document metadata for "
f"{document_path}: {e}")
return []
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed['metadata'][key]
} for key in parsed['metadata']
]
def parse(self, document_path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}"
)
self.text = parsed["content"].strip()
try:
self.date = dateutil.parser.isoparse(
parsed["metadata"]["Creation-Date"])
except Exception as e:
self.log("warning", f"Unable to extract date for document "
f"{document_path}: {e}")
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/convert/office"
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
files = {"files": (file_name, open(document_path, "rb"))}
headers = {}
try:
response = requests.post(url, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}"
)
file = open(pdf_path, "wb")
file.write(response.content)
file.close()
return pdf_path

View File

@@ -0,0 +1,20 @@
from .parsers import TikaDocumentParser
def tika_consumer_declaration(sender, **kwargs):
return {
"parser": TikaDocumentParser,
"weight": 10,
"mime_types": {
"application/msword": ".doc",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.ms-excel": ".xls",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.ms-powerpoint": ".ppt",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
"application/vnd.oasis.opendocument.presentation": ".odp",
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
"application/vnd.oasis.opendocument.text": ".odt",
},
}