Merge branch 'dev' into feature-localization

This commit is contained in:
jonaswinkler
2021-01-02 00:15:03 +01:00
49 changed files with 3218 additions and 466 deletions

View File

@@ -6,29 +6,21 @@ class DocumentsConfig(AppConfig):
name = "documents"
def ready(self):
from .signals import document_consumption_started
from .signals import document_consumption_finished
from .signals.handlers import (
add_inbox_tags,
run_pre_consume_script,
run_post_consume_script,
set_log_entry,
set_correspondent,
set_document_type,
set_tags,
add_to_index
)
document_consumption_started.connect(run_pre_consume_script)
document_consumption_finished.connect(add_inbox_tags)
document_consumption_finished.connect(set_correspondent)
document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_log_entry)
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_post_consume_script)
AppConfig.ready(self)

View File

@@ -1,7 +1,7 @@
import datetime
import hashlib
import logging
import os
from subprocess import Popen
import magic
from django.conf import settings
@@ -9,6 +9,7 @@ from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import create_source_path_directory, \
@@ -66,6 +67,39 @@ class Consumer(LoggingMixin):
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
def run_pre_consume_script(self):
if not settings.PRE_CONSUME_SCRIPT:
return
try:
Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
except Exception as e:
raise ConsumerError(
f"Error while executing pre-consume script: {e}"
)
def run_post_consume_script(self, document):
if not settings.POST_CONSUME_SCRIPT:
return
try:
Popen((
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list(
"name", flat=True)))
)).wait()
except Exception as e:
raise ConsumerError(
f"Error while executing pre-consume script: {e}"
)
def try_consume_file(self,
path,
override_filename=None,
@@ -119,6 +153,8 @@ class Consumer(LoggingMixin):
logging_group=self.logging_group
)
self.run_pre_consume_script()
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.logging_group)
@@ -130,7 +166,7 @@ class Consumer(LoggingMixin):
try:
self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type)
document_parser.parse(self.path, mime_type, self.filename)
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail(
@@ -215,6 +251,9 @@ class Consumer(LoggingMixin):
# Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
self.run_post_consume_script(document)
except Exception as e:
self.log(
"error",

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.4 on 2021-01-01 21:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1009_auto_20201216_2005'),
]
operations = [
migrations.AlterField(
model_name='savedviewfilterrule',
name='value',
field=models.CharField(blank=True, max_length=128, null=True),
),
]

View File

@@ -404,7 +404,9 @@ class SavedViewFilterRule(models.Model):
value = models.CharField(
_("value"),
max_length=128)
max_length=128,
blank=True,
null=True)
class Meta:
verbose_name = _("filter rule")

View File

@@ -144,6 +144,52 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(temp_dir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(in_path),
output_file=out_path,
logging_group=logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path
def parse_date(filename, text):
"""
Returns the date of the document.
@@ -221,7 +267,7 @@ class DocumentParser(LoggingMixin):
def extract_metadata(self, document_path, mime_type):
return []
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
raise NotImplementedError()
def get_archive_path(self):

View File

@@ -11,7 +11,6 @@ from django.db.models import Q
from django.dispatch import receiver
from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from .. import index, matching
from ..file_handling import delete_empty_directories, \
@@ -147,32 +146,6 @@ def set_tags(sender,
document.tags.add(*relevant_tags)
def run_pre_consume_script(sender, filename, **kwargs):
if not settings.PRE_CONSUME_SCRIPT:
return
Popen((settings.PRE_CONSUME_SCRIPT, filename)).wait()
def run_post_consume_script(sender, document, **kwargs):
if not settings.POST_CONSUME_SCRIPT:
return
Popen((
settings.POST_CONSUME_SCRIPT,
str(document.pk),
document.get_public_filename(),
os.path.normpath(document.source_path),
os.path.normpath(document.thumbnail_path),
reverse("document-download", kwargs={"pk": document.pk}),
reverse("document-thumb", kwargs={"pk": document.pk}),
str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True)))
)).wait()
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
with FileLock(settings.MEDIA_LOCK):

View File

@@ -177,7 +177,7 @@ class DummyParser(DocumentParser):
def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
self.text = "The Text"
@@ -194,7 +194,7 @@ class FaultyParser(DocumentParser):
def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb
def parse(self, document_path, mime_type):
def parse(self, document_path, mime_type, file_name=None):
raise ParseError("Does not compute.")
@@ -466,3 +466,53 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertTrue(os.path.isfile(dst))
self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
self.assertTrue(os.path.isfile(dst))
class PostConsumeTestCase(TestCase):
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
Consumer().run_post_consume_script(doc)
m.assert_not_called()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_simple(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
Consumer().run_post_consume_script(doc)
m.assert_called_once()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_with_correspondent(self, m):
c = Correspondent.objects.create(name="my_bank")
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
Consumer().run_post_consume_script(doc)
m.assert_called_once()
args, kwargs = m.call_args
command = args[0]
self.assertEqual(command[0], "script")
self.assertEqual(command[1], str(doc.pk))
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])

View File

@@ -1,56 +0,0 @@
from unittest import mock
from django.test import TestCase, override_settings
from documents.models import Document, Tag, Correspondent
from documents.signals.handlers import run_post_consume_script
class PostConsumeTestCase(TestCase):
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
run_post_consume_script(None, doc)
m.assert_not_called()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_simple(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
run_post_consume_script(None, doc)
m.assert_called_once()
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_with_correspondent(self, m):
c = Correspondent.objects.create(name="my_bank")
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
run_post_consume_script(None, doc)
m.assert_called_once()
args, kwargs = m.call_args
command = args[0]
self.assertEqual(command[0], "script")
self.assertEqual(command[1], str(doc.pk))
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])