Merge branch 'dev' into celery-tasks

This commit is contained in:
Jonas Winkler
2020-11-22 22:49:37 +01:00
146 changed files with 1762 additions and 1390 deletions

View File

@@ -2,8 +2,8 @@ import datetime
import hashlib
import logging
import os
import re
import magic
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
@@ -15,7 +15,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class
from .parsers import ParseError, get_parser_class_for_mime_type
from .signals import (
document_consumption_finished,
document_consumption_started
@@ -69,12 +69,6 @@ class Consumer(LoggingMixin):
"Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR))
def pre_check_regex(self):
if not re.match(FileInfo.REGEXES["title"], self.filename):
raise ConsumerError(
"Filename {} does not seem to be safe to "
"consume".format(self.filename))
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
@@ -118,18 +112,21 @@ class Consumer(LoggingMixin):
self.pre_check_file_exists()
self.pre_check_consumption_dir()
self.pre_check_directories()
self.pre_check_regex()
self.pre_check_duplicate()
self.log("info", "Consuming {}".format(self.filename))
# Determine the parser class.
parser_class = get_parser_class(self.filename)
mime_type = magic.from_file(self.path, mime=True)
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
raise ConsumerError(f"No parsers abvailable for {self.filename}")
else:
self.log("debug", "Parser: {}".format(parser_class.__name__))
self.log("debug",
f"Parser: {parser_class.__name__} "
f"based on mime type {mime_type}")
# Notify all listeners that we're going to do some work.
@@ -156,7 +153,7 @@ class Consumer(LoggingMixin):
# Parse the document. This may take some time.
try:
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(self.filename, 10, 100, 'WORKING',
'Generating thumbnail...')
thumbnail = document_parser.get_optimised_thumbnail()
@@ -196,7 +193,8 @@ class Consumer(LoggingMixin):
# store the document.
document = self._store(
text=text,
date=date
date=date,
mime_type=mime_type
)
# If we get here, it was successful. Proceed with post-consume
@@ -239,11 +237,11 @@ class Consumer(LoggingMixin):
return document
def _store(self, text, date):
def _store(self, text, date, mime_type):
# If someone gave us the original filename, use it instead of doc.
file_info = FileInfo.from_path(self.filename)
file_info = FileInfo.from_filename(self.filename)
stats = os.stat(self.path)
@@ -262,7 +260,7 @@ class Consumer(LoggingMixin):
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
file_type=file_info.extension,
mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
@@ -290,10 +288,12 @@ class Consumer(LoggingMixin):
document.title = self.override_title
if self.override_correspondent_id:
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
document.correspondent = Correspondent.objects.get(
pk=self.override_correspondent_id)
if self.override_document_type_id:
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
document.document_type = DocumentType.objects.get(
pk=self.override_document_type_id)
if self.override_tag_ids:
for tag_id in self.override_tag_ids: