added file type checks to the parsers to prevent temporary files from being consumed. Also: parsers announce file types they wish to use as default for each mime type.

This commit is contained in:
jonaswinkler
2020-11-30 00:40:04 +01:00
parent 64ee8eab2f
commit f51207fc32
7 changed files with 83 additions and 21 deletions

View File

@@ -9,10 +9,11 @@ from django.db import transaction
from django.utils import timezone
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .file_handling import create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class_for_mime_type
from .parsers import ParseError, get_parser_class_for_mime_type, \
get_supported_file_extensions
from .signals import (
document_consumption_finished,
document_consumption_started
@@ -39,6 +40,21 @@ class Consumer(LoggingMixin):
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
def pre_check_file_extension(self):
extensions = get_supported_file_extensions()
_, ext = os.path.splitext(self.filename)
if not ext:
raise ConsumerError(
f"Not consuming {self.filename}: File type unknown."
)
if ext not in extensions:
raise ConsumerError(
f"Not consuming {self.filename}: File extension {ext} does "
f"not map to any known file type ({str(extensions)})"
)
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
@@ -80,6 +96,7 @@ class Consumer(LoggingMixin):
# Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists()
self.pre_check_file_extension()
self.pre_check_directories()
self.pre_check_duplicate()