mime type handling

This commit is contained in:
Jonas Winkler 2020-11-20 13:31:03 +01:00
parent bd45a804a7
commit 41650f20f4
19 changed files with 163 additions and 146 deletions

View File

@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin):
class DocumentAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name") search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "file_type", "storage_type", "filename") readonly_fields = ("added", "mime_type", "storage_type", "filename")
list_display = ( list_display = (
"title", "title",
"created", "created",
@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin):
"correspondent", "correspondent",
"tags_", "tags_",
"archive_serial_number", "archive_serial_number",
"document_type", "document_type"
"filename"
) )
list_filter = ( list_filter = (
"document_type", "document_type",

View File

@ -2,8 +2,8 @@ import datetime
import hashlib import hashlib
import logging import logging
import os import os
import re
import magic
from django.conf import settings from django.conf import settings
from django.db import transaction from django.db import transaction
from django.utils import timezone from django.utils import timezone
@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class from .parsers import ParseError, get_parser_class_for_mime_type
from .signals import ( from .signals import (
document_consumption_finished, document_consumption_finished,
document_consumption_started document_consumption_started
@ -51,12 +51,6 @@ class Consumer(LoggingMixin):
"Consumption directory {} does not exist".format( "Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR)) settings.CONSUMPTION_DIR))
def pre_check_regex(self):
if not re.match(FileInfo.REGEXES["title"], self.filename):
raise ConsumerError(
"Filename {} does not seem to be safe to "
"consume".format(self.filename))
def pre_check_duplicate(self): def pre_check_duplicate(self):
with open(self.path, "rb") as f: with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
@ -100,18 +94,19 @@ class Consumer(LoggingMixin):
self.pre_check_file_exists() self.pre_check_file_exists()
self.pre_check_consumption_dir() self.pre_check_consumption_dir()
self.pre_check_directories() self.pre_check_directories()
self.pre_check_regex()
self.pre_check_duplicate() self.pre_check_duplicate()
self.log("info", "Consuming {}".format(self.filename)) self.log("info", "Consuming {}".format(self.filename))
# Determine the parser class. # Determine the parser class.
parser_class = get_parser_class(self.filename) mime_type = magic.from_file(self.path, mime=True)
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class: if not parser_class:
raise ConsumerError("No parsers abvailable for {}".format(self.filename)) raise ConsumerError("No parsers abvailable for {}".format(self.filename))
else: else:
self.log("debug", "Parser: {}".format(parser_class.__name__)) self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
# Notify all listeners that we're going to do some work. # Notify all listeners that we're going to do some work.
@ -162,7 +157,8 @@ class Consumer(LoggingMixin):
# store the document. # store the document.
document = self._store( document = self._store(
text=text, text=text,
date=date date=date,
mime_type=mime_type
) )
# If we get here, it was successful. Proceed with post-consume # If we get here, it was successful. Proceed with post-consume
@ -197,7 +193,7 @@ class Consumer(LoggingMixin):
return document return document
def _store(self, text, date): def _store(self, text, date, mime_type):
# If someone gave us the original filename, use it instead of doc. # If someone gave us the original filename, use it instead of doc.
@ -220,7 +216,7 @@ class Consumer(LoggingMixin):
correspondent=file_info.correspondent, correspondent=file_info.correspondent,
title=file_info.title, title=file_info.title,
content=text, content=text,
file_type=file_info.extension, mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(), checksum=hashlib.md5(f.read()).hexdigest(),
created=created, created=created,
modified=created, modified=created,

View File

@ -91,9 +91,9 @@ def generate_filename(document):
# Always append the primary key to guarantee uniqueness of filename # Always append the primary key to guarantee uniqueness of filename
if len(path) > 0: if len(path) > 0:
filename = "%s-%07i.%s" % (path, document.pk, document.file_type) filename = "%s-%07i%s" % (path, document.pk, document.file_type)
else: else:
filename = "%07i.%s" % (document.pk, document.file_type) filename = "%07i%s" % (document.pk, document.file_type)
# Append .gpg for encrypted files # Append .gpg for encrypted files
if document.storage_type == document.STORAGE_TYPE_GPG: if document.storage_type == document.STORAGE_TYPE_GPG:

View File

@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
tags = ",".join([t.slug for t in doc.tags.all()]) tags = ",".join([t.slug for t in doc.tags.all()])
if tags: if tags:
return "{} - {} - {} - {}.{}".format( return "{} - {} - {} - {}{}".format(
created, doc.correspondent, doc.title, tags, doc.file_type) created, doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {} - {}.{}".format( return "{} - {} - {}{}".format(
created, doc.correspondent, doc.title, doc.file_type) created, doc.correspondent, doc.title, doc.file_type)

View File

@ -0,0 +1,50 @@
# Generated by Django 3.1.3 on 2020-11-20 11:21
import os
import magic
from django.conf import settings
from django.db import migrations, models
def source_path(self):
if self.filename:
fname = str(self.filename)
else:
fname = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
def add_mime_types(apps, schema_editor):
Document = apps.get_model("documents", "Document")
documents = Document.objects.all()
for d in documents:
d.mime_type = magic.from_file(source_path(d), mime=True)
d.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '1002_auto_20201111_1105'),
]
operations = [
migrations.AddField(
model_name='document',
name='mime_type',
field=models.CharField(default="-", editable=False, max_length=256),
preserve_default=False,
),
migrations.RunPython(add_mime_types),
migrations.RemoveField(
model_name='document',
name='file_type',
),
]

View File

@ -1,6 +1,7 @@
# coding=utf-8 # coding=utf-8
import logging import logging
import mimetypes
import os import os
import re import re
from collections import OrderedDict from collections import OrderedDict
@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
class Document(models.Model): class Document(models.Model):
# TODO: why do we need an explicit list
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg" STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = ( STORAGE_TYPES = (
@ -156,10 +145,9 @@ class Document(models.Model):
"primarily used for searching." "primarily used for searching."
) )
file_type = models.CharField( mime_type = models.CharField(
max_length=4, max_length=256,
editable=False, editable=False
choices=tuple([(t, t.upper()) for t in TYPES])
) )
tags = models.ManyToManyField( tags = models.ManyToManyField(
@ -223,7 +211,7 @@ class Document(models.Model):
if self.filename: if self.filename:
fname = str(self.filename) fname = str(self.filename)
else: else:
fname = "{:07}.{}".format(self.pk, self.file_type) fname = "{:07}{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG: if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg" fname += ".gpg"
@ -238,7 +226,11 @@ class Document(models.Model):
@property @property
def file_name(self): def file_name(self):
return slugify(str(self)) + "." + self.file_type return slugify(str(self)) + self.file_type
@property
def file_type(self):
return mimetypes.guess_extension(str(self.mime_type))
@property @property
def thumbnail_path(self): def thumbnail_path(self):

View File

@ -6,6 +6,7 @@ import subprocess
import tempfile import tempfile
import dateparser import dateparser
import magic
from django.conf import settings from django.conf import settings
from django.utils import timezone from django.utils import timezone
@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_parser_class(doc): def is_mime_type_supported(mime_type):
""" return get_parser_class_for_mime_type(mime_type) is not None
Determine the appropriate parser class based on the file
"""
def get_parser_class_for_mime_type(mime_type):
options = [] options = []
@ -48,9 +50,9 @@ def get_parser_class(doc):
for response in document_consumer_declaration.send(None): for response in document_consumer_declaration.send(None):
parser_declaration = response[1] parser_declaration = response[1]
parser_test = parser_declaration["test"] supported_mime_types = parser_declaration["mime_types"]
if parser_test(doc): if mime_type in supported_mime_types:
options.append(parser_declaration) options.append(parser_declaration)
if not options: if not options:
@ -61,6 +63,16 @@ def get_parser_class(doc):
options, key=lambda _: _["weight"], reverse=True)[0]["parser"] options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def get_parser_class(path):
"""
Determine the appropriate parser class based on the file
"""
mime_type = magic.from_file(path, mime=True)
return get_parser_class_for_mime_type(mime_type)
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None): def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy() environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT: if settings.CONVERT_MEMORY_LIMIT:

View File

@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer):
"document_type_id", "document_type_id",
"title", "title",
"content", "content",
"file_type", "mime_type",
"tags", "tags",
"tags_id", "tags_id",
"checksum", "checksum",

View File

@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
dt = DocumentType.objects.create(name="dt", pk=63) dt = DocumentType.objects.create(name="dt", pk=63)
tag = Tag.objects.create(name="t", pk=85) tag = Tag.objects.create(name="t", pk=85)
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123") doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
doc.tags.add(tag) doc.tags.add(tag)
@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
with open(filename, "wb") as f: with open(filename, "wb") as f:
f.write(content) f.write(content)
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf") doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f: with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
f.write(content_thumbnail) f.write(content_thumbnail)
@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
def test_document_actions_not_existing_file(self): def test_document_actions_not_existing_file(self):
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf") doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
def test_document_filters(self): def test_document_filters(self):
doc1 = Document.objects.create(title="none1", checksum="A") doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
doc2 = Document.objects.create(title="none2", checksum="B") doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
doc3 = Document.objects.create(title="none3", checksum="C") doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True) tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
tag_2 = Tag.objects.create(name="t2") tag_2 = Tag.objects.create(name="t2")

View File

@ -437,6 +437,18 @@ class FaultyParser(DocumentParser):
raise ParseError("Does not compute.") raise ParseError("Does not compute.")
def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(TestCase): class TestConsumer(TestCase):
def make_dummy_parser(self, path, logging_group): def make_dummy_parser(self, path, logging_group):
@ -462,7 +474,7 @@ class TestConsumer(TestCase):
m = patcher.start() m = patcher.start()
m.return_value = [(None, { m.return_value = [(None, {
"parser": self.make_dummy_parser, "parser": self.make_dummy_parser,
"test": lambda _: True, "mime_types": ["application/pdf"],
"weight": 0 "weight": 0
})] })]
@ -592,7 +604,7 @@ class TestConsumer(TestCase):
def testFaultyParser(self, m): def testFaultyParser(self, m):
m.return_value = [(None, { m.return_value = [(None, {
"parser": self.make_faulty_parser, "parser": self.make_faulty_parser,
"test": lambda _: True, "mime_types": ["application/pdf"],
"weight": 0 "weight": 0
})] })]

View File

@ -13,9 +13,12 @@ class TestDocument(TestCase):
title="Title", title="Title",
content="content", content="content",
checksum="checksum", checksum="checksum",
mime_type="application/pdf"
) )
file_path = document.source_path file_path = document.source_path
thumb_path = document.thumbnail_path thumb_path = document.thumbnail_path
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete() document.delete()
mock_unlink.assert_any_call(file_path) mock_unlink.assert_any_call(file_path)

View File

@ -31,7 +31,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="") @override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self): def test_generate_source_filename(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -44,7 +44,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming(self): def test_file_renaming(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -81,7 +81,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self): def test_file_renaming_missing_permissions(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -111,10 +111,10 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_database_error(self): def test_file_renaming_database_error(self):
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA") document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.checksum = "BBBBB" document.checksum = "BBBBB"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -149,7 +149,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete(self): def test_document_delete(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -170,7 +170,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete_nofile(self): def test_document_delete_nofile(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -179,7 +179,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_directory_not_empty(self): def test_directory_not_empty(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -206,7 +206,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self): def test_tags_with_underscore(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -222,7 +222,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_dash(self): def test_tags_with_dash(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -238,7 +238,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_malformed(self): def test_tags_malformed(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -254,7 +254,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_all(self): def test_tags_all(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -269,7 +269,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
def test_tags_out_of_bounds(self): def test_tags_out_of_bounds(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -284,7 +284,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self): def test_nested_directory_cleanup(self):
document = Document() document = Document()
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save() document.save()
@ -309,7 +309,7 @@ class TestDate(TestCase):
def test_format_none(self): def test_format_none(self):
document = Document() document = Document()
document.pk = 1 document.pk = 1
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf") self.assertEqual(generate_filename(document), "0000001.pdf")
@ -335,7 +335,7 @@ class TestDate(TestCase):
def test_invalid_format(self): def test_invalid_format(self):
document = Document() document = Document()
document.pk = 1 document.pk = 1
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf") self.assertEqual(generate_filename(document), "0000001.pdf")
@ -344,7 +344,7 @@ class TestDate(TestCase):
def test_invalid_format_key(self): def test_invalid_format_key(self):
document = Document() document = Document()
document.pk = 1 document.pk = 1
document.file_type = "pdf" document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf") self.assertEqual(generate_filename(document), "0000001.pdf")

View File

@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
TestCase.setUp(self) TestCase.setUp(self)
User.objects.create_user(username='test_consumer', password='12345') User.objects.create_user(username='test_consumer', password='12345')
self.doc_contains = Document.objects.create( self.doc_contains = Document.objects.create(
content="I contain the keyword.", file_type="pdf") content="I contain the keyword.", mime_type="application/pdf")
def test_tag_applied_any(self): def test_tag_applied_any(self):
t1 = Tag.objects.create( t1 = Tag.objects.create(

View File

@ -1,3 +1,4 @@
import os
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from unittest import mock from unittest import mock
@ -5,7 +6,18 @@ from django.test import TestCase
from documents.parsers import get_parser_class from documents.parsers import get_parser_class
def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
class TestParserDiscovery(TestCase): class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send") @mock.patch("documents.parsers.document_consumer_declaration.send")
@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase):
pass pass
m.return_value = ( m.return_value = (
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}), (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
) )
self.assertEqual( self.assertEqual(
@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase):
pass pass
m.return_value = ( m.return_value = (
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}), (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}), (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
) )
self.assertEqual( self.assertEqual(

View File

@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin,
return super(DocumentViewSet, self).destroy(request, *args, **kwargs) return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition): def file_response(self, pk, disposition):
# TODO: this should not be necessary here.
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
doc = Document.objects.get(id=pk) doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin,
else: else:
file_handle = GnuPG.decrypted(doc.source_file) file_handle = GnuPG.decrypted(doc.source_file)
response = HttpResponse(file_handle, content_type=content_types[doc.file_type]) response = HttpResponse(file_handle, content_type=doc.mime_type)
response["Content-Disposition"] = '{}; filename="{}"'.format( response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, doc.file_name) disposition, doc.file_name)
return response return response

View File

@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
from documents.loggers import LoggingMixin from documents.loggers import LoggingMixin
from documents.models import Correspondent from documents.models import Correspondent
from documents.parsers import is_mime_type_supported
from paperless_mail.models import MailAccount, MailRule from paperless_mail.models import MailAccount, MailRule
@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin):
title = get_title(message, att, rule) title = get_title(message, att, rule)
# TODO: check with parsers what files types are supported if is_mime_type_supported(att.content_type):
if att.content_type == 'application/pdf':
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)

View File

@ -1,5 +1,3 @@
import re
from .parsers import RasterisedDocumentParser from .parsers import RasterisedDocumentParser
@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
return { return {
"parser": RasterisedDocumentParser, "parser": RasterisedDocumentParser,
"weight": 0, "weight": 0,
"test": tesseract_consumer_test "mime_types": [
"application/pdf",
"image/jpeg",
"image/png"
]
} }
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
def tesseract_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())

View File

@ -1,36 +0,0 @@
from django.test import TestCase
from paperless_tesseract.signals import tesseract_consumer_test
class SignalsTestCase(TestCase):
def test_test_handles_various_file_names_true(self):
prefixes = (
"doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
"A document with a . in it", "Doc with -- in it"
)
suffixes = (
"pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
"PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
"pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
)
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertTrue(tesseract_consumer_test(name))
def test_test_handles_various_file_names_false(self):
prefixes = ("doc",)
suffixes = ("txt", "markdown", "",)
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertFalse(tesseract_consumer_test(name))
self.assertFalse(tesseract_consumer_test(""))
self.assertFalse(tesseract_consumer_test("doc"))

View File

@ -1,5 +1,3 @@
import re
from .parsers import TextDocumentParser from .parsers import TextDocumentParser
@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
return { return {
"parser": TextDocumentParser, "parser": TextDocumentParser,
"weight": 10, "weight": 10,
"test": text_consumer_test "mime_types": [
"text/plain",
"text/comma-separated-values"
]
} }
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
def text_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())