mime type handling

This commit is contained in:
Jonas Winkler 2020-11-20 13:31:03 +01:00
parent bd45a804a7
commit 41650f20f4
19 changed files with 163 additions and 146 deletions

View File

@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin):
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added", "file_type", "storage_type", "filename")
readonly_fields = ("added", "mime_type", "storage_type", "filename")
list_display = (
"title",
"created",
@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin):
"correspondent",
"tags_",
"archive_serial_number",
"document_type",
"filename"
"document_type"
)
list_filter = (
"document_type",

View File

@ -2,8 +2,8 @@ import datetime
import hashlib
import logging
import os
import re
import magic
from django.conf import settings
from django.db import transaction
from django.utils import timezone
@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class
from .parsers import ParseError, get_parser_class_for_mime_type
from .signals import (
document_consumption_finished,
document_consumption_started
@ -51,12 +51,6 @@ class Consumer(LoggingMixin):
"Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR))
def pre_check_regex(self):
if not re.match(FileInfo.REGEXES["title"], self.filename):
raise ConsumerError(
"Filename {} does not seem to be safe to "
"consume".format(self.filename))
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
@ -100,18 +94,19 @@ class Consumer(LoggingMixin):
self.pre_check_file_exists()
self.pre_check_consumption_dir()
self.pre_check_directories()
self.pre_check_regex()
self.pre_check_duplicate()
self.log("info", "Consuming {}".format(self.filename))
# Determine the parser class.
parser_class = get_parser_class(self.filename)
mime_type = magic.from_file(self.path, mime=True)
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
else:
self.log("debug", "Parser: {}".format(parser_class.__name__))
self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
# Notify all listeners that we're going to do some work.
@ -162,7 +157,8 @@ class Consumer(LoggingMixin):
# store the document.
document = self._store(
text=text,
date=date
date=date,
mime_type=mime_type
)
# If we get here, it was successful. Proceed with post-consume
@ -197,7 +193,7 @@ class Consumer(LoggingMixin):
return document
def _store(self, text, date):
def _store(self, text, date, mime_type):
# If someone gave us the original filename, use it instead of doc.
@ -220,7 +216,7 @@ class Consumer(LoggingMixin):
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
file_type=file_info.extension,
mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,

View File

@ -91,9 +91,9 @@ def generate_filename(document):
# Always append the primary key to guarantee uniqueness of filename
if len(path) > 0:
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
filename = "%s-%07i%s" % (path, document.pk, document.file_type)
else:
filename = "%07i.%s" % (document.pk, document.file_type)
filename = "%07i%s" % (document.pk, document.file_type)
# Append .gpg for encrypted files
if document.storage_type == document.STORAGE_TYPE_GPG:

View File

@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
return "{} - {} - {} - {}.{}".format(
return "{} - {} - {} - {}{}".format(
created, doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {} - {}.{}".format(
return "{} - {} - {}{}".format(
created, doc.correspondent, doc.title, doc.file_type)

View File

@ -0,0 +1,50 @@
# Generated by Django 3.1.3 on 2020-11-20 11:21
import os
import magic
from django.conf import settings
from django.db import migrations, models
def source_path(self):
if self.filename:
fname = str(self.filename)
else:
fname = "{:07}.{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
def add_mime_types(apps, schema_editor):
Document = apps.get_model("documents", "Document")
documents = Document.objects.all()
for d in documents:
d.mime_type = magic.from_file(source_path(d), mime=True)
d.save()
class Migration(migrations.Migration):
dependencies = [
('documents', '1002_auto_20201111_1105'),
]
operations = [
migrations.AddField(
model_name='document',
name='mime_type',
field=models.CharField(default="-", editable=False, max_length=256),
preserve_default=False,
),
migrations.RunPython(add_mime_types),
migrations.RemoveField(
model_name='document',
name='file_type',
),
]

View File

@ -1,6 +1,7 @@
# coding=utf-8
import logging
import mimetypes
import os
import re
from collections import OrderedDict
@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
class Document(models.Model):
# TODO: why do we need an explicit list
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
@ -156,10 +145,9 @@ class Document(models.Model):
"primarily used for searching."
)
file_type = models.CharField(
max_length=4,
editable=False,
choices=tuple([(t, t.upper()) for t in TYPES])
mime_type = models.CharField(
max_length=256,
editable=False
)
tags = models.ManyToManyField(
@ -223,7 +211,7 @@ class Document(models.Model):
if self.filename:
fname = str(self.filename)
else:
fname = "{:07}.{}".format(self.pk, self.file_type)
fname = "{:07}{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
@ -238,7 +226,11 @@ class Document(models.Model):
@property
def file_name(self):
return slugify(str(self)) + "." + self.file_type
return slugify(str(self)) + self.file_type
@property
def file_type(self):
return mimetypes.guess_extension(str(self.mime_type))
@property
def thumbnail_path(self):

View File

@ -6,6 +6,7 @@ import subprocess
import tempfile
import dateparser
import magic
from django.conf import settings
from django.utils import timezone
@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
logger = logging.getLogger(__name__)
def get_parser_class(doc):
"""
Determine the appropriate parser class based on the file
"""
def is_mime_type_supported(mime_type):
return get_parser_class_for_mime_type(mime_type) is not None
def get_parser_class_for_mime_type(mime_type):
options = []
@ -48,9 +50,9 @@ def get_parser_class(doc):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
parser_test = parser_declaration["test"]
supported_mime_types = parser_declaration["mime_types"]
if parser_test(doc):
if mime_type in supported_mime_types:
options.append(parser_declaration)
if not options:
@ -61,6 +63,16 @@ def get_parser_class(doc):
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def get_parser_class(path):
"""
Determine the appropriate parser class based on the file
"""
mime_type = magic.from_file(path, mime=True)
return get_parser_class_for_mime_type(mime_type)
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:

View File

@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer):
"document_type_id",
"title",
"content",
"file_type",
"mime_type",
"tags",
"tags_id",
"checksum",

View File

@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
dt = DocumentType.objects.create(name="dt", pk=63)
tag = Tag.objects.create(name="t", pk=85)
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
doc.tags.add(tag)
@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
with open(filename, "wb") as f:
f.write(content)
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
f.write(content_thumbnail)
@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
def test_document_actions_not_existing_file(self):
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
def test_document_filters(self):
doc1 = Document.objects.create(title="none1", checksum="A")
doc2 = Document.objects.create(title="none2", checksum="B")
doc3 = Document.objects.create(title="none3", checksum="C")
doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
tag_2 = Tag.objects.create(name="t2")

View File

@ -437,6 +437,18 @@ class FaultyParser(DocumentParser):
raise ParseError("Does not compute.")
def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(TestCase):
def make_dummy_parser(self, path, logging_group):
@ -462,7 +474,7 @@ class TestConsumer(TestCase):
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"test": lambda _: True,
"mime_types": ["application/pdf"],
"weight": 0
})]
@ -592,7 +604,7 @@ class TestConsumer(TestCase):
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"test": lambda _: True,
"mime_types": ["application/pdf"],
"weight": 0
})]

View File

@ -13,9 +13,12 @@ class TestDocument(TestCase):
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf"
)
file_path = document.source_path
thumb_path = document.thumbnail_path
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
mock_unlink.assert_any_call(file_path)

View File

@ -31,7 +31,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -44,7 +44,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -81,7 +81,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_missing_permissions(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -111,10 +111,10 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_file_renaming_database_error(self):
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.checksum = "BBBBB"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -149,7 +149,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -170,7 +170,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete_nofile(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -179,7 +179,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_directory_not_empty(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -206,7 +206,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_underscore(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -222,7 +222,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_with_dash(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -238,7 +238,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
def test_tags_malformed(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -254,7 +254,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
def test_tags_all(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -269,7 +269,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
def test_tags_out_of_bounds(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -284,7 +284,7 @@ class TestDate(TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
def test_nested_directory_cleanup(self):
document = Document()
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
@ -309,7 +309,7 @@ class TestDate(TestCase):
def test_format_none(self):
document = Document()
document.pk = 1
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
@ -335,7 +335,7 @@ class TestDate(TestCase):
def test_invalid_format(self):
document = Document()
document.pk = 1
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
@ -344,7 +344,7 @@ class TestDate(TestCase):
def test_invalid_format_key(self):
document = Document()
document.pk = 1
document.file_type = "pdf"
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")

View File

@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
TestCase.setUp(self)
User.objects.create_user(username='test_consumer', password='12345')
self.doc_contains = Document.objects.create(
content="I contain the keyword.", file_type="pdf")
content="I contain the keyword.", mime_type="application/pdf")
def test_tag_applied_any(self):
t1 = Tag.objects.create(

View File

@ -1,3 +1,4 @@
import os
from tempfile import TemporaryDirectory
from unittest import mock
@ -5,7 +6,18 @@ from django.test import TestCase
from documents.parsers import get_parser_class
def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
)
self.assertEqual(
@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
)
self.assertEqual(

View File

@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin,
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition):
# TODO: this should not be necessary here.
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin,
else:
file_handle = GnuPG.decrypted(doc.source_file)
response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
response = HttpResponse(file_handle, content_type=doc.mime_type)
response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, doc.file_name)
return response

View File

@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
from documents.loggers import LoggingMixin
from documents.models import Correspondent
from documents.parsers import is_mime_type_supported
from paperless_mail.models import MailAccount, MailRule
@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin):
title = get_title(message, att, rule)
# TODO: check with parsers what files types are supported
if att.content_type == 'application/pdf':
if is_mime_type_supported(att.content_type):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)

View File

@ -1,5 +1,3 @@
import re
from .parsers import RasterisedDocumentParser
@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"test": tesseract_consumer_test
"mime_types": [
"application/pdf",
"image/jpeg",
"image/png"
]
}
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
def tesseract_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())

View File

@ -1,36 +0,0 @@
from django.test import TestCase
from paperless_tesseract.signals import tesseract_consumer_test
class SignalsTestCase(TestCase):
def test_test_handles_various_file_names_true(self):
prefixes = (
"doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
"A document with a . in it", "Doc with -- in it"
)
suffixes = (
"pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
"PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
"pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
)
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertTrue(tesseract_consumer_test(name))
def test_test_handles_various_file_names_false(self):
prefixes = ("doc",)
suffixes = ("txt", "markdown", "",)
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertFalse(tesseract_consumer_test(name))
self.assertFalse(tesseract_consumer_test(""))
self.assertFalse(tesseract_consumer_test("doc"))

View File

@ -1,5 +1,3 @@
import re
from .parsers import TextDocumentParser
@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"test": text_consumer_test
"mime_types": [
"text/plain",
"text/comma-separated-values"
]
}
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
def text_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())