mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
mime type handling
This commit is contained in:
parent
bd45a804a7
commit
41650f20f4
@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added", "file_type", "storage_type", "filename")
|
||||
readonly_fields = ("added", "mime_type", "storage_type", "filename")
|
||||
list_display = (
|
||||
"title",
|
||||
"created",
|
||||
@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
"correspondent",
|
||||
"tags_",
|
||||
"archive_serial_number",
|
||||
"document_type",
|
||||
"filename"
|
||||
"document_type"
|
||||
)
|
||||
list_filter = (
|
||||
"document_type",
|
||||
|
@ -2,8 +2,8 @@ import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@ -51,12 +51,6 @@ class Consumer(LoggingMixin):
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def pre_check_regex(self):
|
||||
if not re.match(FileInfo.REGEXES["title"], self.filename):
|
||||
raise ConsumerError(
|
||||
"Filename {} does not seem to be safe to "
|
||||
"consume".format(self.filename))
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
@ -100,18 +94,19 @@ class Consumer(LoggingMixin):
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_regex()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
self.log("info", "Consuming {}".format(self.filename))
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
parser_class = get_parser_class(self.filename)
|
||||
mime_type = magic.from_file(self.path, mime=True)
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if not parser_class:
|
||||
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||
else:
|
||||
self.log("debug", "Parser: {}".format(parser_class.__name__))
|
||||
self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
@ -162,7 +157,8 @@ class Consumer(LoggingMixin):
|
||||
# store the document.
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date
|
||||
date=date,
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
@ -197,7 +193,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
return document
|
||||
|
||||
def _store(self, text, date):
|
||||
def _store(self, text, date, mime_type):
|
||||
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
|
||||
@ -220,7 +216,7 @@ class Consumer(LoggingMixin):
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
content=text,
|
||||
file_type=file_info.extension,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
created=created,
|
||||
modified=created,
|
||||
|
@ -91,9 +91,9 @@ def generate_filename(document):
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
|
||||
filename = "%s-%07i%s" % (path, document.pk, document.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (document.pk, document.file_type)
|
||||
filename = "%07i%s" % (document.pk, document.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if document.storage_type == document.STORAGE_TYPE_GPG:
|
||||
|
@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
|
||||
if tags:
|
||||
return "{} - {} - {} - {}.{}".format(
|
||||
return "{} - {} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, tags, doc.file_type)
|
||||
|
||||
return "{} - {} - {}.{}".format(
|
||||
return "{} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, doc.file_type)
|
||||
|
50
src/documents/migrations/1003_mime_types.py
Normal file
50
src/documents/migrations/1003_mime_types.py
Normal file
@ -0,0 +1,50 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-20 11:21
|
||||
import os
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def source_path(self):
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def add_mime_types(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
|
||||
for d in documents:
|
||||
d.mime_type = magic.from_file(source_path(d), mime=True)
|
||||
d.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1002_auto_20201111_1105'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='mime_type',
|
||||
field=models.CharField(default="-", editable=False, max_length=256),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.RunPython(add_mime_types),
|
||||
migrations.RemoveField(
|
||||
model_name='document',
|
||||
name='file_type',
|
||||
),
|
||||
]
|
@ -1,6 +1,7 @@
|
||||
# coding=utf-8
|
||||
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
# TODO: why do we need an explicit list
|
||||
TYPE_PDF = "pdf"
|
||||
TYPE_PNG = "png"
|
||||
TYPE_JPG = "jpg"
|
||||
TYPE_GIF = "gif"
|
||||
TYPE_TIF = "tiff"
|
||||
TYPE_TXT = "txt"
|
||||
TYPE_CSV = "csv"
|
||||
TYPE_MD = "md"
|
||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
|
||||
TYPE_TXT, TYPE_CSV, TYPE_MD)
|
||||
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
STORAGE_TYPES = (
|
||||
@ -156,10 +145,9 @@ class Document(models.Model):
|
||||
"primarily used for searching."
|
||||
)
|
||||
|
||||
file_type = models.CharField(
|
||||
max_length=4,
|
||||
editable=False,
|
||||
choices=tuple([(t, t.upper()) for t in TYPES])
|
||||
mime_type = models.CharField(
|
||||
max_length=256,
|
||||
editable=False
|
||||
)
|
||||
|
||||
tags = models.ManyToManyField(
|
||||
@ -223,7 +211,7 @@ class Document(models.Model):
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
fname = "{:07}{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
@ -238,7 +226,11 @@ class Document(models.Model):
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + "." + self.file_type
|
||||
return slugify(str(self)) + self.file_type
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
return mimetypes.guess_extension(str(self.mime_type))
|
||||
|
||||
@property
|
||||
def thumbnail_path(self):
|
||||
|
@ -6,6 +6,7 @@ import subprocess
|
||||
import tempfile
|
||||
|
||||
import dateparser
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_parser_class(doc):
|
||||
"""
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
def is_mime_type_supported(mime_type):
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type):
|
||||
|
||||
options = []
|
||||
|
||||
@ -48,9 +50,9 @@ def get_parser_class(doc):
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
parser_test = parser_declaration["test"]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if parser_test(doc):
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
@ -61,6 +63,16 @@ def get_parser_class(doc):
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
|
||||
def get_parser_class(path):
|
||||
"""
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
mime_type = magic.from_file(path, mime=True)
|
||||
|
||||
return get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
|
||||
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
|
@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
"document_type_id",
|
||||
"title",
|
||||
"content",
|
||||
"file_type",
|
||||
"mime_type",
|
||||
"tags",
|
||||
"tags_id",
|
||||
"checksum",
|
||||
|
@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
|
||||
dt = DocumentType.objects.create(name="dt", pk=63)
|
||||
tag = Tag.objects.create(name="t", pk=85)
|
||||
|
||||
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
|
||||
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
|
||||
|
||||
doc.tags.add(tag)
|
||||
|
||||
@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
|
||||
|
||||
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
f.write(content_thumbnail)
|
||||
@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
def test_document_filters(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||
doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
|
||||
|
||||
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||
tag_2 = Tag.objects.create(name="t2")
|
||||
|
@ -437,6 +437,18 @@ class FaultyParser(DocumentParser):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
|
||||
if mime:
|
||||
if os.path.splitext(file)[1] == ".pdf":
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "unknown"
|
||||
else:
|
||||
return "A verbose string that describes the contents of the file"
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
@ -462,7 +474,7 @@ class TestConsumer(TestCase):
|
||||
m = patcher.start()
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_dummy_parser,
|
||||
"test": lambda _: True,
|
||||
"mime_types": ["application/pdf"],
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
@ -592,7 +604,7 @@ class TestConsumer(TestCase):
|
||||
def testFaultyParser(self, m):
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_faulty_parser,
|
||||
"test": lambda _: True,
|
||||
"mime_types": ["application/pdf"],
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
|
@ -13,9 +13,12 @@ class TestDocument(TestCase):
|
||||
title="Title",
|
||||
content="content",
|
||||
checksum="checksum",
|
||||
mime_type="application/pdf"
|
||||
)
|
||||
|
||||
file_path = document.source_path
|
||||
thumb_path = document.thumbnail_path
|
||||
|
||||
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
|
||||
document.delete()
|
||||
mock_unlink.assert_any_call(file_path)
|
||||
|
@ -31,7 +31,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -44,7 +44,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -81,7 +81,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -111,10 +111,10 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_database_error(self):
|
||||
|
||||
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
|
||||
document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
|
||||
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.checksum = "BBBBB"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
@ -149,7 +149,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -170,7 +170,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete_nofile(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -179,7 +179,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_directory_not_empty(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -206,7 +206,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_underscore(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -222,7 +222,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_dash(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -238,7 +238,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_malformed(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -254,7 +254,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_all(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -269,7 +269,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
|
||||
def test_tags_out_of_bounds(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -284,7 +284,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@ -309,7 +309,7 @@ class TestDate(TestCase):
|
||||
def test_format_none(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
@ -335,7 +335,7 @@ class TestDate(TestCase):
|
||||
def test_invalid_format(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
@ -344,7 +344,7 @@ class TestDate(TestCase):
|
||||
def test_invalid_format_key(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
TestCase.setUp(self)
|
||||
User.objects.create_user(username='test_consumer', password='12345')
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.", file_type="pdf")
|
||||
content="I contain the keyword.", mime_type="application/pdf")
|
||||
|
||||
def test_tag_applied_any(self):
|
||||
t1 = Tag.objects.create(
|
||||
|
@ -1,3 +1,4 @@
|
||||
import os
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
@ -5,7 +6,18 @@ from django.test import TestCase
|
||||
|
||||
from documents.parsers import get_parser_class
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
|
||||
if mime:
|
||||
if os.path.splitext(file)[1] == ".pdf":
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "unknown"
|
||||
else:
|
||||
return "A verbose string that describes the contents of the file"
|
||||
|
||||
|
||||
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
|
||||
class TestParserDiscovery(TestCase):
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
|
||||
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
|
||||
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
|
@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
# TODO: this should not be necessary here.
|
||||
content_types = {
|
||||
Document.TYPE_PDF: "application/pdf",
|
||||
Document.TYPE_PNG: "image/png",
|
||||
Document.TYPE_JPG: "image/jpeg",
|
||||
Document.TYPE_GIF: "image/gif",
|
||||
Document.TYPE_TIF: "image/tiff",
|
||||
Document.TYPE_CSV: "text/csv",
|
||||
Document.TYPE_MD: "text/markdown",
|
||||
Document.TYPE_TXT: "text/plain"
|
||||
}
|
||||
|
||||
doc = Document.objects.get(id=pk)
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
else:
|
||||
file_handle = GnuPG.decrypted(doc.source_file)
|
||||
|
||||
response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
|
||||
response = HttpResponse(file_handle, content_type=doc.mime_type)
|
||||
response["Content-Disposition"] = '{}; filename="{}"'.format(
|
||||
disposition, doc.file_name)
|
||||
return response
|
||||
|
@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.models import Correspondent
|
||||
from documents.parsers import is_mime_type_supported
|
||||
from paperless_mail.models import MailAccount, MailRule
|
||||
|
||||
|
||||
@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
title = get_title(message, att, rule)
|
||||
|
||||
# TODO: check with parsers what files types are supported
|
||||
if att.content_type == 'application/pdf':
|
||||
if is_mime_type_supported(att.content_type):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
||||
|
@ -1,5 +1,3 @@
|
||||
import re
|
||||
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0,
|
||||
"test": tesseract_consumer_test
|
||||
"mime_types": [
|
||||
"application/pdf",
|
||||
"image/jpeg",
|
||||
"image/png"
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
|
||||
def tesseract_consumer_test(doc):
|
||||
return MATCHING_FILES.match(doc.lower())
|
||||
|
@ -1,36 +0,0 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from paperless_tesseract.signals import tesseract_consumer_test
|
||||
|
||||
|
||||
class SignalsTestCase(TestCase):
|
||||
|
||||
def test_test_handles_various_file_names_true(self):
|
||||
|
||||
prefixes = (
|
||||
"doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
|
||||
"A document with a . in it", "Doc with -- in it"
|
||||
)
|
||||
suffixes = (
|
||||
"pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
|
||||
"PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
|
||||
"pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
|
||||
)
|
||||
|
||||
for prefix in prefixes:
|
||||
for suffix in suffixes:
|
||||
name = "{}.{}".format(prefix, suffix)
|
||||
self.assertTrue(tesseract_consumer_test(name))
|
||||
|
||||
def test_test_handles_various_file_names_false(self):
|
||||
|
||||
prefixes = ("doc",)
|
||||
suffixes = ("txt", "markdown", "",)
|
||||
|
||||
for prefix in prefixes:
|
||||
for suffix in suffixes:
|
||||
name = "{}.{}".format(prefix, suffix)
|
||||
self.assertFalse(tesseract_consumer_test(name))
|
||||
|
||||
self.assertFalse(tesseract_consumer_test(""))
|
||||
self.assertFalse(tesseract_consumer_test("doc"))
|
@ -1,5 +1,3 @@
|
||||
import re
|
||||
|
||||
from .parsers import TextDocumentParser
|
||||
|
||||
|
||||
@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10,
|
||||
"test": text_consumer_test
|
||||
"mime_types": [
|
||||
"text/plain",
|
||||
"text/comma-separated-values"
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||
|
||||
|
||||
def text_consumer_test(doc):
|
||||
return MATCHING_FILES.match(doc.lower())
|
||||
|
Loading…
x
Reference in New Issue
Block a user