mime type handling

2025-12-29 13:48:09 -06:00 · 2020-11-20 13:31:03 +01:00
parent bd45a804a7
commit 41650f20f4
19 changed files with 163 additions and 146 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -50,7 +50,7 @@ class DocumentTypeAdmin(admin.ModelAdmin):
 class DocumentAdmin(admin.ModelAdmin):
    search_fields = ("correspondent__name", "title", "content", "tags__name")
-    readonly_fields = ("added", "file_type", "storage_type", "filename")
+    readonly_fields = ("added", "mime_type", "storage_type", "filename")
    list_display = (
        "title",
        "created",
@@ -58,8 +58,7 @@ class DocumentAdmin(admin.ModelAdmin):
        "correspondent",
        "tags_",
        "archive_serial_number",
-        "document_type",
+        "document_type"
        "filename"
    )
    list_filter = (
        "document_type",
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -2,8 +2,8 @@ import datetime
 import hashlib
 import logging
 import os
 import re
 import magic
 from django.conf import settings
 from django.db import transaction
 from django.utils import timezone
@@ -13,7 +13,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import generate_filename, create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
-from .parsers import ParseError, get_parser_class
+from .parsers import ParseError, get_parser_class_for_mime_type
 from .signals import (
    document_consumption_finished,
    document_consumption_started
@@ -51,12 +51,6 @@ class Consumer(LoggingMixin):
                "Consumption directory {} does not exist".format(
                    settings.CONSUMPTION_DIR))
    def pre_check_regex(self):
        if not re.match(FileInfo.REGEXES["title"], self.filename):
            raise ConsumerError(
                "Filename {} does not seem to be safe to "
                "consume".format(self.filename))
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
@@ -100,18 +94,19 @@ class Consumer(LoggingMixin):
        self.pre_check_file_exists()
        self.pre_check_consumption_dir()
        self.pre_check_directories()
        self.pre_check_regex()
        self.pre_check_duplicate()
        self.log("info", "Consuming {}".format(self.filename))
        # Determine the parser class.
-        parser_class = get_parser_class(self.filename)
+        mime_type = magic.from_file(self.path, mime=True)
        parser_class = get_parser_class_for_mime_type(mime_type)
        if not parser_class:
            raise ConsumerError("No parsers abvailable for {}".format(self.filename))
        else:
-            self.log("debug", "Parser: {}".format(parser_class.__name__))
+            self.log("debug", "Parser: {} based on mime type {}".format(parser_class.__name__, mime_type))
        # Notify all listeners that we're going to do some work.
@@ -162,7 +157,8 @@ class Consumer(LoggingMixin):
                # store the document.
                document = self._store(
                    text=text,
-                    date=date
+                    date=date,
                    mime_type=mime_type
                )
                # If we get here, it was successful. Proceed with post-consume
@@ -197,7 +193,7 @@ class Consumer(LoggingMixin):
        return document
-    def _store(self, text, date):
+    def _store(self, text, date, mime_type):
        # If someone gave us the original filename, use it instead of doc.
@@ -220,7 +216,7 @@ class Consumer(LoggingMixin):
                correspondent=file_info.correspondent,
                title=file_info.title,
                content=text,
-                file_type=file_info.extension,
+                mime_type=mime_type,
                checksum=hashlib.md5(f.read()).hexdigest(),
                created=created,
                modified=created,
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -91,9 +91,9 @@ def generate_filename(document):
    # Always append the primary key to guarantee uniqueness of filename
    if len(path) > 0:
-        filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
+        filename = "%s-%07i%s" % (path, document.pk, document.file_type)
    else:
-        filename = "%07i.%s" % (document.pk, document.file_type)
+        filename = "%07i%s" % (document.pk, document.file_type)
    # Append .gpg for encrypted files
    if document.storage_type == document.STORAGE_TYPE_GPG:
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
        tags = ",".join([t.slug for t in doc.tags.all()])
        if tags:
-            return "{} - {} - {} - {}.{}".format(
+            return "{} - {} - {} - {}{}".format(
                created, doc.correspondent, doc.title, tags, doc.file_type)
-        return "{} - {} - {}.{}".format(
+        return "{} - {} - {}{}".format(
            created, doc.correspondent, doc.title, doc.file_type)
--- a/src/documents/migrations/1003_mime_types.py
+++ b/src/documents/migrations/1003_mime_types.py
@@ -0,0 +1,50 @@
 # Generated by Django 3.1.3 on 2020-11-20 11:21
 import os
 import magic
 from django.conf import settings
 from django.db import migrations, models
 def source_path(self):
    if self.filename:
        fname = str(self.filename)
    else:
        fname = "{:07}.{}".format(self.pk, self.file_type)
        if self.storage_type == self.STORAGE_TYPE_GPG:
            fname += ".gpg"
    return os.path.join(
        settings.ORIGINALS_DIR,
        fname
    )
 def add_mime_types(apps, schema_editor):
    Document = apps.get_model("documents", "Document")
    documents = Document.objects.all()
    for d in documents:
        d.mime_type = magic.from_file(source_path(d), mime=True)
        d.save()
 class Migration(migrations.Migration):
    dependencies = [
        ('documents', '1002_auto_20201111_1105'),
    ]
    operations = [
        migrations.AddField(
            model_name='document',
            name='mime_type',
            field=models.CharField(default="-", editable=False, max_length=256),
            preserve_default=False,
        ),
        migrations.RunPython(add_mime_types),
        migrations.RemoveField(
            model_name='document',
            name='file_type',
        ),
    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,6 +1,7 @@
 # coding=utf-8
 import logging
 import mimetypes
 import os
 import re
 from collections import OrderedDict
@@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
 class Document(models.Model):
    # TODO: why do we need an explicit list
    TYPE_PDF = "pdf"
    TYPE_PNG = "png"
    TYPE_JPG = "jpg"
    TYPE_GIF = "gif"
    TYPE_TIF = "tiff"
    TYPE_TXT = "txt"
    TYPE_CSV = "csv"
    TYPE_MD = "md"
    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
             TYPE_TXT, TYPE_CSV, TYPE_MD)
    STORAGE_TYPE_UNENCRYPTED = "unencrypted"
    STORAGE_TYPE_GPG = "gpg"
    STORAGE_TYPES = (
@@ -156,10 +145,9 @@ class Document(models.Model):
                  "primarily used for searching."
    )
-    file_type = models.CharField(
+    mime_type = models.CharField(
-        max_length=4,
+        max_length=256,
-        editable=False,
+        editable=False
        choices=tuple([(t, t.upper()) for t in TYPES])
    )
    tags = models.ManyToManyField(
@@ -223,7 +211,7 @@ class Document(models.Model):
        if self.filename:
            fname = str(self.filename)
        else:
-            fname = "{:07}.{}".format(self.pk, self.file_type)
+            fname = "{:07}{}".format(self.pk, self.file_type)
            if self.storage_type == self.STORAGE_TYPE_GPG:
                fname += ".gpg"
@@ -238,7 +226,11 @@ class Document(models.Model):
    @property
    def file_name(self):
-        return slugify(str(self)) + "." + self.file_type
+        return slugify(str(self)) + self.file_type
    @property
    def file_type(self):
        return mimetypes.guess_extension(str(self.mime_type))
    @property
    def thumbnail_path(self):
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,6 +6,7 @@ import subprocess
 import tempfile
 import dateparser
 import magic
 from django.conf import settings
 from django.utils import timezone
@@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
 logger = logging.getLogger(__name__)
-def get_parser_class(doc):
+def is_mime_type_supported(mime_type):
-    """
+    return get_parser_class_for_mime_type(mime_type) is not None
-    Determine the appropriate parser class based on the file
+
-    """
+
 def get_parser_class_for_mime_type(mime_type):
    options = []
@@ -48,9 +50,9 @@ def get_parser_class(doc):
    for response in document_consumer_declaration.send(None):
        parser_declaration = response[1]
-        parser_test = parser_declaration["test"]
+        supported_mime_types = parser_declaration["mime_types"]
-        if parser_test(doc):
+        if mime_type in supported_mime_types:
            options.append(parser_declaration)
    if not options:
@@ -61,6 +63,16 @@ def get_parser_class(doc):
        options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 def get_parser_class(path):
    """
    Determine the appropriate parser class based on the file
    """
    mime_type = magic.from_file(path, mime=True)
    return get_parser_class_for_mime_type(mime_type)
 def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT:
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -91,7 +91,7 @@ class DocumentSerializer(serializers.ModelSerializer):
            "document_type_id",
            "title",
            "content",
-            "file_type",
+            "mime_type",
            "tags",
            "tags_id",
            "checksum",
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
        dt = DocumentType.objects.create(name="dt", pk=63)
        tag = Tag.objects.create(name="t", pk=85)
-        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
+        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
        doc.tags.add(tag)
@@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
        with open(filename, "wb") as f:
            f.write(content)
-        doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
+        doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
            f.write(content_thumbnail)
@@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
    def test_document_actions_not_existing_file(self):
-        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
+        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
        self.assertEqual(response.status_code, 404)
@@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
    def test_document_filters(self):
-        doc1 = Document.objects.create(title="none1", checksum="A")
+        doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
-        doc2 = Document.objects.create(title="none2", checksum="B")
+        doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
-        doc3 = Document.objects.create(title="none3", checksum="C")
+        doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
        tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
        tag_2 = Tag.objects.create(name="t2")
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -437,6 +437,18 @@ class FaultyParser(DocumentParser):
        raise ParseError("Does not compute.")
 def fake_magic_from_file(file, mime=False):
    if mime:
        if os.path.splitext(file)[1] == ".pdf":
            return "application/pdf"
        else:
            return "unknown"
    else:
        return "A verbose string that describes the contents of the file"
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumer(TestCase):
    def make_dummy_parser(self, path, logging_group):
@@ -462,7 +474,7 @@ class TestConsumer(TestCase):
        m = patcher.start()
        m.return_value = [(None, {
            "parser": self.make_dummy_parser,
-            "test": lambda _: True,
+            "mime_types": ["application/pdf"],
            "weight": 0
        })]
@@ -592,7 +604,7 @@ class TestConsumer(TestCase):
    def testFaultyParser(self, m):
        m.return_value = [(None, {
            "parser": self.make_faulty_parser,
-            "test": lambda _: True,
+            "mime_types": ["application/pdf"],
            "weight": 0
        })]
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@@ -13,9 +13,12 @@ class TestDocument(TestCase):
            title="Title",
            content="content",
            checksum="checksum",
            mime_type="application/pdf"
        )
        file_path = document.source_path
        thumb_path = document.thumbnail_path
        with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
            document.delete()
            mock_unlink.assert_any_call(file_path)
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -31,7 +31,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_generate_source_filename(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -44,7 +44,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_file_renaming(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -81,7 +81,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_file_renaming_missing_permissions(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -111,10 +111,10 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_file_renaming_database_error(self):
-        document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
+        document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.checksum = "BBBBB"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -149,7 +149,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_document_delete(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -170,7 +170,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_document_delete_nofile(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -179,7 +179,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_directory_not_empty(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -206,7 +206,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_with_underscore(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -222,7 +222,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_with_dash(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -238,7 +238,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_malformed(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -254,7 +254,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
    def test_tags_all(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -269,7 +269,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
    def test_tags_out_of_bounds(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -284,7 +284,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
    def test_nested_directory_cleanup(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -309,7 +309,7 @@ class TestDate(TestCase):
    def test_format_none(self):
        document = Document()
        document.pk = 1
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        self.assertEqual(generate_filename(document), "0000001.pdf")
@@ -335,7 +335,7 @@ class TestDate(TestCase):
    def test_invalid_format(self):
        document = Document()
        document.pk = 1
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        self.assertEqual(generate_filename(document), "0000001.pdf")
@@ -344,7 +344,7 @@ class TestDate(TestCase):
    def test_invalid_format_key(self):
        document = Document()
        document.pk = 1
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        self.assertEqual(generate_filename(document), "0000001.pdf")
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
        TestCase.setUp(self)
        User.objects.create_user(username='test_consumer', password='12345')
        self.doc_contains = Document.objects.create(
-            content="I contain the keyword.", file_type="pdf")
+            content="I contain the keyword.", mime_type="application/pdf")
    def test_tag_applied_any(self):
        t1 = Tag.objects.create(
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,3 +1,4 @@
 import os
 from tempfile import TemporaryDirectory
 from unittest import mock
@@ -5,7 +6,18 @@ from django.test import TestCase
 from documents.parsers import get_parser_class
 def fake_magic_from_file(file, mime=False):
    if mime:
        if os.path.splitext(file)[1] == ".pdf":
            return "application/pdf"
        else:
            return "unknown"
    else:
        return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
 class TestParserDiscovery(TestCase):
    @mock.patch("documents.parsers.document_consumer_declaration.send")
@@ -14,7 +26,7 @@ class TestParserDiscovery(TestCase):
            pass
        m.return_value = (
-            (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
+            (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
        )
        self.assertEqual(
@@ -32,8 +44,8 @@ class TestParserDiscovery(TestCase):
            pass
        m.return_value = (
-            (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
+            (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
-            (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
+            (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
        )
        self.assertEqual(
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -104,18 +104,6 @@ class DocumentViewSet(RetrieveModelMixin,
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
    def file_response(self, pk, disposition):
        # TODO: this should not be necessary here.
        content_types = {
            Document.TYPE_PDF: "application/pdf",
            Document.TYPE_PNG: "image/png",
            Document.TYPE_JPG: "image/jpeg",
            Document.TYPE_GIF: "image/gif",
            Document.TYPE_TIF: "image/tiff",
            Document.TYPE_CSV: "text/csv",
            Document.TYPE_MD: "text/markdown",
            Document.TYPE_TXT: "text/plain"
        }
        doc = Document.objects.get(id=pk)
        if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
@@ -123,7 +111,7 @@ class DocumentViewSet(RetrieveModelMixin,
        else:
            file_handle = GnuPG.decrypted(doc.source_file)
-        response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
+        response = HttpResponse(file_handle, content_type=doc.mime_type)
        response["Content-Disposition"] = '{}; filename="{}"'.format(
            disposition, doc.file_name)
        return response
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -10,6 +10,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
 from documents.loggers import LoggingMixin
 from documents.models import Correspondent
 from documents.parsers import is_mime_type_supported
 from paperless_mail.models import MailAccount, MailRule
@@ -249,8 +250,7 @@ class MailAccountHandler(LoggingMixin):
            title = get_title(message, att, rule)
-            # TODO: check with parsers what files types are supported
+            if is_mime_type_supported(att.content_type):
            if att.content_type == 'application/pdf':
                os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
                _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -1,5 +1,3 @@
 import re
 from .parsers import RasterisedDocumentParser
@@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
    return {
        "parser": RasterisedDocumentParser,
        "weight": 0,
-        "test": tesseract_consumer_test
+        "mime_types": [
            "application/pdf",
            "image/jpeg",
            "image/png"
        ]
    }
 MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
 def tesseract_consumer_test(doc):
    return MATCHING_FILES.match(doc.lower())
--- a/src/paperless_tesseract/tests/test_signals.py
+++ b/src/paperless_tesseract/tests/test_signals.py
@@ -1,36 +0,0 @@
 from django.test import TestCase
 from paperless_tesseract.signals import tesseract_consumer_test
 class SignalsTestCase(TestCase):
    def test_test_handles_various_file_names_true(self):
        prefixes = (
            "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
            "A document with a . in it", "Doc with -- in it"
        )
        suffixes = (
            "pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
            "PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
            "pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
        )
        for prefix in prefixes:
            for suffix in suffixes:
                name = "{}.{}".format(prefix, suffix)
                self.assertTrue(tesseract_consumer_test(name))
    def test_test_handles_various_file_names_false(self):
        prefixes = ("doc",)
        suffixes = ("txt", "markdown", "",)
        for prefix in prefixes:
            for suffix in suffixes:
                name = "{}.{}".format(prefix, suffix)
                self.assertFalse(tesseract_consumer_test(name))
        self.assertFalse(tesseract_consumer_test(""))
        self.assertFalse(tesseract_consumer_test("doc"))
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,5 +1,3 @@
 import re
 from .parsers import TextDocumentParser
@@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
    return {
        "parser": TextDocumentParser,
        "weight": 10,
-        "test": text_consumer_test
+        "mime_types": [
            "text/plain",
            "text/comma-separated-values"
        ]
    }
 MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
 def text_consumer_test(doc):
    return MATCHING_FILES.match(doc.lower())