Merge branch 'dev' into celery-tasks

2025-11-11 03:56:07 -06:00 · 2020-11-22 22:49:37 +01:00
parent 4a6b8ef138 63ea3684fa
commit 3893a23852
146 changed files with 1762 additions and 1390 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -50,17 +50,17 @@ class DocumentTypeAdmin(admin.ModelAdmin):
 class DocumentAdmin(admin.ModelAdmin):

    search_fields = ("correspondent__name", "title", "content", "tags__name")
-    readonly_fields = ("added", "file_type", "storage_type", "filename")
+    readonly_fields = ("added", "mime_type", "storage_type", "filename")
+
+    list_display_links = ("title",)
+
    list_display = (
-        "title",
-        "created",
-        "added",
        "correspondent",
+        "title",
        "tags_",
-        "archive_serial_number",
-        "document_type",
-        "filename"
+        "created",
    )
+
    list_filter = (
        "document_type",
        "tags",
@@ -118,9 +118,19 @@ class DocumentAdmin(admin.ModelAdmin):

 class LogAdmin(admin.ModelAdmin):

+    def has_add_permission(self, request):
+        return False
+
+    def has_change_permission(self, request, obj=None):
+        return False
+
    list_display = ("created", "message", "level",)
    list_filter = ("level", "created",)

+    ordering = ('-created',)
+
+    list_display_links = ("created", "message")
+

 admin.site.register(Correspondent, CorrespondentAdmin)
 admin.site.register(Tag, TagAdmin)
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -30,10 +30,12 @@ class DocumentClassifier(object):
    FORMAT_VERSION = 5

    def __init__(self):
-        # mtime of the model file on disk. used to prevent reloading when nothing has changed.
+        # mtime of the model file on disk. used to prevent reloading when
+        # nothing has changed.
        self.classifier_version = 0

-        # hash of the training data. used to prevent re-training when the training data has not changed.
+        # hash of the training data. used to prevent re-training when the
+        # training data has not changed.
        self.data_hash = None

        self.data_vectorizer = None
@@ -48,10 +50,12 @@ class DocumentClassifier(object):
                schema_version = pickle.load(f)

                if schema_version != self.FORMAT_VERSION:
-                    raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.")
+                    raise IncompatibleClassifierVersionError(
+                        "Cannor load classifier, incompatible versions.")
                else:
                    if self.classifier_version > 0:
-                        logger.info("Classifier updated on disk, reloading classifier models")
+                        logger.info("Classifier updated on disk, "
+                                    "reloading classifier models")
                    self.data_hash = pickle.load(f)
                    self.data_vectorizer = pickle.load(f)
                    self.tags_binarizer = pickle.load(f)
@@ -82,20 +86,22 @@ class DocumentClassifier(object):
        # Step 1: Extract and preprocess training data from the database.
        logging.getLogger(__name__).debug("Gathering data from database...")
        m = hashlib.sha1()
-        for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):
+        for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):  # NOQA: E501
            preprocessed_content = preprocess_content(doc.content)
            m.update(preprocessed_content.encode('utf-8'))
            data.append(preprocessed_content)

            y = -1
-            if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
-                y = doc.document_type.pk
+            dt = doc.document_type
+            if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
+                y = dt.pk
            m.update(y.to_bytes(4, 'little', signed=True))
            labels_document_type.append(y)

            y = -1
-            if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
-                y = doc.correspondent.pk
+            cor = doc.correspondent
+            if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
+                y = cor.pk
            m.update(y.to_bytes(4, 'little', signed=True))
            labels_correspondent.append(y)

@@ -145,7 +151,7 @@ class DocumentClassifier(object):
        # Step 3: train the classifiers
        if num_tags > 0:
            logging.getLogger(__name__).debug("Training tags classifier...")
-            self.tags_classifier = MLPClassifier(verbose=True, tol=0.01)
+            self.tags_classifier = MLPClassifier(tol=0.01)
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
            self.tags_classifier = None
@@ -157,7 +163,7 @@ class DocumentClassifier(object):
            logging.getLogger(__name__).debug(
                "Training correspondent classifier..."
            )
-            self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01)
+            self.correspondent_classifier = MLPClassifier(tol=0.01)
            self.correspondent_classifier.fit(
                data_vectorized,
                labels_correspondent
@@ -173,7 +179,7 @@ class DocumentClassifier(object):
            logging.getLogger(__name__).debug(
                "Training document type classifier..."
            )
-            self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01)
+            self.document_type_classifier = MLPClassifier(tol=0.01)
            self.document_type_classifier.fit(
                data_vectorized,
                labels_document_type
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -2,8 +2,8 @@ import datetime
 import hashlib
 import logging
 import os
-import re

+import magic
 from asgiref.sync import async_to_sync
 from channels.layers import get_channel_layer
 from django.conf import settings
@@ -15,7 +15,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import generate_filename, create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
-from .parsers import ParseError, get_parser_class
+from .parsers import ParseError, get_parser_class_for_mime_type
 from .signals import (
    document_consumption_finished,
    document_consumption_started
@@ -69,12 +69,6 @@ class Consumer(LoggingMixin):
                "Consumption directory {} does not exist".format(
                    settings.CONSUMPTION_DIR))

-    def pre_check_regex(self):
-        if not re.match(FileInfo.REGEXES["title"], self.filename):
-            raise ConsumerError(
-                "Filename {} does not seem to be safe to "
-                "consume".format(self.filename))
-
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
@@ -118,18 +112,21 @@ class Consumer(LoggingMixin):
        self.pre_check_file_exists()
        self.pre_check_consumption_dir()
        self.pre_check_directories()
-        self.pre_check_regex()
        self.pre_check_duplicate()

        self.log("info", "Consuming {}".format(self.filename))

        # Determine the parser class.

-        parser_class = get_parser_class(self.filename)
+        mime_type = magic.from_file(self.path, mime=True)
+
+        parser_class = get_parser_class_for_mime_type(mime_type)
        if not parser_class:
-            raise ConsumerError("No parsers abvailable for {}".format(self.filename))
+            raise ConsumerError(f"No parsers abvailable for {self.filename}")
        else:
-            self.log("debug", "Parser: {}".format(parser_class.__name__))
+            self.log("debug",
+                     f"Parser: {parser_class.__name__} "
+                     f"based on mime type {mime_type}")

        # Notify all listeners that we're going to do some work.

@@ -156,7 +153,7 @@ class Consumer(LoggingMixin):
        # Parse the document. This may take some time.

        try:
-            self.log("debug", "Generating thumbnail for {}...".format(self.filename))
+            self.log("debug", f"Generating thumbnail for {self.filename}...")
            self._send_progress(self.filename, 10, 100, 'WORKING',
                                'Generating thumbnail...')
            thumbnail = document_parser.get_optimised_thumbnail()
@@ -196,7 +193,8 @@ class Consumer(LoggingMixin):
                # store the document.
                document = self._store(
                    text=text,
-                    date=date
+                    date=date,
+                    mime_type=mime_type
                )

                # If we get here, it was successful. Proceed with post-consume
@@ -239,11 +237,11 @@ class Consumer(LoggingMixin):

        return document

-    def _store(self, text, date):
+    def _store(self, text, date, mime_type):

        # If someone gave us the original filename, use it instead of doc.

-        file_info = FileInfo.from_path(self.filename)
+        file_info = FileInfo.from_filename(self.filename)

        stats = os.stat(self.path)

@@ -262,7 +260,7 @@ class Consumer(LoggingMixin):
                correspondent=file_info.correspondent,
                title=file_info.title,
                content=text,
-                file_type=file_info.extension,
+                mime_type=mime_type,
                checksum=hashlib.md5(f.read()).hexdigest(),
                created=created,
                modified=created,
@@ -290,10 +288,12 @@ class Consumer(LoggingMixin):
            document.title = self.override_title

        if self.override_correspondent_id:
-            document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
+            document.correspondent = Correspondent.objects.get(
+                pk=self.override_correspondent_id)

        if self.override_document_type_id:
-            document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
+            document.document_type = DocumentType.objects.get(
+                pk=self.override_document_type_id)

        if self.override_tag_ids:
            for tag_id in self.override_tag_ids:
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -65,38 +65,39 @@ def many_to_dictionary(field):
    return mydictionary


-def generate_filename(document):
-    # Create filename based on configured format
+def generate_filename(doc):
    path = ""

    try:
        if settings.PAPERLESS_FILENAME_FORMAT is not None:
            tags = defaultdict(lambda: slugify(None),
-                               many_to_dictionary(document.tags))
+                               many_to_dictionary(doc.tags))
            path = settings.PAPERLESS_FILENAME_FORMAT.format(
-                correspondent=slugify(document.correspondent),
-                title=slugify(document.title),
-                created=slugify(document.created),
-                created_year=document.created.year if document.created else "none",
-                created_month=document.created.month if document.created else "none",
-                created_day=document.created.day if document.created else "none",
-                added=slugify(document.added),
-                added_year=document.added.year if document.added else "none",
-                added_month=document.added.month if document.added else "none",
-                added_day=document.added.day if document.added else "none",
+                correspondent=slugify(doc.correspondent),
+                title=slugify(doc.title),
+                created=slugify(doc.created),
+                created_year=doc.created.year if doc.created else "none",
+                created_month=doc.created.month if doc.created else "none",
+                created_day=doc.created.day if doc.created else "none",
+                added=slugify(doc.added),
+                added_year=doc.added.year if doc.added else "none",
+                added_month=doc.added.month if doc.added else "none",
+                added_day=doc.added.day if doc.added else "none",
                tags=tags,
            )
    except (ValueError, KeyError, IndexError):
-        logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
+        logging.getLogger(__name__).warning(
+            f"Invalid PAPERLESS_FILENAME_FORMAT: "
+            f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")

    # Always append the primary key to guarantee uniqueness of filename
    if len(path) > 0:
-        filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
+        filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
    else:
-        filename = "%07i.%s" % (document.pk, document.file_type)
+        filename = "%07i%s" % (doc.pk, doc.file_type)

    # Append .gpg for encrypted files
-    if document.storage_type == document.STORAGE_TYPE_GPG:
+    if doc.storage_type == doc.STORAGE_TYPE_GPG:
        filename += ".gpg"

    return filename
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -3,22 +3,35 @@ import tempfile
 from datetime import datetime
 from time import mktime

+import magic
 from django import forms
 from django.conf import settings
 from django_q.tasks import async_task
 from pathvalidate import validate_filename, ValidationError

+from documents.parsers import is_mime_type_supported
+

 class UploadForm(forms.Form):

    document = forms.FileField()

    def clean_document(self):
+        document_name = self.cleaned_data.get("document").name
+
        try:
-            validate_filename(self.cleaned_data.get("document").name)
+            validate_filename(document_name)
        except ValidationError:
            raise forms.ValidationError("That filename is suspicious.")
-        return self.cleaned_data.get("document")
+
+        document_data = self.cleaned_data.get("document").read()
+
+        mime_type = magic.from_buffer(document_data, mime=True)
+
+        if not is_mime_type_supported(mime_type):
+            raise forms.ValidationError("This mime type is not supported.")
+
+        return document_name, document_data

    def save(self):
        """
@@ -27,17 +40,20 @@ class UploadForm(forms.Form):
        form do that as well.  Think of it as a poor-man's queue server.
        """

-        document = self.cleaned_data.get("document").read()
-        original_filename = self.cleaned_data.get("document").name
+        original_filename, data = self.cleaned_data.get("document")

        t = int(mktime(datetime.now().timetuple()))

        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)

-        # TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
-        with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
+        with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
+                                         dir=settings.SCRATCH_DIR,
+                                         delete=False) as f:

-            f.write(document)
+            f.write(data)
            os.utime(f.name, times=(t, t))

-            async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
+            async_task("documents.tasks.consume_file",
+                       f.name,
+                       override_filename=original_filename,
+                       task_name=os.path.basename(original_filename)[:100])
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -1,6 +1,8 @@
 import logging
+import os
 from contextlib import contextmanager

+from django.conf import settings
 from whoosh import highlight
 from whoosh.fields import Schema, TEXT, NUMERIC
 from whoosh.highlight import Formatter, get_text
@@ -8,7 +10,6 @@ from whoosh.index import create_in, exists_in, open_dir
 from whoosh.qparser import MultifieldParser
 from whoosh.writing import AsyncWriter

-from paperless import settings

 logger = logging.getLogger(__name__)

@@ -69,6 +70,8 @@ def open_index(recreate=False):
        # TODO: this is not thread safe. If 2 instances try to create the index
        #  at the same time, this fails. This currently prevents parallel
        #  tests.
+        if not os.path.isdir(settings.INDEX_DIR):
+            os.makedirs(settings.INDEX_DIR, exist_ok=True)
        return create_in(settings.INDEX_DIR, get_schema())


@@ -117,6 +120,7 @@ def query_page(ix, query, page):
 def autocomplete(ix, term, limit=10):
    with ix.reader() as reader:
        terms = []
-        for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
+        for (score, t) in reader.most_distinctive_terms(
+                "content", number=limit, prefix=term.lower()):
            terms.append(t)
        return terms
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler):
    def _consume(self, file):
        if os.path.isfile(file):
            try:
-                async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
+                async_task("documents.tasks.consume_file",
+                           file,
+                           task_name=os.path.basename(file)[:100])
            except Exception as e:
                # Catch all so that the consumer won't crash.
-                logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
+                logging.getLogger(__name__).error(
+                    "Error while consuming document: {}".format(e))

    def on_created(self, event):
        self._consume(event.src_path)
@@ -66,12 +69,14 @@ class Command(BaseCommand):
        # Consume all files as this is not done initially by the watchdog
        for entry in os.scandir(directory):
            if entry.is_file():
-                async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
+                async_task("documents.tasks.consume_file",
+                           entry.path,
+                           task_name=os.path.basename(entry.path)[:100])

        # Start the watchdog. Woof!
        if settings.CONSUMER_POLLING > 0:
-            logging.getLogger(__name__).info('Using polling instead of file'
-                                             'system notifications.')
+            logging.getLogger(__name__).info(
+                "Using polling instead of file system notifications.")
            observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
        else:
            observer = Observer()
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand):

            document = document_map[document_dict["pk"]]

-            unique_filename = "{:07}_{}".format(document.pk, document.file_name)
+            unique_filename = f"{document.pk:07}_{document.file_name}"

            file_target = os.path.join(self.target, unique_filename)

@@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand):
            document_dict[EXPORTER_FILE_NAME] = unique_filename
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

-            print("Exporting: {}".format(file_target))
+            print(f"Exporting: {file_target}")

            t = int(time.mktime(document.created.timetuple()))
            if document.storage_type == Document.STORAGE_TYPE_GPG:
@@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
        tags = ",".join([t.slug for t in doc.tags.all()])

        if tags:
-            return "{} - {} - {} - {}.{}".format(
+            return "{} - {} - {} - {}{}".format(
                created, doc.correspondent, doc.title, tags, doc.file_type)

-        return "{} - {} - {}.{}".format(
+        return "{} - {} - {}{}".format(
            created, doc.correspondent, doc.title, doc.file_type)
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand):
                        encrypted.write(GnuPG.encrypted(unencrypted))

            else:
-                print("Moving {} to {}".format(document_path, document.source_path))
+                print(f"Moving {document_path} to {document.source_path}")
                shutil.copy(document_path, document.source_path)
                shutil.copy(thumbnail_path, document.thumbnail_path)

--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand):
        try:
            classifier.reload()
        except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
-            logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
+            logging.getLogger(__name__).warning(
+                f"Cannot classify documents: {e}.")
            classifier = None

        for document in documents:
            logging.getLogger(__name__).info(
-                "Processing document {}".format(document.title)
-            )
+                f"Processing document {document.title}")

            if options['correspondent']:
                set_correspondent(
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -6,25 +6,42 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag


 def match_correspondents(document_content, classifier):
-    correspondents = Correspondent.objects.all()
-    predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
+    if classifier:
+        pred_id = classifier.predict_correspondent(document_content)
+    else:
+        pred_id = None

-    return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
+    correspondents = Correspondent.objects.all()
+
+    return list(filter(
+        lambda o: matches(o, document_content) or o.pk == pred_id,
+        correspondents))


 def match_document_types(document_content, classifier):
-    document_types = DocumentType.objects.all()
-    predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
+    if classifier:
+        pred_id = classifier.predict_document_type(document_content)
+    else:
+        pred_id = None

-    return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
+    document_types = DocumentType.objects.all()
+
+    return list(filter(
+        lambda o: matches(o, document_content) or o.pk == pred_id,
+        document_types))


 def match_tags(document_content, classifier):
-    objects = Tag.objects.all()
-    predicted_tag_ids = classifier.predict_tags(document_content) if classifier else []
+    if classifier:
+        predicted_tag_ids = classifier.predict_tags(document_content)
+    else:
+        predicted_tag_ids = []

-    matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids]
-    return matched_tags
+    tags = Tag.objects.all()
+
+    return list(filter(
+        lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
+        tags))


 def matches(matching_model, document_content):
@@ -42,39 +59,45 @@ def matches(matching_model, document_content):
    if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
        for word in _split_match(matching_model):
            search_result = re.search(
-                r"\b{}\b".format(word), document_content, **search_kwargs)
+                rf"\b{word}\b", document_content, **search_kwargs)
            if not search_result:
                return False
        return True

-    if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
        for word in _split_match(matching_model):
-            if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
+            if re.search(rf"\b{word}\b", document_content, **search_kwargs):
                return True
        return False

-    if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
        return bool(re.search(
-            r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
+            rf"\b{matching_model.match}\b",
+            document_content,
+            **search_kwargs
+        ))

-    if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
        return bool(re.search(
-            re.compile(matching_model.match, **search_kwargs), document_content))
+            re.compile(matching_model.match, **search_kwargs),
+            document_content
+        ))

-    if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
        match = re.sub(r'[^\w\s]', '', matching_model.match)
        text = re.sub(r'[^\w\s]', '', document_content)
        if matching_model.is_insensitive:
            match = match.lower()
            text = text.lower()

-        return True if fuzz.partial_ratio(match, text) >= 90 else False
+        return fuzz.partial_ratio(match, text) >= 90

-    if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
        # this is done elsewhere.
        return False

-    raise NotImplementedError("Unsupported matching algorithm")
+    else:
+        raise NotImplementedError("Unsupported matching algorithm")


 def _split_match(matching_model):
--- a/src/documents/migrations/1000_update_paperless_all.py
+++ b/src/documents/migrations/1000_update_paperless_all.py
@@ -1,4 +1,6 @@
 # Generated by Django 3.1.3 on 2020-11-07 12:35
+import uuid
+
 from django.db import migrations, models
 import django.db.models.deletion

@@ -20,6 +22,14 @@ def make_index(apps, schema_editor):
        print("  --> Cannot create document index.")


+def logs_set_default_group(apps, schema_editor):
+    Log = apps.get_model('documents', 'Log')
+    for log in Log.objects.all():
+        if log.group is None:
+            log.group = uuid.uuid4()
+            log.save()
+
+
 class Migration(migrations.Migration):

    dependencies = [
@@ -85,6 +95,10 @@ class Migration(migrations.Migration):
            name='group',
            field=models.UUIDField(blank=True, null=True),
        ),
+        migrations.RunPython(
+            code=django.db.migrations.operations.special.RunPython.noop,
+            reverse_code=logs_set_default_group
+        ),
        migrations.RunPython(
            code=make_index,
            reverse_code=django.db.migrations.operations.special.RunPython.noop,
--- a/src/documents/migrations/1003_mime_types.py
+++ b/src/documents/migrations/1003_mime_types.py
@@ -0,0 +1,77 @@
+# Generated by Django 3.1.3 on 2020-11-20 11:21
+import mimetypes
+import os
+
+import magic
+from django.conf import settings
+from django.db import migrations, models
+
+
+def source_path(self):
+    if self.filename:
+        fname = str(self.filename)
+    else:
+        fname = "{:07}.{}".format(self.pk, self.file_type)
+        if self.storage_type == self.STORAGE_TYPE_GPG:
+            fname += ".gpg"
+
+    return os.path.join(
+        settings.ORIGINALS_DIR,
+        fname
+    )
+
+
+def add_mime_types(apps, schema_editor):
+    Document = apps.get_model("documents", "Document")
+    documents = Document.objects.all()
+
+    for d in documents:
+        d.mime_type = magic.from_file(source_path(d), mime=True)
+        d.save()
+
+
+def add_file_extensions(apps, schema_editor):
+    Document = apps.get_model("documents", "Document")
+    documents = Document.objects.all()
+
+    for d in documents:
+        d.file_type = os.path.splitext(d.filename)[1].strip('.')
+        d.save()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1002_auto_20201111_1105'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='mime_type',
+            field=models.CharField(default="-", editable=False, max_length=256),
+            preserve_default=False,
+        ),
+        migrations.RunPython(add_mime_types, migrations.RunPython.noop),
+
+        # This operation is here so that we can revert the entire migration:
+        # By allowing this field to be blank and null, we can revert the
+        # remove operation further down and the database won't complain about
+        # NOT NULL violations.
+        migrations.AlterField(
+            model_name='document',
+            name='file_type',
+            field=models.CharField(
+                choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')],
+                editable=False,
+                max_length=4,
+                null=True,
+                blank=True
+            ),
+        ),
+        migrations.RunPython(migrations.RunPython.noop, add_file_extensions),
+        migrations.RemoveField(
+            model_name='document',
+            name='file_type',
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,6 +1,7 @@
 # coding=utf-8

 import logging
+import mimetypes
 import os
 import re
 from collections import OrderedDict
@@ -113,18 +114,6 @@ class DocumentType(MatchingModel):

 class Document(models.Model):

-    # TODO: why do we need an explicit list
-    TYPE_PDF = "pdf"
-    TYPE_PNG = "png"
-    TYPE_JPG = "jpg"
-    TYPE_GIF = "gif"
-    TYPE_TIF = "tiff"
-    TYPE_TXT = "txt"
-    TYPE_CSV = "csv"
-    TYPE_MD = "md"
-    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
-             TYPE_TXT, TYPE_CSV, TYPE_MD)
-
    STORAGE_TYPE_UNENCRYPTED = "unencrypted"
    STORAGE_TYPE_GPG = "gpg"
    STORAGE_TYPES = (
@@ -156,10 +145,9 @@ class Document(models.Model):
                  "primarily used for searching."
    )

-    file_type = models.CharField(
-        max_length=4,
-        editable=False,
-        choices=tuple([(t, t.upper()) for t in TYPES])
+    mime_type = models.CharField(
+        max_length=256,
+        editable=False
    )

    tags = models.ManyToManyField(
@@ -223,7 +211,7 @@ class Document(models.Model):
        if self.filename:
            fname = str(self.filename)
        else:
-            fname = "{:07}.{}".format(self.pk, self.file_type)
+            fname = "{:07}{}".format(self.pk, self.file_type)
            if self.storage_type == self.STORAGE_TYPE_GPG:
                fname += ".gpg"

@@ -238,7 +226,11 @@ class Document(models.Model):

    @property
    def file_name(self):
-        return slugify(str(self)) + "." + self.file_type
+        return slugify(str(self)) + self.file_type
+
+    @property
+    def file_type(self):
+        return mimetypes.guess_extension(str(self.mime_type))

    @property
    def thumbnail_path(self):
@@ -278,6 +270,7 @@ class Log(models.Model):
        return self.message


+# TODO: why is this in the models file?
 class FileInfo:

    # This epic regex *almost* worked for our needs, so I'm keeping it here for
@@ -292,53 +285,44 @@ class FileInfo:
            non_separated_word=r"([\w,. ]|([^\s]-))"
        )
    )
-    # TODO: what is this used for
-    formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
    REGEXES = OrderedDict([
        ("created-correspondent-title-tags", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<correspondent>.*) - "
            r"(?P<title>.*) - "
-            r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<tags>[a-z0-9\-,]*)$",
            flags=re.IGNORECASE
        )),
        ("created-title-tags", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<title>.*) - "
-            r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<tags>[a-z0-9\-,]*)$",
            flags=re.IGNORECASE
        )),
        ("created-correspondent-title", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<correspondent>.*) - "
-            r"(?P<title>.*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<title>.*)$",
            flags=re.IGNORECASE
        )),
        ("created-title", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
-            r"(?P<title>.*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<title>.*)$",
            flags=re.IGNORECASE
        )),
        ("correspondent-title-tags", re.compile(
            r"(?P<correspondent>.*) - "
            r"(?P<title>.*) - "
-            r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<tags>[a-z0-9\-,]*)$",
            flags=re.IGNORECASE
        )),
        ("correspondent-title", re.compile(
            r"(?P<correspondent>.*) - "
-            r"(?P<title>.*)?"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<title>.*)?$",
            flags=re.IGNORECASE
        )),
        ("title", re.compile(
-            r"(?P<title>.*)"
-            r"\.(?P<extension>{})$".format(formats),
+            r"(?P<title>.*)$",
            flags=re.IGNORECASE
        ))
    ])
@@ -381,15 +365,6 @@ class FileInfo:
            )[0])
        return tuple(r)

-    @classmethod
-    def _get_extension(cls, extension):
-        r = extension.lower()
-        if r == "jpeg":
-            return "jpg"
-        if r == "tif":
-            return "tiff"
-        return r
-
    @classmethod
    def _mangle_property(cls, properties, name):
        if name in properties:
@@ -398,18 +373,16 @@ class FileInfo:
            )

    @classmethod
-    def from_path(cls, path):
+    def from_filename(cls, filename):
        """
        We use a crude naming convention to make handling the correspondent,
        title, and tags easier:
-          "<date> - <correspondent> - <title> - <tags>.<suffix>"
-          "<correspondent> - <title> - <tags>.<suffix>"
-          "<correspondent> - <title>.<suffix>"
-          "<title>.<suffix>"
+          "<date> - <correspondent> - <title> - <tags>"
+          "<correspondent> - <title> - <tags>"
+          "<correspondent> - <title>"
+          "<title>"
        """

-        filename = os.path.basename(path)
-
        # Mutate filename in-place before parsing its components
        # by applying at most one of the configured transformations.
        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
@@ -417,6 +390,23 @@ class FileInfo:
            if count:
                break

+        # do this after the transforms so that the transforms can do whatever
+        # with the file extension.
+        filename_no_ext = os.path.splitext(filename)[0]
+
+        if filename_no_ext == filename and filename.startswith("."):
+            # This is a very special case where there is no text before the
+            # file type.
+            # TODO: this should be handled better. The ext is not removed
+            #  because usually, files like '.pdf' are just hidden files
+            #  with the name pdf, but in our case, its more likely that
+            #  there's just no name to begin with.
+            filename = ""
+            # This isn't too bad either, since we'll just not match anything
+            # and return an empty title. TODO: actually, this is kinda bad.
+        else:
+            filename = filename_no_ext
+
        # Parse filename components.
        for regex in cls.REGEXES.values():
            m = regex.match(filename)
@@ -426,5 +416,4 @@ class FileInfo:
                cls._mangle_property(properties, "correspondent")
                cls._mangle_property(properties, "title")
                cls._mangle_property(properties, "tags")
-                cls._mangle_property(properties, "extension")
                return cls(**properties)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,6 +6,7 @@ import subprocess
 import tempfile

 import dateparser
+import magic
 from django.conf import settings
 from django.utils import timezone

@@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
 logger = logging.getLogger(__name__)


-def get_parser_class(doc):
-    """
-    Determine the appropriate parser class based on the file
-    """
+def is_mime_type_supported(mime_type):
+    return get_parser_class_for_mime_type(mime_type) is not None
+
+
+def get_parser_class_for_mime_type(mime_type):

    options = []

@@ -48,9 +50,9 @@ def get_parser_class(doc):

    for response in document_consumer_declaration.send(None):
        parser_declaration = response[1]
-        parser_test = parser_declaration["test"]
+        supported_mime_types = parser_declaration["mime_types"]

-        if parser_test(doc):
+        if mime_type in supported_mime_types:
            options.append(parser_declaration)

    if not options:
@@ -61,7 +63,28 @@ def get_parser_class(doc):
        options, key=lambda _: _["weight"], reverse=True)[0]["parser"]


-def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
+def get_parser_class(path):
+    """
+    Determine the appropriate parser class based on the file
+    """
+
+    mime_type = magic.from_file(path, mime=True)
+
+    return get_parser_class_for_mime_type(mime_type)
+
+
+def run_convert(input_file,
+                output_file,
+                density=None,
+                scale=None,
+                alpha=None,
+                strip=False,
+                trim=False,
+                type=None,
+                depth=None,
+                extra=None,
+                logging_group=None):
+
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT:
        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
@@ -90,10 +113,13 @@ def run_unpaper(pnm, logging_group=None):
    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
                    pnm_out)

-    logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
+    logger.debug(f"Execute: {' '.join(command_args)}",
+                 extra={'group': logging_group})

-    if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0:
-        raise ParseError("Unpaper failed at {}".format(command_args))
+    if not subprocess.Popen(command_args,
+                            stdout=subprocess.DEVNULL,
+                            stderr=subprocess.DEVNULL).wait() == 0:
+        raise ParseError(f"Unpaper failed at {command_args}")

    return pnm_out

@@ -112,7 +138,8 @@ class DocumentParser(LoggingMixin):
        super().__init__()
        self.logging_group = logging_group
        self.document_path = path
-        self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
+        self.tempdir = tempfile.mkdtemp(
+            prefix="paperless-", dir=settings.SCRATCH_DIR)
        self.progress_callback = progress_callback

    def get_thumbnail(self):
@@ -126,9 +153,10 @@ class DocumentParser(LoggingMixin):
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "optipng.png")

-            args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
+            args = (settings.OPTIPNG_BINARY,
+                    "-silent", "-o5", in_path, "-out", out_path)

-            self.log('debug', 'Execute: ' + " ".join(args))
+            self.log('debug', f"Execute: {' '.join(args)}")

            if not subprocess.Popen(args).wait() == 0:
                raise ParseError("Optipng failed at {}".format(args))
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):

 class DocumentSerializer(serializers.ModelSerializer):

-    correspondent_id = CorrespondentField(allow_null=True, source='correspondent')
+    correspondent_id = CorrespondentField(
+        allow_null=True, source='correspondent')
    tags_id = TagsField(many=True, source='tags')
-    document_type_id = DocumentTypeField(allow_null=True, source='document_type')
+    document_type_id = DocumentTypeField(
+        allow_null=True, source='document_type')

    class Meta:
        model = Document
@@ -91,7 +93,7 @@ class DocumentSerializer(serializers.ModelSerializer):
            "document_type_id",
            "title",
            "content",
-            "file_type",
+            "mime_type",
            "tags",
            "tags_id",
            "checksum",
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
    document.tags.add(*inbox_tags)


-def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
+def set_correspondent(sender,
+                      document=None,
+                      logging_group=None,
+                      classifier=None,
+                      replace=False,
+                      use_first=True,
+                      **kwargs):
    if document.correspondent and not replace:
        return

-    potential_correspondents = matching.match_correspondents(document.content, classifier)
+    potential_correspondents = matching.match_correspondents(document.content,
+                                                             classifier)

    potential_count = len(potential_correspondents)
    if potential_correspondents:
@@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
        selected = None
    if potential_count > 1:
        if use_first:
-            message = "Detected {} potential correspondents, so we've opted for {}"
            logger(
-                message.format(potential_count, selected),
+                f"Detected {potential_count} potential correspondents, "
+                f"so we've opted for {selected}",
                logging_group
            )
        else:
-            message = "Detected {} potential correspondents, not assigning any correspondent"
            logger(
-                message.format(potential_count),
+                f"Detected {potential_count} potential correspondents, "
+                f"not assigning any correspondent",
                logging_group
            )
            return

    if selected or replace:
        logger(
-            'Assigning correspondent "{}" to "{}" '.format(selected, document),
+            f"Assigning correspondent {selected} to {document}",
            logging_group
        )

@@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
        document.save(update_fields=("correspondent",))


-def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
+def set_document_type(sender,
+                      document=None,
+                      logging_group=None,
+                      classifier=None,
+                      replace=False,
+                      use_first=True,
+                      **kwargs):
    if document.document_type and not replace:
        return

-    potential_document_type = matching.match_document_types(document.content, classifier)
+    potential_document_type = matching.match_document_types(document.content,
+                                                            classifier)

    potential_count = len(potential_document_type)
    if potential_document_type:
@@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None

    if potential_count > 1:
        if use_first:
-            message = "Detected {} potential document types, so we've opted for {}"
            logger(
-                message.format(potential_count, selected),
+                f"Detected {potential_count} potential document types, "
+                f"so we've opted for {selected}",
                logging_group
            )
        else:
-            message = "Detected {} potential document types, not assigning any document type"
            logger(
-                message.format(potential_count),
+                f"Detected {potential_count} potential document types, "
+                f"not assigning any document type",
                logging_group
            )
            return

    if selected or replace:
        logger(
-            'Assigning document type "{}" to "{}" '.format(selected, document),
+            f"Assigning document type {selected} to {document}",
            logging_group
        )

@@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
        document.save(update_fields=("document_type",))


-def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
+def set_tags(sender,
+             document=None,
+             logging_group=None,
+             classifier=None,
+             replace=False,
+             **kwargs):
    if replace:
        document.tags.clear()
        current_tags = set([])
    else:
        current_tags = set(document.tags.all())

-    relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
+    matched_tags = matching.match_tags(document.content, classifier)
+
+    relevant_tags = set(matched_tags) - current_tags

    if not relevant_tags:
        return
@@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs):

    if not os.path.isfile(old_path):
        # Can't do anything if the old file does not exist anymore.
-        logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
+        logging.getLogger(__name__).fatal(
+            f"Document {str(instance)}: File {old_path} has gone.")
        return

    if os.path.isfile(new_path):
        # Can't do anything if the new file already exists. Skip updating file.
-        logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
+        logging.getLogger(__name__).warning(
+            f"Document {str(instance)}: Cannot rename file "
+            f"since target path {new_path} already exists.")
        return

    create_source_path_directory(new_path)
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
        dt = DocumentType.objects.create(name="dt", pk=63)
        tag = Tag.objects.create(name="t", pk=85)

-        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
+        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")

        doc.tags.add(tag)

@@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
        with open(filename, "wb") as f:
            f.write(content)

-        doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
+        doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")

        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
            f.write(content_thumbnail)
@@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):

    def test_document_actions_not_existing_file(self):

-        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
+        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")

        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
        self.assertEqual(response.status_code, 404)
@@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):

    def test_document_filters(self):

-        doc1 = Document.objects.create(title="none1", checksum="A")
-        doc2 = Document.objects.create(title="none2", checksum="B")
-        doc3 = Document.objects.create(title="none3", checksum="C")
+        doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
+        doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
+        doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")

        tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
        tag_2 = Tag.objects.create(name="t2")
--- a/src/documents/tests/test_checks.py
+++ b/src/documents/tests/test_checks.py
@@ -15,11 +15,3 @@ class ChecksTestCase(TestCase):
    def test_changed_password_check_no_encryption(self):
        DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
        self.assertEqual(changed_password_check(None), [])
-
-    @unittest.skip("I don't know how to test this")
-    def test_changed_password_check_gpg_encryption_with_good_password(self):
-        pass
-
-    @unittest.skip("I don't know how to test this")
-    def test_changed_password_check_fail(self):
-        pass
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -15,57 +15,42 @@ from ..parsers import DocumentParser, ParseError
 class TestAttributes(TestCase):

    TAGS = ("tag1", "tag2", "tag3")
-    EXTENSIONS = (
-        "pdf", "png", "jpg", "jpeg", "gif", "tiff", "tif",
-        "PDF", "PNG", "JPG", "JPEG", "GIF", "TIFF", "TIF",
-        "PdF", "PnG", "JpG", "JPeG", "GiF", "TiFf", "TiF",
-    )

-    def _test_guess_attributes_from_name(self, path, sender, title, tags):
+    def _test_guess_attributes_from_name(self, filename, sender, title, tags):
+        file_info = FileInfo.from_filename(filename)

-        for extension in self.EXTENSIONS:
+        if sender:
+            self.assertEqual(file_info.correspondent.name, sender, filename)
+        else:
+            self.assertIsNone(file_info.correspondent, filename)

-            f = path.format(extension)
-            file_info = FileInfo.from_path(f)
+        self.assertEqual(file_info.title, title, filename)

-            if sender:
-                self.assertEqual(file_info.correspondent.name, sender, f)
-            else:
-                self.assertIsNone(file_info.correspondent, f)
-
-            self.assertEqual(file_info.title, title, f)
-
-            self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
-            if extension.lower() == "jpeg":
-                self.assertEqual(file_info.extension, "jpg", f)
-            elif extension.lower() == "tif":
-                self.assertEqual(file_info.extension, "tiff", f)
-            else:
-                self.assertEqual(file_info.extension, extension.lower(), f)
+        self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename)

    def test_guess_attributes_from_name0(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Sender - Title.{}", "Sender", "Title", ())
+            "Sender - Title.pdf", "Sender", "Title", ())

    def test_guess_attributes_from_name1(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())
+            "Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())

    def test_guess_attributes_from_name2(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())
+            "Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())

    def test_guess_attributes_from_name3(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())
+            "Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())

    def test_guess_attributes_from_name4(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())
+            "Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())

    def test_guess_attributes_from_name5(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Sender - Title - tag1,tag2,tag3.{}",
+            "Sender - Title - tag1,tag2,tag3.pdf",
            "Sender",
            "Title",
            self.TAGS
@@ -73,7 +58,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name6(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
+            "Spaced Sender - Title - tag1,tag2,tag3.pdf",
            "Spaced Sender",
            "Title",
            self.TAGS
@@ -81,7 +66,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name7(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
+            "Sender - Spaced Title - tag1,tag2,tag3.pdf",
            "Sender",
            "Spaced Title",
            self.TAGS
@@ -89,7 +74,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name8(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
+            "Dashed-Sender - Title - tag1,tag2,tag3.pdf",
            "Dashed-Sender",
            "Title",
            self.TAGS
@@ -97,7 +82,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name9(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
+            "Sender - Dashed-Title - tag1,tag2,tag3.pdf",
            "Sender",
            "Dashed-Title",
            self.TAGS
@@ -105,7 +90,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name10(self):
        self._test_guess_attributes_from_name(
-            "/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
+            "Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
            "Σενδερ",
            "Τιτλε",
            self.TAGS
@@ -113,7 +98,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name_when_correspondent_empty(self):
        self._test_guess_attributes_from_name(
-            '/path/to/ - weird empty correspondent but should not break.{}',
+            ' - weird empty correspondent but should not break.pdf',
            None,
            'weird empty correspondent but should not break',
            ()
@@ -121,7 +106,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
        self._test_guess_attributes_from_name(
-            '/path/to/- weird but should not break.{}',
+            '- weird but should not break.pdf',
            None,
            '- weird but should not break',
            ()
@@ -129,7 +114,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name_when_title_ends_with_dash(self):
        self._test_guess_attributes_from_name(
-            '/path/to/weird but should not break -.{}',
+            'weird but should not break -.pdf',
            None,
            'weird but should not break -',
            ()
@@ -137,7 +122,7 @@ class TestAttributes(TestCase):

    def test_guess_attributes_from_name_when_title_is_empty(self):
        self._test_guess_attributes_from_name(
-            '/path/to/weird correspondent but should not break - .{}',
+            'weird correspondent but should not break - .pdf',
            'weird correspondent but should not break',
            '',
            ()
@@ -149,11 +134,11 @@ class TestAttributes(TestCase):
        :return:
        """

-        path = "Title - Correspondent - tAg1,TAG2.pdf"
-        self.assertEqual(len(FileInfo.from_path(path).tags), 2)
+        filename = "Title - Correspondent - tAg1,TAG2.pdf"
+        self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)

        path = "Title - Correspondent - tag1,tag2.pdf"
-        self.assertEqual(len(FileInfo.from_path(path).tags), 2)
+        self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)

        self.assertEqual(Tag.objects.all().count(), 2)

@@ -173,13 +158,12 @@ class TestFieldPermutations(TestCase):
    ]
    valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
    valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
-    valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]

    def _test_guessed_attributes(self, filename, created=None,
                                 correspondent=None, title=None,
-                                 extension=None, tags=None):
+                                 tags=None):

-        info = FileInfo.from_path(filename)
+        info = FileInfo.from_filename(filename)

        # Created
        if created is None:
@@ -207,68 +191,56 @@ class TestFieldPermutations(TestCase):
                filename
            )

-        # Extension
-        if extension == 'jpeg':
-            extension = 'jpg'
-        self.assertEqual(info.extension, extension, filename)
-
    def test_just_title(self):
-        template = '/path/to/{title}.{extension}'
+        template = '{title}.pdf'
        for title in self.valid_titles:
-            for extension in self.valid_extensions:
-                spec = dict(title=title, extension=extension)
+            spec = dict(title=title)
+            filename = template.format(**spec)
+            self._test_guessed_attributes(filename, **spec)
+
+    def test_title_and_correspondent(self):
+        template = '{correspondent} - {title}.pdf'
+        for correspondent in self.valid_correspondents:
+            for title in self.valid_titles:
+                spec = dict(correspondent=correspondent, title=title)
                filename = template.format(**spec)
                self._test_guessed_attributes(filename, **spec)

-    def test_title_and_correspondent(self):
-        template = '/path/to/{correspondent} - {title}.{extension}'
-        for correspondent in self.valid_correspondents:
-            for title in self.valid_titles:
-                for extension in self.valid_extensions:
-                    spec = dict(correspondent=correspondent, title=title,
-                                extension=extension)
-                    filename = template.format(**spec)
-                    self._test_guessed_attributes(filename, **spec)
-
    def test_title_and_correspondent_and_tags(self):
-        template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
+        template = '{correspondent} - {title} - {tags}.pdf'
        for correspondent in self.valid_correspondents:
            for title in self.valid_titles:
                for tags in self.valid_tags:
-                    for extension in self.valid_extensions:
-                        spec = dict(correspondent=correspondent, title=title,
-                                    tags=tags, extension=extension)
-                        filename = template.format(**spec)
-                        self._test_guessed_attributes(filename, **spec)
+                    spec = dict(correspondent=correspondent, title=title,
+                                tags=tags)
+                    filename = template.format(**spec)
+                    self._test_guessed_attributes(filename, **spec)

    def test_created_and_correspondent_and_title_and_tags(self):

        template = (
-            "/path/to/{created} - "
+            "{created} - "
            "{correspondent} - "
            "{title} - "
-            "{tags}"
-            ".{extension}"
+            "{tags}.pdf"
        )

        for created in self.valid_dates:
            for correspondent in self.valid_correspondents:
                for title in self.valid_titles:
                    for tags in self.valid_tags:
-                        for extension in self.valid_extensions:
-                            spec = {
-                                "created": created,
-                                "correspondent": correspondent,
-                                "title": title,
-                                "tags": tags,
-                                "extension": extension
-                            }
-                            self._test_guessed_attributes(
-                                template.format(**spec), **spec)
+                        spec = {
+                            "created": created,
+                            "correspondent": correspondent,
+                            "title": title,
+                            "tags": tags,
+                        }
+                        self._test_guessed_attributes(
+                            template.format(**spec), **spec)

    def test_created_and_correspondent_and_title(self):

-        template = "/path/to/{created} - {correspondent} - {title}.{extension}"
+        template = "{created} - {correspondent} - {title}.pdf"

        for created in self.valid_dates:
            for correspondent in self.valid_correspondents:
@@ -279,56 +251,50 @@ class TestFieldPermutations(TestCase):
                    if title.lower() == title:
                        continue

-                    for extension in self.valid_extensions:
-                        spec = {
-                            "created": created,
-                            "correspondent": correspondent,
-                            "title": title,
-                            "extension": extension
-                        }
-                        self._test_guessed_attributes(
-                            template.format(**spec), **spec)
-
-    def test_created_and_title(self):
-
-        template = "/path/to/{created} - {title}.{extension}"
-
-        for created in self.valid_dates:
-            for title in self.valid_titles:
-                for extension in self.valid_extensions:
                    spec = {
                        "created": created,
-                        "title": title,
-                        "extension": extension
+                        "correspondent": correspondent,
+                        "title": title
                    }
                    self._test_guessed_attributes(
                        template.format(**spec), **spec)

+    def test_created_and_title(self):
+
+        template = "{created} - {title}.pdf"
+
+        for created in self.valid_dates:
+            for title in self.valid_titles:
+                spec = {
+                    "created": created,
+                    "title": title
+                }
+                self._test_guessed_attributes(
+                    template.format(**spec), **spec)
+
    def test_created_and_title_and_tags(self):

-        template = "/path/to/{created} - {title} - {tags}.{extension}"
+        template = "{created} - {title} - {tags}.pdf"

        for created in self.valid_dates:
            for title in self.valid_titles:
                for tags in self.valid_tags:
-                    for extension in self.valid_extensions:
-                        spec = {
-                            "created": created,
-                            "title": title,
-                            "tags": tags,
-                            "extension": extension
-                        }
-                        self._test_guessed_attributes(
-                            template.format(**spec), **spec)
+                    spec = {
+                        "created": created,
+                        "title": title,
+                        "tags": tags
+                    }
+                    self._test_guessed_attributes(
+                        template.format(**spec), **spec)

    def test_invalid_date_format(self):
-        info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
+        info = FileInfo.from_filename("06112017Z - title.pdf")
        self.assertEqual(info.title, "title")
        self.assertIsNone(info.created)

    def test_filename_parse_transforms(self):

-        path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
+        filename = "tag1,tag2_20190908_180610_0001.pdf"
        all_patt = re.compile("^.*$")
        none_patt = re.compile("$a")
        exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
@@ -336,50 +302,44 @@ class TestFieldPermutations(TestCase):
        repl2 = "\\2Z - " + repl1  # creation date + repl1

        # No transformations configured (= default)
-        info = FileInfo.from_path(path)
+        info = FileInfo.from_filename(filename)
        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
-        self.assertEqual(info.extension, "pdf")
        self.assertEqual(info.tags, ())
        self.assertIsNone(info.created)

        # Pattern doesn't match (filename unaltered)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
-            info = FileInfo.from_path(path)
+            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
-            self.assertEqual(info.extension, "pdf")

        # Simple transformation (match all)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
-            info = FileInfo.from_path(path)
+            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "all")
-            self.assertEqual(info.extension, "gif")

        # Multiple transformations configured (first pattern matches)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[
                    (all_patt, "all.gif"),
                    (all_patt, "anotherall.gif")]):
-            info = FileInfo.from_path(path)
+            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "all")
-            self.assertEqual(info.extension, "gif")

        # Multiple transformations configured (second pattern matches)
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[
                    (none_patt, "none.gif"),
                    (all_patt, "anotherall.gif")]):
-            info = FileInfo.from_path(path)
+            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "anotherall")
-            self.assertEqual(info.extension, "gif")

        # Complex transformation without date in replacement string
        with self.settings(
                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
-            info = FileInfo.from_path(path)
+            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "0001")
-            self.assertEqual(info.extension, "pdf")
            self.assertEqual(len(info.tags), 2)
            self.assertEqual(info.tags[0].slug, "tag1")
            self.assertEqual(info.tags[1].slug, "tag2")
@@ -392,9 +352,8 @@ class TestFieldPermutations(TestCase):
                (exact_patt, repl2),    # <-- matches
                (exact_patt, repl1),
                (all_patt, "all.gif")]):
-            info = FileInfo.from_path(path)
+            info = FileInfo.from_filename(filename)
            self.assertEqual(info.title, "0001")
-            self.assertEqual(info.extension, "pdf")
            self.assertEqual(len(info.tags), 2)
            self.assertEqual(info.tags[0].slug, "tag1")
            self.assertEqual(info.tags[1].slug, "tag2")
@@ -437,6 +396,18 @@ class FaultyParser(DocumentParser):
        raise ParseError("Does not compute.")


+def fake_magic_from_file(file, mime=False):
+
+    if mime:
+        if os.path.splitext(file)[1] == ".pdf":
+            return "application/pdf"
+        else:
+            return "unknown"
+    else:
+        return "A verbose string that describes the contents of the file"
+
+
+@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumer(TestCase):

    def make_dummy_parser(self, path, logging_group):
@@ -462,7 +433,7 @@ class TestConsumer(TestCase):
        m = patcher.start()
        m.return_value = [(None, {
            "parser": self.make_dummy_parser,
-            "test": lambda _: True,
+            "mime_types": ["application/pdf"],
            "weight": 0
        })]

@@ -592,7 +563,7 @@ class TestConsumer(TestCase):
    def testFaultyParser(self, m):
        m.return_value = [(None, {
            "parser": self.make_faulty_parser,
-            "test": lambda _: True,
+            "mime_types": ["application/pdf"],
            "weight": 0
        })]

--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@@ -13,9 +13,12 @@ class TestDocument(TestCase):
            title="Title",
            content="content",
            checksum="checksum",
+            mime_type="application/pdf"
        )
+
        file_path = document.source_path
        thumb_path = document.thumbnail_path
+
        with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
            document.delete()
            mock_unlink.assert_any_call(file_path)
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -31,7 +31,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_generate_source_filename(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -44,7 +44,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_file_renaming(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -81,7 +81,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_file_renaming_missing_permissions(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -111,10 +111,10 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_file_renaming_database_error(self):

-        document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
+        document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")

        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.checksum = "BBBBB"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()
@@ -149,7 +149,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_document_delete(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -170,7 +170,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_document_delete_nofile(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -179,7 +179,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_directory_not_empty(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -206,7 +206,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_with_underscore(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -222,7 +222,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_with_dash(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -238,7 +238,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_malformed(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -254,7 +254,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
    def test_tags_all(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -269,7 +269,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
    def test_tags_out_of_bounds(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -284,7 +284,7 @@ class TestDate(TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
    def test_nested_directory_cleanup(self):
        document = Document()
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        document.save()

@@ -309,7 +309,7 @@ class TestDate(TestCase):
    def test_format_none(self):
        document = Document()
        document.pk = 1
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED

        self.assertEqual(generate_filename(document), "0000001.pdf")
@@ -335,7 +335,7 @@ class TestDate(TestCase):
    def test_invalid_format(self):
        document = Document()
        document.pk = 1
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED

        self.assertEqual(generate_filename(document), "0000001.pdf")
@@ -344,7 +344,7 @@ class TestDate(TestCase):
    def test_invalid_format_key(self):
        document = Document()
        document.pk = 1
-        document.file_type = "pdf"
+        document.mime_type = "application/pdf"
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED

        self.assertEqual(generate_filename(document), "0000001.pdf")
--- a/src/documents/tests/test_index.py
+++ b/src/documents/tests/test_index.py
@@ -0,0 +1,14 @@
+from django.test import TestCase
+
+from documents.index import JsonFormatter
+
+
+class JsonFormatterTest(TestCase):
+
+    def setUp(self) -> None:
+        self.formatter = JsonFormatter()
+
+    def test_empty_fragments(self):
+        self.assertListEqual(self.formatter.format([]), [])
+
+
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
        TestCase.setUp(self)
        User.objects.create_user(username='test_consumer', password='12345')
        self.doc_contains = Document.objects.create(
-            content="I contain the keyword.", file_type="pdf")
+            content="I contain the keyword.", mime_type="application/pdf")

    def test_tag_applied_any(self):
        t1 = Tag.objects.create(
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,3 +1,4 @@
+import os
 from tempfile import TemporaryDirectory
 from unittest import mock

@@ -6,6 +7,18 @@ from django.test import TestCase
 from documents.parsers import get_parser_class


+def fake_magic_from_file(file, mime=False):
+
+    if mime:
+        if os.path.splitext(file)[1] == ".pdf":
+            return "application/pdf"
+        else:
+            return "unknown"
+    else:
+        return "A verbose string that describes the contents of the file"
+
+
+@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
 class TestParserDiscovery(TestCase):

    @mock.patch("documents.parsers.document_consumer_declaration.send")
@@ -14,7 +27,7 @@ class TestParserDiscovery(TestCase):
            pass

        m.return_value = (
-            (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
+            (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
        )

        self.assertEqual(
@@ -32,8 +45,8 @@ class TestParserDiscovery(TestCase):
            pass

        m.return_value = (
-            (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
-            (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
+            (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
+            (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
        )

        self.assertEqual(
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -47,18 +47,30 @@ class IndexView(TemplateView):

 class CorrespondentViewSet(ModelViewSet):
    model = Correspondent
-    queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name')
+
+    queryset = Correspondent.objects.annotate(
+        document_count=Count('documents'),
+        last_correspondence=Max('documents__created')).order_by('name')
+
    serializer_class = CorrespondentSerializer
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
    filterset_class = CorrespondentFilterSet
-    ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
+    ordering_fields = (
+        "name",
+        "matching_algorithm",
+        "match",
+        "document_count",
+        "last_correspondence")


 class TagViewSet(ModelViewSet):
    model = Tag
-    queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name')
+
+    queryset = Tag.objects.annotate(
+        document_count=Count('documents')).order_by('name')
+
    serializer_class = TagSerializer
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
@@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet):

 class DocumentTypeViewSet(ModelViewSet):
    model = DocumentType
-    queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name')
+
+    queryset = DocumentType.objects.annotate(
+        document_count=Count('documents')).order_by('name')
+
    serializer_class = DocumentTypeSerializer
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
@@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin,
    filterset_class = DocumentFilterSet
    search_fields = ("title", "correspondent__name", "content")
    ordering_fields = (
-        "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
+        "id",
+        "title",
+        "correspondent__name",
+        "document_type__name",
+        "created",
+        "modified",
+        "added",
+        "archive_serial_number")

    def update(self, request, *args, **kwargs):
-        response = super(DocumentViewSet, self).update(request, *args, **kwargs)
+        response = super(DocumentViewSet, self).update(
+            request, *args, **kwargs)
        index.add_or_update_document(self.get_object())
        return response

@@ -104,18 +127,6 @@ class DocumentViewSet(RetrieveModelMixin,
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

    def file_response(self, pk, disposition):
-        # TODO: this should not be necessary here.
-        content_types = {
-            Document.TYPE_PDF: "application/pdf",
-            Document.TYPE_PNG: "image/png",
-            Document.TYPE_JPG: "image/jpeg",
-            Document.TYPE_GIF: "image/gif",
-            Document.TYPE_TIF: "image/tiff",
-            Document.TYPE_CSV: "text/csv",
-            Document.TYPE_MD: "text/markdown",
-            Document.TYPE_TXT: "text/plain"
-        }
-
        doc = Document.objects.get(id=pk)

        if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
@@ -123,7 +134,7 @@ class DocumentViewSet(RetrieveModelMixin,
        else:
            file_handle = GnuPG.decrypted(doc.source_file)

-        response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
+        response = HttpResponse(file_handle, content_type=doc.mime_type)
        response["Content-Disposition"] = '{}; filename="{}"'.format(
            disposition, doc.file_name)
        return response
@@ -150,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin,
    @cache_control(public=False, max_age=315360000)
    def thumb(self, request, pk=None):
        try:
-            return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
+            return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
+                                content_type='image/png')
        except FileNotFoundError:
            raise Http404("Document thumbnail does not exist")

@@ -242,5 +254,6 @@ class StatisticsView(APIView):
    def get(self, request, format=None):
        return Response({
            'documents_total': Document.objects.all().count(),
-            'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count()
+            'documents_inbox': Document.objects.filter(
+                tags__is_inbox_tag=True).distinct().count()
        })
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@@ -9,7 +9,7 @@ class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
    """

    def authenticate(self, request):
-        if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):
+        if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):  # NOQA: E501
            user = User.objects.filter(is_staff=True).first()
            print("Auto-Login with user {}".format(user))
            return (user, None)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -334,6 +334,8 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES

 OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")

+OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
+
 # The default language that tesseract will attempt to use when parsing
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -29,43 +29,56 @@ api_router.register(r"tags", TagViewSet)


 urlpatterns = [
+    re_path(r"^api/", include([
+        re_path(r"^auth/",
+                include(('rest_framework.urls', 'rest_framework'),
+                        namespace="rest_framework")),

-    # API
-    re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
-    re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
-    re_path(r"^api/search/", SearchView.as_view(), name="search"),
-    re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
-    re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
+        re_path(r"^search/autocomplete/",
+                SearchAutoCompleteView.as_view(),
+                name="autocomplete"),
+
+        re_path(r"^search/",
+                SearchView.as_view(),
+                name="search"),
+
+        re_path(r"^statistics/",
+                StatisticsView.as_view(),
+                name="statistics"),
+
+    ] + api_router.urls)),

-    # Favicon
    re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),

-    # The Django admin
    re_path(r"admin/", admin.site.urls),

-    # These redirects are here to support clients that use the old FetchView.
-    re_path(
-        r"^fetch/doc/(?P<pk>\d+)$",
-        RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
-    ),
-    re_path(
-        r"^fetch/thumb/(?P<pk>\d+)$",
-        RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
-    ),
-    re_path(
-        r"^fetch/preview/(?P<pk>\d+)$",
-        RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
-    ),
-    re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
+    re_path(r"^fetch/", include([
+        re_path(
+            r"^doc/(?P<pk>\d+)$",
+            RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
+        ),
+        re_path(
+            r"^thumb/(?P<pk>\d+)$",
+            RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
+        ),
+        re_path(
+            r"^preview/(?P<pk>\d+)$",
+            RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
+        ),
+    ])),

-    # Frontend assets TODO: this is pretty bad.
-    path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
+    re_path(r"^push$", csrf_exempt(
+        RedirectView.as_view(url='/api/documents/post_document/'))),

+    # Frontend assets TODO: this is pretty bad, but it works.
+    path('assets/<path:path>',
+         RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
+
+    # login, logout
    path('accounts/', include('django.contrib.auth.urls')),

    # Root of the Frontent
    re_path(r".*", login_required(IndexView.as_view())),
-
 ]


--- a/src/paperless_mail/admin.py
+++ b/src/paperless_mail/admin.py
@@ -9,9 +9,61 @@ class MailAccountAdmin(admin.ModelAdmin):

 class MailRuleAdmin(admin.ModelAdmin):

+    radio_fields = {
+        "action": admin.VERTICAL,
+        "assign_title_from": admin.VERTICAL,
+        "assign_correspondent_from": admin.VERTICAL
+    }
+
+    fieldsets = (
+        (None, {
+            'fields': ('name', 'order', 'account', 'folder')
+        }),
+        ("Filter", {
+            'description':
+                "Paperless will only process mails that match ALL of the "
+                "filters given below.",
+            'fields':
+                ('filter_from',
+                 'filter_subject',
+                 'filter_body',
+                 'maximum_age')
+        }),
+        ("Actions", {
+            'description':
+                "The action applied to the mail. This action is only "
+                "performed when documents were consumed from the mail. Mails "
+                "without attachments will remain entirely untouched.",
+            'fields': (
+                'action',
+                'action_parameter')
+        }),
+        ("Metadata", {
+            'description':
+                "Assign metadata to documents consumed from this rule "
+                "automatically. If you do not assign tags, types or "
+                "correspondents here, paperless will still process all "
+                "matching rules that you have defined.",
+            "fields": (
+                'assign_title_from',
+                'assign_tag',
+                'assign_document_type',
+                'assign_correspondent_from',
+                'assign_correspondent')
+        })
+    )
+
    list_filter = ("account",)

-    list_display = ("name", "account", "folder", "action")
+    list_display = ("order", "name", "account", "folder", "action")
+
+    list_editable = ("order", )
+
+    list_display_links = ("name", )
+
+    sortable_by = []
+
+    ordering = ["order"]


 admin.site.register(MailAccount, MailAccountAdmin)
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -2,6 +2,7 @@ import os
 import tempfile
 from datetime import timedelta, date

+import magic
 from django.conf import settings
 from django.utils.text import slugify
 from django_q.tasks import async_task
@@ -10,6 +11,7 @@ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \

 from documents.loggers import LoggingMixin
 from documents.models import Correspondent
+from documents.parsers import is_mime_type_supported
 from paperless_mail.models import MailAccount, MailRule


@@ -159,7 +161,7 @@ class MailAccountHandler(LoggingMixin):
            self.log('debug', f"Account {account}: Processing "
                              f"{account.rules.count()} rule(s)")

-            for rule in account.rules.all():
+            for rule in account.rules.order_by('order'):
                self.log(
                    'debug',
                    f"Account {account}: Processing rule {rule.name}")
@@ -172,8 +174,8 @@ class MailAccountHandler(LoggingMixin):
                    M.folder.set(rule.folder)
                except MailboxFolderSelectError:
                    raise MailError(
-                        f"Rule {rule.name}: Folder {rule.folder} does not exist "
-                        f"in account {account.name}")
+                        f"Rule {rule.name}: Folder {rule.folder} "
+                        f"does not exist in account {account.name}")

                criterias = make_criterias(rule)

@@ -183,7 +185,8 @@ class MailAccountHandler(LoggingMixin):
                    f"{str(AND(**criterias))}")

                try:
-                    messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
+                    messages = M.fetch(criteria=AND(**criterias),
+                                       mark_seen=False)
                except Exception:
                    raise MailError(
                        f"Rule {rule.name}: Error while fetching folder "
@@ -224,8 +227,8 @@ class MailAccountHandler(LoggingMixin):

                except Exception:
                    raise MailError(
-                        f"Rule {rule.name}: Error while processing post-consume "
-                        f"actions for account {account.name}")
+                        f"Rule {rule.name}: Error while processing "
+                        f"post-consume actions for account {account.name}")

        return total_processed_files

@@ -247,13 +250,25 @@ class MailAccountHandler(LoggingMixin):

        for att in message.attachments:

+            if not att.content_disposition == "attachment":
+                self.log(
+                    'debug',
+                    f"Rule {rule.account}.{rule}: "
+                    f"Skipping attachment {att.filename} "
+                    f"with content disposition inline")
+                continue
+
            title = get_title(message, att, rule)

-            # TODO: check with parsers what files types are supported
-            if att.content_type == 'application/pdf':
+            # don't trust the content type of the attachment. Could be
+            # generic application/octet-stream.
+            mime_type = magic.from_buffer(att.payload, mime=True)
+
+            if is_mime_type_supported(mime_type):

                os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-                _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
+                _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-",
+                                                    dir=settings.SCRATCH_DIR)
                with open(temp_filename, 'wb') as f:
                    f.write(att.payload)

@@ -268,12 +283,19 @@ class MailAccountHandler(LoggingMixin):
                    path=temp_filename,
                    override_filename=att.filename,
                    override_title=title,
-                    override_correspondent_id=correspondent.id if correspondent else None,
-                    override_document_type_id=doc_type.id if doc_type else None,
+                    override_correspondent_id=correspondent.id if correspondent else None,  # NOQA: E501
+                    override_document_type_id=doc_type.id if doc_type else None,  # NOQA: E501
                    override_tag_ids=[tag.id] if tag else None,
-                    task_name=f"Mail: {att.filename}"
+                    task_name=att.filename[:100]
                )

                processed_attachments += 1
+            else:
+                self.log(
+                    'debug',
+                    f"Rule {rule.account}.{rule}: "
+                    f"Skipping attachment {att.filename} "
+                    f"since guessed mime type {mime_type} is not supported "
+                    f"by paperless")

        return processed_attachments
--- a/src/paperless_mail/migrations/0004_mailrule_order.py
+++ b/src/paperless_mail/migrations/0004_mailrule_order.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2020-11-21 21:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('paperless_mail', '0003_auto_20201118_1940'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='mailrule',
+            name='order',
+            field=models.IntegerField(default=0),
+        ),
+    ]
--- a/src/paperless_mail/migrations/0005_help_texts.py
+++ b/src/paperless_mail/migrations/0005_help_texts.py
@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2020-11-22 10:36
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('paperless_mail', '0004_mailrule_order'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='mailrule',
+            name='action',
+            field=models.PositiveIntegerField(choices=[(3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails"), (2, 'Move to specified folder'), (1, 'Delete')], default=3),
+        ),
+        migrations.AlterField(
+            model_name='mailrule',
+            name='maximum_age',
+            field=models.PositiveIntegerField(default=30, help_text='Specified in days.'),
+        ),
+    ]
--- a/src/paperless_mail/models.py
+++ b/src/paperless_mail/models.py
@@ -46,10 +46,10 @@ class MailRule(models.Model):
    ACTION_FLAG = 4

    ACTIONS = (
-        (ACTION_DELETE, "Delete"),
-        (ACTION_MOVE, "Move to specified folder"),
        (ACTION_MARK_READ, "Mark as read, don't process read mails"),
-        (ACTION_FLAG, "Flag the mail, don't process flagged mails")
+        (ACTION_FLAG, "Flag the mail, don't process flagged mails"),
+        (ACTION_MOVE, "Move to specified folder"),
+        (ACTION_DELETE, "Delete"),
    )

    TITLE_FROM_SUBJECT = 1
@@ -66,14 +66,20 @@ class MailRule(models.Model):
    CORRESPONDENT_FROM_CUSTOM = 4

    CORRESPONDENT_SELECTOR = (
-        (CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
-        (CORRESPONDENT_FROM_EMAIL, "Use mail address"),
-        (CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
-        (CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
+        (CORRESPONDENT_FROM_NOTHING,
+         "Do not assign a correspondent"),
+        (CORRESPONDENT_FROM_EMAIL,
+         "Use mail address"),
+        (CORRESPONDENT_FROM_NAME,
+         "Use name (or mail address if not available)"),
+        (CORRESPONDENT_FROM_CUSTOM,
+         "Use correspondent selected below")
    )

    name = models.CharField(max_length=256, unique=True)

+    order = models.IntegerField(default=0)
+
    account = models.ForeignKey(
        MailAccount,
        related_name="rules",
@@ -86,15 +92,13 @@ class MailRule(models.Model):
    filter_subject = models.CharField(max_length=256, null=True, blank=True)
    filter_body = models.CharField(max_length=256, null=True, blank=True)

-    maximum_age = models.PositiveIntegerField(default=30)
+    maximum_age = models.PositiveIntegerField(
+        default=30,
+        help_text="Specified in days.")

    action = models.PositiveIntegerField(
        choices=ACTIONS,
        default=ACTION_MARK_READ,
-        help_text="The action applied to the mail. This action is only "
-                  "performed when documents were consumed from the mail. "
-                  "Mails without attachments will remain entirely "
-                  "untouched."
    )

    action_parameter = models.CharField(
--- a/src/paperless_mail/tasks.py
+++ b/src/paperless_mail/tasks.py
@@ -7,7 +7,8 @@ from paperless_mail.models import MailAccount
 def process_mail_accounts():
    total_new_documents = 0
    for account in MailAccount.objects.all():
-        total_new_documents += MailAccountHandler().handle_mail_account(account)
+        total_new_documents += MailAccountHandler().handle_mail_account(
+            account)

    if total_new_documents > 0:
        return f"Added {total_new_documents} document(s)."
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -99,11 +99,7 @@ def create_message(num_attachments=1, body="", subject="the suject", from_="noon
    message.from_ = from_
    message.body = body
    for i in range(num_attachments):
-        attachment = namedtuple('Attachment', [])
-        attachment.filename = 'some_file.pdf'
-        attachment.content_type = 'application/pdf'
-        attachment.payload = b'content of the attachment'
-        message.attachments.append(attachment)
+        message.attachments.append(create_attachment(filename=f"file_{i}.pdf"))

    message.seen = seen
    message.flagged = flagged
@@ -111,6 +107,26 @@ def create_message(num_attachments=1, body="", subject="the suject", from_="noon
    return message


+def create_attachment(filename="the_file.pdf", content_disposition="attachment", payload=b"a PDF document"):
+    attachment = namedtuple('Attachment', [])
+    attachment.filename = filename
+    attachment.content_disposition = content_disposition
+    attachment.payload = payload
+    return attachment
+
+
+def fake_magic_from_buffer(buffer, mime=False):
+
+    if mime:
+        if 'PDF' in str(buffer):
+            return 'application/pdf'
+        else:
+            return 'unknown/type'
+    else:
+        return 'Some verbose file description'
+
+
+@mock.patch('paperless_mail.mail.magic.from_buffer', fake_magic_from_buffer)
 class TestMail(TestCase):

    def setUp(self):
@@ -182,26 +198,7 @@ class TestMail(TestCase):
        self.assertEqual(get_title(message, att, rule), "the message title")

    def test_handle_message(self):
-        message = namedtuple('MailMessage', [])
-        message.subject = "the message title"
-        message.from_ = "Myself"
-
-        att = namedtuple('Attachment', [])
-        att.filename = "test1.pdf"
-        att.content_type = 'application/pdf'
-        att.payload = b"attachment contents"
-
-        att2 = namedtuple('Attachment', [])
-        att2.filename = "test2.pdf"
-        att2.content_type = 'application/pdf'
-        att2.payload = b"attachment contents"
-
-        att3 = namedtuple('Attachment', [])
-        att3.filename = "test3.pdf"
-        att3.content_type = 'application/invalid'
-        att3.payload = b"attachment contents"
-
-        message.attachments = [att, att2, att3]
+        message = create_message(subject="the message title", from_="Myself", num_attachments=2)

        account = MailAccount()
        rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
@@ -215,14 +212,13 @@ class TestMail(TestCase):
        args1, kwargs1 = self.async_task.call_args_list[0]
        args2, kwargs2 = self.async_task.call_args_list[1]

-        self.assertEqual(kwargs1['override_title'], "test1")
-        self.assertEqual(kwargs1['override_filename'], "test1.pdf")
+        self.assertEqual(kwargs1['override_title'], "file_0")
+        self.assertEqual(kwargs1['override_filename'], "file_0.pdf")

-        self.assertEqual(kwargs2['override_title'], "test2")
-        self.assertEqual(kwargs2['override_filename'], "test2.pdf")
+        self.assertEqual(kwargs2['override_title'], "file_1")
+        self.assertEqual(kwargs2['override_filename'], "file_1.pdf")

-    @mock.patch("paperless_mail.mail.async_task")
-    def test_handle_empty_message(self, m):
+    def test_handle_empty_message(self):
        message = namedtuple('MailMessage', [])

        message.attachments = []
@@ -230,9 +226,45 @@ class TestMail(TestCase):

        result = self.mail_account_handler.handle_message(message, rule)

-        self.assertFalse(m.called)
+        self.assertFalse(self.async_task.called)
        self.assertEqual(result, 0)

+    def test_handle_unknown_mime_type(self):
+        message = create_message()
+        message.attachments = [
+            create_attachment(filename="f1.pdf"),
+            create_attachment(filename="f2.json", payload=b"{'much': 'payload.', 'so': 'json', 'wow': true}")
+        ]
+
+        account = MailAccount()
+        rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
+
+        result = self.mail_account_handler.handle_message(message, rule)
+
+        self.assertEqual(result, 1)
+        self.assertEqual(self.async_task.call_count, 1)
+
+        args, kwargs = self.async_task.call_args
+        self.assertEqual(kwargs['override_filename'], "f1.pdf")
+
+    def test_handle_disposition(self):
+        message = create_message()
+        message.attachments = [
+            create_attachment(filename="f1.pdf", content_disposition='inline'),
+            create_attachment(filename="f2.pdf", content_disposition='attachment')
+        ]
+
+        account = MailAccount()
+        rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
+
+        result = self.mail_account_handler.handle_message(message, rule)
+
+        self.assertEqual(result, 1)
+        self.assertEqual(self.async_task.call_count, 1)
+
+        args, kwargs = self.async_task.call_args
+        self.assertEqual(kwargs['override_filename'], "f2.pdf")
+
    def test_handle_mail_account_mark_read(self):

        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -50,7 +50,10 @@ class RasterisedDocumentParser(DocumentParser):
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
-            self.log('warning', 'Thumbnail generation with ImageMagick failed, falling back to ghostscript. Check your /etc/ImageMagick-x/policy.xml!')
+            self.log(
+                'warning',
+                "Thumbnail generation with ImageMagick failed, falling back "
+                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [settings.GS_BINARY,
                   "-q",
@@ -99,27 +102,41 @@ class RasterisedDocumentParser(DocumentParser):
        try:

            sample_page_index = int(len(images) / 2)
-            self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
+            self.log(
+                "debug",
+                f"Attempting language detection on page "
+                f"{sample_page_index + 1} of {len(images)}...")
            self.progress_callback(0.4, 1, "Language Detection.")
-            sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
+            sample_page_text = self._ocr([images[sample_page_index]],
+                                         settings.OCR_LANGUAGE)[0]
            guessed_language = self._guess_language(sample_page_text)
            self.progress_callback(0.6, 1, "OCR all the pages.")

            if not guessed_language or guessed_language not in ISO639:
                self.log("warning", "Language detection failed.")
-                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+                ocr_pages = self._complete_ocr_default_language(
+                    images, sample_page_index, sample_page_text)

            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
-                self.log("debug", "Detected language: {} (default language)".format(guessed_language))
-                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+                self.log(
+                    "debug",
+                    f"Detected language: {guessed_language} "
+                    f"(default language)")
+                ocr_pages = self._complete_ocr_default_language(
+                    images, sample_page_index, sample_page_text)

-            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
-                self.log("warning", "Detected language {} is not available on this system.".format(guessed_language))
-                ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
+            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501
+                self.log(
+                    "warning",
+                    f"Detected language {guessed_language} is not available "
+                    f"on this system.")
+                ocr_pages = self._complete_ocr_default_language(
+                    images, sample_page_index, sample_page_text)

            else:
-                self.log("debug", "Detected language: {}".format(guessed_language))
-                ocr_pages = self._ocr(images, ISO639[guessed_language], report_progress=True)
+                self.log("debug", f"Detected language: {guessed_language}")
+                ocr_pages = self._ocr(
+                    images, ISO639[guessed_language], report_progress=True)

            self.log("debug", "OCR completed.")
            self._text = strip_excess_whitespace(" ".join(ocr_pages))
@@ -133,16 +150,25 @@ class RasterisedDocumentParser(DocumentParser):
        Greyscale images are easier for Tesseract to OCR
        """

-        self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
-
        # Convert PDF to multiple PNMs
-        pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
+        input_file = self.document_path
+
+        if settings.OCR_PAGES == 1:
+            input_file += "[0]"
+        elif settings.OCR_PAGES > 1:
+            input_file += f"[0-{settings.OCR_PAGES - 1}]"
+
+        self.log(
+            "debug",
+            f"Converting document {input_file} into greyscale images")
+
+        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")

        run_convert(density=settings.CONVERT_DENSITY,
                    depth="8",
                    type="grayscale",
-                    input_file=self.document_path,
-                    output_file=pnm,
+                    input_file=input_file,
+                    output_file=output_files,
                    logging_group=self.logging_group)

        # Get a list of converted images
@@ -151,7 +177,7 @@ class RasterisedDocumentParser(DocumentParser):
            if f.endswith(".pnm"):
                pnms.append(os.path.join(self.tempdir, f))

-        self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
+        self.log("debug", f"Running unpaper on {len(pnms)} pages...")

        self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms)))

@@ -166,11 +192,13 @@ class RasterisedDocumentParser(DocumentParser):
            guess = langdetect.detect(text)
            return guess
        except Exception as e:
-            self.log('warning', "Language detection failed with: {}".format(e))
+            self.log('warning', f"Language detection failed with: {e}")
            return None

    def _ocr(self, imgs, lang, report_progress=False):
-        self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
+        self.log(
+            "debug",
+            f"Performing OCR on {len(imgs)} page(s) with language {lang}")
        r = []
        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
            # r = pool.map(image_to_string, itertools.product(imgs, [lang]))
@@ -180,18 +208,16 @@ class RasterisedDocumentParser(DocumentParser):
                r += [page]
        return r

-    def _complete_ocr_default_language(self, images, sample_page_index, sample_page):
-        """
-        Given a `middle` value and the text that middle page represents, we OCR
-        the remainder of the document and return the whole thing.
-        """
-        # text = self._ocr(imgs[:middle], settings.OCR_LANGUAGE) + text
-        # text += self._ocr(imgs[middle + 1:], settings.OCR_LANGUAGE)
+    def _complete_ocr_default_language(self,
+                                       images,
+                                       sample_page_index,
+                                       sample_page):
        images_copy = list(images)
        del images_copy[sample_page_index]
        if images_copy:
-            self.log('debug', 'Continuing ocr with default language.')
-            ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE, report_progress=True)
+            self.log('debug', "Continuing ocr with default language.")
+            ocr_pages = self._ocr(
+                images_copy, settings.OCR_LANGUAGE, report_progress=True)
            ocr_pages.insert(sample_page_index, sample_page)
            return ocr_pages
        else:
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -1,5 +1,3 @@
-import re
-
 from .parsers import RasterisedDocumentParser


@@ -7,12 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs):
    return {
        "parser": RasterisedDocumentParser,
        "weight": 0,
-        "test": tesseract_consumer_test
+        "mime_types": [
+            "application/pdf",
+            "image/jpeg",
+            "image/png"
+        ]
    }
-
-
-MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
-
-
-def tesseract_consumer_test(doc):
-    return MATCHING_FILES.match(doc.lower())
--- a/src/paperless_tesseract/tests/test_signals.py
+++ b/src/paperless_tesseract/tests/test_signals.py
@@ -1,36 +0,0 @@
-from django.test import TestCase
-
-from paperless_tesseract.signals import tesseract_consumer_test
-
-
-class SignalsTestCase(TestCase):
-
-    def test_test_handles_various_file_names_true(self):
-
-        prefixes = (
-            "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
-            "A document with a . in it", "Doc with -- in it"
-        )
-        suffixes = (
-            "pdf", "jpg", "jpeg", "gif", "png", "tiff", "tif", "pnm", "bmp",
-            "PDF", "JPG", "JPEG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
-            "pDf", "jPg", "jpEg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
-        )
-
-        for prefix in prefixes:
-            for suffix in suffixes:
-                name = "{}.{}".format(prefix, suffix)
-                self.assertTrue(tesseract_consumer_test(name))
-
-    def test_test_handles_various_file_names_false(self):
-
-        prefixes = ("doc",)
-        suffixes = ("txt", "markdown", "",)
-
-        for prefix in prefixes:
-            for suffix in suffixes:
-                name = "{}.{}".format(prefix, suffix)
-                self.assertFalse(tesseract_consumer_test(name))
-
-        self.assertFalse(tesseract_consumer_test(""))
-        self.assertFalse(tesseract_consumer_test("doc"))
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -1,5 +1,3 @@
-import re
-
 from .parsers import TextDocumentParser


@@ -7,12 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
    return {
        "parser": TextDocumentParser,
        "weight": 10,
-        "test": text_consumer_test
+        "mime_types": [
+            "text/plain",
+            "text/comma-separated-values"
+        ]
    }
-
-
-MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
-
-
-def text_consumer_test(doc):
-    return MATCHING_FILES.match(doc.lower())
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -1,6 +1,5 @@
 [pycodestyle]
-exclude = migrations, paperless/settings.py, .tox
-ignore = E501
+exclude = migrations, paperless/settings.py, .tox, */tests/*

 [tool:pytest]
 DJANGO_SETTINGS_MODULE=paperless.settings