Merge branch 'dev' into celery-tasks

2026-02-20 00:39:32 -06:00 · 2020-11-10 00:16:59 +01:00
parent 0b20736102 d3e7c8ff4e
commit 7bfe28f451
77 changed files with 1605 additions and 959 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -2,7 +2,9 @@ from django.contrib import admin
 from django.contrib.auth.models import Group, User
 from django.utils.html import format_html, format_html_join
 from django.utils.safestring import mark_safe
+from whoosh.writing import AsyncWriter

+from . import index
 from .models import Correspondent, Document, DocumentType, Log, Tag


@@ -71,6 +73,21 @@ class DocumentAdmin(admin.ModelAdmin):
        return obj.created.date().strftime("%Y-%m-%d")
    created_.short_description = "Created"

+    def delete_queryset(self, request, queryset):
+        ix = index.open_index()
+        with AsyncWriter(ix) as writer:
+            for o in queryset:
+                index.remove_document(writer, o)
+        super(DocumentAdmin, self).delete_queryset(request, queryset)
+
+    def delete_model(self, request, obj):
+        index.remove_document_from_index(obj)
+        super(DocumentAdmin, self).delete_model(request, obj)
+
+    def save_model(self, request, obj, form, change):
+        index.add_or_update_document(obj)
+        super(DocumentAdmin, self).save_model(request, obj, form, change)
+
    @mark_safe
    def tags_(self, obj):
        r = ""
--- a/src/documents/apps.py
+++ b/src/documents/apps.py
@@ -18,7 +18,8 @@ class DocumentsConfig(AppConfig):
            set_log_entry,
            set_correspondent,
            set_document_type,
-            set_tags
+            set_tags,
+            add_to_index

        )

@@ -29,6 +30,7 @@ class DocumentsConfig(AppConfig):
        document_consumption_finished.connect(set_document_type)
        document_consumption_finished.connect(set_tags)
        document_consumption_finished.connect(set_log_entry)
+        document_consumption_finished.connect(add_to_index)
        document_consumption_finished.connect(run_post_consume_script)

        post_delete.connect(cleanup_document_deletion)
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -1,4 +1,3 @@
-import magic
 import os

 from datetime import datetime
@@ -6,77 +5,25 @@ from time import mktime

 from django import forms
 from django.conf import settings
-
-from .models import Document, Correspondent
+from pathvalidate import validate_filename, ValidationError


 class UploadForm(forms.Form):

-    TYPE_LOOKUP = {
-        "application/pdf": Document.TYPE_PDF,
-        "image/png": Document.TYPE_PNG,
-        "image/jpeg": Document.TYPE_JPG,
-        "image/gif": Document.TYPE_GIF,
-        "image/tiff": Document.TYPE_TIF,
-    }
-
-    correspondent = forms.CharField(
-        max_length=Correspondent._meta.get_field("name").max_length,
-        required=False
-    )
-    title = forms.CharField(
-        max_length=Document._meta.get_field("title").max_length,
-        required=False
-    )
    document = forms.FileField()

-    def __init__(self, *args, **kwargs):
-        forms.Form.__init__(self, *args, **kwargs)
-        self._file_type = None
-
-    def clean_correspondent(self):
-        """
-        I suppose it might look cleaner to use .get_or_create() here, but that
-        would also allow someone to fill up the db with bogus correspondents
-        before all validation was met.
-        """
-
-        corresp = self.cleaned_data.get("correspondent")
-
-        if not corresp:
-            return None
-
-        if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
-            raise forms.ValidationError(
-                "That correspondent name is suspicious.")
-
-        return corresp
-
-    def clean_title(self):
-
-        title = self.cleaned_data.get("title")
-
-        if not title:
-            return None
-
-        if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
-            raise forms.ValidationError("That title is suspicious.")
-
-        return title
-
    def clean_document(self):
+        try:
+            validate_filename(self.cleaned_data.get("document").name)
+        except ValidationError:
+            raise forms.ValidationError("That filename is suspicious.")
+        return self.cleaned_data.get("document")

-        document = self.cleaned_data.get("document").read()
-
-        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
-            file_type = m.id_buffer(document)
-
-        if file_type not in self.TYPE_LOOKUP:
-            raise forms.ValidationError("The file type is invalid.")
-
-        self._file_type = self.TYPE_LOOKUP[file_type]
-
-        return document
+    def get_filename(self, i=None):
+        return os.path.join(
+            settings.CONSUMPTION_DIR,
+            "{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
+        )

    def save(self):
        """
@@ -85,15 +32,15 @@ class UploadForm(forms.Form):
        form do that as well.  Think of it as a poor-man's queue server.
        """

-        correspondent = self.cleaned_data.get("correspondent")
-        title = self.cleaned_data.get("title")
-        document = self.cleaned_data.get("document")
+        document = self.cleaned_data.get("document").read()

        t = int(mktime(datetime.now().timetuple()))
-        file_name = os.path.join(
-            settings.CONSUMPTION_DIR,
-            "{} - {}.{}".format(correspondent, title, self._file_type)
-        )
+
+        file_name = self.get_filename()
+        i = 0
+        while os.path.exists(file_name):
+            i += 1
+            file_name = self.get_filename(i)

        with open(file_name, "wb") as f:
            f.write(document)
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -2,15 +2,20 @@ import logging

 from django.db import models
 from django.dispatch import receiver
+from whoosh import highlight
 from whoosh.fields import Schema, TEXT, NUMERIC
 from whoosh.highlight import Formatter, get_text
 from whoosh.index import create_in, exists_in, open_dir
+from whoosh.qparser import MultifieldParser
 from whoosh.writing import AsyncWriter

 from documents.models import Document
 from paperless import settings


+logger = logging.getLogger(__name__)
+
+
 class JsonFormatter(Formatter):
    def __init__(self):
        self.seen = {}
@@ -68,7 +73,7 @@ def open_index(recreate=False):


 def update_document(writer, doc):
-    logging.getLogger(__name__).debug("Updating index with document{}".format(str(doc)))
+    logger.debug("Indexing {}...".format(doc))
    writer.update_document(
        id=doc.pk,
        title=doc.title,
@@ -77,19 +82,32 @@ def update_document(writer, doc):
    )


-@receiver(models.signals.post_save, sender=Document)
-def add_document_to_index(sender, instance, **kwargs):
-    ix = open_index()
-    with AsyncWriter(ix) as writer:
-        update_document(writer, instance)
+def remove_document(writer, doc):
+    logger.debug("Removing {} from index...".format(doc))
+    writer.delete_by_term('id', doc.pk)


-@receiver(models.signals.post_delete, sender=Document)
-def remove_document_from_index(sender, instance, **kwargs):
-    logging.getLogger(__name__).debug("Removing document {} from index".format(str(instance)))
+def add_or_update_document(document):
    ix = open_index()
    with AsyncWriter(ix) as writer:
-        writer.delete_by_term('id', instance.pk)
+        update_document(writer, document)
+
+
+def remove_document_from_index(document):
+    ix = open_index()
+    with AsyncWriter(ix) as writer:
+        remove_document(writer, document)
+
+
+def query_page(ix, query, page):
+    with ix.searcher() as searcher:
+        query_parser = MultifieldParser(["content", "title", "correspondent"],
+                                        ix.schema).parse(query)
+        result_page = searcher.search_page(query_parser, page)
+        result_page.results.fragmenter = highlight.ContextFragmenter(
+            surround=50)
+        result_page.results.formatter = JsonFormatter()
+        return result_page


 def autocomplete(ix, term, limit=10):
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -1,10 +1,6 @@
-import logging
-
 from django.core.management.base import BaseCommand
-from documents.classifier import DocumentClassifier, \
-    IncompatibleClassifierVersionError
-from paperless import settings
 from ...mixins import Renderable
+from ...tasks import train_classifier


 class Command(Renderable, BaseCommand):
@@ -18,27 +14,4 @@ class Command(Renderable, BaseCommand):
        BaseCommand.__init__(self, *args, **kwargs)

    def handle(self, *args, **options):
-        classifier = DocumentClassifier()
-
-        try:
-            # load the classifier, since we might not have to train it again.
-            classifier.reload()
-        except (FileNotFoundError, IncompatibleClassifierVersionError):
-            # This is what we're going to fix here.
-            pass
-
-        try:
-            if classifier.train():
-                logging.getLogger(__name__).info(
-                    "Saving updated classifier model to {}...".format(settings.MODEL_FILE)
-                )
-                classifier.save_classifier()
-            else:
-                logging.getLogger(__name__).debug(
-                    "Training data unchanged."
-                )
-
-        except Exception as e:
-            logging.getLogger(__name__).error(
-                "Classifier error: " + str(e)
-            )
+        train_classifier()
--- a/src/documents/management/commands/document_index.py
+++ b/src/documents/management/commands/document_index.py
@@ -1,9 +1,7 @@
 from django.core.management import BaseCommand
-from whoosh.writing import AsyncWriter

-import documents.index as index
 from documents.mixins import Renderable
-from documents.models import Document
+from documents.tasks import index_reindex, index_optimize


 class Command(Renderable, BaseCommand):
@@ -22,13 +20,6 @@ class Command(Renderable, BaseCommand):
        self.verbosity = options["verbosity"]

        if options['command'] == 'reindex':
-            documents = Document.objects.all()
-
-            ix = index.open_index(recreate=True)
-
-            with AsyncWriter(ix) as writer:
-                for document in documents:
-                    index.update_document(writer, document)
-
+            index_reindex()
        elif options['command'] == 'optimize':
-            index.open_index().optimize()
+            index_optimize()
--- a/src/documents/management/commands/document_renamer.py
+++ b/src/documents/management/commands/document_renamer.py
@@ -0,0 +1,24 @@
+from django.core.management.base import BaseCommand
+
+from documents.models import Document, Tag
+
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        This will rename all documents to match the latest filename format.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        self.verbosity = 0
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def handle(self, *args, **options):
+
+        self.verbosity = options["verbosity"]
+
+        for document in Document.objects.all():
+            # Saving the document again will generate a new filename and rename
+            document.save()
--- a/src/documents/management/commands/document_rerun_ocr.py
+++ b/src/documents/management/commands/document_rerun_ocr.py
@@ -1,60 +0,0 @@
-import argparse
-import threading
-from multiprocessing import Pool
-from multiprocessing.pool import ThreadPool
-
-from django.core.management.base import BaseCommand
-
-from documents.consumer import Consumer
-from documents.models import Log, Document
-from documents.parsers import get_parser_class
-
-
-def process_document(doc):
-    parser_class = get_parser_class(doc.file_name)
-    if not parser_class:
-        print("no parser available")
-    else:
-        print("Parser: {}".format(parser_class.__name__))
-        parser = parser_class(doc.source_path, None)
-        try:
-            text = parser.get_text()
-            doc.content = text
-            doc.save()
-        finally:
-            parser.cleanup()
-
-
-def document_index(value):
-    ivalue = int(value)
-    if not (1 <= ivalue <= Document.objects.count()):
-        raise argparse.ArgumentTypeError(
-            "{} is not a valid document index (out of range)".format(value))
-
-    return ivalue
-
-
-class Command(BaseCommand):
-
-    help = "Performs OCR on all documents again!"
-
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "-s", "--start_index",
-            default=None,
-            type=document_index
-        )
-
-    def handle(self, *args, **options):
-
-        docs = Document.objects.all().order_by("added")
-
-        indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
-
-        for i in indices:
-            doc = docs[i]
-            print("==================================")
-            print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
-            print("==================================")
-            process_document(doc)
--- a/src/documents/migrations/1000_update_paperless.py
+++ b/src/documents/migrations/1000_update_paperless.py
@@ -1,73 +0,0 @@
-# Generated by Django 3.1.2 on 2020-10-29 14:29
-import os
-
-from django.db import migrations
-
-from django.conf import settings
-
-
-def make_index(apps, schema_editor):
-    Document = apps.get_model("documents", "Document")
-    documents = Document.objects.all()
-    print()
-    try:
-        print("  --> Creating document index...")
-        from whoosh.writing import AsyncWriter
-        from documents import index
-        ix = index.open_index(recreate=True)
-        with AsyncWriter(ix) as writer:
-            for document in documents:
-                index.update_document(writer, document)
-    except ImportError:
-        # index may not be relevant anymore
-        print("  --> Cannot create document index.")
-
-
-def restore_filenames(apps, schema_editor):
-    Document = apps.get_model("documents", "Document")
-    for doc in Document.objects.all():
-        file_name = "{:07}.{}".format(doc.pk, doc.file_type)
-        if doc.storage_type == "gpg":
-            file_name += ".gpg"
-
-        if not doc.filename == file_name:
-            try:
-                print("file was renamed, restoring {} to {}".format(doc.filename, file_name))
-                os.rename(os.path.join(settings.ORIGINALS_DIR, doc.filename),
-                          os.path.join(settings.ORIGINALS_DIR, file_name))
-            except PermissionError:
-                pass
-            except FileNotFoundError:
-                pass
-
-
-def initialize_document_classifier(apps, schema_editor):
-    try:
-        print("Initalizing document classifier...")
-        from documents.classifier import DocumentClassifier
-        classifier = DocumentClassifier()
-        try:
-            classifier.train()
-            classifier.save_classifier()
-        except Exception as e:
-            print("Classifier error: {}".format(e))
-    except ImportError:
-        print("Document classifier not found, skipping")
-
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('documents', '0023_document_current_filename'),
-    ]
-
-    operations = [
-        migrations.RunPython(make_index, migrations.RunPython.noop),
-        migrations.RunPython(restore_filenames),
-        migrations.RunPython(initialize_document_classifier, migrations.RunPython.noop),
-        migrations.RemoveField(
-            model_name='document',
-            name='filename',
-        ),
-    ]
--- a/src/documents/migrations/1000_update_paperless_all.py
+++ b/src/documents/migrations/1000_update_paperless_all.py
@@ -0,0 +1,95 @@
+# Generated by Django 3.1.3 on 2020-11-07 12:35
+import os
+
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+def make_index(apps, schema_editor):
+    Document = apps.get_model("documents", "Document")
+    documents = Document.objects.all()
+    print()
+    try:
+        print("  --> Creating document index...")
+        from whoosh.writing import AsyncWriter
+        from documents import index
+        ix = index.open_index(recreate=True)
+        with AsyncWriter(ix) as writer:
+            for document in documents:
+                index.update_document(writer, document)
+    except ImportError:
+        # index may not be relevant anymore
+        print("  --> Cannot create document index.")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0023_document_current_filename'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_serial_number',
+            field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='is_inbox_tag',
+            field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
+        ),
+        migrations.CreateModel(
+            name='DocumentType',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.CharField(max_length=128, unique=True)),
+                ('slug', models.SlugField(blank=True, editable=False)),
+                ('match', models.CharField(blank=True, max_length=256)),
+                ('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
+                ('is_insensitive', models.BooleanField(default=True)),
+            ],
+            options={
+                'abstract': False,
+                'ordering': ('name',),
+            },
+        ),
+        migrations.AddField(
+            model_name='document',
+            name='document_type',
+            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.documenttype'),
+        ),
+        migrations.AlterField(
+            model_name='correspondent',
+            name='matching_algorithm',
+            field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='matching_algorithm',
+            field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='content',
+            field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
+        ),
+        migrations.AlterModelOptions(
+            name='log',
+            options={'ordering': ('-created',)},
+        ),
+        migrations.RemoveField(
+            model_name='log',
+            name='modified',
+        ),
+        migrations.AlterField(
+            model_name='log',
+            name='group',
+            field=models.UUIDField(blank=True, null=True),
+        ),
+        migrations.RunPython(
+            code=make_index,
+            reverse_code=django.db.migrations.operations.special.RunPython.noop,
+        ),
+    ]
--- a/src/documents/migrations/1001_auto_20201109_1636.py
+++ b/src/documents/migrations/1001_auto_20201109_1636.py
@@ -0,0 +1,28 @@
+# Generated by Django 3.1.3 on 2020-11-09 16:36
+
+from django.db import migrations
+from django.db.migrations import RunPython
+from django_q.models import Schedule
+from django_q.tasks import schedule
+
+
+def add_schedules(apps, schema_editor):
+    schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
+    schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
+    schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
+
+
+def remove_schedules(apps, schema_editor):
+    Schedule.objects.all().delete()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1000_update_paperless_all'),
+        ('django_q', '0013_task_attempt_count'),
+    ]
+
+    operations = [
+        RunPython(add_schedules, remove_schedules)
+    ]
--- a/src/documents/migrations/1001_workflow_improvements.py
+++ b/src/documents/migrations/1001_workflow_improvements.py
@@ -1,23 +0,0 @@
-# Generated by Django 2.0.7 on 2018-07-12 09:52
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('documents', '1000_update_paperless'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='document',
-            name='archive_serial_number',
-            field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='is_inbox_tag',
-            field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
-        ),
-    ]
--- a/src/documents/migrations/1002_auto_20180823_1155.py
+++ b/src/documents/migrations/1002_auto_20180823_1155.py
@@ -1,33 +0,0 @@
-# Generated by Django 2.0.7 on 2018-08-23 11:55
-
-from django.db import migrations, models
-import django.db.models.deletion
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('documents', '1001_workflow_improvements'),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name='DocumentType',
-            fields=[
-                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('name', models.CharField(max_length=128, unique=True)),
-                ('slug', models.SlugField(blank=True, editable=False)),
-                ('match', models.CharField(blank=True, max_length=256)),
-                ('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
-                ('is_insensitive', models.BooleanField(default=True)),
-            ],
-            options={
-                'abstract': False,
-            },
-        ),
-        migrations.AddField(
-            model_name='document',
-            name='document_type',
-            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.DocumentType'),
-        ),
-    ]
--- a/src/documents/migrations/1003_auto_20201028_1751.py
+++ b/src/documents/migrations/1003_auto_20201028_1751.py
@@ -1,32 +0,0 @@
-# Generated by Django 3.1.2 on 2020-10-28 17:51
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('documents', '1002_auto_20180823_1155'),
-    ]
-
-    operations = [
-        migrations.AlterModelOptions(
-            name='documenttype',
-            options={'ordering': ('name',)},
-        ),
-        migrations.AlterField(
-            model_name='correspondent',
-            name='matching_algorithm',
-            field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
-        ),
-        migrations.AlterField(
-            model_name='documenttype',
-            name='matching_algorithm',
-            field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
-        ),
-        migrations.AlterField(
-            model_name='tag',
-            name='matching_algorithm',
-            field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match'), (6, 'Automatic Classification')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.'),
-        ),
-    ]
--- a/src/documents/migrations/1004_auto_20201029_1331.py
+++ b/src/documents/migrations/1004_auto_20201029_1331.py
@@ -1,18 +0,0 @@
-# Generated by Django 3.1.2 on 2020-10-29 13:31
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('documents', '1003_auto_20201028_1751'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='document',
-            name='content',
-            field=models.TextField(blank=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
-        ),
-    ]
--- a/src/documents/migrations/1005_auto_20201102_0007.py
+++ b/src/documents/migrations/1005_auto_20201102_0007.py
@@ -1,26 +0,0 @@
-# Generated by Django 3.1.2 on 2020-11-02 00:07
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('documents', '1004_auto_20201029_1331'),
-    ]
-
-    operations = [
-        migrations.AlterModelOptions(
-            name='log',
-            options={'ordering': ('-created',)},
-        ),
-        migrations.RemoveField(
-            model_name='log',
-            name='modified',
-        ),
-        migrations.AlterField(
-            model_name='log',
-            name='group',
-            field=models.UUIDField(blank=True, null=True),
-        ),
-    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -3,11 +3,12 @@
 import logging
 import os
 import re
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict

 import dateutil.parser
 from django.conf import settings
 from django.db import models
+from django.dispatch import receiver
 from django.template.defaultfilters import slugify
 from django.utils import timezone
 from django.utils.text import slugify
@@ -190,6 +191,14 @@ class Document(models.Model):
    added = models.DateTimeField(
        default=timezone.now, editable=False, db_index=True)

+    filename = models.FilePathField(
+        max_length=256,
+        editable=False,
+        default=None,
+        null=True,
+        help_text="Current filename in storage"
+    )
+
    archive_serial_number = models.IntegerField(
        blank=True,
        null=True,
@@ -211,15 +220,123 @@ class Document(models.Model):
            return "{}: {}".format(created, self.correspondent or self.title)
        return str(created)

+    def find_renamed_document(self, subdirectory=""):
+        suffix = "%07i.%s" % (self.pk, self.file_type)
+
+        # Append .gpg for encrypted files
+        if self.storage_type == self.STORAGE_TYPE_GPG:
+            suffix += ".gpg"
+
+        # Go up in the directory hierarchy and try to delete all directories
+        root = os.path.normpath(Document.filename_to_path(subdirectory))
+
+        for filename in os.listdir(root):
+            if filename.endswith(suffix):
+                return os.path.join(subdirectory, filename)
+
+            fullname = os.path.join(subdirectory, filename)
+            if os.path.isdir(Document.filename_to_path(fullname)):
+                return self.find_renamed_document(fullname)
+
+        return None
+
+    @property
+    def source_filename(self):
+        # Initial filename generation (for new documents)
+        if self.filename is None:
+            self.filename = self.generate_source_filename()
+
+        # Check if document is still available under filename
+        elif not os.path.isfile(Document.filename_to_path(self.filename)):
+            recovered_filename = self.find_renamed_document()
+
+            # If we have found the file so update the filename
+            if recovered_filename is not None:
+                logger = logging.getLogger(__name__)
+                logger.warning("Filename of document " + str(self.id) +
+                               " has changed and was successfully updated")
+                self.filename = recovered_filename
+
+                # Remove all empty subdirectories from MEDIA_ROOT
+                Document.delete_all_empty_subdirectories(
+                        Document.filename_to_path(""))
+            else:
+                logger = logging.getLogger(__name__)
+                logger.error("File of document " + str(self.id) + " has " +
+                             "gone and could not be recovered")
+
+        return self.filename
+
+    @staticmethod
+    def many_to_dictionary(field):
+        # Converts ManyToManyField to dictionary by assuming, that field
+        # entries contain an _ or - which will be used as a delimiter
+        mydictionary = dict()
+
+        for index, t in enumerate(field.all()):
+            # Populate tag names by index
+            mydictionary[index] = slugify(t.name)
+
+            # Find delimiter
+            delimiter = t.name.find('_')
+
+            if delimiter == -1:
+                delimiter = t.name.find('-')
+
+            if delimiter == -1:
+                continue
+
+            key = t.name[:delimiter]
+            value = t.name[delimiter+1:]
+
+            mydictionary[slugify(key)] = slugify(value)
+
+        return mydictionary
+
+    def generate_source_filename(self):
+        # Create filename based on configured format
+        if settings.PAPERLESS_FILENAME_FORMAT is not None:
+            tags = defaultdict(lambda: slugify(None),
+                               self.many_to_dictionary(self.tags))
+            path = settings.PAPERLESS_FILENAME_FORMAT.format(
+                   correspondent=slugify(self.correspondent),
+                   title=slugify(self.title),
+                   created=slugify(self.created),
+                   added=slugify(self.added),
+                   tags=tags)
+        else:
+            path = ""
+
+        # Always append the primary key to guarantee uniqueness of filename
+        if len(path) > 0:
+            filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
+        else:
+            filename = "%07i.%s" % (self.pk, self.file_type)
+
+        # Append .gpg for encrypted files
+        if self.storage_type == self.STORAGE_TYPE_GPG:
+            filename += ".gpg"
+
+        return filename
+
+    def create_source_directory(self):
+        new_filename = self.generate_source_filename()
+
+        # Determine the full "target" path
+        dir_new = Document.filename_to_path(os.path.dirname(new_filename))
+
+        # Create new path
+        os.makedirs(dir_new, exist_ok=True)
+
    @property
    def source_path(self):
-        file_name = "{:07}.{}".format(self.pk, self.file_type)
-        if self.storage_type == self.STORAGE_TYPE_GPG:
-            file_name += ".gpg"
+        return Document.filename_to_path(self.source_filename)

+    @staticmethod
+    def filename_to_path(filename):
        return os.path.join(
            settings.ORIGINALS_DIR,
-            file_name
+            filename
        )

    @property
@@ -245,6 +362,125 @@ class Document(models.Model):
    def thumbnail_file(self):
        return open(self.thumbnail_path, "rb")

+    def set_filename(self, filename):
+        if os.path.isfile(Document.filename_to_path(filename)):
+            self.filename = filename
+
+    @staticmethod
+    def try_delete_empty_directories(directory):
+        # Go up in the directory hierarchy and try to delete all directories
+        directory = os.path.normpath(directory)
+        root = os.path.normpath(Document.filename_to_path(""))
+
+        while directory != root:
+            # Try to delete the current directory
+            try:
+                os.rmdir(directory)
+            except os.error:
+                # Directory not empty, no need to go further up
+                return
+
+            # Cut off actual directory and go one level up
+            directory, _ = os.path.split(directory)
+            directory = os.path.normpath(directory)
+
+    @staticmethod
+    def delete_all_empty_subdirectories(directory):
+        # Go through all folders and try to delete all directories
+        root = os.path.normpath(Document.filename_to_path(directory))
+
+        for filename in os.listdir(root):
+            fullname = os.path.join(directory, filename)
+
+            if not os.path.isdir(Document.filename_to_path(fullname)):
+                continue
+
+            # Go into subdirectory to see, if there is more to delete
+            Document.delete_all_empty_subdirectories(
+                    os.path.join(directory, filename))
+
+            # Try to delete the directory
+            try:
+                os.rmdir(Document.filename_to_path(fullname))
+                continue
+            except os.error:
+                # Directory not empty, no need to go further up
+                continue
+
+
+@receiver(models.signals.m2m_changed, sender=Document.tags.through)
+@receiver(models.signals.post_save, sender=Document)
+def update_filename(sender, instance, **kwargs):
+    # Skip if document has not been saved yet
+    if instance.filename is None:
+        return
+
+    # Check is file exists and update filename otherwise
+    if not os.path.isfile(Document.filename_to_path(instance.filename)):
+        instance.filename = instance.source_filename
+
+    # Build the new filename
+    new_filename = instance.generate_source_filename()
+
+    # If the filename is the same, then nothing needs to be done
+    if instance.filename == new_filename:
+        return
+
+    # Determine the full "target" path
+    path_new = instance.filename_to_path(new_filename)
+    dir_new = instance.filename_to_path(os.path.dirname(new_filename))
+
+    # Create new path
+    instance.create_source_directory()
+
+    # Determine the full "current" path
+    path_current = instance.filename_to_path(instance.source_filename)
+
+    # Move file
+    try:
+        os.rename(path_current, path_new)
+    except PermissionError:
+        # Do not update filename in object
+        return
+    except FileNotFoundError:
+        logger = logging.getLogger(__name__)
+        logger.error("Renaming of document " + str(instance.id) + " failed " +
+                     "as file " + instance.filename + " was no longer present")
+        return
+
+    # Delete empty directory
+    old_dir = os.path.dirname(instance.filename)
+    old_path = instance.filename_to_path(old_dir)
+    Document.try_delete_empty_directories(old_path)
+
+    instance.filename = new_filename
+
+    # Save instance
+    # This will not cause a cascade of post_save signals, as next time
+    # nothing needs to be renamed
+    instance.save()
+
+
+@receiver(models.signals.post_delete, sender=Document)
+def delete_files(sender, instance, **kwargs):
+    if instance.filename is None:
+        return
+
+    # Remove the document
+    old_file = instance.filename_to_path(instance.filename)
+
+    try:
+        os.remove(old_file)
+    except FileNotFoundError:
+        logger = logging.getLogger(__name__)
+        logger.warning("Deleted document " + str(instance.id) + " but file " +
+                       old_file + " was no longer present")
+
+    # And remove the directory (if applicable)
+    old_dir = os.path.dirname(instance.filename)
+    old_path = instance.filename_to_path(old_dir)
+    Document.try_delete_empty_directories(old_path)
+

 class Log(models.Model):

--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -166,3 +166,7 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
        user=user,
        object_repr=document.__str__(),
    )
+
+
+def add_to_index(sender, document, **kwargs):
+    index.add_or_update_document(document)
--- a/src/documents/static/bootstrap.min.css
+++ b/src/documents/static/bootstrap.min.css
--- a/src/documents/static/signin.css
+++ b/src/documents/static/signin.css
@@ -0,0 +1,44 @@
+html,
+body {
+  height: 100%;
+}
+
+body {
+  display: -ms-flexbox;
+  display: flex;
+  -ms-flex-align: center;
+  align-items: center;
+  padding-top: 40px;
+  padding-bottom: 40px;
+  background-color: #f5f5f5;
+}
+
+.form-signin {
+  width: 100%;
+  max-width: 330px;
+  padding: 15px;
+  margin: auto;
+}
+.form-signin .checkbox {
+  font-weight: 400;
+}
+.form-signin .form-control {
+  position: relative;
+  box-sizing: border-box;
+  height: auto;
+  padding: 10px;
+  font-size: 16px;
+}
+.form-signin .form-control:focus {
+  z-index: 2;
+}
+.form-signin input[type="text"] {
+  margin-bottom: -1px;
+  border-bottom-right-radius: 0;
+  border-bottom-left-radius: 0;
+}
+.form-signin input[type="password"] {
+  margin-bottom: 10px;
+  border-top-left-radius: 0;
+  border-top-right-radius: 0;
+}
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -0,0 +1,57 @@
+import logging
+
+from django.conf import settings
+from django_q.tasks import async_task, result
+from whoosh.writing import AsyncWriter
+
+from documents import index
+from documents.classifier import DocumentClassifier, \
+    IncompatibleClassifierVersionError
+from documents.mail import MailFetcher
+from documents.models import Document
+
+
+def consume_mail():
+    MailFetcher().pull()
+
+
+def index_optimize():
+    index.open_index().optimize()
+
+
+def index_reindex():
+    documents = Document.objects.all()
+
+    ix = index.open_index(recreate=True)
+
+    with AsyncWriter(ix) as writer:
+        for document in documents:
+            index.update_document(writer, document)
+
+
+def train_classifier():
+    classifier = DocumentClassifier()
+
+    try:
+        # load the classifier, since we might not have to train it again.
+        classifier.reload()
+    except (FileNotFoundError, IncompatibleClassifierVersionError):
+        # This is what we're going to fix here.
+        pass
+
+    try:
+        if classifier.train():
+            logging.getLogger(__name__).info(
+                "Saving updated classifier model to {}...".format(
+                    settings.MODEL_FILE)
+            )
+            classifier.save_classifier()
+        else:
+            logging.getLogger(__name__).debug(
+                "Training data unchanged."
+            )
+
+    except Exception as e:
+        logging.getLogger(__name__).error(
+            "Classifier error: " + str(e)
+        )
--- a/src/documents/templates/index.html
+++ b/src/documents/templates/index.html
@@ -9,11 +9,11 @@
  <base href="/">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="icon" type="image/x-icon" href="favicon.ico">
-<link rel="stylesheet" href="{% static 'styles.css' %}"></head>
+<link rel="stylesheet" href="{% static 'frontend/styles.css' %}"></head>
 <body>
  <app-root>Loading...</app-root>
-	<script src="{% static 'runtime.js' %}" defer></script>
-	<script src="{% static 'polyfills.js' %}" defer></script>
-	<script src="{% static 'main.js' %}" defer></script>
+	<script src="{% static 'frontend/runtime.js' %}" defer></script>
+	<script src="{% static 'frontend/polyfills.js' %}" defer></script>
+	<script src="{% static 'frontend/main.js' %}" defer></script>
 </body>
 </html>
--- a/src/documents/templates/registration/logged_out.html
+++ b/src/documents/templates/registration/logged_out.html
@@ -0,0 +1,44 @@
+<!doctype html>
+
+{% load static %}
+
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <meta name="description" content="">
+    <meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
+    <meta name="generator" content="Jekyll v4.1.1">
+    <title>Paperless Sign In</title>
+
+    <!-- Bootstrap core CSS -->
+		<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
+
+    <style>
+      .bd-placeholder-img {
+        font-size: 1.125rem;
+        text-anchor: middle;
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        -ms-user-select: none;
+        user-select: none;
+      }
+
+      @media (min-width: 768px) {
+        .bd-placeholder-img-lg {
+          font-size: 3.5rem;
+        }
+      }
+    </style>
+    <!-- Custom styles for this template -->
+    <link href="{% static 'signin.css' %}" rel="stylesheet">
+  </head>
+
+  <body class="text-center">
+    <div class="form-signin">
+			<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
+			<p>You have been successfully logged out. Bye!</p>
+			<a href="/">Sign in again</a>
+		</div>
+	</body>
+</html>
--- a/src/documents/templates/registration/login.html
+++ b/src/documents/templates/registration/login.html
@@ -0,0 +1,54 @@
+<!doctype html>
+
+{% load static %}
+
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <meta name="description" content="">
+    <meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors">
+    <meta name="generator" content="Jekyll v4.1.1">
+    <title>Paperless Sign In</title>
+
+    <!-- Bootstrap core CSS -->
+		<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
+
+    <style>
+      .bd-placeholder-img {
+        font-size: 1.125rem;
+        text-anchor: middle;
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        -ms-user-select: none;
+        user-select: none;
+      }
+
+      @media (min-width: 768px) {
+        .bd-placeholder-img-lg {
+          font-size: 3.5rem;
+        }
+      }
+    </style>
+    <!-- Custom styles for this template -->
+    <link href="{% static 'signin.css' %}" rel="stylesheet">
+  </head>
+
+  <body class="text-center">
+    <form class="form-signin" method="post">
+			{% csrf_token %}
+			<img class="mb-4" src="{% static 'frontend/assets/logo.svg' %}" alt="" width="300">
+			<p>Please sign in.</p>
+			{% if form.errors %}
+				<div class="alert alert-danger" role="alert">
+					Your username and password didn't match. Please try again.
+				</div>
+			{% endif %}
+			<label for="inputUsername" class="sr-only">Username</label>
+			<input type="text" name="username" id="inputUsername" class="form-control" placeholder="Username" required autofocus>
+			<label for="inputPassword" class="sr-only">Password</label>
+			<input type="password" name="password" id="inputPassword" class="form-control" placeholder="Password" required>
+			<button class="btn btn-lg btn-primary btn-block" type="submit">Sign in</button>
+		</form>
+	</body>
+</html>
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,66 +1,10 @@
 import re

 from django.test import TestCase
-from unittest import mock
-from tempfile import TemporaryDirectory

-from ..consumer import Consumer
 from ..models import FileInfo, Tag


-class TestConsumer(TestCase):
-
-    class DummyParser(object):
-        pass
-
-    def test__get_parser_class_1_parser(self):
-        self.assertEqual(
-            self._get_consumer()._get_parser_class("doc.pdf"),
-            self.DummyParser
-        )
-
-    @mock.patch("documents.consumer.os.makedirs")
-    @mock.patch("documents.consumer.os.path.exists", return_value=True)
-    @mock.patch("documents.consumer.document_consumer_declaration.send")
-    def test__get_parser_class_n_parsers(self, m, *args):
-
-        class DummyParser1(object):
-            pass
-
-        class DummyParser2(object):
-            pass
-
-        m.return_value = (
-            (None, lambda _: {"weight": 0, "parser": DummyParser1}),
-            (None, lambda _: {"weight": 1, "parser": DummyParser2}),
-        )
-        with TemporaryDirectory() as tmpdir:
-            self.assertEqual(
-                Consumer(consume=tmpdir)._get_parser_class("doc.pdf"),
-                DummyParser2
-            )
-
-    @mock.patch("documents.consumer.os.makedirs")
-    @mock.patch("documents.consumer.os.path.exists", return_value=True)
-    @mock.patch("documents.consumer.document_consumer_declaration.send")
-    def test__get_parser_class_0_parsers(self, m, *args):
-        m.return_value = ((None, lambda _: None),)
-        with TemporaryDirectory() as tmpdir:
-            self.assertIsNone(
-                Consumer(consume=tmpdir)._get_parser_class("doc.pdf")
-            )
-
-    @mock.patch("documents.consumer.os.makedirs")
-    @mock.patch("documents.consumer.os.path.exists", return_value=True)
-    @mock.patch("documents.consumer.document_consumer_declaration.send")
-    def _get_consumer(self, m, *args):
-        m.return_value = (
-            (None, lambda _: {"weight": 0, "parser": self.DummyParser}),
-        )
-        with TemporaryDirectory() as tmpdir:
-            return Consumer(consume=tmpdir)
-
-
 class TestAttributes(TestCase):

    TAGS = ("tag1", "tag2", "tag3")
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -0,0 +1,559 @@
+import datetime
+import os
+import shutil
+from unittest import mock
+from uuid import uuid4
+from pathlib import Path
+from shutil import rmtree
+
+from dateutil import tz
+from django.test import TestCase, override_settings
+
+from django.utils.text import slugify
+from ..models import Tag, Document, Correspondent
+from django.conf import settings
+
+
+class TestDate(TestCase):
+    deletion_list = []
+
+    def add_to_deletion_list(self, dirname):
+        self.deletion_list.append(dirname)
+
+    def setUp(self):
+        folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
+        os.makedirs(folder + "/documents/originals")
+        override_settings(MEDIA_ROOT=folder).enable()
+        override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
+        self.add_to_deletion_list(folder)
+
+    def tearDown(self):
+        for dirname in self.deletion_list:
+            shutil.rmtree(dirname, ignore_errors=True)
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="")
+    def test_source_filename(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        self.assertEqual(document.source_filename, "0000001.pdf")
+
+        document.filename = "test.pdf"
+        self.assertEqual(document.source_filename, "test.pdf")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="")
+    def test_generate_source_filename(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        self.assertEqual(document.generate_source_filename(), "0000001.pdf")
+
+        document.storage_type = Document.STORAGE_TYPE_GPG
+        self.assertEqual(document.generate_source_filename(),
+                         "0000001.pdf.gpg")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_file_renaming(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Test source_path
+        self.assertEqual(document.source_path, settings.MEDIA_ROOT +
+                         "/documents/originals/none/none-0000001.pdf")
+
+        # Enable encryption and check again
+        document.storage_type = Document.STORAGE_TYPE_GPG
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf.gpg")
+        document.save()
+
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), True)
+
+        # Set a correspondent and save the document
+        document.correspondent = Correspondent.objects.get_or_create(
+                name="test")[0]
+        document.save()
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/test"), True)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), False)
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/test/test-0000001.pdf.gpg"), True)
+        self.assertEqual(document.generate_source_filename(),
+                         "test/test-0000001.pdf.gpg")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_file_renaming_missing_permissions(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Test source_path
+        self.assertEqual(document.source_path, settings.MEDIA_ROOT +
+                         "/documents/originals/none/none-0000001.pdf")
+
+        # Make the folder read- and execute-only (no writing and no renaming)
+        os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555)
+
+        # Set a correspondent and save the document
+        document.correspondent = Correspondent.objects.get_or_create(
+                name="test")[0]
+        document.save()
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/none/none-0000001.pdf"), True)
+        self.assertEqual(document.source_filename,
+                         "none/none-0000001.pdf")
+
+        os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777)
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_document_delete(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Ensure file deletion after delete
+        document.delete()
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
+                         "/documents/originals/none/none-0000001.pdf"), False)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), False)
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_document_delete_nofile(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_directory_not_empty(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+        Path(document.source_path + "test").touch()
+
+        # Set a correspondent and save the document
+        document.correspondent = Correspondent.objects.get_or_create(
+                name="test")[0]
+        document.save()
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/test"), True)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), True)
+
+        # Cleanup
+        os.remove(settings.MEDIA_ROOT +
+                  "/documents/originals/none/none-0000001.pdftest")
+        os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
+    def test_tags_with_underscore(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Add tag to document
+        document.tags.create(name="type_demo")
+        document.tags.create(name="foo_bar")
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "demo-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
+    def test_tags_with_dash(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Add tag to document
+        document.tags.create(name="type-demo")
+        document.tags.create(name="foo-bar")
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "demo-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
+    def test_tags_malformed(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Add tag to document
+        document.tags.create(name="type:demo")
+        document.tags.create(name="foo:bar")
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
+    def test_tags_all(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Add tag to document
+        document.tags.create(name="demo")
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "demo-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
+    def test_tags_out_of_bounds_0(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}")
+    def test_tags_out_of_bounds_10000000(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}")
+    def test_tags_out_of_bounds_99(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        document.delete()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}/{correspondent}")
+    def test_nested_directory_cleanup(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none/none"), True)
+
+        document.delete()
+
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
+                         "/documents/originals/none/none/none-0000001.pdf"),
+                         False)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none/none"), False)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), False)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals"), True)
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
+    def test_format_none(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        self.assertEqual(document.generate_source_filename(), "0000001.pdf")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_document_renamed(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Test source_path
+        self.assertEqual(document.source_path, settings.MEDIA_ROOT +
+                         "/documents/originals/none/none-0000001.pdf")
+
+        # Rename the document "illegaly"
+        os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
+        os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
+                                        "none/none-0000001.pdf",
+                  settings.MEDIA_ROOT + "/documents/originals/" +
+                                        "test/test-0000001.pdf")
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/test/test-0000001.pdf"), True)
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/none/none-0000001.pdf"), False)
+
+        # Set new correspondent and expect document to be saved properly
+        document.correspondent = Correspondent.objects.get_or_create(
+                name="foo")[0]
+        document.save()
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/foo/foo-0000001.pdf"), True)
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/foo"), True)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), False)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/test"), False)
+        self.assertEqual(document.generate_source_filename(),
+                         "foo/foo-0000001.pdf")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_document_renamed_encrypted(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_GPG
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf.gpg")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Test source_path
+        self.assertEqual(document.source_path, settings.MEDIA_ROOT +
+                         "/documents/originals/none/none-0000001.pdf.gpg")
+
+        # Rename the document "illegaly"
+        os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
+        os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
+                                        "none/none-0000001.pdf.gpg",
+                  settings.MEDIA_ROOT + "/documents/originals/" +
+                                        "test/test-0000001.pdf.gpg")
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/test/test-0000001.pdf.gpg"), True)
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/none/none-0000001.pdf"), False)
+
+        # Set new correspondent and expect document to be saved properly
+        document.correspondent = Correspondent.objects.get_or_create(
+                name="foo")[0]
+        document.save()
+        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
+                         "originals/foo/foo-0000001.pdf.gpg"), True)
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/foo"), True)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), False)
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/test"), False)
+        self.assertEqual(document.generate_source_filename(),
+                         "foo/foo-0000001.pdf.gpg")
+
+    def test_delete_all_empty_subdirectories(self):
+        # Create our working directory
+        tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
+        os.makedirs(tmp)
+        self.add_to_deletion_list(tmp)
+
+        os.makedirs(os.path.join(tmp, "empty"))
+        os.makedirs(os.path.join(tmp, "empty", "subdirectory"))
+
+        os.makedirs(os.path.join(tmp, "notempty"))
+        Path(os.path.join(tmp, "notempty", "file")).touch()
+
+        Document.delete_all_empty_subdirectories(tmp)
+
+        self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
+        self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False)
+        self.assertEqual(os.path.isfile(
+            os.path.join(tmp, "notempty", "file")), True)
+
+    def test_try_delete_empty_directories(self):
+        # Create our working directory
+        tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
+        os.makedirs(tmp)
+        self.add_to_deletion_list(tmp)
+
+        os.makedirs(os.path.join(tmp, "notempty"))
+        Path(os.path.join(tmp, "notempty", "file")).touch()
+        os.makedirs(os.path.join(tmp, "notempty", "empty"))
+
+        Document.try_delete_empty_directories(
+                os.path.join(tmp, "notempty", "empty"))
+        self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
+        self.assertEqual(os.path.isfile(
+            os.path.join(tmp, "notempty", "file")), True)
+        self.assertEqual(os.path.isdir(
+            os.path.join(tmp, "notempty", "empty")), False)
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_document_accidentally_deleted(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Test source_path
+        self.assertEqual(document.source_path, settings.MEDIA_ROOT +
+                         "/documents/originals/none/none-0000001.pdf")
+
+        # Delete the document "illegaly"
+        os.remove(settings.MEDIA_ROOT + "/documents/originals/" +
+                                        "none/none-0000001.pdf")
+
+        # Set new correspondent and expect document to be saved properly
+        document.correspondent = Correspondent.objects.get_or_create(
+                name="foo")[0]
+        document.save()
+
+        # Check proper handling of files
+        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
+                         "/documents/originals/none"), True)
+        self.assertEqual(document.source_filename,
+                         "none/none-0000001.pdf")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
+                       "{correspondent}")
+    def test_set_filename(self):
+        document = Document()
+        document.file_type = "pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        document.save()
+
+        # Ensure that filename is properly generated
+        tmp = document.source_filename
+        self.assertEqual(document.generate_source_filename(),
+                         "none/none-0000001.pdf")
+        document.create_source_directory()
+        Path(document.source_path).touch()
+
+        # Set existing filename
+        document.set_filename(tmp)
+        self.assertEqual(document.source_filename, "none/none-0000001.pdf")
+
+        # Set non-existing filename
+        document.set_filename("doesnotexist")
+        self.assertEqual(document.source_filename, "none/none-0000001.pdf")
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -0,0 +1,50 @@
+from tempfile import TemporaryDirectory
+from unittest import mock
+
+from django.test import TestCase
+
+from documents.parsers import get_parser_class
+
+
+class TestParserDiscovery(TestCase):
+
+    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    def test__get_parser_class_1_parser(self, m, *args):
+        class DummyParser(object):
+            pass
+
+        m.return_value = (
+            (None, lambda _: {"weight": 0, "parser": DummyParser}),
+        )
+
+        self.assertEqual(
+            get_parser_class("doc.pdf"),
+            DummyParser
+        )
+
+    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    def test__get_parser_class_n_parsers(self, m, *args):
+
+        class DummyParser1(object):
+            pass
+
+        class DummyParser2(object):
+            pass
+
+        m.return_value = (
+            (None, lambda _: {"weight": 0, "parser": DummyParser1}),
+            (None, lambda _: {"weight": 1, "parser": DummyParser2}),
+        )
+
+        self.assertEqual(
+            get_parser_class("doc.pdf"),
+            DummyParser2
+        )
+
+    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    def test__get_parser_class_0_parsers(self, m, *args):
+        m.return_value = ((None, lambda _: None),)
+        with TemporaryDirectory() as tmpdir:
+            self.assertIsNone(
+                get_parser_class("doc.pdf")
+            )
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -6,9 +6,6 @@ from django_filters.rest_framework import DjangoFilterBackend
 from rest_framework.decorators import action
 from rest_framework.response import Response
 from rest_framework.views import APIView
-from whoosh import highlight
-from whoosh.qparser import QueryParser
-from whoosh.query import terms

 from paperless.db import GnuPG
 from paperless.views import StandardPagination
@@ -97,7 +94,16 @@ class DocumentViewSet(RetrieveModelMixin,
    filter_class = DocumentFilterSet
    search_fields = ("title", "correspondent__name", "content")
    ordering_fields = (
-        "id", "title", "correspondent__name", "created", "modified", "added", "archive_serial_number")
+        "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
+
+    def update(self, request, *args, **kwargs):
+        response = super(DocumentViewSet, self).update(request, *args, **kwargs)
+        index.add_or_update_document(self.get_object())
+        return response
+
+    def destroy(self, request, *args, **kwargs):
+        index.remove_document_from_index(self.get_object())
+        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

    def file_response(self, pk, disposition):
        #TODO: this should not be necessary here.
@@ -185,18 +191,13 @@ class SearchView(APIView):
            except (ValueError, TypeError):
                page = 1

-            with self.ix.searcher() as searcher:
-                query_parser = QueryParser("content", self.ix.schema).parse(query)
-                result_page = searcher.search_page(query_parser, page)
-                result_page.results.fragmenter = highlight.ContextFragmenter(
-                    surround=50)
-                result_page.results.formatter = index.JsonFormatter()
+            result_page = index.query_page(self.ix, query, page)

-                return Response(
-                    {'count': len(result_page),
-                     'page': result_page.pagenum,
-                     'page_count': result_page.pagecount,
-                     'results': list(map(self.add_infos_to_hit, result_page))})
+            return Response(
+                {'count': len(result_page),
+                 'page': result_page.pagenum,
+                 'page_count': result_page.pagecount,
+                 'results': list(map(self.add_infos_to_hit, result_page))})

        else:
            return Response({