Merge branch 'dev' into feature/any-all-filtering

2025-11-09 03:46:12 -06:00 · 2022-02-14 22:23:31 -08:00
parent 38d8f467cf a2c81b1a6c
commit bb5ac02289
323 changed files with 85068 additions and 12189 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -1,10 +1,6 @@
 from django.contrib import admin
-from django.utils.html import format_html, format_html_join
-from django.utils.safestring import mark_safe
-from whoosh.writing import AsyncWriter

-from . import index
-from .models import Correspondent, Document, DocumentType, Log, Tag, \
+from .models import Correspondent, Document, DocumentType, Tag, \
    SavedView, SavedViewFilterRule


@@ -23,12 +19,12 @@ class TagAdmin(admin.ModelAdmin):

    list_display = (
        "name",
-        "colour",
+        "color",
        "match",
        "matching_algorithm"
    )
-    list_filter = ("colour", "matching_algorithm")
-    list_editable = ("colour", "match", "matching_algorithm")
+    list_filter = ("color", "matching_algorithm")
+    list_editable = ("color", "match", "matching_algorithm")


 class DocumentTypeAdmin(admin.ModelAdmin):
@@ -50,26 +46,31 @@ class DocumentAdmin(admin.ModelAdmin):
        "modified",
        "mime_type",
        "storage_type",
-        "filename")
+        "filename",
+        "checksum",
+        "archive_filename",
+        "archive_checksum"
+    )

    list_display_links = ("title",)

    list_display = (
-        "correspondent",
+        "id",
        "title",
-        "tags_",
-        "created",
+        "mime_type",
+        "filename",
+        "archive_filename"
    )

    list_filter = (
-        "document_type",
-        "tags",
-        "correspondent"
+        ("mime_type"),
+        ("archive_serial_number", admin.EmptyFieldListFilter),
+        ("archive_filename", admin.EmptyFieldListFilter),
    )

    filter_horizontal = ("tags",)

-    ordering = ["-created"]
+    ordering = ["-id"]

    date_hierarchy = "created"

@@ -81,56 +82,24 @@ class DocumentAdmin(admin.ModelAdmin):
    created_.short_description = "Created"

    def delete_queryset(self, request, queryset):
-        ix = index.open_index()
-        with AsyncWriter(ix) as writer:
+        from documents import index
+
+        with index.open_index_writer() as writer:
            for o in queryset:
                index.remove_document(writer, o)
+
        super(DocumentAdmin, self).delete_queryset(request, queryset)

    def delete_model(self, request, obj):
+        from documents import index
        index.remove_document_from_index(obj)
        super(DocumentAdmin, self).delete_model(request, obj)

    def save_model(self, request, obj, form, change):
+        from documents import index
        index.add_or_update_document(obj)
        super(DocumentAdmin, self).save_model(request, obj, form, change)

-    @mark_safe
-    def tags_(self, obj):
-        r = ""
-        for tag in obj.tags.all():
-            r += self._html_tag(
-                "span",
-                tag.name + ", "
-            )
-        return r
-
-    @staticmethod
-    def _html_tag(kind, inside=None, **kwargs):
-        attributes = format_html_join(' ', '{}="{}"', kwargs.items())
-
-        if inside is not None:
-            return format_html("<{kind} {attributes}>{inside}</{kind}>",
-                               kind=kind, attributes=attributes, inside=inside)
-
-        return format_html("<{} {}/>", kind, attributes)
-
-
-class LogAdmin(admin.ModelAdmin):
-
-    def has_add_permission(self, request):
-        return False
-
-    def has_change_permission(self, request, obj=None):
-        return False
-
-    list_display = ("created", "message", "level",)
-    list_filter = ("level", "created",)
-
-    ordering = ('-created',)
-
-    list_display_links = ("created", "message")
-

 class RuleInline(admin.TabularInline):
    model = SavedViewFilterRule
@@ -149,5 +118,4 @@ admin.site.register(Correspondent, CorrespondentAdmin)
 admin.site.register(Tag, TagAdmin)
 admin.site.register(DocumentType, DocumentTypeAdmin)
 admin.site.register(Document, DocumentAdmin)
-admin.site.register(Log, LogAdmin)
 admin.site.register(SavedView, SavedViewAdmin)
--- a/src/documents/bulk_download.py
+++ b/src/documents/bulk_download.py
@@ -0,0 +1,60 @@
+from zipfile import ZipFile
+
+from documents.models import Document
+
+
+class BulkArchiveStrategy:
+
+    def __init__(self, zipf: ZipFile):
+        self.zipf = zipf
+
+    def make_unique_filename(self,
+                             doc: Document,
+                             archive: bool = False,
+                             folder: str = ""):
+        counter = 0
+        while True:
+            filename = folder + doc.get_public_filename(archive, counter)
+            if filename in self.zipf.namelist():
+                counter += 1
+            else:
+                return filename
+
+    def add_document(self, doc: Document):
+        raise NotImplementedError()  # pragma: no cover
+
+
+class OriginalsOnlyStrategy(BulkArchiveStrategy):
+
+    def add_document(self, doc: Document):
+        self.zipf.write(doc.source_path, self.make_unique_filename(doc))
+
+
+class ArchiveOnlyStrategy(BulkArchiveStrategy):
+
+    def __init__(self, zipf):
+        super(ArchiveOnlyStrategy, self).__init__(zipf)
+
+    def add_document(self, doc: Document):
+        if doc.has_archive_version:
+            self.zipf.write(doc.archive_path,
+                            self.make_unique_filename(doc, archive=True))
+        else:
+            self.zipf.write(doc.source_path,
+                            self.make_unique_filename(doc))
+
+
+class OriginalAndArchiveStrategy(BulkArchiveStrategy):
+
+    def add_document(self, doc: Document):
+        if doc.has_archive_version:
+            self.zipf.write(
+                doc.archive_path, self.make_unique_filename(
+                    doc, archive=True, folder="archive/"
+                )
+            )
+
+        self.zipf.write(
+            doc.source_path,
+            self.make_unique_filename(doc, folder="originals/")
+        )
--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -2,9 +2,7 @@ import itertools

 from django.db.models import Q
 from django_q.tasks import async_task
-from whoosh.writing import AsyncWriter

-from documents import index
 from documents.models import Document, Correspondent, DocumentType


@@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
 def delete(doc_ids):
    Document.objects.filter(id__in=doc_ids).delete()

-    ix = index.open_index()
-    with AsyncWriter(ix) as writer:
+    from documents import index
+
+    with index.open_index_writer() as writer:
        for id in doc_ids:
            index.remove_document_by_id(writer, id)

--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -3,12 +3,9 @@ import logging
 import os
 import pickle
 import re
+import shutil

 from django.conf import settings
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
-from sklearn.utils.multiclass import type_of_target

 from documents.models import Document, MatchingModel

@@ -17,7 +14,11 @@ class IncompatibleClassifierVersionError(Exception):
    pass


-logger = logging.getLogger(__name__)
+class ClassifierModelCorruptError(Exception):
+    pass
+
+
+logger = logging.getLogger("paperless.classifier")


 def preprocess_content(content):
@@ -26,15 +27,46 @@ def preprocess_content(content):
    return content


+def load_classifier():
+    if not os.path.isfile(settings.MODEL_FILE):
+        logger.debug(
+            f"Document classification model does not exist (yet), not "
+            f"performing automatic matching."
+        )
+        return None
+
+    classifier = DocumentClassifier()
+    try:
+        classifier.load()
+
+    except (ClassifierModelCorruptError,
+            IncompatibleClassifierVersionError):
+        # there's something wrong with the model file.
+        logger.exception(
+            f"Unrecoverable error while loading document "
+            f"classification model, deleting model file."
+        )
+        os.unlink(settings.MODEL_FILE)
+        classifier = None
+    except OSError:
+        logger.exception(
+            f"IO error while loading document classification model"
+        )
+        classifier = None
+    except Exception:
+        logger.exception(
+            f"Unknown error while loading document classification model"
+        )
+        classifier = None
+
+    return classifier
+
+
 class DocumentClassifier(object):

    FORMAT_VERSION = 6

    def __init__(self):
-        # mtime of the model file on disk. used to prevent reloading when
-        # nothing has changed.
-        self.classifier_version = 0
-
        # hash of the training data. used to prevent re-training when the
        # training data has not changed.
        self.data_hash = None
@@ -45,20 +77,15 @@ class DocumentClassifier(object):
        self.correspondent_classifier = None
        self.document_type_classifier = None

-    def reload(self):
-        if os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
-            with open(settings.MODEL_FILE, "rb") as f:
-                schema_version = pickle.load(f)
+    def load(self):
+        with open(settings.MODEL_FILE, "rb") as f:
+            schema_version = pickle.load(f)

-                if schema_version != self.FORMAT_VERSION:
-                    raise IncompatibleClassifierVersionError(
-                        "Cannor load classifier, incompatible versions.")
-                else:
-                    if self.classifier_version > 0:
-                        # Don't be confused by this check. It's simply here
-                        # so that we wont log anything on initial reload.
-                        logger.info("Classifier updated on disk, "
-                                    "reloading classifier models")
+            if schema_version != self.FORMAT_VERSION:
+                raise IncompatibleClassifierVersionError(
+                    "Cannor load classifier, incompatible versions.")
+            else:
+                try:
                    self.data_hash = pickle.load(f)
                    self.data_vectorizer = pickle.load(f)
                    self.tags_binarizer = pickle.load(f)
@@ -66,10 +93,14 @@ class DocumentClassifier(object):
                    self.tags_classifier = pickle.load(f)
                    self.correspondent_classifier = pickle.load(f)
                    self.document_type_classifier = pickle.load(f)
-            self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
+                except Exception:
+                    raise ClassifierModelCorruptError()

-    def save_classifier(self):
-        with open(settings.MODEL_FILE, "wb") as f:
+    def save(self):
+        target_file = settings.MODEL_FILE
+        target_file_temp = settings.MODEL_FILE + ".part"
+
+        with open(target_file_temp, "wb") as f:
            pickle.dump(self.FORMAT_VERSION, f)
            pickle.dump(self.data_hash, f)
            pickle.dump(self.data_vectorizer, f)
@@ -80,14 +111,19 @@ class DocumentClassifier(object):
            pickle.dump(self.correspondent_classifier, f)
            pickle.dump(self.document_type_classifier, f)

+        if os.path.isfile(target_file):
+            os.unlink(target_file)
+        shutil.move(target_file_temp, target_file)
+
    def train(self):
+
        data = list()
        labels_tags = list()
        labels_correspondent = list()
        labels_document_type = list()

        # Step 1: Extract and preprocess training data from the database.
-        logging.getLogger(__name__).debug("Gathering data from database...")
+        logger.debug("Gathering data from database...")
        m = hashlib.sha1()
        for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):  # NOQA: E501
            preprocessed_content = preprocess_content(doc.content)
@@ -108,10 +144,11 @@ class DocumentClassifier(object):
            m.update(y.to_bytes(4, 'little', signed=True))
            labels_correspondent.append(y)

-            tags = [tag.pk for tag in doc.tags.filter(
+            tags = sorted([tag.pk for tag in doc.tags.filter(
                matching_algorithm=MatchingModel.MATCH_AUTO
-            )]
-            m.update(bytearray(tags))
+            )])
+            for tag in tags:
+                m.update(tag.to_bytes(4, 'little', signed=True))
            labels_tags.append(tags)

        if not data:
@@ -134,7 +171,7 @@ class DocumentClassifier(object):
        num_correspondents = len(set(labels_correspondent) | {-1}) - 1
        num_document_types = len(set(labels_document_type) | {-1}) - 1

-        logging.getLogger(__name__).debug(
+        logger.debug(
            "{} documents, {} tag(s), {} correspondent(s), "
            "{} document type(s).".format(
                len(data),
@@ -144,8 +181,12 @@ class DocumentClassifier(object):
            )
        )

+        from sklearn.feature_extraction.text import CountVectorizer
+        from sklearn.neural_network import MLPClassifier
+        from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
+
        # Step 2: vectorize data
-        logging.getLogger(__name__).debug("Vectorizing data...")
+        logger.debug("Vectorizing data...")
        self.data_vectorizer = CountVectorizer(
            analyzer="word",
            ngram_range=(1, 2),
@@ -155,7 +196,7 @@ class DocumentClassifier(object):

        # Step 3: train the classifiers
        if num_tags > 0:
-            logging.getLogger(__name__).debug("Training tags classifier...")
+            logger.debug("Training tags classifier...")

            if num_tags == 1:
                # Special case where only one tag has auto:
@@ -174,12 +215,12 @@ class DocumentClassifier(object):
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
            self.tags_classifier = None
-            logging.getLogger(__name__).debug(
+            logger.debug(
                "There are no tags. Not training tags classifier."
            )

        if num_correspondents > 0:
-            logging.getLogger(__name__).debug(
+            logger.debug(
                "Training correspondent classifier..."
            )
            self.correspondent_classifier = MLPClassifier(tol=0.01)
@@ -189,13 +230,13 @@ class DocumentClassifier(object):
            )
        else:
            self.correspondent_classifier = None
-            logging.getLogger(__name__).debug(
+            logger.debug(
                "There are no correspondents. Not training correspondent "
                "classifier."
            )

        if num_document_types > 0:
-            logging.getLogger(__name__).debug(
+            logger.debug(
                "Training document type classifier..."
            )
            self.document_type_classifier = MLPClassifier(tol=0.01)
@@ -205,7 +246,7 @@ class DocumentClassifier(object):
            )
        else:
            self.document_type_classifier = None
-            logging.getLogger(__name__).debug(
+            logger.debug(
                "There are no document types. Not training document type "
                "classifier."
            )
@@ -237,6 +278,8 @@ class DocumentClassifier(object):
            return None

    def predict_tags(self, content):
+        from sklearn.utils.multiclass import type_of_target
+
        if self.tags_classifier:
            X = self.data_vectorizer.transform([preprocess_content(content)])
            y = self.tags_classifier.predict(X)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,9 +1,12 @@
 import datetime
 import hashlib
 import os
+import uuid
 from subprocess import Popen

 import magic
+from asgiref.sync import async_to_sync
+from channels.layers import get_channel_layer
 from django.conf import settings
 from django.db import transaction
 from django.db.models import Q
@@ -11,7 +14,7 @@ from django.utils import timezone
 from filelock import FileLock
 from rest_framework.reverse import reverse

-from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
+from .classifier import load_classifier
 from .file_handling import create_source_path_directory, \
    generate_unique_filename
 from .loggers import LoggingMixin
@@ -27,8 +30,45 @@ class ConsumerError(Exception):
    pass


+MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists"
+MESSAGE_FILE_NOT_FOUND = "file_not_found"
+MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
+MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
+MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
+MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
+MESSAGE_NEW_FILE = "new_file"
+MESSAGE_UNSUPPORTED_TYPE = "unsupported_type"
+MESSAGE_PARSING_DOCUMENT = "parsing_document"
+MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail"
+MESSAGE_PARSE_DATE = "parse_date"
+MESSAGE_SAVE_DOCUMENT = "save_document"
+MESSAGE_FINISHED = "finished"
+
+
 class Consumer(LoggingMixin):

+    logging_name = "paperless.consumer"
+
+    def _send_progress(self, current_progress, max_progress, status,
+                       message=None, document_id=None):
+        payload = {
+            'filename': os.path.basename(self.filename) if self.filename else None,  # NOQA: E501
+            'task_id': self.task_id,
+            'current_progress': current_progress,
+            'max_progress': max_progress,
+            'status': status,
+            'message': message,
+            'document_id': document_id
+        }
+        async_to_sync(self.channel_layer.group_send)("status_updates",
+                                                     {'type': 'status_update',
+                                                      'data': payload})
+
+    def _fail(self, message, log_message=None, exc_info=None):
+        self._send_progress(100, 100, 'FAILED', message)
+        self.log("error", log_message or message, exc_info=exc_info)
+        raise ConsumerError(f"{self.filename}: {log_message or message}")
+
    def __init__(self):
        super().__init__()
        self.path = None
@@ -37,15 +77,16 @@ class Consumer(LoggingMixin):
        self.override_correspondent_id = None
        self.override_tag_ids = None
        self.override_document_type_id = None
+        self.task_id = None
+
+        self.channel_layer = get_channel_layer()

    def pre_check_file_exists(self):
        if not os.path.isfile(self.path):
-            self.log(
-                "error",
-                "Cannot consume {}: It is not a file.".format(self.path)
+            self._fail(
+                MESSAGE_FILE_NOT_FOUND,
+                f"Cannot consume {self.path}: File not found."
            )
-            raise ConsumerError("Cannot consume {}: It is not a file".format(
-                self.path))

    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
@@ -53,12 +94,9 @@ class Consumer(LoggingMixin):
        if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501
            if settings.CONSUMER_DELETE_DUPLICATES:
                os.unlink(self.path)
-            self.log(
-                "error",
-                "Not consuming {}: It is a duplicate.".format(self.filename)
-            )
-            raise ConsumerError(
-                "Not consuming {}: It is a duplicate.".format(self.filename)
+            self._fail(
+                MESSAGE_DOCUMENT_ALREADY_EXISTS,
+                f"Not consuming {self.filename}: It is a duplicate."
            )

    def pre_check_directories(self):
@@ -72,15 +110,21 @@ class Consumer(LoggingMixin):
            return

        if not os.path.isfile(settings.PRE_CONSUME_SCRIPT):
-            raise ConsumerError(
+            self._fail(
+                MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
                f"Configured pre-consume script "
                f"{settings.PRE_CONSUME_SCRIPT} does not exist.")

+        self.log("info",
+                 f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
+
        try:
            Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
        except Exception as e:
-            raise ConsumerError(
-                f"Error while executing pre-consume script: {e}"
+            self._fail(
+                MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
+                f"Error while executing pre-consume script: {e}",
+                exc_info=True
            )

    def run_post_consume_script(self, document):
@@ -88,9 +132,16 @@ class Consumer(LoggingMixin):
            return

        if not os.path.isfile(settings.POST_CONSUME_SCRIPT):
-            raise ConsumerError(
+            self._fail(
+                MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
                f"Configured post-consume script "
-                f"{settings.POST_CONSUME_SCRIPT} does not exist.")
+                f"{settings.POST_CONSUME_SCRIPT} does not exist."
+            )
+
+        self.log(
+            "info",
+            f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}"
+        )

        try:
            Popen((
@@ -106,8 +157,10 @@ class Consumer(LoggingMixin):
                    "name", flat=True)))
            )).wait()
        except Exception as e:
-            raise ConsumerError(
-                f"Error while executing pre-consume script: {e}"
+            self._fail(
+                MESSAGE_POST_CONSUME_SCRIPT_ERROR,
+                f"Error while executing post-consume script: {e}",
+                exc_info=True
            )

    def try_consume_file(self,
@@ -116,7 +169,8 @@ class Consumer(LoggingMixin):
                         override_title=None,
                         override_correspondent_id=None,
                         override_document_type_id=None,
-                         override_tag_ids=None):
+                         override_tag_ids=None,
+                         task_id=None):
        """
        Return the document object if it was successfully created.
        """
@@ -127,6 +181,9 @@ class Consumer(LoggingMixin):
        self.override_correspondent_id = override_correspondent_id
        self.override_document_type_id = override_document_type_id
        self.override_tag_ids = override_tag_ids
+        self.task_id = task_id or str(uuid.uuid4())
+
+        self._send_progress(0, 100, 'STARTING', MESSAGE_NEW_FILE)

        # this is for grouping logging entries for this particular file
        # together.
@@ -149,11 +206,10 @@ class Consumer(LoggingMixin):

        parser_class = get_parser_class_for_mime_type(mime_type)
        if not parser_class:
-            raise ConsumerError(
-                f"Unsupported mime type {mime_type} of file {self.filename}")
-        else:
-            self.log("debug",
-                     f"Parser: {parser_class.__name__}")
+            self._fail(
+                MESSAGE_UNSUPPORTED_TYPE,
+                f"Unsupported mime type {mime_type}"
+            )

        # Notify all listeners that we're going to do some work.

@@ -165,35 +221,53 @@ class Consumer(LoggingMixin):

        self.run_pre_consume_script()

+        def progress_callback(current_progress, max_progress):
+            # recalculate progress to be within 20 and 80
+            p = int((current_progress / max_progress) * 50 + 20)
+            self._send_progress(p, 100, "WORKING")
+
        # This doesn't parse the document yet, but gives us a parser.

-        document_parser = parser_class(self.logging_group)
+        document_parser = parser_class(self.logging_group, progress_callback)
+
+        self.log("debug", f"Parser: {type(document_parser).__name__}")

        # However, this already created working directories which we have to
        # clean up.

        # Parse the document. This may take some time.

+        text = None
+        date = None
+        thumbnail = None
+        archive_path = None
+
        try:
+            self._send_progress(20, 100, 'WORKING', MESSAGE_PARSING_DOCUMENT)
            self.log("debug", "Parsing {}...".format(self.filename))
            document_parser.parse(self.path, mime_type, self.filename)

            self.log("debug", f"Generating thumbnail for {self.filename}...")
+            self._send_progress(70, 100, 'WORKING',
+                                MESSAGE_GENERATING_THUMBNAIL)
            thumbnail = document_parser.get_optimised_thumbnail(
-                self.path, mime_type)
+                self.path, mime_type, self.filename)

            text = document_parser.get_text()
            date = document_parser.get_date()
            if not date:
+                self._send_progress(90, 100, 'WORKING',
+                                    MESSAGE_PARSE_DATE)
                date = parse_date(self.filename, text)
            archive_path = document_parser.get_archive_path()

        except ParseError as e:
            document_parser.cleanup()
-            self.log(
-                "error",
-                f"Error while consuming document {self.filename}: {e}")
-            raise ConsumerError(e)
+            self._fail(
+                str(e),
+                f"Error while consuming document {self.filename}: {e}",
+                exc_info=True
+            )

        # Prepare the document classifier.

@@ -201,15 +275,9 @@ class Consumer(LoggingMixin):
        #   reloading the classifier multiple times, since there are multiple
        #   post-consume hooks that all require the classifier.

-        try:
-            classifier = DocumentClassifier()
-            classifier.reload()
-        except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
-            self.log(
-                "warning",
-                f"Cannot classify documents: {e}.")
-            classifier = None
+        classifier = load_classifier()

+        self._send_progress(95, 100, 'WORKING', MESSAGE_SAVE_DOCUMENT)
        # now that everything is done, we can start to store the document
        # in the system. This will be a transaction and reasonably fast.
        try:
@@ -235,8 +303,7 @@ class Consumer(LoggingMixin):
                # After everything is in the database, copy the files into
                # place. If this fails, we'll also rollback the transaction.
                with FileLock(settings.MEDIA_LOCK):
-                    document.filename = generate_unique_filename(
-                        document, settings.ORIGINALS_DIR)
+                    document.filename = generate_unique_filename(document)
                    create_source_path_directory(document.source_path)

                    self._write(document.storage_type,
@@ -246,6 +313,10 @@ class Consumer(LoggingMixin):
                                thumbnail, document.thumbnail_path)

                    if archive_path and os.path.isfile(archive_path):
+                        document.archive_filename = generate_unique_filename(
+                            document,
+                            archive_filename=True
+                        )
                        create_source_path_directory(document.archive_path)
                        self._write(document.storage_type,
                                    archive_path, document.archive_path)
@@ -262,13 +333,22 @@ class Consumer(LoggingMixin):
                self.log("debug", "Deleting file {}".format(self.path))
                os.unlink(self.path)

+                # https://github.com/jonaswinkler/paperless-ng/discussions/1037
+                shadow_file = os.path.join(
+                    os.path.dirname(self.path),
+                    "._" + os.path.basename(self.path))
+
+                if os.path.isfile(shadow_file):
+                    self.log("debug", "Deleting file {}".format(shadow_file))
+                    os.unlink(shadow_file)
+
        except Exception as e:
-            self.log(
-                "error",
+            self._fail(
+                str(e),
                f"The following error occured while consuming "
-                f"{self.filename}: {e}"
+                f"{self.filename}: {e}",
+                exc_info=True
            )
-            raise ConsumerError(e)
        finally:
            document_parser.cleanup()

@@ -279,6 +359,8 @@ class Consumer(LoggingMixin):
            "Document {} consumption finished".format(document)
        )

+        self._send_progress(100, 100, 'SUCCESS', MESSAGE_FINISHED, document.id)
+
        return document

    def _store(self, text, date, mime_type):
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -8,6 +8,9 @@ from django.conf import settings
 from django.template.defaultfilters import slugify


+logger = logging.getLogger("paperless.filehandling")
+
+
 class defaultdictNoStr(defaultdict):

    def __str__(self):
@@ -76,12 +79,40 @@ def many_to_dictionary(field):
    return mydictionary


-def generate_unique_filename(doc, root):
+def generate_unique_filename(doc,
+                             archive_filename=False):
+    """
+    Generates a unique filename for doc in settings.ORIGINALS_DIR.
+
+    The returned filename is guaranteed to be either the current filename
+    of the document if unchanged, or a new filename that does not correspondent
+    to any existing files. The function will append _01, _02, etc to the
+    filename before the extension to avoid conflicts.
+
+    If archive_filename is True, return a unique archive filename instead.
+
+    """
+    if archive_filename:
+        old_filename = doc.archive_filename
+        root = settings.ARCHIVE_DIR
+    else:
+        old_filename = doc.filename
+        root = settings.ORIGINALS_DIR
+
+    # If generating archive filenames, try to make a name that is similar to
+    # the original filename first.
+
+    if archive_filename and doc.filename:
+        new_filename = os.path.splitext(doc.filename)[0] + ".pdf"
+        if new_filename == old_filename or not os.path.exists(os.path.join(root, new_filename)):  # NOQA: E501
+            return new_filename
+
    counter = 0

    while True:
-        new_filename = generate_filename(doc, counter)
-        if new_filename == doc.filename:
+        new_filename = generate_filename(
+            doc, counter, archive_filename=archive_filename)
+        if new_filename == old_filename:
            # still the same as before.
            return new_filename

@@ -91,7 +122,7 @@ def generate_unique_filename(doc, root):
            return new_filename


-def generate_filename(doc, counter=0, append_gpg=True):
+def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
    path = ""

    try:
@@ -120,6 +151,11 @@ def generate_filename(doc, counter=0, append_gpg=True):
            else:
                document_type = "none"

+            if doc.archive_serial_number:
+                asn = str(doc.archive_serial_number)
+            else:
+                asn = "none"
+
            path = settings.PAPERLESS_FILENAME_FORMAT.format(
                title=pathvalidate.sanitize_filename(
                    doc.title, replacement_text="-"),
@@ -133,6 +169,7 @@ def generate_filename(doc, counter=0, append_gpg=True):
                added_year=doc.added.year if doc.added else "none",
                added_month=f"{doc.added.month:02}" if doc.added else "none",
                added_day=f"{doc.added.day:02}" if doc.added else "none",
+                asn=asn,
                tags=tags,
                tag_list=tag_list
            ).strip()
@@ -140,23 +177,21 @@ def generate_filename(doc, counter=0, append_gpg=True):
            path = path.strip(os.sep)

    except (ValueError, KeyError, IndexError):
-        logging.getLogger(__name__).warning(
+        logger.warning(
            f"Invalid PAPERLESS_FILENAME_FORMAT: "
            f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")

    counter_str = f"_{counter:02}" if counter else ""
+
+    filetype_str = ".pdf" if archive_filename else doc.file_type
+
    if len(path) > 0:
-        filename = f"{path}{counter_str}{doc.file_type}"
+        filename = f"{path}{counter_str}{filetype_str}"
    else:
-        filename = f"{doc.pk:07}{counter_str}{doc.file_type}"
+        filename = f"{doc.pk:07}{counter_str}{filetype_str}"

    # Append .gpg for encrypted files
    if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
        filename += ".gpg"

    return filename
-
-
-def archive_name_from_filename(filename):
-
-    return os.path.splitext(filename)[0] + ".pdf"
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -1,3 +1,4 @@
+from django.db.models import Q
 from django_filters.rest_framework import BooleanFilter, FilterSet, Filter

 from .models import Correspondent, Document, Tag, DocumentType, Log
@@ -74,6 +75,16 @@ class InboxFilter(Filter):
            return qs


+class TitleContentFilter(Filter):
+
+    def filter(self, qs, value):
+        if value:
+            return qs.filter(Q(title__icontains=value) |
+                             Q(content__icontains=value))
+        else:
+            return qs
+
+
 class DocumentFilterSet(FilterSet):

    is_tagged = BooleanFilter(
@@ -91,6 +102,8 @@ class DocumentFilterSet(FilterSet):

    is_in_inbox = InboxFilter()

+    title_content = TitleContentFilter()
+
    class Meta:
        model = Document
        fields = {
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -2,75 +2,70 @@ import logging
 import os
 from contextlib import contextmanager

+import math
+from dateutil.parser import isoparse
 from django.conf import settings
 from whoosh import highlight, classify, query
-from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
-from whoosh.highlight import Formatter, get_text
+from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME, BOOLEAN
+from whoosh.highlight import HtmlFormatter
 from whoosh.index import create_in, exists_in, open_dir
 from whoosh.qparser import MultifieldParser
 from whoosh.qparser.dateparse import DateParserPlugin
+from whoosh.searching import ResultsPage, Searcher
 from whoosh.writing import AsyncWriter

+from documents.models import Document

-logger = logging.getLogger(__name__)
-
-
-class JsonFormatter(Formatter):
-    def __init__(self):
-        self.seen = {}
-
-    def format_token(self, text, token, replace=False):
-        ttext = self._text(get_text(text, token, replace))
-        return {'text': ttext, 'highlight': 'true'}
-
-    def format_fragment(self, fragment, replace=False):
-        output = []
-        index = fragment.startchar
-        text = fragment.text
-        amend_token = None
-        for t in fragment.matches:
-            if t.startchar is None:
-                continue
-            if t.startchar < index:
-                continue
-            if t.startchar > index:
-                text_inbetween = text[index:t.startchar]
-                if amend_token and t.startchar - index < 10:
-                    amend_token['text'] += text_inbetween
-                else:
-                    output.append({'text': text_inbetween,
-                                   'highlight': False})
-                    amend_token = None
-            token = self.format_token(text, t, replace)
-            if amend_token:
-                amend_token['text'] += token['text']
-            else:
-                output.append(token)
-                amend_token = token
-            index = t.endchar
-        if index < fragment.endchar:
-            output.append({'text': text[index:fragment.endchar],
-                           'highlight': False})
-        return output
-
-    def format(self, fragments, replace=False):
-        output = []
-        for fragment in fragments:
-            output.append(self.format_fragment(fragment, replace=replace))
-        return output
+logger = logging.getLogger("paperless.index")


 def get_schema():
    return Schema(
-        id=NUMERIC(stored=True, unique=True, numtype=int),
-        title=TEXT(stored=True),
+        id=NUMERIC(
+            stored=True,
+            unique=True
+        ),
+        title=TEXT(
+            sortable=True
+        ),
        content=TEXT(),
-        correspondent=TEXT(stored=True),
-        tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
-        type=TEXT(stored=True),
-        created=DATETIME(stored=True, sortable=True),
-        modified=DATETIME(stored=True, sortable=True),
-        added=DATETIME(stored=True, sortable=True),
+        asn=NUMERIC(
+            sortable=True
+        ),
+
+        correspondent=TEXT(
+            sortable=True
+        ),
+        correspondent_id=NUMERIC(),
+        has_correspondent=BOOLEAN(),
+
+        tag=KEYWORD(
+            commas=True,
+            scorable=True,
+            lowercase=True
+        ),
+        tag_id=KEYWORD(
+            commas=True,
+            scorable=True
+        ),
+        has_tag=BOOLEAN(),
+
+        type=TEXT(
+            sortable=True
+        ),
+        type_id=NUMERIC(),
+        has_type=BOOLEAN(),
+
+        created=DATETIME(
+            sortable=True
+        ),
+        modified=DATETIME(
+            sortable=True
+        ),
+        added=DATETIME(
+            sortable=True
+        ),
+
    )


@@ -78,25 +73,56 @@ def open_index(recreate=False):
    try:
        if exists_in(settings.INDEX_DIR) and not recreate:
            return open_dir(settings.INDEX_DIR, schema=get_schema())
-    except Exception as e:
-        logger.error(f"Error while opening the index: {e}, recreating.")
+    except Exception:
+        logger.exception(f"Error while opening the index, recreating.")

    if not os.path.isdir(settings.INDEX_DIR):
        os.makedirs(settings.INDEX_DIR, exist_ok=True)
    return create_in(settings.INDEX_DIR, get_schema())


+@contextmanager
+def open_index_writer(optimize=False):
+    writer = AsyncWriter(open_index())
+
+    try:
+        yield writer
+    except Exception as e:
+        logger.exception(str(e))
+        writer.cancel()
+    finally:
+        writer.commit(optimize=optimize)
+
+
+@contextmanager
+def open_index_searcher():
+    searcher = open_index().searcher()
+
+    try:
+        yield searcher
+    finally:
+        searcher.close()
+
+
 def update_document(writer, doc):
    tags = ",".join([t.name for t in doc.tags.all()])
+    tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
    writer.update_document(
        id=doc.pk,
        title=doc.title,
        content=doc.content,
        correspondent=doc.correspondent.name if doc.correspondent else None,
+        correspondent_id=doc.correspondent.id if doc.correspondent else None,
+        has_correspondent=doc.correspondent is not None,
        tag=tags if tags else None,
+        tag_id=tags_ids if tags_ids else None,
+        has_tag=len(tags) > 0,
        type=doc.document_type.name if doc.document_type else None,
+        type_id=doc.document_type.id if doc.document_type else None,
+        has_type=doc.document_type is not None,
        created=doc.created,
        added=doc.added,
+        asn=doc.archive_serial_number,
        modified=doc.modified,
    )

@@ -110,61 +136,162 @@ def remove_document_by_id(writer, doc_id):


 def add_or_update_document(document):
-    ix = open_index()
-    with AsyncWriter(ix) as writer:
+    with open_index_writer() as writer:
        update_document(writer, document)


 def remove_document_from_index(document):
-    ix = open_index()
-    with AsyncWriter(ix) as writer:
+    with open_index_writer() as writer:
        remove_document(writer, document)


-@contextmanager
-def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
-    searcher = ix.searcher()
-    try:
-        if querystring:
-            qp = MultifieldParser(
-                ["content", "title", "correspondent", "tag", "type"],
-                ix.schema)
-            qp.add_plugin(DateParserPlugin())
-            str_q = qp.parse(querystring)
-            corrected = searcher.correct_query(str_q, querystring)
-        else:
-            str_q = None
-            corrected = None
+class DelayedQuery:

-        if more_like_doc_id:
-            docnum = searcher.document_number(id=more_like_doc_id)
-            kts = searcher.key_terms_from_text(
-                'content', more_like_doc_content, numterms=20,
-                model=classify.Bo1Model, normalize=False)
-            more_like_q = query.Or(
-                [query.Term('content', word, boost=weight)
-                 for word, weight in kts])
-            result_page = searcher.search_page(
-                more_like_q, page, filter=str_q, mask={docnum})
-        elif str_q:
-            result_page = searcher.search_page(str_q, page)
-        else:
-            raise ValueError(
-                "Either querystring or more_like_doc_id is required."
-            )
+    def _get_query(self):
+        raise NotImplementedError()

-        result_page.results.fragmenter = highlight.ContextFragmenter(
+    def _get_query_filter(self):
+        criterias = []
+        for k, v in self.query_params.items():
+            if k == 'correspondent__id':
+                criterias.append(query.Term('correspondent_id', v))
+            elif k == 'tags__id__all':
+                for tag_id in v.split(","):
+                    criterias.append(query.Term('tag_id', tag_id))
+            elif k == 'document_type__id':
+                criterias.append(query.Term('type_id', v))
+            elif k == 'correspondent__isnull':
+                criterias.append(query.Term("has_correspondent", v == "false"))
+            elif k == 'is_tagged':
+                criterias.append(query.Term("has_tag", v == "true"))
+            elif k == 'document_type__isnull':
+                criterias.append(query.Term("has_type", v == "false"))
+            elif k == 'created__date__lt':
+                criterias.append(
+                    query.DateRange("created", start=None, end=isoparse(v)))
+            elif k == 'created__date__gt':
+                criterias.append(
+                    query.DateRange("created", start=isoparse(v), end=None))
+            elif k == 'added__date__gt':
+                criterias.append(
+                    query.DateRange("added", start=isoparse(v), end=None))
+            elif k == 'added__date__lt':
+                criterias.append(
+                    query.DateRange("added", start=None, end=isoparse(v)))
+        if len(criterias) > 0:
+            return query.And(criterias)
+        else:
+            return None
+
+    def _get_query_sortedby(self):
+        if 'ordering' not in self.query_params:
+            return None, False
+
+        field: str = self.query_params['ordering']
+
+        sort_fields_map = {
+            "created": "created",
+            "modified": "modified",
+            "added": "added",
+            "title": "title",
+            "correspondent__name": "correspondent",
+            "document_type__name": "type",
+            "archive_serial_number": "asn"
+        }
+
+        if field.startswith('-'):
+            field = field[1:]
+            reverse = True
+        else:
+            reverse = False
+
+        if field not in sort_fields_map:
+            return None, False
+        else:
+            return sort_fields_map[field], reverse
+
+    def __init__(self, searcher: Searcher, query_params, page_size):
+        self.searcher = searcher
+        self.query_params = query_params
+        self.page_size = page_size
+        self.saved_results = dict()
+        self.first_score = None
+
+    def __len__(self):
+        page = self[0:1]
+        return len(page)
+
+    def __getitem__(self, item):
+        if item.start in self.saved_results:
+            return self.saved_results[item.start]
+
+        q, mask = self._get_query()
+        sortedby, reverse = self._get_query_sortedby()
+
+        page: ResultsPage = self.searcher.search_page(
+            q,
+            mask=mask,
+            filter=self._get_query_filter(),
+            pagenum=math.floor(item.start / self.page_size) + 1,
+            pagelen=self.page_size,
+            sortedby=sortedby,
+            reverse=reverse
+        )
+        page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
-        result_page.results.formatter = JsonFormatter()
+        page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")

-        if corrected and corrected.query != str_q:
+        if (not self.first_score and
+                len(page.results) > 0 and
+                sortedby is None):
+            self.first_score = page.results[0].score
+
+        page.results.top_n = list(map(
+            lambda hit: (
+                (hit[0] / self.first_score) if self.first_score else None,
+                hit[1]
+            ),
+            page.results.top_n
+        ))
+
+        self.saved_results[item.start] = page
+
+        return page
+
+
+class DelayedFullTextQuery(DelayedQuery):
+
+    def _get_query(self):
+        q_str = self.query_params['query']
+        qp = MultifieldParser(
+            ["content", "title", "correspondent", "tag", "type"],
+            self.searcher.ixreader.schema)
+        qp.add_plugin(DateParserPlugin())
+        q = qp.parse(q_str)
+
+        corrected = self.searcher.correct_query(q, q_str)
+        if corrected.query != q:
            corrected_query = corrected.string
-        else:
-            corrected_query = None

-        yield result_page, corrected_query
-    finally:
-        searcher.close()
+        return q, None
+
+
+class DelayedMoreLikeThisQuery(DelayedQuery):
+
+    def _get_query(self):
+        more_like_doc_id = int(self.query_params['more_like_id'])
+        content = Document.objects.get(id=more_like_doc_id).content
+
+        docnum = self.searcher.document_number(id=more_like_doc_id)
+        kts = self.searcher.key_terms_from_text(
+            'content', content, numterms=20,
+            model=classify.Bo1Model, normalize=False)
+        q = query.Or(
+            [query.Term('content', word, boost=weight)
+             for word, weight in kts])
+        mask = {docnum}
+
+        return q, mask


 def autocomplete(ix, term, limit=10):
--- a/src/documents/loggers.py
+++ b/src/documents/loggers.py
@@ -4,33 +4,24 @@ import uuid
 from django.conf import settings


-class PaperlessHandler(logging.Handler):
-    def emit(self, record):
-        if settings.DISABLE_DBHANDLER:
-            return
-
-        # We have to do the import here or Django will barf when it tries to
-        # load this because the apps aren't loaded at that point
-        from .models import Log
-
-        kwargs = {"message": record.msg, "level": record.levelno}
-
-        if hasattr(record, "group"):
-            kwargs["group"] = record.group
-
-        Log.objects.create(**kwargs)
-
-
 class LoggingMixin:

    logging_group = None

+    logging_name = None
+
    def renew_logging_group(self):
        self.logging_group = uuid.uuid4()

    def log(self, level, message, **kwargs):
-        target = ".".join([self.__class__.__module__, self.__class__.__name__])
-        logger = logging.getLogger(target)
+        if self.logging_name:
+            logger = logging.getLogger(self.logging_name)
+        else:
+            name = ".".join([
+                self.__class__.__module__,
+                self.__class__.__name__
+            ])
+            logger = logging.getLogger(name)

        getattr(logger, level)(message, extra={
            "group": self.logging_group
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -16,12 +16,12 @@ from whoosh.writing import AsyncWriter

 from documents.models import Document
 from ... import index
-from ...file_handling import create_source_path_directory
-from ...mixins import Renderable
+from ...file_handling import create_source_path_directory, \
+    generate_unique_filename
 from ...parsers import get_parser_class_for_mime_type


-logger = logging.getLogger(__name__)
+logger = logging.getLogger("paperless.management.archiver")


 def handle_document(document_id):
@@ -31,38 +31,57 @@ def handle_document(document_id):

    parser_class = get_parser_class_for_mime_type(mime_type)

+    if not parser_class:
+        logger.error(f"No parser found for mime type {mime_type}, cannot "
+                     f"archive document {document} (ID: {document_id})")
+        return
+
    parser = parser_class(logging_group=uuid.uuid4())

    try:
-        parser.parse(document.source_path, mime_type)
+        parser.parse(
+            document.source_path,
+            mime_type,
+            document.get_public_filename())
+
+        thumbnail = parser.get_optimised_thumbnail(
+            document.source_path,
+            mime_type,
+            document.get_public_filename()
+        )

        if parser.get_archive_path():
            with transaction.atomic():
                with open(parser.get_archive_path(), 'rb') as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
-                # i'm going to save first so that in case the file move
+                # I'm going to save first so that in case the file move
                # fails, the database is rolled back.
-                # we also don't use save() since that triggers the filehandling
+                # We also don't use save() since that triggers the filehandling
                # logic, and we don't want that yet (file not yet in place)
+                document.archive_filename = generate_unique_filename(
+                    document, archive_filename=True)
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum,
-                    content=parser.get_text()
+                    content=parser.get_text(),
+                    archive_filename=document.archive_filename
                )
                with FileLock(settings.MEDIA_LOCK):
                    create_source_path_directory(document.archive_path)
                    shutil.move(parser.get_archive_path(),
                                document.archive_path)
+                    shutil.move(thumbnail, document.thumbnail_path)

-        with AsyncWriter(index.open_index()) as writer:
-            index.update_document(writer, document)
+            with index.open_index_writer() as writer:
+                index.update_document(writer, document)

    except Exception as e:
-        logger.error(f"Error while parsing document {document}: {str(e)}")
+        logger.exception(f"Error while parsing document {document} "
+                         f"(ID: {document_id})")
    finally:
        parser.cleanup()


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = """
        Using the current classification model, assigns correspondents, tags
@@ -71,10 +90,6 @@ class Command(Renderable, BaseCommand):
        modified) after their initial import.
    """.replace("    ", "")

-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
    def add_arguments(self, parser):
        parser.add_argument(
            "-f", "--overwrite",
@@ -91,6 +106,12 @@ class Command(Renderable, BaseCommand):
            help="Specify the ID of a document, and this command will only "
                 "run on this specific document."
        )
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )

    def handle(self, *args, **options):

@@ -106,7 +127,7 @@ class Command(Renderable, BaseCommand):
        document_ids = list(map(
            lambda doc: doc.id,
            filter(
-                lambda d: overwrite or not d.archive_checksum,
+                lambda d: overwrite or not d.has_archive_version,
                documents
            )
        ))
@@ -125,7 +146,8 @@ class Command(Renderable, BaseCommand):
                        handle_document,
                        document_ids
                    ),
-                    total=len(document_ids)
+                    total=len(document_ids),
+                    disable=options['no_progress_bar']
                ))
        except KeyboardInterrupt:
            print("Aborting...")
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -1,6 +1,7 @@
 import logging
 import os
-from pathlib import Path
+from pathlib import Path, PurePath
+from threading import Thread
 from time import sleep

 from django.conf import settings
@@ -17,11 +18,11 @@ try:
 except ImportError:
    INotify = flags = None

-logger = logging.getLogger(__name__)
+logger = logging.getLogger("paperless.management.consumer")


 def _tags_from_path(filepath):
-    """Walk up the directory tree from filepath to CONSUMPTION_DIr
+    """Walk up the directory tree from filepath to CONSUMPTION_DIR
       and get or create Tag IDs for every directory.
    """
    tag_ids = set()
@@ -35,8 +36,15 @@ def _tags_from_path(filepath):
    return tag_ids


+def _is_ignored(filepath: str) -> bool:
+    filepath_relative = PurePath(filepath).relative_to(
+        settings.CONSUMPTION_DIR)
+    return any(
+        filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
+
+
 def _consume(filepath):
-    if os.path.isdir(filepath):
+    if os.path.isdir(filepath) or _is_ignored(filepath):
        return

    if not os.path.isfile(filepath):
@@ -54,10 +62,10 @@ def _consume(filepath):
        if settings.CONSUMER_SUBDIRS_AS_TAGS:
            tag_ids = _tags_from_path(filepath)
    except Exception as e:
-        logger.error(
-            "Error creating tags from path: {}".format(e))
+        logger.exception("Error creating tags from path")

    try:
+        logger.info(f"Adding {filepath} to the task queue.")
        async_task("documents.tasks.consume_file",
                   filepath,
                   override_tag_ids=tag_ids if tag_ids else None,
@@ -66,14 +74,17 @@ def _consume(filepath):
        # Catch all so that the consumer won't crash.
        # This is also what the test case is listening for to check for
        # errors.
-        logger.error(
-            "Error while consuming document: {}".format(e))
+        logger.exception("Error while consuming document")


-def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
+def _consume_wait_unmodified(file):
+    if _is_ignored(file):
+        return
+
+    logger.debug(f"Waiting for file {file} to remain unmodified")
    mtime = -1
    current_try = 0
-    while current_try < num_tries:
+    while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
        try:
            new_mtime = os.stat(file).st_mtime
        except FileNotFoundError:
@@ -84,7 +95,7 @@ def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
            _consume(file)
            return
        mtime = new_mtime
-        sleep(wait_time)
+        sleep(settings.CONSUMER_POLLING_DELAY)
        current_try += 1

    logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
@@ -93,10 +104,14 @@ def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
 class Handler(FileSystemEventHandler):

    def on_created(self, event):
-        _consume_wait_unmodified(event.src_path)
+        Thread(
+            target=_consume_wait_unmodified, args=(event.src_path,)
+        ).start()

    def on_moved(self, event):
-        _consume_wait_unmodified(event.dest_path)
+        Thread(
+            target=_consume_wait_unmodified, args=(event.dest_path,)
+        ).start()


 class Command(BaseCommand):
@@ -108,12 +123,7 @@ class Command(BaseCommand):
    # This is here primarily for the tests and is irrelevant in production.
    stop_flag = False

-    def __init__(self, *args, **kwargs):
-
-        self.logger = logging.getLogger(__name__)
-
-        BaseCommand.__init__(self, *args, **kwargs)
-        self.observer = None
+    observer = None

    def add_arguments(self, parser):
        parser.add_argument(
@@ -153,7 +163,7 @@ class Command(BaseCommand):
        if options["oneshot"]:
            return

-        if settings.CONSUMER_POLLING == 0:
+        if settings.CONSUMER_POLLING == 0 and INotify:
            self.handle_inotify(directory, recursive)
        else:
            self.handle_polling(directory, recursive)
@@ -161,7 +171,7 @@ class Command(BaseCommand):
        logger.debug("Consumer exiting.")

    def handle_polling(self, directory, recursive):
-        logging.getLogger(__name__).info(
+        logger.info(
            f"Polling directory for changes: {directory}")
        self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
        self.observer.schedule(Handler(), directory, recursive=recursive)
@@ -176,7 +186,7 @@ class Command(BaseCommand):
        self.observer.join()

    def handle_inotify(self, directory, recursive):
-        logging.getLogger(__name__).info(
+        logger.info(
            f"Using inotify to watch directory for changes: {directory}")

        inotify = INotify()
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -1,10 +1,9 @@
 from django.core.management.base import BaseCommand

-from ...mixins import Renderable
 from ...tasks import train_classifier


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = """
        Trains the classifier on your data and saves the resulting models to a
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -6,20 +6,22 @@ import time

 import tqdm
 from django.conf import settings
+from django.contrib.auth.models import User
 from django.core import serializers
 from django.core.management.base import BaseCommand, CommandError
 from django.db import transaction
 from filelock import FileLock

-from documents.models import Document, Correspondent, Tag, DocumentType
+from documents.models import Document, Correspondent, Tag, DocumentType, \
+    SavedView, SavedViewFilterRule
 from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
    EXPORTER_ARCHIVE_NAME
 from paperless.db import GnuPG
+from paperless_mail.models import MailAccount, MailRule
 from ...file_handling import generate_filename, delete_empty_directories
-from ...mixins import Renderable


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = """
        Decrypt and rename all files in our collection into a given target
@@ -55,6 +57,12 @@ class Command(Renderable, BaseCommand):
                 "do not belong to the current export, such as files from "
                 "deleted documents."
        )
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )

    def __init__(self, *args, **kwargs):
        BaseCommand.__init__(self, *args, **kwargs)
@@ -79,9 +87,9 @@ class Command(Renderable, BaseCommand):
            raise CommandError("That path doesn't appear to be writable")

        with FileLock(settings.MEDIA_LOCK):
-            self.dump()
+            self.dump(options['no_progress_bar'])

-    def dump(self):
+    def dump(self, progress_bar_disable=False):
        # 1. Take a snapshot of what files exist in the current export folder
        for root, dirs, files in os.walk(self.target):
            self.files_in_export_dir.extend(
@@ -106,9 +114,27 @@ class Command(Renderable, BaseCommand):
                serializers.serialize("json", documents))
            manifest += document_manifest

+            manifest += json.loads(serializers.serialize(
+                "json", MailAccount.objects.all()))
+
+            manifest += json.loads(serializers.serialize(
+                "json", MailRule.objects.all()))
+
+            manifest += json.loads(serializers.serialize(
+                "json", SavedView.objects.all()))
+
+            manifest += json.loads(serializers.serialize(
+                "json", SavedViewFilterRule.objects.all()))
+
+            manifest += json.loads(serializers.serialize(
+                "json", User.objects.all()))
+
        # 3. Export files from each document
-        for index, document_dict in tqdm.tqdm(enumerate(document_manifest),
-                                              total=len(document_manifest)):
+        for index, document_dict in tqdm.tqdm(
+            enumerate(document_manifest),
+            total=len(document_manifest),
+            disable=progress_bar_disable
+        ):
            # 3.1. store files unencrypted
            document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED  # NOQA: E501

@@ -140,7 +166,7 @@ class Command(Renderable, BaseCommand):
            thumbnail_target = os.path.join(self.target, thumbnail_name)
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

-            if os.path.exists(document.archive_path):
+            if document.has_archive_version:
                archive_name = base_name + "-archive.pdf"
                archive_target = os.path.join(self.target, archive_name)
                document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -15,7 +15,6 @@ from documents.models import Document
 from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
    EXPORTER_ARCHIVE_NAME
 from ...file_handling import create_source_path_directory
-from ...mixins import Renderable
 from ...signals.handlers import update_filename_and_move_files


@@ -28,7 +27,7 @@ def disable_signal(sig, receiver, sender):
        sig.connect(receiver=receiver, sender=sender)


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = """
        Using a manifest.json file, load the data from there, and import the
@@ -37,6 +36,12 @@ class Command(Renderable, BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument("source")
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )

    def __init__(self, *args, **kwargs):
        BaseCommand.__init__(self, *args, **kwargs)
@@ -71,7 +76,7 @@ class Command(Renderable, BaseCommand):
                # Fill up the database with whatever is in the manifest
                call_command("loaddata", manifest_path)

-                self._import_files_from_manifest()
+                self._import_files_from_manifest(options['no_progress_bar'])

        print("Updating search index...")
        call_command('document_index', 'reindex')
@@ -112,7 +117,7 @@ class Command(Renderable, BaseCommand):
                        f"does not appear to be in the source directory."
                    )

-    def _import_files_from_manifest(self):
+    def _import_files_from_manifest(self, progress_bar_disable):

        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
@@ -124,7 +129,10 @@ class Command(Renderable, BaseCommand):
            lambda r: r["model"] == "documents.document",
            self.manifest))

-        for record in tqdm.tqdm(manifest_documents):
+        for record in tqdm.tqdm(
+            manifest_documents,
+            disable=progress_bar_disable
+        ):

            document = Document.objects.get(pk=record["pk"])

@@ -152,6 +160,9 @@ class Command(Renderable, BaseCommand):
                shutil.copy2(thumbnail_path, document.thumbnail_path)
                if archive_path:
                    create_source_path_directory(document.archive_path)
+                    # TODO: this assumes that the export is valid and
+                    #  archive_filename is present on all documents with
+                    #  archived files
                    shutil.copy2(archive_path, document.archive_path)

            document.save()
--- a/src/documents/management/commands/document_index.py
+++ b/src/documents/management/commands/document_index.py
@@ -1,26 +1,25 @@
 from django.core.management import BaseCommand
 from django.db import transaction

-from documents.mixins import Renderable
 from documents.tasks import index_reindex, index_optimize


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = "Manages the document index."

-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
    def add_arguments(self, parser):
        parser.add_argument("command", choices=['reindex', 'optimize'])
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )

    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
        with transaction.atomic():
            if options['command'] == 'reindex':
-                index_reindex()
+                index_reindex(progress_bar_disable=options['no_progress_bar'])
            elif options['command'] == 'optimize':
                index_optimize()
--- a/src/documents/management/commands/document_logs.py
+++ b/src/documents/management/commands/document_logs.py
@@ -1,12 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from documents.models import Log
-
-
-class Command(BaseCommand):
-
-    help = "A quick & dirty way to see what's in the logs"
-
-    def handle(self, *args, **options):
-        for log in Log.objects.order_by("pk"):
-            print(log)
--- a/src/documents/management/commands/document_renamer.py
+++ b/src/documents/management/commands/document_renamer.py
@@ -5,24 +5,28 @@ from django.core.management.base import BaseCommand
 from django.db.models.signals import post_save

 from documents.models import Document
-from ...mixins import Renderable


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = """
        This will rename all documents to match the latest filename format.
    """.replace("    ", "")

-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )

    def handle(self, *args, **options):

-        self.verbosity = options["verbosity"]
-
        logging.getLogger().handlers[0].level = logging.ERROR

-        for document in tqdm.tqdm(Document.objects.all()):
+        for document in tqdm.tqdm(
+            Document.objects.all(),
+            disable=options['no_progress_bar']
+        ):
            post_save.send(Document, instance=document)
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -1,15 +1,17 @@
 import logging

+import tqdm
 from django.core.management.base import BaseCommand

-from documents.classifier import DocumentClassifier, \
-    IncompatibleClassifierVersionError
+from documents.classifier import load_classifier
 from documents.models import Document
-from ...mixins import Renderable
 from ...signals.handlers import set_correspondent, set_document_type, set_tags


-class Command(Renderable, BaseCommand):
+logger = logging.getLogger("paperless.management.retagger")
+
+
+class Command(BaseCommand):

    help = """
        Using the current classification model, assigns correspondents, tags
@@ -18,10 +20,6 @@ class Command(Renderable, BaseCommand):
        modified) after their initial import.
    """.replace("    ", "")

-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
    def add_arguments(self, parser):
        parser.add_argument(
            "-c", "--correspondent",
@@ -59,10 +57,26 @@ class Command(Renderable, BaseCommand):
                 "set correspondent, document and remove correspondents, types"
                 "and tags that do not match anymore due to changed rules."
        )
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )
+        parser.add_argument(
+            "--suggest",
+            default=False,
+            action="store_true",
+            help="Return the suggestion, don't change anything."
+        )
+        parser.add_argument(
+            "--base-url",
+            help="The base URL to use to build the link to the documents."
+        )

    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
+        # Detect if we support color
+        color = self.style.ERROR("test") != "test"

        if options["inbox_only"]:
            queryset = Document.objects.filter(tags__is_inbox_tag=True)
@@ -70,17 +84,12 @@ class Command(Renderable, BaseCommand):
            queryset = Document.objects.all()
        documents = queryset.distinct()

-        classifier = DocumentClassifier()
-        try:
-            classifier.reload()
-        except (OSError, EOFError, IncompatibleClassifierVersionError) as e:
-            logging.getLogger(__name__).warning(
-                f"Cannot classify documents: {e}.")
-            classifier = None
+        classifier = load_classifier()

-        for document in documents:
-            logging.getLogger(__name__).info(
-                f"Processing document {document.title}")
+        for document in tqdm.tqdm(
+            documents,
+            disable=options['no_progress_bar']
+        ):

            if options['correspondent']:
                set_correspondent(
@@ -88,18 +97,27 @@ class Command(Renderable, BaseCommand):
                    document=document,
                    classifier=classifier,
                    replace=options['overwrite'],
-                    use_first=options['use_first'])
+                    use_first=options['use_first'],
+                    suggest=options['suggest'],
+                    base_url=options['base_url'],
+                    color=color)

            if options['document_type']:
                set_document_type(sender=None,
                                  document=document,
                                  classifier=classifier,
                                  replace=options['overwrite'],
-                                  use_first=options['use_first'])
+                                  use_first=options['use_first'],
+                                  suggest=options['suggest'],
+                                  base_url=options['base_url'],
+                                  color=color)

            if options['tags']:
                set_tags(
                    sender=None,
                    document=document,
                    classifier=classifier,
-                    replace=options['overwrite'])
+                    replace=options['overwrite'],
+                    suggest=options['suggest'],
+                    base_url=options['base_url'],
+                    color=color)
--- a/src/documents/management/commands/document_sanity_checker.py
+++ b/src/documents/management/commands/document_sanity_checker.py
@@ -0,0 +1,23 @@
+from django.core.management.base import BaseCommand
+from documents.sanity_checker import check_sanity
+
+
+class Command(BaseCommand):
+
+    help = """
+        This command checks your document archive for issues.
+    """.replace("    ", "")
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )
+
+    def handle(self, *args, **options):
+
+        messages = check_sanity(progress=not options['no_progress_bar'])
+
+        messages.log_messages()
--- a/src/documents/management/commands/document_thumbnails.py
+++ b/src/documents/management/commands/document_thumbnails.py
@@ -7,7 +7,6 @@ from django import db
 from django.core.management.base import BaseCommand

 from documents.models import Document
-from ...mixins import Renderable
 from ...parsers import get_parser_class_for_mime_type


@@ -23,23 +22,22 @@ def _process_document(doc_in):

    try:
        thumb = parser.get_optimised_thumbnail(
-            document.source_path, document.mime_type)
+            document.source_path,
+            document.mime_type,
+            document.get_public_filename()
+        )

        shutil.move(thumb, document.thumbnail_path)
    finally:
        parser.cleanup()


-class Command(Renderable, BaseCommand):
+class Command(BaseCommand):

    help = """
        This will regenerate the thumbnails for all documents.
    """.replace("    ", "")

-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
    def add_arguments(self, parser):
        parser.add_argument(
            "-d", "--document",
@@ -49,11 +47,14 @@ class Command(Renderable, BaseCommand):
            help="Specify the ID of a document, and this command will only "
                 "run on this specific document."
        )
+        parser.add_argument(
+            "--no-progress-bar",
+            default=False,
+            action="store_true",
+            help="If set, the progress bar will not be shown"
+        )

    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
-
        logging.getLogger().handlers[0].level = logging.ERROR

        if options['document']:
@@ -70,5 +71,7 @@ class Command(Renderable, BaseCommand):

        with multiprocessing.Pool() as pool:
            list(tqdm.tqdm(
-                pool.imap_unordered(_process_document, ids), total=len(ids)
+                pool.imap_unordered(_process_document, ids),
+                total=len(ids),
+                disable=options['no_progress_bar']
            ))
--- a/src/documents/management/commands/manage_superuser.py
+++ b/src/documents/management/commands/manage_superuser.py
@@ -0,0 +1,42 @@
+import logging
+import os
+
+from django.contrib.auth.models import User
+from django.core.management.base import BaseCommand, CommandError
+
+
+logger = logging.getLogger("paperless.management.superuser")
+
+
+class Command(BaseCommand):
+
+    help = """
+        Creates a Django superuser based on env variables.
+    """.replace("    ", "")
+
+    def handle(self, *args, **options):
+
+        username = os.getenv('PAPERLESS_ADMIN_USER')
+        if not username:
+            return
+
+        mail = os.getenv('PAPERLESS_ADMIN_MAIL', 'root@localhost')
+        password = os.getenv('PAPERLESS_ADMIN_PASSWORD')
+
+        # Check if user exists already, leave as is if it does
+        if User.objects.filter(username=username).exists():
+            user: User = User.objects.get_by_natural_key(username)
+            user.set_password(password)
+            user.save()
+            self.stdout.write(f"Changed password of user {username}.")
+        elif password:
+            # Create superuser based on env variables
+            User.objects.create_superuser(username, mail, password)
+            self.stdout.write(
+                f'Created superuser "{username}" with provided password.')
+        else:
+            self.stdout.write(
+                f'Did not create superuser "{username}".')
+            self.stdout.write(
+                'Make sure you specified "PAPERLESS_ADMIN_PASSWORD" in your '
+                '"docker-compose.env" file.')
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -1,18 +1,17 @@
 import logging
 import re

-from fuzzywuzzy import fuzz

 from documents.models import MatchingModel, Correspondent, DocumentType, Tag


-logger = logging.getLogger(__name__)
+logger = logging.getLogger("paperless.matching")


 def log_reason(matching_model, document, reason):
    class_name = type(matching_model).__name__
    logger.debug(
-        f"Assigning {class_name} {matching_model.name} to document "
+        f"{class_name} {matching_model.name} matched on document "
        f"{document} because {reason}")


@@ -91,7 +90,7 @@ def matches(matching_model, document):

    elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
        result = bool(re.search(
-            rf"\b{matching_model.match}\b",
+            rf"\b{re.escape(matching_model.match)}\b",
            document_content,
            **search_kwargs
        ))
@@ -123,6 +122,8 @@ def matches(matching_model, document):
        return bool(match)

    elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
+        from fuzzywuzzy import fuzz
+
        match = re.sub(r'[^\w\s]', '', matching_model.match)
        text = re.sub(r'[^\w\s]', '', document_content)
        if matching_model.is_insensitive:
@@ -160,6 +161,9 @@ def _split_match(matching_model):
    findterms = re.compile(r'"([^"]+)"|(\S+)').findall
    normspace = re.compile(r"\s+").sub
    return [
-        normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
+        # normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
+        re.escape(
+            normspace(" ", (t[0] or t[1]).strip())
+        ).replace(r"\ ", r"\s+")
        for t in findterms(matching_model.match)
    ]
--- a/src/documents/migrations/1012_fix_archive_files.py
+++ b/src/documents/migrations/1012_fix_archive_files.py
@@ -0,0 +1,330 @@
+# Generated by Django 3.1.6 on 2021-02-07 22:26
+import datetime
+import hashlib
+import logging
+import os
+import shutil
+from time import sleep
+
+import pathvalidate
+from django.conf import settings
+from django.db import migrations, models
+from django.template.defaultfilters import slugify
+
+from documents.file_handling import defaultdictNoStr, many_to_dictionary
+
+
+logger = logging.getLogger("paperless.migrations")
+
+###############################################################################
+# This is code copied straight paperless before the change.
+###############################################################################
+
+def archive_name_from_filename(filename):
+    return os.path.splitext(filename)[0] + ".pdf"
+
+
+def archive_path_old(doc):
+    if doc.filename:
+        fname = archive_name_from_filename(doc.filename)
+    else:
+        fname = "{:07}.pdf".format(doc.pk)
+
+    return os.path.join(
+        settings.ARCHIVE_DIR,
+        fname
+    )
+
+
+STORAGE_TYPE_GPG = "gpg"
+
+
+def archive_path_new(doc):
+    if doc.archive_filename is not None:
+        return os.path.join(
+            settings.ARCHIVE_DIR,
+            str(doc.archive_filename)
+        )
+    else:
+        return None
+
+
+def source_path(doc):
+    if doc.filename:
+        fname = str(doc.filename)
+    else:
+        fname = "{:07}{}".format(doc.pk, doc.file_type)
+        if doc.storage_type == STORAGE_TYPE_GPG:
+            fname += ".gpg"  # pragma: no cover
+
+    return os.path.join(
+        settings.ORIGINALS_DIR,
+        fname
+    )
+
+
+def generate_unique_filename(doc, archive_filename=False):
+    if archive_filename:
+        old_filename = doc.archive_filename
+        root = settings.ARCHIVE_DIR
+    else:
+        old_filename = doc.filename
+        root = settings.ORIGINALS_DIR
+
+    counter = 0
+
+    while True:
+        new_filename = generate_filename(
+            doc, counter, archive_filename=archive_filename)
+        if new_filename == old_filename:
+            # still the same as before.
+            return new_filename
+
+        if os.path.exists(os.path.join(root, new_filename)):
+            counter += 1
+        else:
+            return new_filename
+
+
+def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
+    path = ""
+
+    try:
+        if settings.PAPERLESS_FILENAME_FORMAT is not None:
+            tags = defaultdictNoStr(lambda: slugify(None),
+                                    many_to_dictionary(doc.tags))
+
+            tag_list = pathvalidate.sanitize_filename(
+                ",".join(sorted(
+                    [tag.name for tag in doc.tags.all()]
+                )),
+                replacement_text="-"
+            )
+
+            if doc.correspondent:
+                correspondent = pathvalidate.sanitize_filename(
+                    doc.correspondent.name, replacement_text="-"
+                )
+            else:
+                correspondent = "none"
+
+            if doc.document_type:
+                document_type = pathvalidate.sanitize_filename(
+                    doc.document_type.name, replacement_text="-"
+                )
+            else:
+                document_type = "none"
+
+            path = settings.PAPERLESS_FILENAME_FORMAT.format(
+                title=pathvalidate.sanitize_filename(
+                    doc.title, replacement_text="-"),
+                correspondent=correspondent,
+                document_type=document_type,
+                created=datetime.date.isoformat(doc.created),
+                created_year=doc.created.year if doc.created else "none",
+                created_month=f"{doc.created.month:02}" if doc.created else "none",  # NOQA: E501
+                created_day=f"{doc.created.day:02}" if doc.created else "none",
+                added=datetime.date.isoformat(doc.added),
+                added_year=doc.added.year if doc.added else "none",
+                added_month=f"{doc.added.month:02}" if doc.added else "none",
+                added_day=f"{doc.added.day:02}" if doc.added else "none",
+                tags=tags,
+                tag_list=tag_list
+            ).strip()
+
+            path = path.strip(os.sep)
+
+    except (ValueError, KeyError, IndexError):
+        logger.warning(
+            f"Invalid PAPERLESS_FILENAME_FORMAT: "
+            f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
+
+    counter_str = f"_{counter:02}" if counter else ""
+
+    filetype_str = ".pdf" if archive_filename else doc.file_type
+
+    if len(path) > 0:
+        filename = f"{path}{counter_str}{filetype_str}"
+    else:
+        filename = f"{doc.pk:07}{counter_str}{filetype_str}"
+
+    # Append .gpg for encrypted files
+    if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
+        filename += ".gpg"
+
+    return filename
+
+
+###############################################################################
+# This code performs bidirection archive file transformation.
+###############################################################################
+
+
+def parse_wrapper(parser, path, mime_type, file_name):
+    # this is here so that I can mock this out for testing.
+    parser.parse(path, mime_type, file_name)
+
+
+def create_archive_version(doc, retry_count=3):
+    from documents.parsers import get_parser_class_for_mime_type, \
+        DocumentParser, \
+        ParseError
+
+    logger.info(
+        f"Regenerating archive document for document ID:{doc.id}"
+    )
+    parser_class = get_parser_class_for_mime_type(doc.mime_type)
+    for try_num in range(retry_count):
+        parser: DocumentParser = parser_class(None, None)
+        try:
+            parse_wrapper(parser, source_path(doc), doc.mime_type,
+                          os.path.basename(doc.filename))
+            doc.content = parser.get_text()
+
+            if parser.get_archive_path() and os.path.isfile(
+                parser.get_archive_path()):
+                doc.archive_filename = generate_unique_filename(
+                    doc, archive_filename=True)
+                with open(parser.get_archive_path(), "rb") as f:
+                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
+                os.makedirs(os.path.dirname(archive_path_new(doc)),
+                            exist_ok=True)
+                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
+            else:
+                doc.archive_checksum = None
+                logger.error(
+                    f"Parser did not return an archive document for document "
+                    f"ID:{doc.id}. Removing archive document."
+                )
+            doc.save()
+            return
+        except ParseError:
+            if try_num + 1 == retry_count:
+                logger.exception(
+                    f"Unable to regenerate archive document for ID:{doc.id}. You "
+                    f"need to invoke the document_archiver management command "
+                    f"manually for that document."
+                )
+                doc.archive_checksum = None
+                doc.save()
+                return
+            else:
+                # This is mostly here for the tika parser in docker
+                # environemnts. The servers for parsing need to come up first,
+                # and the docker setup doesn't ensure that tika is running
+                # before attempting migrations.
+                logger.error("Parse error, will try again in 5 seconds...")
+                sleep(5)
+        finally:
+            parser.cleanup()
+
+
+def move_old_to_new_locations(apps, schema_editor):
+    Document = apps.get_model("documents", "Document")
+
+    affected_document_ids = set()
+
+    old_archive_path_to_id = {}
+
+    # check for documents that have incorrect archive versions
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+        old_path = archive_path_old(doc)
+
+        if old_path in old_archive_path_to_id:
+            affected_document_ids.add(doc.id)
+            affected_document_ids.add(old_archive_path_to_id[old_path])
+        else:
+            old_archive_path_to_id[old_path] = doc.id
+
+    # check that archive files of all unaffected documents are in place
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+        old_path = archive_path_old(doc)
+        if doc.id not in affected_document_ids and not os.path.isfile(old_path):
+            raise ValueError(
+                f"Archived document ID:{doc.id} does not exist at: "
+                f"{old_path}")
+
+    # check that we can regenerate affected archive versions
+    for doc_id in affected_document_ids:
+        from documents.parsers import get_parser_class_for_mime_type
+
+        doc = Document.objects.get(id=doc_id)
+        parser_class = get_parser_class_for_mime_type(doc.mime_type)
+        if not parser_class:
+            raise ValueError(
+                f"Document ID:{doc.id} has an invalid archived document, "
+                f"but no parsers are available. Cannot migrate.")
+
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+
+        if doc.id in affected_document_ids:
+            old_path = archive_path_old(doc)
+            # remove affected archive versions
+            if os.path.isfile(old_path):
+                logger.debug(
+                    f"Removing {old_path}"
+                )
+                os.unlink(old_path)
+        else:
+            # Set archive path for unaffected files
+            doc.archive_filename = archive_name_from_filename(doc.filename)
+            Document.objects.filter(id=doc.id).update(
+                archive_filename=doc.archive_filename
+            )
+
+    # regenerate archive documents
+    for doc_id in affected_document_ids:
+        doc = Document.objects.get(id=doc_id)
+        create_archive_version(doc)
+
+
+def move_new_to_old_locations(apps, schema_editor):
+    Document = apps.get_model("documents", "Document")
+
+    old_archive_paths = set()
+
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+        new_archive_path = archive_path_new(doc)
+        old_archive_path = archive_path_old(doc)
+        if old_archive_path in old_archive_paths:
+            raise ValueError(
+                f"Cannot migrate: Archive file name {old_archive_path} of "
+                f"document {doc.filename} would clash with another archive "
+                f"filename.")
+        old_archive_paths.add(old_archive_path)
+        if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
+            raise ValueError(
+                f"Cannot migrate: Cannot move {new_archive_path} to "
+                f"{old_archive_path}: file already exists."
+            )
+
+    for doc in Document.objects.filter(archive_checksum__isnull=False):
+        new_archive_path = archive_path_new(doc)
+        old_archive_path = archive_path_old(doc)
+        if new_archive_path != old_archive_path:
+            logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
+            shutil.move(new_archive_path, old_archive_path)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1011_auto_20210101_2340'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_filename',
+            field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='filename',
+            field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
+        ),
+        migrations.RunPython(
+            move_old_to_new_locations,
+            move_new_to_old_locations
+        ),
+    ]
--- a/src/documents/migrations/1013_migrate_tag_colour.py
+++ b/src/documents/migrations/1013_migrate_tag_colour.py
@@ -0,0 +1,70 @@
+# Generated by Django 3.1.4 on 2020-12-02 21:43
+
+from django.db import migrations, models
+
+COLOURS_OLD = {
+    1: "#a6cee3",
+    2: "#1f78b4",
+    3: "#b2df8a",
+    4: "#33a02c",
+    5: "#fb9a99",
+    6: "#e31a1c",
+    7: "#fdbf6f",
+    8: "#ff7f00",
+    9: "#cab2d6",
+    10: "#6a3d9a",
+    11: "#b15928",
+    12: "#000000",
+    13: "#cccccc",
+}
+
+
+def forward(apps, schema_editor):
+    Tag = apps.get_model('documents', 'Tag')
+
+    for tag in Tag.objects.all():
+        colour_old_id = tag.colour_old
+        rgb = COLOURS_OLD[colour_old_id]
+        tag.color = rgb
+        tag.save()
+
+
+def reverse(apps, schema_editor):
+    Tag = apps.get_model('documents', 'Tag')
+
+    def _get_colour_id(rdb):
+        for idx, rdbx in COLOURS_OLD.items():
+            if rdbx == rdb:
+                return idx
+        # Return colour 1 if we can't match anything
+        return 1
+
+    for tag in Tag.objects.all():
+        colour_id = _get_colour_id(tag.color)
+        tag.colour_old = colour_id
+        tag.save()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1012_fix_archive_files'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='tag',
+            old_name='colour',
+            new_name='colour_old',
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='color',
+            field=models.CharField(default='#a6cee3', max_length=7, verbose_name='color'),
+        ),
+        migrations.RunPython(forward, reverse),
+        migrations.RemoveField(
+            model_name='tag',
+            name='colour_old',
+        )
+    ]
--- a/src/documents/migrations/1014_auto_20210228_1614.py
+++ b/src/documents/migrations/1014_auto_20210228_1614.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.7 on 2021-02-28 15:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1013_migrate_tag_colour'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='savedviewfilterrule',
+            name='rule_type',
+            field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag'), (18, 'does not have ASN'), (19, 'title or content contains')], verbose_name='rule type'),
+        ),
+    ]
--- a/src/documents/migrations/1015_remove_null_characters.py
+++ b/src/documents/migrations/1015_remove_null_characters.py
@@ -0,0 +1,29 @@
+# Generated by Django 3.1.7 on 2021-04-04 18:28
+import logging
+
+from django.db import migrations
+
+
+logger = logging.getLogger("paperless.migrations")
+
+
+def remove_null_characters(apps, schema_editor):
+    Document = apps.get_model('documents', 'Document')
+
+    for doc in Document.objects.all():
+        content: str = doc.content
+        if '\0' in content:
+            logger.info(f"Removing null characters from document {doc}...")
+            doc.content = content.replace('\0', ' ')
+            doc.save()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1014_auto_20210228_1614'),
+    ]
+
+    operations = [
+        migrations.RunPython(remove_null_characters, migrations.RunPython.noop)
+    ]
--- a/src/documents/migrations/1016_auto_20210317_1351.py
+++ b/src/documents/migrations/1016_auto_20210317_1351.py
@@ -0,0 +1,23 @@
+# Generated by Django 3.1.7 on 2021-03-17 12:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1015_remove_null_characters'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='savedview',
+            name='sort_field',
+            field=models.CharField(blank=True, max_length=128, null=True, verbose_name='sort field'),
+        ),
+        migrations.AlterField(
+            model_name='savedviewfilterrule',
+            name='rule_type',
+            field=models.PositiveIntegerField(choices=[(0, 'title contains'), (1, 'content contains'), (2, 'ASN is'), (3, 'correspondent is'), (4, 'document type is'), (5, 'is in inbox'), (6, 'has tag'), (7, 'has any tag'), (8, 'created before'), (9, 'created after'), (10, 'created year is'), (11, 'created month is'), (12, 'created day is'), (13, 'added before'), (14, 'added after'), (15, 'modified before'), (16, 'modified after'), (17, 'does not have tag'), (18, 'does not have ASN'), (19, 'title or content contains'), (20, 'fulltext query'), (21, 'more like this')], verbose_name='rule type'),
+        ),
+    ]
--- a/src/documents/mixins.py
+++ b/src/documents/mixins.py
@@ -1,9 +0,0 @@
-class Renderable:
-    """
-    A handy mixin to make it easier/cleaner to print output based on a
-    verbosity value.
-    """
-
-    def _render(self, text, verbosity):
-        if self.verbosity >= verbosity:
-            print(text)
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -16,7 +16,6 @@ from django.utils.timezone import is_aware

 from django.utils.translation import gettext_lazy as _

-from documents.file_handling import archive_name_from_filename
 from documents.parsers import get_default_file_extension


@@ -66,10 +65,6 @@ class MatchingModel(models.Model):

 class Correspondent(MatchingModel):

-    # This regex is probably more restrictive than it needs to be, but it's
-    # better safe than sorry.
-    SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
-
    class Meta:
        ordering = ("name",)
        verbose_name = _("correspondent")
@@ -78,25 +73,11 @@ class Correspondent(MatchingModel):

 class Tag(MatchingModel):

-    COLOURS = (
-        (1, "#a6cee3"),
-        (2, "#1f78b4"),
-        (3, "#b2df8a"),
-        (4, "#33a02c"),
-        (5, "#fb9a99"),
-        (6, "#e31a1c"),
-        (7, "#fdbf6f"),
-        (8, "#ff7f00"),
-        (9, "#cab2d6"),
-        (10, "#6a3d9a"),
-        (11, "#b15928"),
-        (12, "#000000"),
-        (13, "#cccccc")
-    )
-
-    colour = models.PositiveIntegerField(
+    color = models.CharField(
        _("color"),
-        choices=COLOURS, default=1)
+        max_length=7,
+        default="#a6cee3"
+    )

    is_inbox_tag = models.BooleanField(
        _("is inbox tag"),
@@ -208,10 +189,21 @@ class Document(models.Model):
        max_length=1024,
        editable=False,
        default=None,
+        unique=True,
        null=True,
        help_text=_("Current filename in storage")
    )

+    archive_filename = models.FilePathField(
+        _("archive filename"),
+        max_length=1024,
+        editable=False,
+        default=None,
+        unique=True,
+        null=True,
+        help_text=_("Current archive filename in storage")
+    )
+
    archive_serial_number = models.IntegerField(
        _("archive serial number"),
        blank=True,
@@ -256,16 +248,18 @@ class Document(models.Model):
        return open(self.source_path, "rb")

    @property
-    def archive_path(self):
-        if self.filename:
-            fname = archive_name_from_filename(self.filename)
-        else:
-            fname = "{:07}.pdf".format(self.pk)
+    def has_archive_version(self):
+        return self.archive_filename is not None

-        return os.path.join(
-            settings.ARCHIVE_DIR,
-            fname
-        )
+    @property
+    def archive_path(self):
+        if self.has_archive_version:
+            return os.path.join(
+                settings.ARCHIVE_DIR,
+                str(self.archive_filename)
+            )
+        else:
+            return None

    @property
    def archive_file(self):
@@ -361,7 +355,10 @@ class SavedView(models.Model):

    sort_field = models.CharField(
        _("sort field"),
-        max_length=128)
+        max_length=128,
+        null=True,
+        blank=True
+    )
    sort_reverse = models.BooleanField(
        _("sort reverse"),
        default=False)
@@ -387,7 +384,11 @@ class SavedViewFilterRule(models.Model):
        (15, _("modified before")),
        (16, _("modified after")),
        (17, _("does not have tag")),
-        (19, _("has tags in")),
+        (18, _("does not have ASN")),
+        (19, _("title or content contains")),
+        (20, _("fulltext query")),
+        (21, _("more like this")),
+        (22, _("has tags in"))
    ]

    saved_view = models.ForeignKey(
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,7 +6,6 @@ import shutil
 import subprocess
 import tempfile

-import dateparser
 import magic
 from django.conf import settings
 from django.utils import timezone
@@ -36,7 +35,7 @@ DATE_REGEX = re.compile(
 )


-logger = logging.getLogger(__name__)
+logger = logging.getLogger("paperless.parsing")


 def is_mime_type_supported(mime_type):
@@ -144,6 +143,46 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))


+def get_default_thumbnail():
+    return os.path.join(os.path.dirname(__file__), "resources", "document.png")
+
+
+def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
+    out_path = os.path.join(temp_dir, "convert_gs.png")
+
+    # if convert fails, fall back to extracting
+    # the first PDF page as a PNG using Ghostscript
+    logger.warning(
+        "Thumbnail generation with ImageMagick failed, falling back "
+        "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
+        extra={'group': logging_group}
+    )
+    gs_out_path = os.path.join(temp_dir, "gs_out.png")
+    cmd = [settings.GS_BINARY,
+           "-q",
+           "-sDEVICE=pngalpha",
+           "-o", gs_out_path,
+           in_path]
+    try:
+        if not subprocess.Popen(cmd).wait() == 0:
+            raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
+        # then run convert on the output from gs
+        run_convert(density=300,
+                    scale="500x5000>",
+                    alpha="remove",
+                    strip=True,
+                    trim=False,
+                    auto_orient=True,
+                    input_file=gs_out_path,
+                    output_file=out_path,
+                    logging_group=logging_group)
+
+        return out_path
+
+    except ParseError:
+        return get_default_thumbnail()
+
+
 def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
    """
    The thumbnail of a PDF is just a 500px wide image of the first page.
@@ -162,31 +201,8 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
                    output_file=out_path,
                    logging_group=logging_group)
    except ParseError:
-        # if convert fails, fall back to extracting
-        # the first PDF page as a PNG using Ghostscript
-        logger.warning(
-            "Thumbnail generation with ImageMagick failed, falling back "
-            "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
-            extra={'group': logging_group}
-        )
-        gs_out_path = os.path.join(temp_dir, "gs_out.png")
-        cmd = [settings.GS_BINARY,
-               "-q",
-               "-sDEVICE=pngalpha",
-               "-o", gs_out_path,
-               in_path]
-        if not subprocess.Popen(cmd).wait() == 0:
-            raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
-        # then run convert on the output from gs
-        run_convert(density=300,
-                    scale="500x5000>",
-                    alpha="remove",
-                    strip=True,
-                    trim=False,
-                    auto_orient=True,
-                    input_file=gs_out_path,
-                    output_file=out_path,
-                    logging_group=logging_group)
+        out_path = make_thumbnail_from_pdf_gs_fallback(
+            in_path, temp_dir, logging_group)

    return out_path

@@ -200,6 +216,8 @@ def parse_date(filename, text):
        """
        Call dateparser.parse with a particular date ordering
        """
+        import dateparser
+
        return dateparser.parse(
            ds,
            settings={
@@ -261,7 +279,9 @@ class DocumentParser(LoggingMixin):
    `paperless_tesseract.parsers` for inspiration.
    """

-    def __init__(self, logging_group):
+    logging_name = "paperless.parsing"
+
+    def __init__(self, logging_group, progress_callback=None):
        super().__init__()
        self.logging_group = logging_group
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
@@ -271,6 +291,11 @@ class DocumentParser(LoggingMixin):
        self.archive_path = None
        self.text = None
        self.date = None
+        self.progress_callback = progress_callback
+
+    def progress(self, current_progress, max_progress):
+        if self.progress_callback:
+            self.progress_callback(current_progress, max_progress)

    def extract_metadata(self, document_path, mime_type):
        return []
@@ -281,14 +306,17 @@ class DocumentParser(LoggingMixin):
    def get_archive_path(self):
        return self.archive_path

-    def get_thumbnail(self, document_path, mime_type):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()

-    def get_optimised_thumbnail(self, document_path, mime_type):
-        thumbnail = self.get_thumbnail(document_path, mime_type)
+    def get_optimised_thumbnail(self,
+                                document_path,
+                                mime_type,
+                                file_name=None):
+        thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "thumb_optipng.png")

@@ -311,5 +339,5 @@ class DocumentParser(LoggingMixin):
        return self.date

    def cleanup(self):
-        self.log("debug", "Deleting directory {}".format(self.tempdir))
+        self.log("debug", f"Deleting directory {self.tempdir}")
        shutil.rmtree(self.tempdir)
--- a/src/documents/resources/document.png
+++ b/src/documents/resources/document.png
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -1,45 +1,55 @@
 import hashlib
+import logging
 import os

 from django.conf import settings
+from tqdm import tqdm

 from documents.models import Document


-class SanityMessage:
-    message = None
+class SanityCheckMessages:
+
+    def __init__(self):
+        self._messages = []
+
+    def error(self, message):
+        self._messages.append({"level": logging.ERROR, "message": message})
+
+    def warning(self, message):
+        self._messages.append({"level": logging.WARNING, "message": message})
+
+    def info(self, message):
+        self._messages.append({"level": logging.INFO, "message": message})
+
+    def log_messages(self):
+        logger = logging.getLogger("paperless.sanity_checker")
+
+        if len(self._messages) == 0:
+            logger.info("Sanity checker detected no issues.")
+        else:
+            for msg in self._messages:
+                logger.log(msg['level'], msg['message'])
+
+    def __len__(self):
+        return len(self._messages)
+
+    def __getitem__(self, item):
+        return self._messages[item]
+
+    def has_error(self):
+        return any([msg['level'] == logging.ERROR for msg in self._messages])
+
+    def has_warning(self):
+        return any([msg['level'] == logging.WARNING for msg in self._messages])


-class SanityWarning(SanityMessage):
-    def __init__(self, message):
-        self.message = message
-
-    def __str__(self):
-        return f"Warning: {self.message}"
+class SanityCheckFailedException(Exception):
+    pass


-class SanityError(SanityMessage):
-    def __init__(self, message):
-        self.message = message
-
-    def __str__(self):
-        return f"ERROR: {self.message}"
-
-
-class SanityFailedError(Exception):
-
-    def __init__(self, messages):
-        self.messages = messages
-
-    def __str__(self):
-        message_string = "\n".join([str(m) for m in self.messages])
-        return (
-            f"The following issuse were found by the sanity checker:\n"
-            f"{message_string}\n\n===============\n\n")
-
-
-def check_sanity():
-    messages = []
+def check_sanity(progress=False):
+    messages = SanityCheckMessages()

    present_files = []
    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
@@ -50,72 +60,81 @@ def check_sanity():
    if lockfile in present_files:
        present_files.remove(lockfile)

-    for doc in Document.objects.all():
+    for doc in tqdm(Document.objects.all(), disable=not progress):
        # Check sanity of the thumbnail
        if not os.path.isfile(doc.thumbnail_path):
-            messages.append(SanityError(
-                f"Thumbnail of document {doc.pk} does not exist."))
+            messages.error(f"Thumbnail of document {doc.pk} does not exist.")
        else:
-            present_files.remove(os.path.normpath(doc.thumbnail_path))
+            if os.path.normpath(doc.thumbnail_path) in present_files:
+                present_files.remove(os.path.normpath(doc.thumbnail_path))
            try:
                with doc.thumbnail_file as f:
                    f.read()
            except OSError as e:
-                messages.append(SanityError(
+                messages.error(
                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
-                ))
+                )

        # Check sanity of the original file
        # TODO: extract method
        if not os.path.isfile(doc.source_path):
-            messages.append(SanityError(
-                f"Original of document {doc.pk} does not exist."))
+            messages.error(f"Original of document {doc.pk} does not exist.")
        else:
-            present_files.remove(os.path.normpath(doc.source_path))
+            if os.path.normpath(doc.source_path) in present_files:
+                present_files.remove(os.path.normpath(doc.source_path))
            try:
                with doc.source_file as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
            except OSError as e:
-                messages.append(SanityError(
-                    f"Cannot read original file of document {doc.pk}: {e}"))
+                messages.error(
+                    f"Cannot read original file of document {doc.pk}: {e}")
            else:
                if not checksum == doc.checksum:
-                    messages.append(SanityError(
+                    messages.error(
                        f"Checksum mismatch of document {doc.pk}. "
                        f"Stored: {doc.checksum}, actual: {checksum}."
-                    ))
+                    )

        # Check sanity of the archive file.
-        if doc.archive_checksum:
+        if doc.archive_checksum and not doc.archive_filename:
+            messages.error(
+                f"Document {doc.pk} has an archive file checksum, but no "
+                f"archive filename."
+            )
+        elif not doc.archive_checksum and doc.archive_filename:
+            messages.error(
+                f"Document {doc.pk} has an archive file, but its checksum is "
+                f"missing."
+            )
+        elif doc.has_archive_version:
            if not os.path.isfile(doc.archive_path):
-                messages.append(SanityError(
+                messages.error(
                    f"Archived version of document {doc.pk} does not exist."
-                ))
+                )
            else:
-                present_files.remove(os.path.normpath(doc.archive_path))
+                if os.path.normpath(doc.archive_path) in present_files:
+                    present_files.remove(os.path.normpath(doc.archive_path))
                try:
                    with doc.archive_file as f:
                        checksum = hashlib.md5(f.read()).hexdigest()
                except OSError as e:
-                    messages.append(SanityError(
+                    messages.error(
                        f"Cannot read archive file of document {doc.pk}: {e}"
-                    ))
+                    )
                else:
                    if not checksum == doc.archive_checksum:
-                        messages.append(SanityError(
-                            f"Checksum mismatch of archive {doc.pk}. "
-                            f"Stored: {doc.checksum}, actual: {checksum}."
-                        ))
+                        messages.error(
+                            f"Checksum mismatch of archived document "
+                            f"{doc.pk}. "
+                            f"Stored: {doc.archive_checksum}, "
+                            f"actual: {checksum}."
+                        )

        # other document checks
        if not doc.content:
-            messages.append(SanityWarning(
-                f"Document {doc.pk} has no content."
-            ))
+            messages.info(f"Document {doc.pk} has no content.")

    for extra_file in present_files:
-        messages.append(SanityWarning(
-            f"Orphaned file in media dir: {extra_file}"
-        ))
+        messages.warning(f"Orphaned file in media dir: {extra_file}")

    return messages
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1,13 +1,18 @@
+import re
+
 import magic
+import math
 from django.utils.text import slugify
 from rest_framework import serializers
 from rest_framework.fields import SerializerMethodField

 from . import bulk_edit
-from .models import Correspondent, Tag, Document, Log, DocumentType, \
-    SavedView, SavedViewFilterRule
+from .models import Correspondent, Tag, Document, DocumentType, \
+    SavedView, SavedViewFilterRule, MatchingModel
 from .parsers import is_mime_type_supported

+from django.utils.translation import gettext as _
+

 # https://www.django-rest-framework.org/api-guide/serializers/#example
 class DynamicFieldsModelSerializer(serializers.ModelSerializer):
@@ -31,16 +36,30 @@ class DynamicFieldsModelSerializer(serializers.ModelSerializer):
                self.fields.pop(field_name)


-class CorrespondentSerializer(serializers.ModelSerializer):
+class MatchingModelSerializer(serializers.ModelSerializer):

    document_count = serializers.IntegerField(read_only=True)

-    last_correspondence = serializers.DateTimeField(read_only=True)
-
    def get_slug(self, obj):
        return slugify(obj.name)
    slug = SerializerMethodField()

+    def validate_match(self, match):
+        if 'matching_algorithm' in self.initial_data and self.initial_data['matching_algorithm'] == MatchingModel.MATCH_REGEX:  # NOQA: E501
+            try:
+                re.compile(match)
+            except Exception as e:
+                raise serializers.ValidationError(
+                    _("Invalid regular expression: %(error)s") %
+                    {'error': str(e)}
+                )
+        return match
+
+
+class CorrespondentSerializer(MatchingModelSerializer):
+
+    last_correspondence = serializers.DateTimeField(read_only=True)
+
    class Meta:
        model = Correspondent
        fields = (
@@ -55,13 +74,7 @@ class CorrespondentSerializer(serializers.ModelSerializer):
        )


-class DocumentTypeSerializer(serializers.ModelSerializer):
-
-    document_count = serializers.IntegerField(read_only=True)
-
-    def get_slug(self, obj):
-        return slugify(obj.name)
-    slug = SerializerMethodField()
+class DocumentTypeSerializer(MatchingModelSerializer):

    class Meta:
        model = DocumentType
@@ -76,13 +89,40 @@ class DocumentTypeSerializer(serializers.ModelSerializer):
        )


-class TagSerializer(serializers.ModelSerializer):
+class ColorField(serializers.Field):

-    document_count = serializers.IntegerField(read_only=True)
+    COLOURS = (
+        (1, "#a6cee3"),
+        (2, "#1f78b4"),
+        (3, "#b2df8a"),
+        (4, "#33a02c"),
+        (5, "#fb9a99"),
+        (6, "#e31a1c"),
+        (7, "#fdbf6f"),
+        (8, "#ff7f00"),
+        (9, "#cab2d6"),
+        (10, "#6a3d9a"),
+        (11, "#b15928"),
+        (12, "#000000"),
+        (13, "#cccccc")
+    )

-    def get_slug(self, obj):
-        return slugify(obj.name)
-    slug = SerializerMethodField()
+    def to_internal_value(self, data):
+        for id, color in self.COLOURS:
+            if id == data:
+                return color
+        raise serializers.ValidationError()
+
+    def to_representation(self, value):
+        for id, color in self.COLOURS:
+            if color == value:
+                return id
+        return 1
+
+
+class TagSerializerVersion1(MatchingModelSerializer):
+
+    colour = ColorField(source='color', default="#a6cee3")

    class Meta:
        model = Tag
@@ -99,6 +139,45 @@ class TagSerializer(serializers.ModelSerializer):
        )


+class TagSerializer(MatchingModelSerializer):
+
+    def get_text_color(self, obj):
+        try:
+            h = obj.color.lstrip('#')
+            rgb = tuple(int(h[i:i + 2], 16)/256 for i in (0, 2, 4))
+            luminance = math.sqrt(
+                0.299 * math.pow(rgb[0], 2) +
+                0.587 * math.pow(rgb[1], 2) +
+                0.114 * math.pow(rgb[2], 2)
+            )
+            return "#ffffff" if luminance < 0.53 else "#000000"
+        except ValueError:
+            return "#000000"
+
+    text_color = serializers.SerializerMethodField()
+
+    class Meta:
+        model = Tag
+        fields = (
+            "id",
+            "slug",
+            "name",
+            "color",
+            "text_color",
+            "match",
+            "matching_algorithm",
+            "is_insensitive",
+            "is_inbox_tag",
+            "document_count"
+        )
+
+    def validate_color(self, color):
+        regex = r"#[0-9a-fA-F]{6}"
+        if not re.match(regex, color):
+            raise serializers.ValidationError(_("Invalid color."))
+        return color
+
+
 class CorrespondentField(serializers.PrimaryKeyRelatedField):
    def get_queryset(self):
        return Correspondent.objects.all()
@@ -127,7 +206,7 @@ class DocumentSerializer(DynamicFieldsModelSerializer):
        return obj.get_public_filename()

    def get_archived_file_name(self, obj):
-        if obj.archive_checksum:
+        if obj.has_archive_version:
            return obj.get_public_filename(archive=True)
        else:
            return None
@@ -151,19 +230,6 @@ class DocumentSerializer(DynamicFieldsModelSerializer):
        )


-class LogSerializer(serializers.ModelSerializer):
-
-    class Meta:
-        model = Log
-        fields = (
-            "id",
-            "created",
-            "message",
-            "group",
-            "level"
-        )
-
-
 class SavedViewFilterRuleSerializer(serializers.ModelSerializer):

    class Meta:
@@ -203,14 +269,34 @@ class SavedViewSerializer(serializers.ModelSerializer):
        return saved_view


-class BulkEditSerializer(serializers.Serializer):
+class DocumentListSerializer(serializers.Serializer):

    documents = serializers.ListField(
-        child=serializers.IntegerField(),
+        required=True,
        label="Documents",
-        write_only=True
+        write_only=True,
+        child=serializers.IntegerField()
    )

+    def _validate_document_id_list(self, documents, name="documents"):
+        if not type(documents) == list:
+            raise serializers.ValidationError(f"{name} must be a list")
+        if not all([type(i) == int for i in documents]):
+            raise serializers.ValidationError(
+                f"{name} must be a list of integers")
+        count = Document.objects.filter(id__in=documents).count()
+        if not count == len(documents):
+            raise serializers.ValidationError(
+                f"Some documents in {name} don't exist or were "
+                f"specified twice.")
+
+    def validate_documents(self, documents):
+        self._validate_document_id_list(documents)
+        return documents
+
+
+class BulkEditSerializer(DocumentListSerializer):
+
    method = serializers.ChoiceField(
        choices=[
            "set_correspondent",
@@ -226,18 +312,6 @@ class BulkEditSerializer(serializers.Serializer):

    parameters = serializers.DictField(allow_empty=True)

-    def _validate_document_id_list(self, documents, name="documents"):
-        if not type(documents) == list:
-            raise serializers.ValidationError(f"{name} must be a list")
-        if not all([type(i) == int for i in documents]):
-            raise serializers.ValidationError(
-                f"{name} must be a list of integers")
-        count = Document.objects.filter(id__in=documents).count()
-        if not count == len(documents):
-            raise serializers.ValidationError(
-                f"Some documents in {name} don't exist or were "
-                f"specified twice.")
-
    def _validate_tag_id_list(self, tags, name="tags"):
        if not type(tags) == list:
            raise serializers.ValidationError(f"{name} must be a list")
@@ -249,10 +323,6 @@ class BulkEditSerializer(serializers.Serializer):
            raise serializers.ValidationError(
                f"Some tags in {name} don't exist or were specified twice.")

-    def validate_documents(self, documents):
-        self._validate_document_id_list(documents)
-        return documents
-
    def validate_method(self, method):
        if method == "set_correspondent":
            return bulk_edit.set_correspondent
@@ -378,7 +448,9 @@ class PostDocumentSerializer(serializers.Serializer):

        if not is_mime_type_supported(mime_type):
            raise serializers.ValidationError(
-                "This file type is not supported.")
+                _("File type %(type)s not supported") %
+                {'type': mime_type}
+            )

        return document.name, document_data

@@ -401,9 +473,24 @@ class PostDocumentSerializer(serializers.Serializer):
            return None


-class SelectionDataSerializer(serializers.Serializer):
+class BulkDownloadSerializer(DocumentListSerializer):

-    documents = serializers.ListField(
-        required=True,
-        child=serializers.IntegerField()
+    content = serializers.ChoiceField(
+        choices=["archive", "originals", "both"],
+        default="archive"
    )
+
+    compression = serializers.ChoiceField(
+        choices=["none", "deflated", "bzip2", "lzma"],
+        default="none"
+    )
+
+    def validate_compression(self, compression):
+        import zipfile
+
+        return {
+            "none": zipfile.ZIP_STORED,
+            "deflated": zipfile.ZIP_DEFLATED,
+            "bzip2": zipfile.ZIP_BZIP2,
+            "lzma": zipfile.ZIP_LZMA
+        }[compression]
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -1,7 +1,7 @@
 import logging
 import os
-from subprocess import Popen

+from django.utils import termcolors
 from django.conf import settings
 from django.contrib.admin.models import ADDITION, LogEntry
 from django.contrib.auth.models import User
@@ -9,18 +9,17 @@ from django.contrib.contenttypes.models import ContentType
 from django.db import models, DatabaseError
 from django.db.models import Q
 from django.dispatch import receiver
-from django.utils import timezone
+from django.utils import termcolors, timezone
 from filelock import FileLock

-from .. import index, matching
+from .. import matching
 from ..file_handling import delete_empty_directories, \
-    create_source_path_directory, archive_name_from_filename, \
+    create_source_path_directory, \
    generate_unique_filename
-from ..models import Document, Tag
+from ..models import Document, Tag, MatchingModel


-def logger(message, group):
-    logging.getLogger(__name__).debug(message, extra={"group": group})
+logger = logging.getLogger("paperless.handlers")


 def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
@@ -34,6 +33,9 @@ def set_correspondent(sender,
                      classifier=None,
                      replace=False,
                      use_first=True,
+                      suggest=False,
+                      base_url=None,
+                      color=False,
                      **kwargs):
    if document.correspondent and not replace:
        return
@@ -48,27 +50,45 @@ def set_correspondent(sender,
        selected = None
    if potential_count > 1:
        if use_first:
-            logger(
+            logger.debug(
                f"Detected {potential_count} potential correspondents, "
                f"so we've opted for {selected}",
-                logging_group
+                extra={'group': logging_group}
            )
        else:
-            logger(
+            logger.debug(
                f"Detected {potential_count} potential correspondents, "
                f"not assigning any correspondent",
-                logging_group
+                extra={'group': logging_group}
            )
            return

    if selected or replace:
-        logger(
-            f"Assigning correspondent {selected} to {document}",
-            logging_group
-        )
+        if suggest:
+            if base_url:
+                print(
+                    termcolors.colorize(str(document), fg='green')
+                    if color
+                    else str(document)
+                )
+                print(f"{base_url}/documents/{document.pk}")
+            else:
+                print(
+                    (
+                        termcolors.colorize(str(document), fg='green')
+                        if color
+                        else str(document)
+                    ) + f" [{document.pk}]"
+                )
+            print(f"Suggest correspondent {selected}")
+        else:
+            logger.info(
+                f"Assigning correspondent {selected} to {document}",
+                extra={'group': logging_group}
+            )

-        document.correspondent = selected
-        document.save(update_fields=("correspondent",))
+            document.correspondent = selected
+            document.save(update_fields=("correspondent",))


 def set_document_type(sender,
@@ -77,6 +97,9 @@ def set_document_type(sender,
                      classifier=None,
                      replace=False,
                      use_first=True,
+                      suggest=False,
+                      base_url=None,
+                      color=False,
                      **kwargs):
    if document.document_type and not replace:
        return
@@ -92,27 +115,45 @@ def set_document_type(sender,

    if potential_count > 1:
        if use_first:
-            logger(
+            logger.info(
                f"Detected {potential_count} potential document types, "
                f"so we've opted for {selected}",
-                logging_group
+                extra={'group': logging_group}
            )
        else:
-            logger(
+            logger.info(
                f"Detected {potential_count} potential document types, "
                f"not assigning any document type",
-                logging_group
+                extra={'group': logging_group}
            )
            return

    if selected or replace:
-        logger(
-            f"Assigning document type {selected} to {document}",
-            logging_group
-        )
+        if suggest:
+            if base_url:
+                print(
+                    termcolors.colorize(str(document), fg='green')
+                    if color
+                    else str(document)
+                )
+                print(f"{base_url}/documents/{document.pk}")
+            else:
+                print(
+                    (
+                        termcolors.colorize(str(document), fg='green')
+                        if color
+                        else str(document)
+                    ) + f" [{document.pk}]"
+                )
+            print(f"Sugest document type {selected}")
+        else:
+            logger.info(
+                f"Assigning document type {selected} to {document}",
+                extra={'group': logging_group}
+            )

-        document.document_type = selected
-        document.save(update_fields=("document_type",))
+            document.document_type = selected
+            document.save(update_fields=("document_type",))


 def set_tags(sender,
@@ -120,6 +161,9 @@ def set_tags(sender,
             logging_group=None,
             classifier=None,
             replace=False,
+             suggest=False,
+             base_url=None,
+             color=False,
             **kwargs):

    if replace:
@@ -134,33 +178,65 @@ def set_tags(sender,

    relevant_tags = set(matched_tags) - current_tags

-    if not relevant_tags:
-        return
+    if suggest:
+        extra_tags = current_tags - set(matched_tags)
+        extra_tags = [
+            t for t in extra_tags
+            if t.matching_algorithm == MatchingModel.MATCH_AUTO
+        ]
+        if not relevant_tags and not extra_tags:
+            return
+        if base_url:
+            print(
+                termcolors.colorize(str(document), fg='green')
+                if color
+                else str(document)
+            )
+            print(f"{base_url}/documents/{document.pk}")
+        else:
+            print(
+                (
+                    termcolors.colorize(str(document), fg='green')
+                    if color
+                    else str(document)
+                ) + f" [{document.pk}]"
+            )
+        if relevant_tags:
+            print(
+                "Suggest tags: " + ", ".join([t.name for t in relevant_tags])
+            )
+        if extra_tags:
+            print("Extra tags: " + ", ".join([t.name for t in extra_tags]))
+    else:
+        if not relevant_tags:
+            return

-    message = 'Tagging "{}" with "{}"'
-    logger(
-        message.format(document, ", ".join([t.name for t in relevant_tags])),
-        logging_group
-    )
+        message = 'Tagging "{}" with "{}"'
+        logger.info(
+            message.format(
+                document, ", ".join([t.name for t in relevant_tags])
+            ),
+            extra={'group': logging_group}
+        )

-    document.tags.add(*relevant_tags)
+        document.tags.add(*relevant_tags)


@receiver(models.signals.post_delete, sender=Document)
 def cleanup_document_deletion(sender, instance, using, **kwargs):
    with FileLock(settings.MEDIA_LOCK):
-        for f in (instance.source_path,
-                  instance.archive_path,
-                  instance.thumbnail_path):
-            if os.path.isfile(f):
+        for filename in (instance.source_path,
+                         instance.archive_path,
+                         instance.thumbnail_path):
+            if filename and os.path.isfile(filename):
                try:
-                    os.unlink(f)
-                    logging.getLogger(__name__).debug(
-                        f"Deleted file {f}.")
+                    os.unlink(filename)
+                    logger.debug(
+                        f"Deleted file {filename}.")
                except OSError as e:
-                    logging.getLogger(__name__).warning(
+                    logger.warning(
                        f"While deleting document {str(instance)}, the file "
-                        f"{f} could not be deleted: {e}"
+                        f"{filename} could not be deleted: {e}"
                    )

        delete_empty_directories(
@@ -168,27 +244,30 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
            root=settings.ORIGINALS_DIR
        )

-        delete_empty_directories(
-            os.path.dirname(instance.archive_path),
-            root=settings.ARCHIVE_DIR
-        )
+        if instance.has_archive_version:
+            delete_empty_directories(
+                os.path.dirname(instance.archive_path),
+                root=settings.ARCHIVE_DIR
+            )
+
+
+class CannotMoveFilesException(Exception):
+    pass


 def validate_move(instance, old_path, new_path):
    if not os.path.isfile(old_path):
        # Can't do anything if the old file does not exist anymore.
-        logging.getLogger(__name__).fatal(
+        logger.fatal(
            f"Document {str(instance)}: File {old_path} has gone.")
-        return False
+        raise CannotMoveFilesException()

    if os.path.isfile(new_path):
        # Can't do anything if the new file already exists. Skip updating file.
-        logging.getLogger(__name__).warning(
+        logger.warning(
            f"Document {str(instance)}: Cannot rename file "
            f"since target path {new_path} already exists.")
-        return False
-
-    return True
+        raise CannotMoveFilesException()


@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@@ -207,56 +286,61 @@ def update_filename_and_move_files(sender, instance, **kwargs):
        return

    with FileLock(settings.MEDIA_LOCK):
-        old_filename = instance.filename
-        new_filename = generate_unique_filename(
-            instance, settings.ORIGINALS_DIR)
+        try:
+            old_filename = instance.filename
+            old_source_path = instance.source_path

-        if new_filename == instance.filename:
-            # Don't do anything if its the same.
-            return
+            instance.filename = generate_unique_filename(instance)
+            move_original = old_filename != instance.filename

-        old_source_path = instance.source_path
-        new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
-
-        if not validate_move(instance, old_source_path, new_source_path):
-            return
-
-        # archive files are optional, archive checksum tells us if we have one,
-        # since this is None for documents without archived files.
-        if instance.archive_checksum:
-            new_archive_filename = archive_name_from_filename(new_filename)
+            old_archive_filename = instance.archive_filename
            old_archive_path = instance.archive_path
-            new_archive_path = os.path.join(settings.ARCHIVE_DIR,
-                                            new_archive_filename)

-            if not validate_move(instance, old_archive_path, new_archive_path):
+            if instance.has_archive_version:
+
+                instance.archive_filename = generate_unique_filename(
+                    instance, archive_filename=True
+                )
+
+                move_archive = old_archive_filename != instance.archive_filename  # NOQA: E501
+            else:
+                move_archive = False
+
+            if not move_original and not move_archive:
+                # Don't do anything if filenames did not change.
                return

-            create_source_path_directory(new_archive_path)
-        else:
-            old_archive_path = None
-            new_archive_path = None
+            if move_original:
+                validate_move(instance, old_source_path, instance.source_path)
+                create_source_path_directory(instance.source_path)
+                os.rename(old_source_path, instance.source_path)

-        create_source_path_directory(new_source_path)
-
-        try:
-            os.rename(old_source_path, new_source_path)
-            if instance.archive_checksum:
-                os.rename(old_archive_path, new_archive_path)
-            instance.filename = new_filename
+            if move_archive:
+                validate_move(
+                    instance, old_archive_path, instance.archive_path)
+                create_source_path_directory(instance.archive_path)
+                os.rename(old_archive_path, instance.archive_path)

            # Don't save() here to prevent infinite recursion.
            Document.objects.filter(pk=instance.pk).update(
-                filename=new_filename)
+                filename=instance.filename,
+                archive_filename=instance.archive_filename,
+            )

-        except OSError as e:
-            instance.filename = old_filename
-            # this happens when we can't move a file. If that's the case for
-            # the archive file, we try our best to revert the changes.
-            # no need to save the instance, the update() has not happened yet.
+        except (OSError, DatabaseError, CannotMoveFilesException):
+            # This happens when either:
+            #  - moving the files failed due to file system errors
+            #  - saving to the database failed due to database errors
+            # In both cases, we need to revert to the original state.
+
+            # Try to move files to their original location.
            try:
-                os.rename(new_source_path, old_source_path)
-                os.rename(new_archive_path, old_archive_path)
+                if move_original and os.path.isfile(instance.source_path):
+                    os.rename(instance.source_path, old_source_path)
+
+                if move_archive and os.path.isfile(instance.archive_path):
+                    os.rename(instance.archive_path, old_archive_path)
+
            except Exception as e:
                # This is fine, since:
                # A: if we managed to move source from A to B, we will also
@@ -267,16 +351,10 @@ def update_filename_and_move_files(sender, instance, **kwargs):
                # B: if moving the orignal file failed, nothing has changed
                #  anyway.
                pass
-        except DatabaseError as e:
-            # this happens after moving files, so move them back into place.
-            # since moving them once succeeded, it's very likely going to
-            # succeed again.
-            os.rename(new_source_path, old_source_path)
-            if instance.archive_checksum:
-                os.rename(new_archive_path, old_archive_path)
+
+            # restore old values on the instance
            instance.filename = old_filename
-            # again, no need to save the instance, since the actual update()
-            # operation failed.
+            instance.archive_filename = old_archive_filename

        # finally, remove any empty sub folders. This will do nothing if
        # something has failed above.
@@ -284,7 +362,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
            delete_empty_directories(os.path.dirname(old_source_path),
                                     root=settings.ORIGINALS_DIR)

-        if old_archive_path and not os.path.isfile(old_archive_path):
+        if instance.has_archive_version and not os.path.isfile(old_archive_path):  # NOQA: E501
            delete_empty_directories(os.path.dirname(old_archive_path),
                                     root=settings.ARCHIVE_DIR)

@@ -305,4 +383,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):


 def add_to_index(sender, document, **kwargs):
+    from documents import index
+
    index.add_or_update_document(document)
--- a/src/documents/static/bootstrap.min.css
+++ b/src/documents/static/bootstrap.min.css
--- a/src/documents/static/signin.css
+++ b/src/documents/static/signin.css
@@ -42,3 +42,58 @@ body {
  border-top-left-radius: 0;
  border-top-right-radius: 0;
 }
+
+@media (prefers-color-scheme: dark) {
+  /*
+  From theme_dark.scss
+  $primary-dark-mode: #45973a;
+  $danger-dark-mode: #b71631;
+  $bg-dark-mode: #161618;
+  $bg-dark-mode-accent: #21262d;
+  $bg-light-dark-mode: #1c1c1f;
+  $text-color-dark-mode: #abb2bf;
+  $border-color-dark-mode: #47494f;
+   */
+  body {
+    background-color: #161618 !important;
+    color: #abb2bf;
+  }
+
+  svg.logo .text {
+    fill: #abb2bf!important;
+  }
+
+  .form-control:not(.is-invalid):not(.btn) {
+    border-color: #47494f;
+  }
+
+  .form-control:not(.btn) {
+    background-color: #161618;
+    color: #abb2bf;
+  }
+
+  .form-control:not(.btn)::placeholder {
+    color: #abb2bf;
+  }
+
+  .form-control:not(.btn):focus {
+    background-color: #1c1c1f !important;
+    color: #8e97a9 !important;
+  }
+
+  .btn-primary {
+  color: #fff;
+    background-color: #17541f;
+    border-color: #17541f;
+  }
+
+  .btn-primary:hover, .btn-primary:focus {
+    background-color: #0f3614;
+    border-color: #0c2c10;
+  }
+
+  .btn-primary:not(:disabled):not(.disabled):active {
+    background-color: #0c2c10;
+    border-color: #09220d;
+  }
+}
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -6,11 +6,12 @@ from django.db.models.signals import post_save
 from whoosh.writing import AsyncWriter

 from documents import index, sanity_checker
-from documents.classifier import DocumentClassifier, \
-    IncompatibleClassifierVersionError
+from documents.classifier import DocumentClassifier, load_classifier
 from documents.consumer import Consumer, ConsumerError
-from documents.models import Document
-from documents.sanity_checker import SanityFailedError
+from documents.models import Document, Tag, DocumentType, Correspondent
+from documents.sanity_checker import SanityCheckFailedException
+
+logger = logging.getLogger("paperless.tasks")


 def index_optimize():
@@ -19,40 +20,45 @@ def index_optimize():
    writer.commit(optimize=True)


-def index_reindex():
+def index_reindex(progress_bar_disable=False):
    documents = Document.objects.all()

    ix = index.open_index(recreate=True)

    with AsyncWriter(ix) as writer:
-        for document in tqdm.tqdm(documents):
+        for document in tqdm.tqdm(documents, disable=progress_bar_disable):
            index.update_document(writer, document)


 def train_classifier():
-    classifier = DocumentClassifier()
+    if (not Tag.objects.filter(
+                matching_algorithm=Tag.MATCH_AUTO).exists() and
+        not DocumentType.objects.filter(
+            matching_algorithm=Tag.MATCH_AUTO).exists() and
+        not Correspondent.objects.filter(
+            matching_algorithm=Tag.MATCH_AUTO).exists()):

-    try:
-        # load the classifier, since we might not have to train it again.
-        classifier.reload()
-    except (OSError, EOFError, IncompatibleClassifierVersionError):
-        # This is what we're going to fix here.
+        return
+
+    classifier = load_classifier()
+
+    if not classifier:
        classifier = DocumentClassifier()

    try:
        if classifier.train():
-            logging.getLogger(__name__).info(
+            logger.info(
                "Saving updated classifier model to {}...".format(
                    settings.MODEL_FILE)
            )
-            classifier.save_classifier()
+            classifier.save()
        else:
-            logging.getLogger(__name__).debug(
+            logger.debug(
                "Training data unchanged."
            )

    except Exception as e:
-        logging.getLogger(__name__).error(
+        logger.warning(
            "Classifier error: " + str(e)
        )

@@ -62,7 +68,8 @@ def consume_file(path,
                 override_title=None,
                 override_correspondent_id=None,
                 override_document_type_id=None,
-                 override_tag_ids=None):
+                 override_tag_ids=None,
+                 task_id=None):

    document = Consumer().try_consume_file(
        path,
@@ -70,7 +77,9 @@ def consume_file(path,
        override_title=override_title,
        override_correspondent_id=override_correspondent_id,
        override_document_type_id=override_document_type_id,
-        override_tag_ids=override_tag_ids)
+        override_tag_ids=override_tag_ids,
+        task_id=task_id
+    )

    if document:
        return "Success. New document id {} created".format(
@@ -84,8 +93,15 @@ def consume_file(path,
 def sanity_check():
    messages = sanity_checker.check_sanity()

-    if len(messages) > 0:
-        raise SanityFailedError(messages)
+    messages.log_messages()
+
+    if messages.has_error():
+        raise SanityCheckFailedException(
+            "Sanity check failed with errors. See log.")
+    elif messages.has_warning():
+        return "Sanity check exited with warnings. See log."
+    elif len(messages) > 0:
+        return "Sanity check exited with infos. See log."
    else:
        return "No issues detected."

--- a/src/documents/templates/index.html
+++ b/src/documents/templates/index.html
@@ -7,14 +7,16 @@
 <head>
  <meta charset="utf-8">
  <title>Paperless-ng</title>
-  <base href="/">
+  <base href="{% url 'base' %}">
  <meta name="viewport" content="width=device-width, initial-scale=1">
 	<meta name="username" content="{{username}}">
 	<meta name="full_name" content="{{full_name}}">
 	<meta name="cookie_prefix" content="{{cookie_prefix}}">
+	<meta name="robots" content="noindex,nofollow">
  <link rel="icon" type="image/x-icon" href="favicon.ico">
  <link rel="manifest" href="{% static webmanifest %}">
 	<link rel="stylesheet" href="{% static styles_css %}">
+	<link rel="apple-touch-icon" href="{% static apple_touch_icon %}">
 </head>
 <body>
  <app-root>{% translate "Paperless-ng is loading..." %}</app-root>
--- a/src/documents/templates/registration/logged_out.html
+++ b/src/documents/templates/registration/logged_out.html
--- a/src/documents/templates/registration/login.html
+++ b/src/documents/templates/registration/login.html
--- a/src/documents/tests/data/model.pickle
+++ b/src/documents/tests/data/model.pickle
--- a/src/documents/tests/samples/simple-noalpha.png
+++ b/src/documents/tests/samples/simple-noalpha.png
--- a/src/documents/tests/samples/simple.jpg
+++ b/src/documents/tests/samples/simple.jpg
--- a/src/documents/tests/samples/simple.png
+++ b/src/documents/tests/samples/simple.png
--- a/src/documents/tests/samples/simple.txt
+++ b/src/documents/tests/samples/simple.txt
@@ -0,0 +1 @@
+This is a test file.
--- a/src/documents/tests/test_admin.py
+++ b/src/documents/tests/test_admin.py
@@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
 from django.test import TestCase
 from django.utils import timezone

+from documents import index
 from documents.admin import DocumentAdmin
 from documents.models import Document
 from documents.tests.utils import DirectoriesMixin
@@ -11,49 +12,52 @@ from documents.tests.utils import DirectoriesMixin

 class TestDocumentAdmin(DirectoriesMixin, TestCase):

+    def get_document_from_index(self, doc):
+        ix = index.open_index()
+        with ix.searcher() as searcher:
+            return searcher.document(id=doc.id)
+
    def setUp(self) -> None:
        super(TestDocumentAdmin, self).setUp()
        self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())

-    @mock.patch("documents.admin.index.add_or_update_document")
-    def test_save_model(self, m):
+    def test_save_model(self):
        doc = Document.objects.create(title="test")
+
        doc.title = "new title"
        self.doc_admin.save_model(None, doc, None, None)
        self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
-        m.assert_called_once()
+        self.assertEqual(self.get_document_from_index(doc)['id'], doc.id)

-    def test_tags(self):
+    def test_delete_model(self):
        doc = Document.objects.create(title="test")
-        doc.tags.create(name="t1")
-        doc.tags.create(name="t2")
+        index.add_or_update_document(doc)
+        self.assertIsNotNone(self.get_document_from_index(doc))

-        self.assertEqual(self.doc_admin.tags_(doc), "<span >t1, </span><span >t2, </span>")
-
-    def test_tags_empty(self):
-        doc = Document.objects.create(title="test")
-
-        self.assertEqual(self.doc_admin.tags_(doc), "")
-
-    @mock.patch("documents.admin.index.remove_document")
-    def test_delete_model(self, m):
-        doc = Document.objects.create(title="test")
        self.doc_admin.delete_model(None, doc)
-        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
-        m.assert_called_once()

-    @mock.patch("documents.admin.index.remove_document")
-    def test_delete_queryset(self, m):
+        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
+        self.assertIsNone(self.get_document_from_index(doc))
+
+    def test_delete_queryset(self):
+        docs = []
        for i in range(42):
-            Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
+            doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
+            docs.append(doc)
+            index.add_or_update_document(doc)

        self.assertEqual(Document.objects.count(), 42)

+        for doc in docs:
+            self.assertIsNotNone(self.get_document_from_index(doc))
+
        self.doc_admin.delete_queryset(None, Document.objects.all())

-        self.assertEqual(m.call_count, 42)
        self.assertEqual(Document.objects.count(), 0)

+        for doc in docs:
+            self.assertIsNone(self.get_document_from_index(doc))
+
    def test_created(self):
        doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
        self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -1,15 +1,21 @@
+import datetime
+import io
 import json
 import os
 import shutil
 import tempfile
+import zipfile
 from unittest import mock

+import pytest
+from django.conf import settings
 from django.contrib.auth.models import User
+from django.test import override_settings
 from rest_framework.test import APITestCase
 from whoosh.writing import AsyncWriter

 from documents import index, bulk_edit
-from documents.models import Document, Correspondent, DocumentType, Tag, SavedView
+from documents.models import Document, Correspondent, DocumentType, Tag, SavedView, MatchingModel
 from documents.tests.utils import DirectoriesMixin


@@ -144,21 +150,19 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_thumbnail)

+    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_download_with_archive(self):

-        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
-
        content = b"This is a test"
        content_archive = b"This is the same test but archived"

-        with open(filename, "wb") as f:
-            f.write(content)
-
-        filename = os.path.basename(filename)
-
-        doc = Document.objects.create(title="none", filename=filename,
+        doc = Document.objects.create(title="none", filename="my_document.pdf",
+                                      archive_filename="archived.pdf",
                                      mime_type="application/pdf")

+        with open(doc.source_path, "wb") as f:
+            f.write(content)
+
        with open(doc.archive_path, "wb") as f:
            f.write(content_archive)

@@ -228,6 +232,12 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(len(results), 2)
        self.assertCountEqual([results[0]['id'], results[1]['id']], [doc1.id, doc3.id])

+        response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_2.id, tag_3.id))
+        self.assertEqual(response.status_code, 200)
+        results = response.data['results']
+        self.assertEqual(len(results), 2)
+        self.assertCountEqual([results[0]['id'], results[1]['id']], [doc2.id, doc3.id])
+
        response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
@@ -261,10 +271,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        results = response.data['results']
        self.assertEqual(len(results), 0)

-    def test_search_no_query(self):
-        response = self.client.get("/api/search/")
-        results = response.data['results']
+    def test_documents_title_content_filter(self):

+        doc1 = Document.objects.create(title="title A", content="content A", checksum="A", mime_type="application/pdf")
+        doc2 = Document.objects.create(title="title B", content="content A", checksum="B", mime_type="application/pdf")
+        doc3 = Document.objects.create(title="title A", content="content B", checksum="C", mime_type="application/pdf")
+        doc4 = Document.objects.create(title="title B", content="content B", checksum="D", mime_type="application/pdf")
+
+        response = self.client.get("/api/documents/?title_content=A")
+        self.assertEqual(response.status_code, 200)
+        results = response.data['results']
+        self.assertEqual(len(results), 3)
+        self.assertCountEqual([results[0]['id'], results[1]['id'], results[2]['id']], [doc1.id, doc2.id, doc3.id])
+
+        response = self.client.get("/api/documents/?title_content=B")
+        self.assertEqual(response.status_code, 200)
+        results = response.data['results']
+        self.assertEqual(len(results), 3)
+        self.assertCountEqual([results[0]['id'], results[1]['id'], results[2]['id']], [doc2.id, doc3.id, doc4.id])
+
+        response = self.client.get("/api/documents/?title_content=X")
+        self.assertEqual(response.status_code, 200)
+        results = response.data['results']
        self.assertEqual(len(results), 0)

    def test_search(self):
@@ -278,32 +306,24 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            index.update_document(writer, d1)
            index.update_document(writer, d2)
            index.update_document(writer, d3)
-        response = self.client.get("/api/search/?query=bank")
+        response = self.client.get("/api/documents/?query=bank")
        results = response.data['results']
        self.assertEqual(response.data['count'], 3)
-        self.assertEqual(response.data['page'], 1)
-        self.assertEqual(response.data['page_count'], 1)
        self.assertEqual(len(results), 3)

-        response = self.client.get("/api/search/?query=september")
+        response = self.client.get("/api/documents/?query=september")
        results = response.data['results']
        self.assertEqual(response.data['count'], 1)
-        self.assertEqual(response.data['page'], 1)
-        self.assertEqual(response.data['page_count'], 1)
        self.assertEqual(len(results), 1)

-        response = self.client.get("/api/search/?query=statement")
+        response = self.client.get("/api/documents/?query=statement")
        results = response.data['results']
        self.assertEqual(response.data['count'], 2)
-        self.assertEqual(response.data['page'], 1)
-        self.assertEqual(response.data['page_count'], 1)
        self.assertEqual(len(results), 2)

-        response = self.client.get("/api/search/?query=sfegdfg")
+        response = self.client.get("/api/documents/?query=sfegdfg")
        results = response.data['results']
        self.assertEqual(response.data['count'], 0)
-        self.assertEqual(response.data['page'], 0)
-        self.assertEqual(response.data['page_count'], 0)
        self.assertEqual(len(results), 0)

    def test_search_multi_page(self):
@@ -316,53 +336,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        seen_ids = []

        for i in range(1, 6):
-            response = self.client.get(f"/api/search/?query=content&page={i}")
+            response = self.client.get(f"/api/documents/?query=content&page={i}&page_size=10")
            results = response.data['results']
            self.assertEqual(response.data['count'], 55)
-            self.assertEqual(response.data['page'], i)
-            self.assertEqual(response.data['page_count'], 6)
            self.assertEqual(len(results), 10)

            for result in results:
                self.assertNotIn(result['id'], seen_ids)
                seen_ids.append(result['id'])

-        response = self.client.get(f"/api/search/?query=content&page=6")
+        response = self.client.get(f"/api/documents/?query=content&page=6&page_size=10")
        results = response.data['results']
        self.assertEqual(response.data['count'], 55)
-        self.assertEqual(response.data['page'], 6)
-        self.assertEqual(response.data['page_count'], 6)
        self.assertEqual(len(results), 5)

        for result in results:
            self.assertNotIn(result['id'], seen_ids)
            seen_ids.append(result['id'])

-        response = self.client.get(f"/api/search/?query=content&page=7")
-        results = response.data['results']
-        self.assertEqual(response.data['count'], 55)
-        self.assertEqual(response.data['page'], 6)
-        self.assertEqual(response.data['page_count'], 6)
-        self.assertEqual(len(results), 5)
-
    def test_search_invalid_page(self):
        with AsyncWriter(index.open_index()) as writer:
            for i in range(15):
                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
                index.update_document(writer, doc)

-        first_page = self.client.get(f"/api/search/?query=content&page=1").data
-        second_page = self.client.get(f"/api/search/?query=content&page=2").data
-        should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
-        should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
-        should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
-        should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
-
-        self.assertDictEqual(first_page, should_be_first_page_1)
-        self.assertDictEqual(first_page, should_be_first_page_2)
-        self.assertDictEqual(first_page, should_be_first_page_3)
-        self.assertDictEqual(first_page, should_be_first_page_4)
-        self.assertNotEqual(len(first_page['results']), len(second_page['results']))
+        response = self.client.get(f"/api/documents/?query=content&page=0&page_size=10")
+        self.assertEqual(response.status_code, 404)
+        response = self.client.get(f"/api/documents/?query=content&page=3&page_size=10")
+        self.assertEqual(response.status_code, 404)

    @mock.patch("documents.index.autocomplete")
    def test_search_autocomplete(self, m):
@@ -386,6 +387,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(len(response.data), 10)

+    @pytest.mark.skip(reason="Not implemented yet")
    def test_search_spelling_correction(self):
        with AsyncWriter(index.open_index()) as writer:
            for i in range(55):
@@ -411,7 +413,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            index.update_document(writer, d2)
            index.update_document(writer, d3)

-        response = self.client.get(f"/api/search/?more_like={d2.id}")
+        response = self.client.get(f"/api/documents/?more_like_id={d2.id}")

        self.assertEqual(response.status_code, 200)

@@ -421,6 +423,79 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(results[0]['id'], d3.id)
        self.assertEqual(results[1]['id'], d1.id)

+    def test_search_filtering(self):
+        t = Tag.objects.create(name="tag")
+        t2 = Tag.objects.create(name="tag2")
+        c = Correspondent.objects.create(name="correspondent")
+        dt = DocumentType.objects.create(name="type")
+
+        d1 = Document.objects.create(checksum="1", correspondent=c, content="test")
+        d2 = Document.objects.create(checksum="2", document_type=dt, content="test")
+        d3 = Document.objects.create(checksum="3", content="test")
+        d3.tags.add(t)
+        d3.tags.add(t2)
+        d4 = Document.objects.create(checksum="4", created=datetime.datetime(2020, 7, 13), content="test")
+        d4.tags.add(t2)
+        d5 = Document.objects.create(checksum="5", added=datetime.datetime(2020, 7, 13), content="test")
+        d6 = Document.objects.create(checksum="6", content="test2")
+
+        with AsyncWriter(index.open_index()) as writer:
+            for doc in Document.objects.all():
+                index.update_document(writer, doc)
+
+        def search_query(q):
+            r = self.client.get("/api/documents/?query=test" + q)
+            self.assertEqual(r.status_code, 200)
+            return [hit['id'] for hit in r.data['results']]
+
+        self.assertCountEqual(search_query(""), [d1.id, d2.id, d3.id, d4.id, d5.id])
+        self.assertCountEqual(search_query("&is_tagged=true"), [d3.id, d4.id])
+        self.assertCountEqual(search_query("&is_tagged=false"), [d1.id, d2.id, d5.id])
+        self.assertCountEqual(search_query("&correspondent__id=" + str(c.id)), [d1.id])
+        self.assertCountEqual(search_query("&document_type__id=" + str(dt.id)), [d2.id])
+        self.assertCountEqual(search_query("&correspondent__isnull"), [d2.id, d3.id, d4.id, d5.id])
+        self.assertCountEqual(search_query("&document_type__isnull"), [d1.id, d3.id, d4.id, d5.id])
+        self.assertCountEqual(search_query("&tags__id__all=" + str(t.id) + "," + str(t2.id)), [d3.id])
+        self.assertCountEqual(search_query("&tags__id__all=" + str(t.id)), [d3.id])
+        self.assertCountEqual(search_query("&tags__id__all=" + str(t2.id)), [d3.id, d4.id])
+
+        self.assertIn(d4.id, search_query("&created__date__lt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
+        self.assertNotIn(d4.id, search_query("&created__date__gt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
+
+        self.assertNotIn(d4.id, search_query("&created__date__lt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
+        self.assertIn(d4.id, search_query("&created__date__gt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
+
+        self.assertIn(d5.id, search_query("&added__date__lt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
+        self.assertNotIn(d5.id, search_query("&added__date__gt=" + datetime.datetime(2020, 9, 2).strftime("%Y-%m-%d")))
+
+        self.assertNotIn(d5.id, search_query("&added__date__lt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
+        self.assertIn(d5.id, search_query("&added__date__gt=" + datetime.datetime(2020, 1, 2).strftime("%Y-%m-%d")))
+
+    def test_search_sorting(self):
+        c1 = Correspondent.objects.create(name="corres Ax")
+        c2 = Correspondent.objects.create(name="corres Cx")
+        c3 = Correspondent.objects.create(name="corres Bx")
+        d1 = Document.objects.create(checksum="1", correspondent=c1, content="test", archive_serial_number=2, title="3")
+        d2 = Document.objects.create(checksum="2", correspondent=c2, content="test", archive_serial_number=3, title="2")
+        d3 = Document.objects.create(checksum="3", correspondent=c3, content="test", archive_serial_number=1, title="1")
+
+        with AsyncWriter(index.open_index()) as writer:
+            for doc in Document.objects.all():
+                index.update_document(writer, doc)
+
+        def search_query(q):
+            r = self.client.get("/api/documents/?query=test" + q)
+            self.assertEqual(r.status_code, 200)
+            return [hit['id'] for hit in r.data['results']]
+
+        self.assertListEqual(search_query("&ordering=archive_serial_number"), [d3.id, d1.id, d2.id])
+        self.assertListEqual(search_query("&ordering=-archive_serial_number"), [d2.id, d1.id, d3.id])
+        self.assertListEqual(search_query("&ordering=title"), [d3.id, d2.id, d1.id])
+        self.assertListEqual(search_query("&ordering=-title"), [d1.id, d2.id, d3.id])
+        self.assertListEqual(search_query("&ordering=correspondent__name"), [d1.id, d3.id, d2.id])
+        self.assertListEqual(search_query("&ordering=-correspondent__name"), [d2.id, d3.id, d1.id])
+
+
    def test_statistics(self):

        doc1 = Document.objects.create(title="none1", checksum="A")
@@ -436,6 +511,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.data['documents_total'], 3)
        self.assertEqual(response.data['documents_inbox'], 1)

+    def test_statistics_no_inbox_tag(self):
+        Document.objects.create(title="none1", checksum="A")
+
+        response = self.client.get("/api/statistics/")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.data['documents_inbox'], None)
+
    @mock.patch("documents.views.async_task")
    def test_upload(self, m):

@@ -569,10 +651,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        async_task.assert_not_called()

    def test_get_metadata(self):
-        doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png", archive_checksum="A")
+        doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png", archive_checksum="A", archive_filename="archive.pdf")

-        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
-        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
+        source_file = os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png")
+        archive_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+
+        shutil.copy(source_file, doc.source_path)
+        shutil.copy(archive_file, doc.archive_path)

        response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
        self.assertEqual(response.status_code, 200)
@@ -583,6 +668,14 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertTrue(meta['has_archive_version'])
        self.assertEqual(len(meta['original_metadata']), 0)
        self.assertGreater(len(meta['archive_metadata']), 0)
+        self.assertEqual(meta['media_filename'], "file.pdf")
+        self.assertEqual(meta['archive_media_filename'], "archive.pdf")
+        self.assertEqual(meta['original_size'], os.stat(source_file).st_size)
+        self.assertEqual(meta['archive_size'], os.stat(archive_file).st_size)
+
+    def test_get_metadata_invalid_doc(self):
+        response = self.client.get(f"/api/documents/34576/metadata/")
+        self.assertEqual(response.status_code, 404)

    def test_get_metadata_no_archive(self):
        doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
@@ -598,6 +691,46 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertFalse(meta['has_archive_version'])
        self.assertGreater(len(meta['original_metadata']), 0)
        self.assertIsNone(meta['archive_metadata'])
+        self.assertIsNone(meta['archive_media_filename'])
+
+    def test_get_metadata_missing_files(self):
+        doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf", archive_filename="file.pdf", archive_checksum="B", checksum="A")
+
+        response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+        self.assertEqual(response.status_code, 200)
+
+        meta = response.data
+
+        self.assertTrue(meta['has_archive_version'])
+        self.assertIsNone(meta['original_metadata'])
+        self.assertIsNone(meta['original_size'])
+        self.assertIsNone(meta['archive_metadata'])
+        self.assertIsNone(meta['archive_size'])
+
+
+    def test_get_empty_suggestions(self):
+        doc = Document.objects.create(title="test", mime_type="application/pdf")
+
+        response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.data, {'correspondents': [], 'tags': [], 'document_types': []})
+
+    def test_get_suggestions_invalid_doc(self):
+        response = self.client.get(f"/api/documents/34676/suggestions/")
+        self.assertEqual(response.status_code, 404)
+
+    @mock.patch("documents.views.match_correspondents")
+    @mock.patch("documents.views.match_tags")
+    @mock.patch("documents.views.match_document_types")
+    def test_get_suggestions(self, match_document_types, match_tags, match_correspondents):
+        doc = Document.objects.create(title="test", mime_type="application/pdf", content="this is an invoice!")
+        match_tags.return_value = [Tag(id=56), Tag(id=123)]
+        match_document_types.return_value = [DocumentType(id=23)]
+        match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
+
+        response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+        self.assertEqual(response.data, {'correspondents': [88,2], 'tags': [56,123], 'document_types': [23]})

    def test_saved_views(self):
        u1 = User.objects.create_user("user1")
@@ -683,6 +816,126 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        v1 = SavedView.objects.get(id=v1.id)
        self.assertEqual(v1.filter_rules.count(), 0)

+    def test_get_logs(self):
+        response = self.client.get("/api/logs/")
+        self.assertEqual(response.status_code, 200)
+        self.assertCountEqual(response.data, ["mail", "paperless"])
+
+    def test_get_invalid_log(self):
+        response = self.client.get("/api/logs/bogus_log/")
+        self.assertEqual(response.status_code, 404)
+
+    @override_settings(LOGGING_DIR="bogus_dir")
+    def test_get_nonexistent_log(self):
+        response = self.client.get("/api/logs/paperless/")
+        self.assertEqual(response.status_code, 404)
+
+    def test_get_log(self):
+        log_data = "test\ntest2\n"
+        with open(os.path.join(settings.LOGGING_DIR, "paperless.log"), "w") as f:
+            f.write(log_data)
+        response = self.client.get("/api/logs/paperless/")
+        self.assertEqual(response.status_code, 200)
+        self.assertListEqual(response.data, ["test", "test2"])
+
+    def test_invalid_regex_other_algorithm(self):
+        for endpoint in ['correspondents', 'tags', 'document_types']:
+            response = self.client.post(f"/api/{endpoint}/", {
+                "name": "test",
+                "matching_algorithm": MatchingModel.MATCH_ANY,
+                "match": "["
+            }, format='json')
+            self.assertEqual(response.status_code, 201, endpoint)
+
+    def test_invalid_regex(self):
+        for endpoint in ['correspondents', 'tags', 'document_types']:
+            response = self.client.post(f"/api/{endpoint}/", {
+                "name": "test",
+                "matching_algorithm": MatchingModel.MATCH_REGEX,
+                "match": "["
+            }, format='json')
+            self.assertEqual(response.status_code, 400, endpoint)
+
+    def test_valid_regex(self):
+        for endpoint in ['correspondents', 'tags', 'document_types']:
+            response = self.client.post(f"/api/{endpoint}/", {
+                "name": "test",
+                "matching_algorithm": MatchingModel.MATCH_REGEX,
+                "match": "[0-9]"
+            }, format='json')
+            self.assertEqual(response.status_code, 201, endpoint)
+
+    def test_regex_no_algorithm(self):
+        for endpoint in ['correspondents', 'tags', 'document_types']:
+            response = self.client.post(f"/api/{endpoint}/", {
+                "name": "test",
+                "match": "[0-9]"
+            }, format='json')
+            self.assertEqual(response.status_code, 201, endpoint)
+
+    def test_tag_color_default(self):
+        response = self.client.post("/api/tags/", {
+            "name": "tag"
+        }, format="json")
+        self.assertEqual(response.status_code, 201)
+        self.assertEqual(Tag.objects.get(id=response.data['id']).color, "#a6cee3")
+        self.assertEqual(self.client.get(f"/api/tags/{response.data['id']}/", format="json").data['colour'], 1)
+
+    def test_tag_color(self):
+        response = self.client.post("/api/tags/", {
+            "name": "tag",
+            "colour": 3
+        }, format="json")
+        self.assertEqual(response.status_code, 201)
+        self.assertEqual(Tag.objects.get(id=response.data['id']).color, "#b2df8a")
+        self.assertEqual(self.client.get(f"/api/tags/{response.data['id']}/", format="json").data['colour'], 3)
+
+    def test_tag_color_invalid(self):
+        response = self.client.post("/api/tags/", {
+            "name": "tag",
+            "colour": 34
+        }, format="json")
+        self.assertEqual(response.status_code, 400)
+
+    def test_tag_color_custom(self):
+        tag = Tag.objects.create(name="test", color="#abcdef")
+        self.assertEqual(self.client.get(f"/api/tags/{tag.id}/", format="json").data['colour'], 1)
+
+
+class TestDocumentApiV2(DirectoriesMixin, APITestCase):
+
+    def setUp(self):
+        super(TestDocumentApiV2, self).setUp()
+
+        self.user = User.objects.create_superuser(username="temp_admin")
+
+        self.client.force_login(user=self.user)
+        self.client.defaults['HTTP_ACCEPT'] = 'application/json; version=2'
+
+    def test_tag_validate_color(self):
+        self.assertEqual(self.client.post("/api/tags/", {"name": "test", "color": "#12fFaA"}, format="json").status_code, 201)
+
+        self.assertEqual(self.client.post("/api/tags/", {"name": "test1", "color": "abcdef"}, format="json").status_code, 400)
+        self.assertEqual(self.client.post("/api/tags/", {"name": "test2", "color": "#abcdfg"}, format="json").status_code, 400)
+        self.assertEqual(self.client.post("/api/tags/", {"name": "test3", "color": "#asd"}, format="json").status_code, 400)
+        self.assertEqual(self.client.post("/api/tags/", {"name": "test4", "color": "#12121212"}, format="json").status_code, 400)
+
+    def test_tag_text_color(self):
+        t = Tag.objects.create(name="tag1", color="#000000")
+        self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#ffffff")
+
+        t.color = "#ffffff"
+        t.save()
+        self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#000000")
+
+        t.color = "asdf"
+        t.save()
+        self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#000000")
+
+        t.color = "123"
+        t.save()
+        self.assertEqual(self.client.get(f"/api/tags/{t.id}/", format="json").data['text_color'], "#000000")
+

 class TestBulkEdit(DirectoriesMixin, APITestCase):

@@ -1037,6 +1290,113 @@ class TestBulkEdit(DirectoriesMixin, APITestCase):
        self.assertCountEqual(response.data['selected_document_types'], [{"id": self.c1.id, "document_count": 1}, {"id": self.c2.id, "document_count": 0}])


+class TestBulkDownload(DirectoriesMixin, APITestCase):
+
+    def setUp(self):
+        super(TestBulkDownload, self).setUp()
+
+        user = User.objects.create_superuser(username="temp_admin")
+        self.client.force_login(user=user)
+
+        self.doc1 = Document.objects.create(title="unrelated", checksum="A")
+        self.doc2 = Document.objects.create(title="document A", filename="docA.pdf", mime_type="application/pdf", checksum="B", created=datetime.datetime(2021, 1, 1))
+        self.doc2b = Document.objects.create(title="document A", filename="docA2.pdf", mime_type="application/pdf", checksum="D", created=datetime.datetime(2021, 1, 1))
+        self.doc3 = Document.objects.create(title="document B", filename="docB.jpg", mime_type="image/jpeg", checksum="C", created=datetime.datetime(2020, 3, 21), archive_filename="docB.pdf", archive_checksum="D")
+
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), self.doc2.source_path)
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.png"), self.doc2b.source_path)
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.jpg"), self.doc3.source_path)
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "test_with_bom.pdf"), self.doc3.archive_path)
+
+    def test_download_originals(self):
+        response = self.client.post("/api/documents/bulk_download/", json.dumps({
+            "documents": [self.doc2.id, self.doc3.id],
+            "content": "originals"
+        }), content_type='application/json')
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response['Content-Type'], 'application/zip')
+
+        with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
+            self.assertEqual(len(zipf.filelist), 2)
+            self.assertIn("2021-01-01 document A.pdf", zipf.namelist())
+            self.assertIn("2020-03-21 document B.jpg", zipf.namelist())
+
+            with self.doc2.source_file as f:
+                self.assertEqual(f.read(), zipf.read("2021-01-01 document A.pdf"))
+
+            with self.doc3.source_file as f:
+                self.assertEqual(f.read(), zipf.read("2020-03-21 document B.jpg"))
+
+    def test_download_default(self):
+        response = self.client.post("/api/documents/bulk_download/", json.dumps({
+            "documents": [self.doc2.id, self.doc3.id]
+        }), content_type='application/json')
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response['Content-Type'], 'application/zip')
+
+        with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
+            self.assertEqual(len(zipf.filelist), 2)
+            self.assertIn("2021-01-01 document A.pdf", zipf.namelist())
+            self.assertIn("2020-03-21 document B.pdf", zipf.namelist())
+
+            with self.doc2.source_file as f:
+                self.assertEqual(f.read(), zipf.read("2021-01-01 document A.pdf"))
+
+            with self.doc3.archive_file as f:
+                self.assertEqual(f.read(), zipf.read("2020-03-21 document B.pdf"))
+
+    def test_download_both(self):
+        response = self.client.post("/api/documents/bulk_download/", json.dumps({
+            "documents": [self.doc2.id, self.doc3.id],
+            "content": "both"
+        }), content_type='application/json')
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response['Content-Type'], 'application/zip')
+
+        with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
+            self.assertEqual(len(zipf.filelist), 3)
+            self.assertIn("originals/2021-01-01 document A.pdf", zipf.namelist())
+            self.assertIn("archive/2020-03-21 document B.pdf", zipf.namelist())
+            self.assertIn("originals/2020-03-21 document B.jpg", zipf.namelist())
+
+            with self.doc2.source_file as f:
+                self.assertEqual(f.read(), zipf.read("originals/2021-01-01 document A.pdf"))
+
+            with self.doc3.archive_file as f:
+                self.assertEqual(f.read(), zipf.read("archive/2020-03-21 document B.pdf"))
+
+            with self.doc3.source_file as f:
+                self.assertEqual(f.read(), zipf.read("originals/2020-03-21 document B.jpg"))
+
+    def test_filename_clashes(self):
+        response = self.client.post("/api/documents/bulk_download/", json.dumps({
+            "documents": [self.doc2.id, self.doc2b.id]
+        }), content_type='application/json')
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response['Content-Type'], 'application/zip')
+
+        with zipfile.ZipFile(io.BytesIO(response.content)) as zipf:
+            self.assertEqual(len(zipf.filelist), 2)
+
+            self.assertIn("2021-01-01 document A.pdf", zipf.namelist())
+            self.assertIn("2021-01-01 document A_01.pdf", zipf.namelist())
+
+            with self.doc2.source_file as f:
+                self.assertEqual(f.read(), zipf.read("2021-01-01 document A.pdf"))
+
+            with self.doc2b.source_file as f:
+                self.assertEqual(f.read(), zipf.read("2021-01-01 document A_01.pdf"))
+
+    def test_compression(self):
+        response = self.client.post("/api/documents/bulk_download/", json.dumps({
+            "documents": [self.doc2.id, self.doc2b.id],
+            "compression": "lzma"
+        }), content_type='application/json')
+
 class TestApiAuth(APITestCase):

    def test_auth_required(self):
@@ -1057,7 +1417,20 @@ class TestApiAuth(APITestCase):
        self.assertEqual(self.client.get("/api/logs/").status_code, 401)
        self.assertEqual(self.client.get("/api/saved_views/").status_code, 401)

-        self.assertEqual(self.client.get("/api/search/").status_code, 401)
-        self.assertEqual(self.client.get("/api/search/auto_complete/").status_code, 401)
+        self.assertEqual(self.client.get("/api/search/autocomplete/").status_code, 401)
        self.assertEqual(self.client.get("/api/documents/bulk_edit/").status_code, 401)
+        self.assertEqual(self.client.get("/api/documents/bulk_download/").status_code, 401)
        self.assertEqual(self.client.get("/api/documents/selection_data/").status_code, 401)
+
+    def test_api_version_no_auth(self):
+
+        response = self.client.get("/api/")
+        self.assertNotIn("X-Api-Version", response)
+        self.assertNotIn("X-Version", response)
+
+    def test_api_version_with_auth(self):
+        user = User.objects.create_superuser(username="test")
+        self.client.force_login(user)
+        response = self.client.get("/api/")
+        self.assertIn("X-Api-Version", response)
+        self.assertIn("X-Version", response)
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -1,10 +1,13 @@
+import os
 import tempfile
-from time import sleep
+from pathlib import Path
 from unittest import mock

+import pytest
+from django.conf import settings
 from django.test import TestCase, override_settings

-from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
+from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError, load_classifier
 from documents.models import Correspondent, Document, Tag, DocumentType
 from documents.tests.utils import DirectoriesMixin

@@ -82,37 +85,19 @@ class TestClassifier(DirectoriesMixin, TestCase):
        self.assertTrue(self.classifier.train())
        self.assertFalse(self.classifier.train())

-        self.classifier.save_classifier()
+        self.classifier.save()

        classifier2 = DocumentClassifier()

        current_ver = DocumentClassifier.FORMAT_VERSION
        with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
            # assure that we won't load old classifiers.
-            self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
+            self.assertRaises(IncompatibleClassifierVersionError, classifier2.load)

-            self.classifier.save_classifier()
+            self.classifier.save()

            # assure that we can load the classifier after saving it.
-            classifier2.reload()
-
-    def testReload(self):
-
-        self.generate_test_data()
-        self.assertTrue(self.classifier.train())
-        self.classifier.save_classifier()
-
-        classifier2 = DocumentClassifier()
-        classifier2.reload()
-        v1 = classifier2.classifier_version
-
-        # change the classifier after some time.
-        sleep(1)
-        self.classifier.save_classifier()
-
-        classifier2.reload()
-        v2 = classifier2.classifier_version
-        self.assertNotEqual(v1, v2)
+            classifier2.load()

    @override_settings(DATA_DIR=tempfile.mkdtemp())
    def testSaveClassifier(self):
@@ -121,12 +106,21 @@ class TestClassifier(DirectoriesMixin, TestCase):

        self.classifier.train()

-        self.classifier.save_classifier()
+        self.classifier.save()

        new_classifier = DocumentClassifier()
-        new_classifier.reload()
+        new_classifier.load()
        self.assertFalse(new_classifier.train())

+    @override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
+    def test_load_and_classify(self):
+        self.generate_test_data()
+
+        new_classifier = DocumentClassifier()
+        new_classifier.load()
+
+        self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
+
    def test_one_correspondent_predict(self):
        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
@@ -235,3 +229,42 @@ class TestClassifier(DirectoriesMixin, TestCase):
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
+
+    def test_load_classifier_not_exists(self):
+        self.assertFalse(os.path.exists(settings.MODEL_FILE))
+        self.assertIsNone(load_classifier())
+
+    @mock.patch("documents.classifier.DocumentClassifier.load")
+    def test_load_classifier(self, load):
+        Path(settings.MODEL_FILE).touch()
+        self.assertIsNotNone(load_classifier())
+        load.assert_called_once()
+
+    @override_settings(CACHES={'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}})
+    @override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"))
+    @pytest.mark.skip(reason="Disabled caching due to high memory usage - need to investigate.")
+    def test_load_classifier_cached(self):
+        classifier = load_classifier()
+        self.assertIsNotNone(classifier)
+
+        with mock.patch("documents.classifier.DocumentClassifier.load") as load:
+            classifier2 = load_classifier()
+            load.assert_not_called()
+
+    @mock.patch("documents.classifier.DocumentClassifier.load")
+    def test_load_classifier_incompatible_version(self, load):
+        Path(settings.MODEL_FILE).touch()
+        self.assertTrue(os.path.exists(settings.MODEL_FILE))
+
+        load.side_effect = IncompatibleClassifierVersionError()
+        self.assertIsNone(load_classifier())
+        self.assertFalse(os.path.exists(settings.MODEL_FILE))
+
+    @mock.patch("documents.classifier.DocumentClassifier.load")
+    def test_load_classifier_os_error(self, load):
+        Path(settings.MODEL_FILE).touch()
+        self.assertTrue(os.path.exists(settings.MODEL_FILE))
+
+        load.side_effect = OSError()
+        self.assertIsNone(load_classifier())
+        self.assertTrue(os.path.exists(settings.MODEL_FILE))
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -5,12 +5,14 @@ import tempfile
 from unittest import mock
 from unittest.mock import MagicMock

+from django.conf import settings
 from django.test import TestCase, override_settings

 from .utils import DirectoriesMixin
 from ..consumer import Consumer, ConsumerError
 from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
 from ..parsers import DocumentParser, ParseError
+from ..tasks import sanity_check


 class TestAttributes(TestCase):
@@ -165,25 +167,43 @@ class TestFieldPermutations(TestCase):

 class DummyParser(DocumentParser):

-    def get_thumbnail(self, document_path, mime_type):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
        # not important during tests
        raise NotImplementedError()

    def __init__(self, logging_group, scratch_dir, archive_path):
-        super(DummyParser, self).__init__(logging_group)
+        super(DummyParser, self).__init__(logging_group, None)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
        self.archive_path = archive_path

-    def get_optimised_thumbnail(self, document_path, mime_type):
+    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
        return self.fake_thumb

    def parse(self, document_path, mime_type, file_name=None):
        self.text = "The Text"


+class CopyParser(DocumentParser):
+
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
+        return self.fake_thumb
+
+    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
+        return self.fake_thumb
+
+    def __init__(self, logging_group, progress_callback=None):
+        super(CopyParser, self).__init__(logging_group, progress_callback)
+        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=self.tempdir)
+
+    def parse(self, document_path, mime_type, file_name=None):
+        self.text = "The text"
+        self.archive_path = os.path.join(self.tempdir, "archive.pdf")
+        shutil.copy(document_path, self.archive_path)
+
+
 class FaultyParser(DocumentParser):

-    def get_thumbnail(self, document_path, mime_type):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
        # not important during tests
        raise NotImplementedError()

@@ -191,7 +211,7 @@ class FaultyParser(DocumentParser):
        super(FaultyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)

-    def get_optimised_thumbnail(self, document_path, mime_type):
+    def get_optimised_thumbnail(self, document_path, mime_type, file_name=None):
        return self.fake_thumb

    def parse(self, document_path, mime_type, file_name=None):
@@ -203,6 +223,8 @@ def fake_magic_from_file(file, mime=False):
    if mime:
        if os.path.splitext(file)[1] == ".pdf":
            return "application/pdf"
+        elif os.path.splitext(file)[1] == ".png":
+            return "image/png"
        else:
            return "unknown"
    else:
@@ -212,10 +234,24 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumer(DirectoriesMixin, TestCase):

-    def make_dummy_parser(self, logging_group):
+    def _assert_first_last_send_progress(self, first_status="STARTING", last_status="SUCCESS", first_progress=0, first_progress_max=100, last_progress=100, last_progress_max=100):
+
+        self._send_progress.assert_called()
+
+        args, kwargs = self._send_progress.call_args_list[0]
+        self.assertEqual(args[0], first_progress)
+        self.assertEqual(args[1], first_progress_max)
+        self.assertEqual(args[2], first_status)
+
+        args, kwargs = self._send_progress.call_args_list[len(self._send_progress.call_args_list) - 1]
+        self.assertEqual(args[0], last_progress)
+        self.assertEqual(args[1], last_progress_max)
+        self.assertEqual(args[2], last_status)
+
+    def make_dummy_parser(self, logging_group, progress_callback=None):
        return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())

-    def make_faulty_parser(self, logging_group):
+    def make_faulty_parser(self, logging_group, progress_callback=None):
        return FaultyParser(logging_group, self.dirs.scratch_dir)

    def setUp(self):
@@ -228,7 +264,11 @@ class TestConsumer(DirectoriesMixin, TestCase):
            "mime_types": {"application/pdf": ".pdf"},
            "weight": 0
        })]
+        self.addCleanup(patcher.stop)

+        # this prevents websocket message reports during testing.
+        patcher = mock.patch("documents.consumer.Consumer._send_progress")
+        self._send_progress = patcher.start()
        self.addCleanup(patcher.stop)

        self.consumer = Consumer()
@@ -256,6 +296,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertIsNone(document.correspondent)
        self.assertIsNone(document.document_type)
        self.assertEqual(document.filename, "0000001.pdf")
+        self.assertEqual(document.archive_filename, "0000001.pdf")

        self.assertTrue(os.path.isfile(
            document.source_path
@@ -274,6 +315,29 @@ class TestConsumer(DirectoriesMixin, TestCase):

        self.assertFalse(os.path.isfile(filename))

+        self._assert_first_last_send_progress()
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
+    def testDeleteMacFiles(self):
+        # https://github.com/jonaswinkler/paperless-ng/discussions/1037
+
+        filename = self.get_test_file()
+        shadow_file = os.path.join(self.dirs.scratch_dir, "._sample.pdf")
+
+        shutil.copy(filename, shadow_file)
+
+        self.assertTrue(os.path.isfile(shadow_file))
+
+        document = self.consumer.try_consume_file(filename)
+
+        self.assertTrue(os.path.isfile(
+            document.source_path
+        ))
+
+        self.assertFalse(os.path.isfile(shadow_file))
+        self.assertFalse(os.path.isfile(filename))
+
+
    def testOverrideFilename(self):
        filename = self.get_test_file()
        override_filename = "Statement for November.pdf"
@@ -282,21 +346,26 @@ class TestConsumer(DirectoriesMixin, TestCase):

        self.assertEqual(document.title, "Statement for November")

+        self._assert_first_last_send_progress()
+
    def testOverrideTitle(self):
        document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
        self.assertEqual(document.title, "Override Title")
+        self._assert_first_last_send_progress()

    def testOverrideCorrespondent(self):
        c = Correspondent.objects.create(name="test")

        document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
        self.assertEqual(document.correspondent.id, c.id)
+        self._assert_first_last_send_progress()

    def testOverrideDocumentType(self):
        dt = DocumentType.objects.create(name="test")

        document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
        self.assertEqual(document.document_type.id, dt.id)
+        self._assert_first_last_send_progress()

    def testOverrideTags(self):
        t1 = Tag.objects.create(name="t1")
@@ -307,37 +376,42 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertIn(t1, document.tags.all())
        self.assertNotIn(t2, document.tags.all())
        self.assertIn(t3, document.tags.all())
+        self._assert_first_last_send_progress()

    def testNotAFile(self):
-        try:
-            self.consumer.try_consume_file("non-existing-file")
-        except ConsumerError as e:
-            self.assertTrue(str(e).endswith('It is not a file'))
-            return

-        self.fail("Should throw exception")
+        self.assertRaisesMessage(
+            ConsumerError,
+            "File not found",
+            self.consumer.try_consume_file,
+            "non-existing-file"
+        )
+
+        self._assert_first_last_send_progress(last_status="FAILED")

    def testDuplicates1(self):
        self.consumer.try_consume_file(self.get_test_file())

-        try:
-            self.consumer.try_consume_file(self.get_test_file())
-        except ConsumerError as e:
-            self.assertTrue(str(e).endswith("It is a duplicate."))
-            return
+        self.assertRaisesMessage(
+            ConsumerError,
+            "It is a duplicate",
+            self.consumer.try_consume_file,
+            self.get_test_file()
+        )

-        self.fail("Should throw exception")
+        self._assert_first_last_send_progress(last_status="FAILED")

    def testDuplicates2(self):
        self.consumer.try_consume_file(self.get_test_file())

-        try:
-            self.consumer.try_consume_file(self.get_test_archive_file())
-        except ConsumerError as e:
-            self.assertTrue(str(e).endswith("It is a duplicate."))
-            return
+        self.assertRaisesMessage(
+            ConsumerError,
+            "It is a duplicate",
+            self.consumer.try_consume_file,
+            self.get_test_archive_file()
+        )

-        self.fail("Should throw exception")
+        self._assert_first_last_send_progress(last_status="FAILED")

    def testDuplicates3(self):
        self.consumer.try_consume_file(self.get_test_archive_file())
@@ -347,13 +421,15 @@ class TestConsumer(DirectoriesMixin, TestCase):
    def testNoParsers(self, m):
        m.return_value = []

-        try:
-            self.consumer.try_consume_file(self.get_test_file())
-        except ConsumerError as e:
-            self.assertEqual("Unsupported mime type application/pdf of file sample.pdf", str(e))
-            return
+        self.assertRaisesMessage(
+            ConsumerError,
+            "sample.pdf: Unsupported mime type application/pdf",
+            self.consumer.try_consume_file,
+            self.get_test_file()
+        )
+
+        self._assert_first_last_send_progress(last_status="FAILED")

-        self.fail("Should throw exception")

    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def testFaultyParser(self, m):
@@ -363,24 +439,28 @@ class TestConsumer(DirectoriesMixin, TestCase):
            "weight": 0
        })]

-        try:
-            self.consumer.try_consume_file(self.get_test_file())
-        except ConsumerError as e:
-            self.assertEqual(str(e), "Does not compute.")
-            return
+        self.assertRaisesMessage(
+            ConsumerError,
+            "sample.pdf: Error while consuming document sample.pdf: Does not compute.",
+            self.consumer.try_consume_file,
+            self.get_test_file()
+        )

-        self.fail("Should throw exception.")
+        self._assert_first_last_send_progress(last_status="FAILED")

    @mock.patch("documents.consumer.Consumer._write")
    def testPostSaveError(self, m):
        filename = self.get_test_file()
        m.side_effect = OSError("NO.")
-        try:
-            self.consumer.try_consume_file(filename)
-        except ConsumerError as e:
-            self.assertEqual(str(e), "NO.")
-        else:
-            self.fail("Should raise exception")
+
+        self.assertRaisesMessage(
+            ConsumerError,
+            "sample.pdf: The following error occured while consuming sample.pdf: NO.",
+            self.consumer.try_consume_file,
+            filename
+        )
+
+        self._assert_first_last_send_progress(last_status="FAILED")

        # file not deleted
        self.assertTrue(os.path.isfile(filename))
@@ -396,6 +476,9 @@ class TestConsumer(DirectoriesMixin, TestCase):

        self.assertEqual(document.title, "new docs")
        self.assertEqual(document.filename, "none/new docs.pdf")
+        self.assertEqual(document.archive_filename, "none/new docs.pdf")
+
+        self._assert_first_last_send_progress()

    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    @mock.patch("documents.signals.handlers.generate_unique_filename")
@@ -408,7 +491,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
            filenames.insert(0, f)
            return f

-        m.side_effect = lambda f, root: get_filename()
+        m.side_effect = lambda f, archive_filename = False: get_filename()

        filename = self.get_test_file()

@@ -419,8 +502,11 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertEqual(document.title, "new docs")
        self.assertIsNotNone(os.path.isfile(document.title))
        self.assertTrue(os.path.isfile(document.source_path))
+        self.assertTrue(os.path.isfile(document.archive_path))

-    @mock.patch("documents.consumer.DocumentClassifier")
+        self._assert_first_last_send_progress()
+
+    @mock.patch("documents.consumer.load_classifier")
    def testClassifyDocument(self, m):
        correspondent = Correspondent.objects.create(name="test")
        dtype = DocumentType.objects.create(name="test")
@@ -439,19 +525,26 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertIn(t1, document.tags.all())
        self.assertNotIn(t2, document.tags.all())

+        self._assert_first_last_send_progress()
+
    @override_settings(CONSUMER_DELETE_DUPLICATES=True)
    def test_delete_duplicate(self):
        dst = self.get_test_file()
        self.assertTrue(os.path.isfile(dst))
        doc = self.consumer.try_consume_file(dst)

+        self._assert_first_last_send_progress()
+
        self.assertFalse(os.path.isfile(dst))
        self.assertIsNotNone(doc)

+        self._send_progress.reset_mock()
+
        dst = self.get_test_file()
        self.assertTrue(os.path.isfile(dst))
        self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
        self.assertFalse(os.path.isfile(dst))
+        self._assert_first_last_send_progress(last_status="FAILED")

    @override_settings(CONSUMER_DELETE_DUPLICATES=False)
    def test_no_delete_duplicate(self):
@@ -467,6 +560,32 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
        self.assertTrue(os.path.isfile(dst))

+        self._assert_first_last_send_progress(last_status="FAILED")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
+    @mock.patch("documents.parsers.document_consumer_declaration.send")
+    def test_similar_filenames(self, m):
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png"), os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
+        m.return_value = [(None, {
+            "parser": CopyParser,
+            "mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
+            "weight": 0
+        })]
+        doc1 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png"))
+        doc2 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.pdf"))
+        doc3 = self.consumer.try_consume_file(os.path.join(settings.CONSUMPTION_DIR, "simple.png.pdf"))
+
+        self.assertEqual(doc1.filename, "simple.png")
+        self.assertEqual(doc1.archive_filename, "simple.pdf")
+        self.assertEqual(doc2.filename, "simple.pdf")
+        self.assertEqual(doc2.archive_filename, "simple_01.pdf")
+        self.assertEqual(doc3.filename, "simple.png.pdf")
+        self.assertEqual(doc3.archive_filename, "simple.png.pdf")
+
+        sanity_check()
+

 class PreConsumeTestCase(TestCase):

@@ -479,9 +598,11 @@ class PreConsumeTestCase(TestCase):
        m.assert_not_called()

    @mock.patch("documents.consumer.Popen")
+    @mock.patch("documents.consumer.Consumer._send_progress")
    @override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
-    def test_pre_consume_script_not_found(self, m):
+    def test_pre_consume_script_not_found(self, m, m2):
        c = Consumer()
+        c.filename = "somefile.pdf"
        c.path = "path-to-file"
        self.assertRaises(ConsumerError, c.run_pre_consume_script)

@@ -503,7 +624,6 @@ class PreConsumeTestCase(TestCase):
                self.assertEqual(command[1], "path-to-file")


-
 class PostConsumeTestCase(TestCase):

    @mock.patch("documents.consumer.Popen")
@@ -519,12 +639,13 @@ class PostConsumeTestCase(TestCase):

        m.assert_not_called()

-
    @override_settings(POST_CONSUME_SCRIPT="does-not-exist")
-    def test_post_consume_script_not_found(self):
+    @mock.patch("documents.consumer.Consumer._send_progress")
+    def test_post_consume_script_not_found(self, m):
        doc = Document.objects.create(title="Test", mime_type="application/pdf")
-
-        self.assertRaises(ConsumerError, Consumer().run_post_consume_script, doc)
+        c = Consumer()
+        c.filename = "somefile.pdf"
+        self.assertRaises(ConsumerError, c.run_post_consume_script, doc)

    @mock.patch("documents.consumer.Popen")
    def test_post_consume_script_simple(self, m):
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -1,7 +1,6 @@
 import datetime
 import os
 import shutil
-from unittest import mock
 from uuid import uuid4

 from dateutil import tz
@@ -9,7 +8,6 @@ from django.conf import settings
 from django.test import TestCase, override_settings

 from documents.parsers import parse_date
-from paperless_tesseract.parsers import RasterisedDocumentParser


 class TestDate(TestCase):
@@ -152,4 +150,4 @@ class TestDate(TestCase):
                2018, 2, 13, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
-        )
+        )
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -201,6 +201,13 @@ class TestFileHandling(DirectoriesMixin, TestCase):

        self.assertEqual(generate_filename(d), "my_doc_type - the_doc.pdf")

+    @override_settings(PAPERLESS_FILENAME_FORMAT="{asn} - {title}")
+    def test_asn(self):
+        d1 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=652, checksum="A")
+        d2 = Document.objects.create(title="the_doc", mime_type="application/pdf", archive_serial_number=None, checksum="B")
+        self.assertEqual(generate_filename(d1), "652 - the_doc.pdf")
+        self.assertEqual(generate_filename(d2), "none - the_doc.pdf")
+
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
    def test_tags_with_underscore(self):
        document = Document()
@@ -439,6 +446,18 @@ class TestFileHandling(DirectoriesMixin, TestCase):
        self.assertEqual(document2.filename, "qwe.pdf")


+    @override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
+    @mock.patch("documents.signals.handlers.Document.objects.filter")
+    def test_no_update_without_change(self, m):
+        doc = Document.objects.create(title="document", filename="document.pdf", archive_filename="document.pdf", checksum="A", archive_checksum="B", mime_type="application/pdf")
+        Path(doc.source_path).touch()
+        Path(doc.archive_path).touch()
+
+        doc.save()
+
+        m.assert_not_called()
+
+

 class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):

@@ -448,7 +467,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
-        doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")

        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
@@ -461,7 +480,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")

        self.assertFalse(os.path.isfile(original))
        self.assertFalse(os.path.isfile(archive))
@@ -475,7 +494,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")

        self.assertTrue(os.path.isfile(original))
        self.assertFalse(os.path.isfile(archive))
@@ -486,14 +505,49 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
    def test_move_archive_exists(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        existing_archive_file = os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")
        Path(original).touch()
        Path(archive).touch()
        os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
-        Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc.pdf")).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        Path(existing_archive_file).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+        self.assertTrue(os.path.isfile(existing_archive_file))
+        self.assertEqual(doc.archive_filename, "none/my_doc_01.pdf")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
+    def test_move_original_only(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "document_01.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "document.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+
+        doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document_01.pdf", checksum="A",
+                                      archive_checksum="B", archive_filename="document.pdf")
+
+        self.assertEqual(doc.filename, "document.pdf")
+        self.assertEqual(doc.archive_filename, "document.pdf")
+
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
+    def test_move_archive_only(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "document.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "document_01.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+
+        doc = Document.objects.create(mime_type="application/pdf", title="document", filename="document.pdf", checksum="A",
+                                      archive_checksum="B", archive_filename="document_01.pdf")
+
+        self.assertEqual(doc.filename, "document.pdf")
+        self.assertEqual(doc.archive_filename, "document.pdf")

-        self.assertTrue(os.path.isfile(original))
-        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))

@@ -514,8 +568,9 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")

+        m.assert_called()
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
@@ -527,7 +582,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        #Path(original).touch()
        Path(archive).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")

        self.assertFalse(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
@@ -551,19 +606,21 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", archive_filename="0000001.pdf", checksum="A", archive_checksum="B")

+        m.assert_called()
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))

+    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_archive_deleted(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
-        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")

        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
@@ -577,6 +634,28 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        self.assertFalse(os.path.isfile(doc.source_path))
        self.assertFalse(os.path.isfile(doc.archive_path))

+    @override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
+    def test_archive_deleted2(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "document.png")
+        original2 = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(original2).touch()
+        Path(archive).touch()
+
+        doc1 = Document.objects.create(mime_type="image/png", title="document", filename="document.png", checksum="A", archive_checksum="B", archive_filename="0000001.pdf")
+        doc2 = Document.objects.create(mime_type="application/pdf", title="0000001", filename="0000001.pdf", checksum="C")
+
+        self.assertTrue(os.path.isfile(doc1.source_path))
+        self.assertTrue(os.path.isfile(doc1.archive_path))
+        self.assertTrue(os.path.isfile(doc2.source_path))
+
+        doc2.delete()
+
+        self.assertTrue(os.path.isfile(doc1.source_path))
+        self.assertTrue(os.path.isfile(doc1.archive_path))
+        self.assertFalse(os.path.isfile(doc2.source_path))
+
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def test_database_error(self):

@@ -584,7 +663,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
-        doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_filename="0000001.pdf", archive_checksum="B")
        with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
            m.side_effect = DatabaseError()
            doc.save()
@@ -594,6 +673,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))

+
 class TestFilenameGeneration(TestCase):

    @override_settings(
@@ -617,7 +697,7 @@ class TestFilenameGeneration(TestCase):

 def run():
    doc = Document.objects.create(checksum=str(uuid.uuid4()), title=str(uuid.uuid4()), content="wow")
-    doc.filename = generate_unique_filename(doc, settings.ORIGINALS_DIR)
+    doc.filename = generate_unique_filename(doc)
    Path(doc.thumbnail_path).touch()
    with open(doc.source_path, "w") as f:
        f.write(str(uuid.uuid4()))
--- a/src/documents/tests/test_index.py
+++ b/src/documents/tests/test_index.py
@@ -1,20 +1,10 @@
 from django.test import TestCase

 from documents import index
-from documents.index import JsonFormatter
 from documents.models import Document
 from documents.tests.utils import DirectoriesMixin


-class JsonFormatterTest(TestCase):
-
-    def setUp(self) -> None:
-        self.formatter = JsonFormatter()
-
-    def test_empty_fragments(self):
-        self.assertListEqual(self.formatter.format([]), [])
-
-
 class TestAutoComplete(DirectoriesMixin, TestCase):

    def test_auto_complete(self):
--- a/src/documents/tests/test_logger.py
+++ b/src/documents/tests/test_logger.py
@@ -1,66 +0,0 @@
-import logging
-import uuid
-from unittest import mock
-
-from django.test import TestCase, override_settings
-
-from ..models import Log
-
-
-class TestPaperlessLog(TestCase):
-
-    def __init__(self, *args, **kwargs):
-        TestCase.__init__(self, *args, **kwargs)
-        self.logger = logging.getLogger(
-            "documents.management.commands.document_consumer")
-
-    @override_settings(DISABLE_DBHANDLER=False)
-    def test_that_it_saves_at_all(self):
-
-        kw = {"group": uuid.uuid4()}
-
-        self.assertEqual(Log.objects.all().count(), 0)
-
-        with mock.patch("logging.StreamHandler.emit") as __:
-
-            # Debug messages are ignored by default
-            self.logger.debug("This is a debugging message", extra=kw)
-            self.assertEqual(Log.objects.all().count(), 1)
-
-            self.logger.info("This is an informational message", extra=kw)
-            self.assertEqual(Log.objects.all().count(), 2)
-
-            self.logger.warning("This is an warning message", extra=kw)
-            self.assertEqual(Log.objects.all().count(), 3)
-
-            self.logger.error("This is an error message", extra=kw)
-            self.assertEqual(Log.objects.all().count(), 4)
-
-            self.logger.critical("This is a critical message", extra=kw)
-            self.assertEqual(Log.objects.all().count(), 5)
-
-    @override_settings(DISABLE_DBHANDLER=False)
-    def test_groups(self):
-
-        kw1 = {"group": uuid.uuid4()}
-        kw2 = {"group": uuid.uuid4()}
-
-        self.assertEqual(Log.objects.all().count(), 0)
-
-        with mock.patch("logging.StreamHandler.emit") as __:
-
-            self.logger.info("This is an informational message", extra=kw2)
-            self.assertEqual(Log.objects.all().count(), 1)
-            self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
-
-            self.logger.warning("This is an warning message", extra=kw1)
-            self.assertEqual(Log.objects.all().count(), 2)
-            self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
-
-            self.logger.error("This is an error message", extra=kw2)
-            self.assertEqual(Log.objects.all().count(), 3)
-            self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
-
-            self.logger.critical("This is a critical message", extra=kw1)
-            self.assertEqual(Log.objects.all().count(), 4)
-            self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
--- a/src/documents/tests/test_management.py
+++ b/src/documents/tests/test_management.py
@@ -20,6 +20,7 @@ from documents.tests.utils import DirectoriesMixin
 sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")


+@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
 class TestArchiver(DirectoriesMixin, TestCase):

    def make_models(self):
@@ -42,9 +43,42 @@ class TestArchiver(DirectoriesMixin, TestCase):
        doc = Document.objects.get(id=doc.id)

        self.assertIsNotNone(doc.checksum)
+        self.assertIsNotNone(doc.archive_checksum)
        self.assertTrue(os.path.isfile(doc.archive_path))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
+        self.assertEqual(doc.archive_filename, "none/A.pdf")
+
+    def test_unknown_mime_type(self):
+        doc = self.make_models()
+        doc.mime_type = "sdgfh"
+        doc.save()
+        shutil.copy(sample_file, doc.source_path)
+
+        handle_document(doc.pk)
+
+        doc = Document.objects.get(id=doc.id)
+
+        self.assertIsNotNone(doc.checksum)
+        self.assertIsNone(doc.archive_checksum)
+        self.assertIsNone(doc.archive_filename)
+        self.assertTrue(os.path.isfile(doc.source_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{title}")
+    def test_naming_priorities(self):
+        doc1 = Document.objects.create(checksum="A", title="document", content="first document", mime_type="application/pdf", filename="document.pdf")
+        doc2 = Document.objects.create(checksum="B", title="document", content="second document", mime_type="application/pdf", filename="document_01.pdf")
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document.pdf"))
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document_01.pdf"))
+
+        handle_document(doc2.pk)
+        handle_document(doc1.pk)
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertEqual(doc1.archive_filename, "document.pdf")
+        self.assertEqual(doc2.archive_filename, "document_01.pdf")


 class TestDecryptDocuments(TestCase):
@@ -106,24 +140,27 @@ class TestMakeIndex(TestCase):

 class TestRenamer(DirectoriesMixin, TestCase):

+    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_rename(self):
-        doc = Document.objects.create(title="test", mime_type="application/pdf")
+        doc = Document.objects.create(title="test", mime_type="image/jpeg")
        doc.filename = generate_filename(doc)
+        doc.archive_filename = generate_filename(doc, archive_filename=True)
        doc.save()

        Path(doc.source_path).touch()
+        Path(doc.archive_path).touch()

-        old_source_path = doc.source_path
-
-        with override_settings(PAPERLESS_FILENAME_FORMAT="{title}"):
+        with override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}"):
            call_command("document_renamer")

        doc2 = Document.objects.get(id=doc.id)

-        self.assertEqual(doc2.filename, "test.pdf")
-        self.assertFalse(os.path.isfile(old_source_path))
+        self.assertEqual(doc2.filename, "none/test.jpg")
+        self.assertEqual(doc2.archive_filename, "none/test.pdf")
        self.assertFalse(os.path.isfile(doc.source_path))
+        self.assertFalse(os.path.isfile(doc.archive_path))
        self.assertTrue(os.path.isfile(doc2.source_path))
+        self.assertTrue(os.path.isfile(doc2.archive_path))


 class TestCreateClassifier(TestCase):
@@ -133,3 +170,24 @@ class TestCreateClassifier(TestCase):
        call_command("document_create_classifier")

        m.assert_called_once()
+
+
+class TestSanityChecker(DirectoriesMixin, TestCase):
+
+    def test_no_issues(self):
+        with self.assertLogs() as capture:
+            call_command("document_sanity_checker")
+
+        self.assertEqual(len(capture.output), 1)
+        self.assertIn("Sanity checker detected no issues.", capture.output[0])
+
+    def test_errors(self):
+        doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc")
+        Path(doc.source_path).touch()
+        Path(doc.thumbnail_path).touch()
+
+        with self.assertLogs() as capture:
+            call_command("document_sanity_checker")
+
+        self.assertEqual(len(capture.output), 1)
+        self.assertIn("Checksum mismatch of document", capture.output[0])
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -60,10 +60,10 @@ class ConsumerMixin:

        super(ConsumerMixin, self).tearDown()

-    def wait_for_task_mock_call(self):
+    def wait_for_task_mock_call(self, excpeted_call_count=1):
        n = 0
        while n < 100:
-            if self.task_mock.call_count > 0:
+            if self.task_mock.call_count >= excpeted_call_count:
                # give task_mock some time to finish and raise errors
                sleep(1)
                return
@@ -202,8 +202,44 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')

+    def test_mac_write(self):
+        self.task_mock.side_effect = self.bogus_task

-@override_settings(CONSUMER_POLLING=1)
+        self.t_start()
+
+        shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, ".DS_STORE"))
+        shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
+        shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_file.pdf"))
+        shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "my_second_file.pdf"))
+        shutil.copy(self.sample_file, os.path.join(self.dirs.consumption_dir, "._my_second_file.pdf"))
+
+        sleep(5)
+
+        self.wait_for_task_mock_call(excpeted_call_count=2)
+
+        self.assertEqual(2, self.task_mock.call_count)
+
+        fnames = [os.path.basename(args[1]) for args, _ in self.task_mock.call_args_list]
+        self.assertCountEqual(fnames, ["my_file.pdf", "my_second_file.pdf"])
+
+    def test_is_ignored(self):
+        test_paths = [
+            (os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
+            (os.path.join(self.dirs.consumption_dir, "foo","bar.pdf"), False),
+            (os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
+            (os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"), True),
+            (os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
+            (os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
+            (os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
+        ]
+        for file_path, expected_ignored in test_paths:
+            self.assertEqual(
+                expected_ignored,
+                document_consumer._is_ignored(file_path),
+                f'_is_ignored("{file_path}") != {expected_ignored}')
+
+
+@override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
 class TestConsumerPolling(TestConsumer):
    # just do all the tests with polling
    pass
@@ -215,8 +251,7 @@ class TestConsumerRecursive(TestConsumer):
    pass


-@override_settings(CONSUMER_RECURSIVE=True)
-@override_settings(CONSUMER_POLLING=1)
+@override_settings(CONSUMER_RECURSIVE=True, CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
 class TestConsumerRecursivePolling(TestConsumer):
    # just do all the tests with polling and recursive
    pass
@@ -257,6 +292,6 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
        # their order.
        self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)

-    @override_settings(CONSUMER_POLLING=1)
+    @override_settings(CONSUMER_POLLING=1, CONSUMER_POLLING_DELAY=1, CONSUMER_POLLING_RETRY_COUNT=20)
    def test_consume_file_with_path_tags_polling(self):
        self.test_consume_file_with_path_tags()
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -22,7 +22,7 @@ class TestExportImport(DirectoriesMixin, TestCase):
        self.target = tempfile.mkdtemp()
        self.addCleanup(shutil.rmtree, self.target)

-        self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf")
+        self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
        self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf")
        self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf")
        self.d4 = Document.objects.create(content="Content", checksum="82186aaa94f0b98697d704b90fd1c072", title="wow_dec", filename="0000004.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
@@ -69,7 +69,7 @@ class TestExportImport(DirectoriesMixin, TestCase):

        manifest = self._do_export(use_filename_format=use_filename_format)

-        self.assertEqual(len(manifest), 7)
+        self.assertEqual(len(manifest), 8)
        self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 4)

        self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
--- a/src/documents/tests/test_management_retagger.py
+++ b/src/documents/tests/test_management_retagger.py
@@ -11,14 +11,17 @@ class TestRetagger(DirectoriesMixin, TestCase):
        self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
        self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
        self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
+        self.d4 = Document.objects.create(checksum="D", title="D", content="auto document")

        self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
        self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
        self.tag_inbox = Tag.objects.create(name="test", is_inbox_tag=True)
        self.tag_no_match = Tag.objects.create(name="test2")
+        self.tag_auto = Tag.objects.create(name="tagauto", matching_algorithm=Tag.MATCH_AUTO)

        self.d3.tags.add(self.tag_inbox)
        self.d3.tags.add(self.tag_no_match)
+        self.d4.tags.add(self.tag_auto)


        self.correspondent_first = Correspondent.objects.create(
@@ -32,7 +35,8 @@ class TestRetagger(DirectoriesMixin, TestCase):
            name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)

    def get_updated_docs(self):
-        return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
+        return Document.objects.get(title="A"), Document.objects.get(title="B"), \
+            Document.objects.get(title="C"), Document.objects.get(title="D")

    def setUp(self) -> None:
        super(TestRetagger, self).setUp()
@@ -40,25 +44,26 @@ class TestRetagger(DirectoriesMixin, TestCase):

    def test_add_tags(self):
        call_command('document_retagger', '--tags')
-        d_first, d_second, d_unrelated = self.get_updated_docs()
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()

        self.assertEqual(d_first.tags.count(), 1)
        self.assertEqual(d_second.tags.count(), 1)
        self.assertEqual(d_unrelated.tags.count(), 2)
+        self.assertEqual(d_auto.tags.count(), 1)

        self.assertEqual(d_first.tags.first(), self.tag_first)
        self.assertEqual(d_second.tags.first(), self.tag_second)

    def test_add_type(self):
        call_command('document_retagger', '--document_type')
-        d_first, d_second, d_unrelated = self.get_updated_docs()
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()

        self.assertEqual(d_first.document_type, self.doctype_first)
        self.assertEqual(d_second.document_type, self.doctype_second)

    def test_add_correspondent(self):
        call_command('document_retagger', '--correspondent')
-        d_first, d_second, d_unrelated = self.get_updated_docs()
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()

        self.assertEqual(d_first.correspondent, self.correspondent_first)
        self.assertEqual(d_second.correspondent, self.correspondent_second)
@@ -68,11 +73,55 @@ class TestRetagger(DirectoriesMixin, TestCase):

        call_command('document_retagger', '--tags', '--overwrite')

-        d_first, d_second, d_unrelated = self.get_updated_docs()
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()

        self.assertIsNotNone(Tag.objects.get(id=self.tag_second.id))

        self.assertCountEqual([tag.id for tag in d_first.tags.all()], [self.tag_first.id])
        self.assertCountEqual([tag.id for tag in d_second.tags.all()], [self.tag_second.id])
        self.assertCountEqual([tag.id for tag in d_unrelated.tags.all()], [self.tag_inbox.id, self.tag_no_match.id])
+        self.assertEqual(d_auto.tags.count(), 0)

+    def test_add_tags_suggest(self):
+        call_command('document_retagger', '--tags', '--suggest')
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
+
+        self.assertEqual(d_first.tags.count(), 0)
+        self.assertEqual(d_second.tags.count(), 0)
+        self.assertEqual(d_auto.tags.count(), 1)
+
+    def test_add_type_suggest(self):
+        call_command('document_retagger', '--document_type', '--suggest')
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
+
+        self.assertEqual(d_first.document_type, None)
+        self.assertEqual(d_second.document_type, None)
+
+    def test_add_correspondent_suggest(self):
+        call_command('document_retagger', '--correspondent', '--suggest')
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
+
+        self.assertEqual(d_first.correspondent, None)
+        self.assertEqual(d_second.correspondent, None)
+
+    def test_add_tags_suggest_url(self):
+        call_command('document_retagger', '--tags', '--suggest', '--base-url=http://localhost')
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
+
+        self.assertEqual(d_first.tags.count(), 0)
+        self.assertEqual(d_second.tags.count(), 0)
+        self.assertEqual(d_auto.tags.count(), 1)
+
+    def test_add_type_suggest_url(self):
+        call_command('document_retagger', '--document_type', '--suggest', '--base-url=http://localhost')
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
+
+        self.assertEqual(d_first.document_type, None)
+        self.assertEqual(d_second.document_type, None)
+
+    def test_add_correspondent_suggest_url(self):
+        call_command('document_retagger', '--correspondent', '--suggest', '--base-url=http://localhost')
+        d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
+
+        self.assertEqual(d_first.correspondent, None)
+        self.assertEqual(d_second.correspondent, None)
--- a/src/documents/tests/test_management_superuser.py
+++ b/src/documents/tests/test_management_superuser.py
@@ -0,0 +1,66 @@
+import os
+import shutil
+from unittest import mock
+
+from django.contrib.auth.models import User
+from django.core.management import call_command
+from django.test import TestCase
+
+from documents.management.commands.document_thumbnails import _process_document
+from documents.models import Document, Tag, Correspondent, DocumentType
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestManageSuperUser(DirectoriesMixin, TestCase):
+
+    def reset_environment(self):
+        if "PAPERLESS_ADMIN_USER" in os.environ:
+            del os.environ["PAPERLESS_ADMIN_USER"]
+        if "PAPERLESS_ADMIN_PASSWORD" in os.environ:
+            del os.environ["PAPERLESS_ADMIN_PASSWORD"]
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.reset_environment()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self.reset_environment()
+
+    def test_no_user(self):
+        call_command("manage_superuser")
+
+        # just the consumer user.
+        self.assertEqual(User.objects.count(), 1)
+        self.assertTrue(User.objects.filter(username="consumer").exists())
+
+    def test_create(self):
+        os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
+        os.environ["PAPERLESS_ADMIN_PASSWORD"] = "123456"
+
+        call_command("manage_superuser")
+
+        user: User = User.objects.get_by_natural_key("new_user")
+        self.assertTrue(user.check_password("123456"))
+
+    def test_update(self):
+        os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
+        os.environ["PAPERLESS_ADMIN_PASSWORD"] = "123456"
+
+        call_command("manage_superuser")
+
+        os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
+        os.environ["PAPERLESS_ADMIN_PASSWORD"] = "more_secure_pwd_7645"
+
+        call_command("manage_superuser")
+
+        user: User = User.objects.get_by_natural_key("new_user")
+        self.assertTrue(user.check_password("more_secure_pwd_7645"))
+
+    def test_no_password(self):
+        os.environ["PAPERLESS_ADMIN_USER"] = "new_user"
+
+        call_command("manage_superuser")
+
+        with self.assertRaises(User.DoesNotExist):
+            User.objects.get_by_natural_key("new_user")
--- a/src/documents/tests/test_migration_archive_files.py
+++ b/src/documents/tests/test_migration_archive_files.py
@@ -0,0 +1,325 @@
+import hashlib
+import os
+import shutil
+from pathlib import Path
+from unittest import mock
+
+from django.conf import settings
+from django.test import override_settings
+
+from documents.parsers import ParseError
+from documents.tests.utils import DirectoriesMixin, TestMigrations
+
+
+STORAGE_TYPE_GPG = "gpg"
+
+
+def archive_name_from_filename(filename):
+    return os.path.splitext(filename)[0] + ".pdf"
+
+
+def archive_path_old(self):
+    if self.filename:
+        fname = archive_name_from_filename(self.filename)
+    else:
+        fname = "{:07}.pdf".format(self.pk)
+
+    return os.path.join(
+        settings.ARCHIVE_DIR,
+        fname
+    )
+
+
+def archive_path_new(doc):
+        if doc.archive_filename is not None:
+            return os.path.join(
+                settings.ARCHIVE_DIR,
+                str(doc.archive_filename)
+            )
+        else:
+            return None
+
+
+def source_path(doc):
+    if doc.filename:
+        fname = str(doc.filename)
+    else:
+        fname = "{:07}{}".format(doc.pk, doc.file_type)
+        if doc.storage_type == STORAGE_TYPE_GPG:
+            fname += ".gpg"  # pragma: no cover
+
+    return os.path.join(
+        settings.ORIGINALS_DIR,
+        fname
+    )
+
+
+def thumbnail_path(doc):
+    file_name = "{:07}.png".format(doc.pk)
+    if doc.storage_type == STORAGE_TYPE_GPG:
+        file_name += ".gpg"
+
+    return os.path.join(
+        settings.THUMBNAIL_DIR,
+        file_name
+    )
+
+
+def make_test_document(document_class, title: str, mime_type: str, original: str, original_filename: str, archive: str = None, archive_filename: str = None):
+    doc = document_class()
+    doc.filename = original_filename
+    doc.title = title
+    doc.mime_type = mime_type
+    doc.content = "the content, does not matter for this test"
+    doc.save()
+
+    shutil.copy2(original, source_path(doc))
+    with open(original, "rb") as f:
+        doc.checksum = hashlib.md5(f.read()).hexdigest()
+
+    if archive:
+        if archive_filename:
+            doc.archive_filename = archive_filename
+            shutil.copy2(archive, archive_path_new(doc))
+        else:
+            shutil.copy2(archive, archive_path_old(doc))
+
+        with open(archive, "rb") as f:
+            doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
+
+    doc.save()
+
+    Path(thumbnail_path(doc)).touch()
+
+    return doc
+
+
+simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg")
+simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf")
+simple_pdf3 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000003.pdf")
+simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt")
+simple_png = os.path.join(os.path.dirname(__file__), "samples", "simple-noalpha.png")
+simple_png2 = os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
+
+
+@override_settings(PAPERLESS_FILENAME_FORMAT="")
+class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1011_auto_20210101_2340'
+    migrate_to = '1012_fix_archive_files'
+
+    def setUpBeforeMigration(self, apps):
+        Document = apps.get_model("documents", "Document")
+
+        self.unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf3, "unrelated.pdf", simple_pdf)
+        self.no_text = make_test_document(Document, "no-text", "image/png", simple_png2, "no-text.png", simple_pdf)
+        self.doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
+        self.clash1 = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
+        self.clash2 = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf)
+        self.clash3 = make_test_document(Document, "clash", "image/png", simple_png, "clash.png", simple_pdf)
+        self.clash4 = make_test_document(Document, "clash.png", "application/pdf", simple_pdf2, "clash.png.pdf", simple_pdf2)
+
+        self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash2))
+        self.assertEqual(archive_path_old(self.clash1), archive_path_old(self.clash3))
+        self.assertNotEqual(archive_path_old(self.clash1), archive_path_old(self.clash4))
+
+    def testArchiveFilesMigrated(self):
+        Document = self.apps.get_model('documents', 'Document')
+
+        for doc in Document.objects.all():
+            if doc.archive_checksum:
+                self.assertIsNotNone(doc.archive_filename)
+                self.assertTrue(os.path.isfile(archive_path_new(doc)))
+            else:
+                self.assertIsNone(doc.archive_filename)
+
+            with open(source_path(doc), "rb") as f:
+                original_checksum = hashlib.md5(f.read()).hexdigest()
+            self.assertEqual(original_checksum, doc.checksum)
+
+            if doc.archive_checksum:
+                self.assertTrue(os.path.isfile(archive_path_new(doc)))
+                with open(archive_path_new(doc), "rb") as f:
+                    archive_checksum = hashlib.md5(f.read()).hexdigest()
+                self.assertEqual(archive_checksum, doc.archive_checksum)
+
+        self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 6)
+
+    def test_filenames(self):
+        Document = self.apps.get_model('documents', 'Document')
+        self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
+        self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
+        self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
+        self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, f"{self.clash1.id:07}.pdf")
+        self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, f"{self.clash2.id:07}.pdf")
+        self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, f"{self.clash3.id:07}.pdf")
+        self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
+
+
+@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+class TestMigrateArchiveFilesWithFilenameFormat(TestMigrateArchiveFiles):
+
+    def test_filenames(self):
+        Document = self.apps.get_model('documents', 'Document')
+        self.assertEqual(Document.objects.get(id=self.unrelated.id).archive_filename, "unrelated.pdf")
+        self.assertEqual(Document.objects.get(id=self.no_text.id).archive_filename, "no-text.pdf")
+        self.assertEqual(Document.objects.get(id=self.doc_no_archive.id).archive_filename, None)
+        self.assertEqual(Document.objects.get(id=self.clash1.id).archive_filename, "none/clash.pdf")
+        self.assertEqual(Document.objects.get(id=self.clash2.id).archive_filename, "none/clash_01.pdf")
+        self.assertEqual(Document.objects.get(id=self.clash3.id).archive_filename, "none/clash_02.pdf")
+        self.assertEqual(Document.objects.get(id=self.clash4.id).archive_filename, "clash.png.pdf")
+
+
+def fake_parse_wrapper(parser, path, mime_type, file_name):
+    parser.archive_path = None
+    parser.text = "the text"
+
+
+@override_settings(PAPERLESS_FILENAME_FORMAT="")
+class TestMigrateArchiveFilesErrors(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1011_auto_20210101_2340'
+    migrate_to = '1012_fix_archive_files'
+    auto_migrate = False
+
+    def test_archive_missing(self):
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf)
+        os.unlink(archive_path_old(doc))
+
+        self.assertRaisesMessage(ValueError, "does not exist at: ", self.performMigration)
+
+    def test_parser_missing(self):
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "invalid/typesss768", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "invalid/typesss768", simple_jpg, "document.jpg", simple_pdf)
+
+        self.assertRaisesMessage(ValueError, "no parsers are available", self.performMigration)
+
+    @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+    def test_parser_error(self, m):
+        m.side_effect = ParseError()
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+        self.assertIsNotNone(doc1.archive_checksum)
+        self.assertIsNotNone(doc2.archive_checksum)
+
+        with self.assertLogs() as capture:
+            self.performMigration()
+
+        self.assertEqual(m.call_count, 6)
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))),
+            4)
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))),
+            2)
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertIsNone(doc1.archive_checksum)
+        self.assertIsNone(doc2.archive_checksum)
+        self.assertIsNone(doc1.archive_filename)
+        self.assertIsNone(doc2.archive_filename)
+
+    @mock.patch("documents.migrations.1012_fix_archive_files.parse_wrapper")
+    def test_parser_no_archive(self, m):
+        m.side_effect = fake_parse_wrapper
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf)
+        doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf)
+
+        with self.assertLogs() as capture:
+            self.performMigration()
+
+        self.assertEqual(
+            len(list(filter(lambda log: "Parser did not return an archive document for document" in log, capture.output))),
+            2)
+
+        Document = self.apps.get_model("documents", "Document")
+
+        doc1 = Document.objects.get(id=doc1.id)
+        doc2 = Document.objects.get(id=doc2.id)
+
+        self.assertIsNone(doc1.archive_checksum)
+        self.assertIsNone(doc2.archive_checksum)
+        self.assertIsNone(doc1.archive_filename)
+        self.assertIsNone(doc2.archive_filename)
+
+
+@override_settings(PAPERLESS_FILENAME_FORMAT="")
+class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1012_fix_archive_files'
+    migrate_to = '1011_auto_20210101_2340'
+
+    def setUpBeforeMigration(self, apps):
+
+        Document = apps.get_model("documents", "Document")
+
+        doc_unrelated = make_test_document(Document, "unrelated", "application/pdf", simple_pdf2, "unrelated.txt", simple_pdf2, "unrelated.pdf")
+        doc_no_archive = make_test_document(Document, "no_archive", "text/plain", simple_txt, "no_archive.txt")
+        clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_02.pdf")
+
+    def testArchiveFilesReverted(self):
+        Document = self.apps.get_model('documents', 'Document')
+
+        for doc in Document.objects.all():
+            if doc.archive_checksum:
+                self.assertTrue(os.path.isfile(archive_path_old(doc)))
+            with open(source_path(doc), "rb") as f:
+                original_checksum = hashlib.md5(f.read()).hexdigest()
+            self.assertEqual(original_checksum, doc.checksum)
+
+            if doc.archive_checksum:
+                self.assertTrue(os.path.isfile(archive_path_old(doc)))
+                with open(archive_path_old(doc), "rb") as f:
+                    archive_checksum = hashlib.md5(f.read()).hexdigest()
+                self.assertEqual(archive_checksum, doc.archive_checksum)
+
+        self.assertEqual(Document.objects.filter(archive_checksum__isnull=False).count(), 2)
+
+
+@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+class TestMigrateArchiveFilesBackwardsWithFilenameFormat(TestMigrateArchiveFilesBackwards):
+    pass
+
+
+@override_settings(PAPERLESS_FILENAME_FORMAT="")
+class TestMigrateArchiveFilesBackwardsErrors(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1012_fix_archive_files'
+    migrate_to = '1011_auto_20210101_2340'
+    auto_migrate = False
+
+    def test_filename_clash(self):
+
+        Document = self.apps.get_model("documents", "Document")
+
+        self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash_02.pdf")
+        self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
+
+        self.assertRaisesMessage(ValueError, "would clash with another archive filename", self.performMigration)
+
+    def test_filename_exists(self):
+
+        Document = self.apps.get_model("documents", "Document")
+
+        self.clashA = make_test_document(Document, "clash", "application/pdf", simple_pdf, "clash.pdf", simple_pdf, "clash.pdf")
+        self.clashB = make_test_document(Document, "clash", "image/jpeg", simple_jpg, "clash.jpg", simple_pdf, "clash_01.pdf")
+
+        self.assertRaisesMessage(ValueError, "file already exists.", self.performMigration)
--- a/src/documents/tests/test_migration_mime_type.py
+++ b/src/documents/tests/test_migration_mime_type.py
@@ -1,52 +1,11 @@
 import os
 import shutil
-from pathlib import Path

-from django.apps import apps
 from django.conf import settings
-from django.db import connection
-from django.db.migrations.executor import MigrationExecutor
-from django.test import TestCase, TransactionTestCase, override_settings
+from django.test import override_settings

-from documents.models import Document
 from documents.parsers import get_default_file_extension
-from documents.tests.utils import DirectoriesMixin
-
-
-class TestMigrations(TransactionTestCase):
-
-    @property
-    def app(self):
-        return apps.get_containing_app_config(type(self).__module__).name
-
-    migrate_from = None
-    migrate_to = None
-
-    def setUp(self):
-        super(TestMigrations, self).setUp()
-
-        assert self.migrate_from and self.migrate_to, \
-            "TestCase '{}' must define migrate_from and migrate_to     properties".format(type(self).__name__)
-        self.migrate_from = [(self.app, self.migrate_from)]
-        self.migrate_to = [(self.app, self.migrate_to)]
-        executor = MigrationExecutor(connection)
-        old_apps = executor.loader.project_state(self.migrate_from).apps
-
-        # Reverse to the original migration
-        executor.migrate(self.migrate_from)
-
-        self.setUpBeforeMigration(old_apps)
-
-        # Run the migration to test
-        executor = MigrationExecutor(connection)
-        executor.loader.build_graph()  # reload.
-        executor.migrate(self.migrate_to)
-
-        self.apps = executor.loader.project_state(self.migrate_to).apps
-
-    def setUpBeforeMigration(self, apps):
-        pass
-
+from documents.tests.utils import DirectoriesMixin, TestMigrations

 STORAGE_TYPE_UNENCRYPTED = "unencrypted"
 STORAGE_TYPE_GPG = "gpg"
--- a/src/documents/tests/test_migration_remove_null_characters.py
+++ b/src/documents/tests/test_migration_remove_null_characters.py
@@ -0,0 +1,15 @@
+from documents.tests.utils import DirectoriesMixin, TestMigrations
+
+
+class TestMigrateNullCharacters(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1014_auto_20210228_1614'
+    migrate_to = '1015_remove_null_characters'
+
+    def setUpBeforeMigration(self, apps):
+        Document = apps.get_model("documents", "Document")
+        self.doc = Document.objects.create(content="aaa\0bbb")
+
+    def testMimeTypesMigrated(self):
+        Document = self.apps.get_model('documents', 'Document')
+        self.assertNotIn("\0", Document.objects.get(id=self.doc.id).content)
--- a/src/documents/tests/test_migration_tag_colors.py
+++ b/src/documents/tests/test_migration_tag_colors.py
@@ -0,0 +1,37 @@
+from documents.tests.utils import DirectoriesMixin, TestMigrations
+
+
+class TestMigrateTagColor(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1012_fix_archive_files'
+    migrate_to = '1013_migrate_tag_colour'
+
+    def setUpBeforeMigration(self, apps):
+        Tag = apps.get_model("documents", "Tag")
+        self.t1_id = Tag.objects.create(name="tag1").id
+        self.t2_id = Tag.objects.create(name="tag2", colour=1).id
+        self.t3_id = Tag.objects.create(name="tag3", colour=5).id
+
+    def testMimeTypesMigrated(self):
+        Tag = self.apps.get_model('documents', 'Tag')
+        self.assertEqual(Tag.objects.get(id=self.t1_id).color, "#a6cee3")
+        self.assertEqual(Tag.objects.get(id=self.t2_id).color, "#a6cee3")
+        self.assertEqual(Tag.objects.get(id=self.t3_id).color, "#fb9a99")
+
+
+class TestMigrateTagColorBackwards(DirectoriesMixin, TestMigrations):
+
+    migrate_from = '1013_migrate_tag_colour'
+    migrate_to = '1012_fix_archive_files'
+
+    def setUpBeforeMigration(self, apps):
+        Tag = apps.get_model("documents", "Tag")
+        self.t1_id = Tag.objects.create(name="tag1").id
+        self.t2_id = Tag.objects.create(name="tag2", color="#cab2d6").id
+        self.t3_id = Tag.objects.create(name="tag3", color="#123456").id
+
+    def testMimeTypesReverted(self):
+        Tag = self.apps.get_model('documents', 'Tag')
+        self.assertEqual(Tag.objects.get(id=self.t1_id).colour, 1)
+        self.assertEqual(Tag.objects.get(id=self.t2_id).colour, 9)
+        self.assertEqual(Tag.objects.get(id=self.t3_id).colour, 1)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -68,7 +68,7 @@ class TestParserDiscovery(TestCase):
            )


-def fake_get_thumbnail(self, path, mimetype):
+def fake_get_thumbnail(self, path, mimetype, file_name):
    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")


@@ -89,15 +89,15 @@ class TestBaseParser(TestCase):
    def test_get_optimised_thumbnail(self):
        parser = DocumentParser(None)

-        parser.get_optimised_thumbnail("any", "not important")
+        parser.get_optimised_thumbnail("any", "not important", "document.pdf")

    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
    @override_settings(OPTIMIZE_THUMBNAILS=False)
    def test_get_optimised_thumb_disabled(self):
        parser = DocumentParser(None)

-        path = parser.get_optimised_thumbnail("any", "not important")
-        self.assertEqual(path, fake_get_thumbnail(None, None, None))
+        path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
+        self.assertEqual(path, fake_get_thumbnail(None, None, None, None))


 class TestParserAvailability(TestCase):
@@ -114,8 +114,8 @@ class TestParserAvailability(TestCase):
        self.assertEqual(get_default_file_extension('application/zip'), ".zip")
        self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")

-        self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
-        self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
+        self.assertIsInstance(get_parser_class_for_mime_type('application/pdf')(logging_group=None), RasterisedDocumentParser)
+        self.assertIsInstance(get_parser_class_for_mime_type('text/plain')(logging_group=None), TextDocumentParser)
        self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)

        self.assertTrue(is_file_ext_supported('.pdf'))
--- a/src/documents/tests/test_sanity_check.py
+++ b/src/documents/tests/test_sanity_check.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import shutil
 from pathlib import Path
@@ -7,10 +8,59 @@ from django.conf import settings
 from django.test import TestCase

 from documents.models import Document
-from documents.sanity_checker import check_sanity, SanityFailedError
+from documents.sanity_checker import check_sanity, SanityCheckMessages
 from documents.tests.utils import DirectoriesMixin


+class TestSanityCheckMessages(TestCase):
+
+    def test_no_messages(self):
+        messages = SanityCheckMessages()
+        self.assertEqual(len(messages), 0)
+        self.assertFalse(messages.has_error())
+        self.assertFalse(messages.has_warning())
+        with self.assertLogs() as capture:
+            messages.log_messages()
+            self.assertEqual(len(capture.output), 1)
+            self.assertEqual(capture.records[0].levelno, logging.INFO)
+            self.assertEqual(capture.records[0].message, "Sanity checker detected no issues.")
+
+    def test_info(self):
+        messages = SanityCheckMessages()
+        messages.info("Something might be wrong")
+        self.assertEqual(len(messages), 1)
+        self.assertFalse(messages.has_error())
+        self.assertFalse(messages.has_warning())
+        with self.assertLogs() as capture:
+            messages.log_messages()
+            self.assertEqual(len(capture.output), 1)
+            self.assertEqual(capture.records[0].levelno, logging.INFO)
+            self.assertEqual(capture.records[0].message, "Something might be wrong")
+
+    def test_warning(self):
+        messages = SanityCheckMessages()
+        messages.warning("Something is wrong")
+        self.assertEqual(len(messages), 1)
+        self.assertFalse(messages.has_error())
+        self.assertTrue(messages.has_warning())
+        with self.assertLogs() as capture:
+            messages.log_messages()
+            self.assertEqual(len(capture.output), 1)
+            self.assertEqual(capture.records[0].levelno, logging.WARNING)
+            self.assertEqual(capture.records[0].message, "Something is wrong")
+
+    def test_error(self):
+        messages = SanityCheckMessages()
+        messages.error("Something is seriously wrong")
+        self.assertEqual(len(messages), 1)
+        self.assertTrue(messages.has_error())
+        self.assertFalse(messages.has_warning())
+        with self.assertLogs() as capture:
+            messages.log_messages()
+            self.assertEqual(len(capture.output), 1)
+            self.assertEqual(capture.records[0].levelno, logging.ERROR)
+            self.assertEqual(capture.records[0].message, "Something is seriously wrong")
+
 class TestSanityCheck(DirectoriesMixin, TestCase):

    def make_test_data(self):
@@ -21,7 +71,12 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
            shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
            shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))

-        return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
+        return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf", archive_filename="0000001.pdf")
+
+    def assertSanityError(self, messageRegex):
+        messages = check_sanity()
+        self.assertTrue(messages.has_error())
+        self.assertRegex(messages[0]['message'], messageRegex)

    def test_no_docs(self):
        self.assertEqual(len(check_sanity()), 0)
@@ -33,59 +88,75 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
    def test_no_thumbnail(self):
        doc = self.make_test_data()
        os.remove(doc.thumbnail_path)
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Thumbnail of document .* does not exist")

    def test_thumbnail_no_access(self):
        doc = self.make_test_data()
        os.chmod(doc.thumbnail_path, 0o000)
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Cannot read thumbnail file of document")
        os.chmod(doc.thumbnail_path, 0o777)

    def test_no_original(self):
        doc = self.make_test_data()
        os.remove(doc.source_path)
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Original of document .* does not exist.")

    def test_original_no_access(self):
        doc = self.make_test_data()
        os.chmod(doc.source_path, 0o000)
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Cannot read original file of document")
        os.chmod(doc.source_path, 0o777)

    def test_original_checksum_mismatch(self):
        doc = self.make_test_data()
        doc.checksum = "WOW"
        doc.save()
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Checksum mismatch of document")

    def test_no_archive(self):
        doc = self.make_test_data()
        os.remove(doc.archive_path)
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Archived version of document .* does not exist.")

    def test_archive_no_access(self):
        doc = self.make_test_data()
        os.chmod(doc.archive_path, 0o000)
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Cannot read archive file of document")
        os.chmod(doc.archive_path, 0o777)

    def test_archive_checksum_mismatch(self):
        doc = self.make_test_data()
        doc.archive_checksum = "WOW"
        doc.save()
-        self.assertEqual(len(check_sanity()), 1)
+        self.assertSanityError("Checksum mismatch of archived document")

    def test_empty_content(self):
        doc = self.make_test_data()
        doc.content = ""
        doc.save()
-        self.assertEqual(len(check_sanity()), 1)
+        messages = check_sanity()
+        self.assertFalse(messages.has_error())
+        self.assertFalse(messages.has_warning())
+        self.assertEqual(len(messages), 1)
+        self.assertRegex(messages[0]['message'], "Document .* has no content.")

    def test_orphaned_file(self):
        doc = self.make_test_data()
        Path(self.dirs.originals_dir, "orphaned").touch()
-        self.assertEqual(len(check_sanity()), 1)
+        messages = check_sanity()
+        self.assertFalse(messages.has_error())
+        self.assertTrue(messages.has_warning())
+        self.assertEqual(len(messages), 1)
+        self.assertRegex(messages[0]['message'], "Orphaned file in media dir")

-    def test_all(self):
-        Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
-        string = str(SanityFailedError(check_sanity()))
+    def test_archive_filename_no_checksum(self):
+        doc = self.make_test_data()
+        doc.archive_checksum = None
+        doc.save()
+        self.assertSanityError("has an archive file, but its checksum is missing.")
+
+    def test_archive_checksum_no_filename(self):
+        doc = self.make_test_data()
+        doc.archive_filename = None
+        doc.save()
+        self.assertSanityError("has an archive file checksum, but no archive filename.")
--- a/src/documents/tests/test_settings.py
+++ b/src/documents/tests/test_settings.py
@@ -20,7 +20,7 @@ class TestSettings(TestCase):
        self.assertEqual(default_threads, 1)

    def test_workers_threads(self):
-        for i in range(2, 64):
+        for i in range(1, 64):
            with mock.patch("paperless.settings.multiprocessing.cpu_count") as cpu_count:
                cpu_count.return_value = i

@@ -31,4 +31,4 @@ class TestSettings(TestCase):
                self.assertTrue(default_workers >= 1)
                self.assertTrue(default_threads >= 1)

-                self.assertTrue(default_workers * default_threads < i, f"{i}")
+                self.assertTrue(default_workers * default_threads <= i, f"{i}")
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -1,12 +1,13 @@
-from datetime import datetime
+import os
 from unittest import mock

+from django.conf import settings
 from django.test import TestCase
 from django.utils import timezone

 from documents import tasks
-from documents.models import Document
-from documents.sanity_checker import SanityError, SanityFailedError
+from documents.models import Document, Tag, Correspondent, DocumentType
+from documents.sanity_checker import SanityCheckMessages, SanityCheckFailedException
 from documents.tests.utils import DirectoriesMixin


@@ -22,20 +23,87 @@ class TestTasks(DirectoriesMixin, TestCase):

        tasks.index_optimize()

-    def test_train_classifier(self):
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_no_auto_matching(self, load_classifier):
        tasks.train_classifier()
+        load_classifier.assert_not_called()
+
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_with_auto_tag(self, load_classifier):
+        load_classifier.return_value = None
+        Tag.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        tasks.train_classifier()
+        load_classifier.assert_called_once()
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_with_auto_type(self, load_classifier):
+        load_classifier.return_value = None
+        DocumentType.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        tasks.train_classifier()
+        load_classifier.assert_called_once()
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+    @mock.patch("documents.tasks.load_classifier")
+    def test_train_classifier_with_auto_correspondent(self, load_classifier):
+        load_classifier.return_value = None
+        Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        tasks.train_classifier()
+        load_classifier.assert_called_once()
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+    def test_train_classifier(self):
+        c = Correspondent.objects.create(matching_algorithm=Tag.MATCH_AUTO, name="test")
+        doc = Document.objects.create(correspondent=c, content="test", title="test")
+        self.assertFalse(os.path.isfile(settings.MODEL_FILE))
+
+        tasks.train_classifier()
+        self.assertTrue(os.path.isfile(settings.MODEL_FILE))
+        mtime = os.stat(settings.MODEL_FILE).st_mtime
+
+        tasks.train_classifier()
+        self.assertTrue(os.path.isfile(settings.MODEL_FILE))
+        mtime2 = os.stat(settings.MODEL_FILE).st_mtime
+        self.assertEqual(mtime, mtime2)
+
+        doc.content = "test2"
+        doc.save()
+        tasks.train_classifier()
+        self.assertTrue(os.path.isfile(settings.MODEL_FILE))
+        mtime3 = os.stat(settings.MODEL_FILE).st_mtime
+        self.assertNotEqual(mtime2, mtime3)

    @mock.patch("documents.tasks.sanity_checker.check_sanity")
-    def test_sanity_check(self, m):
-        m.return_value = []
-        tasks.sanity_check()
-        m.assert_called_once()
-        m.reset_mock()
-        m.return_value = [SanityError("")]
-        self.assertRaises(SanityFailedError, tasks.sanity_check)
+    def test_sanity_check_success(self, m):
+        m.return_value = SanityCheckMessages()
+        self.assertEqual(tasks.sanity_check(), "No issues detected.")
        m.assert_called_once()

-    def test_culk_update_documents(self):
+    @mock.patch("documents.tasks.sanity_checker.check_sanity")
+    def test_sanity_check_error(self, m):
+        messages = SanityCheckMessages()
+        messages.error("Some error")
+        m.return_value = messages
+        self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
+        m.assert_called_once()
+
+    @mock.patch("documents.tasks.sanity_checker.check_sanity")
+    def test_sanity_check_warning(self, m):
+        messages = SanityCheckMessages()
+        messages.warning("Some warning")
+        m.return_value = messages
+        self.assertEqual(tasks.sanity_check(), "Sanity check exited with warnings. See log.")
+        m.assert_called_once()
+
+    @mock.patch("documents.tasks.sanity_checker.check_sanity")
+    def test_sanity_check_info(self, m):
+        messages = SanityCheckMessages()
+        messages.info("Some info")
+        m.return_value = messages
+        self.assertEqual(tasks.sanity_check(), "Sanity check exited with infos. See log.")
+        m.assert_called_once()
+
+    def test_bulk_update_documents(self):
        doc1 = Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(),
                                created=timezone.now(), modified=timezone.now())

--- a/src/documents/tests/test_views.py
+++ b/src/documents/tests/test_views.py
@@ -15,7 +15,7 @@ class TestViews(TestCase):

    def test_index(self):
        self.client.force_login(self.user)
-        for (language_given, language_actual) in [("", "en-US"), ("en-US", "en-US"), ("de", "de"), ("en", "en-US"), ("en-us", "en-US"), ("fr", "fr"), ("jp", "en-US")]:
+        for (language_given, language_actual) in [("", "en-US"), ("en-US", "en-US"), ("de", "de-DE"), ("en", "en-US"), ("en-us", "en-US"), ("fr", "fr-FR"), ("jp", "en-US")]:
            if language_given:
                self.client.cookies.load({settings.LANGUAGE_COOKIE_NAME: language_given})
            elif settings.LANGUAGE_COOKIE_NAME in self.client.cookies.keys():
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -4,7 +4,10 @@ import tempfile
 from collections import namedtuple
 from contextlib import contextmanager

-from django.test import override_settings
+from django.apps import apps
+from django.db import connection
+from django.db.migrations.executor import MigrationExecutor
+from django.test import override_settings, TransactionTestCase


 def setup_directories():
@@ -19,12 +22,15 @@ def setup_directories():
    dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
    dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
    dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
+    dirs.logging_dir = os.path.join(dirs.data_dir, "log")

    os.makedirs(dirs.index_dir, exist_ok=True)
    os.makedirs(dirs.originals_dir, exist_ok=True)
    os.makedirs(dirs.thumbnail_dir, exist_ok=True)
    os.makedirs(dirs.archive_dir, exist_ok=True)

+    os.makedirs(dirs.logging_dir, exist_ok=True)
+
    dirs.settings_override = override_settings(
        DATA_DIR=dirs.data_dir,
        SCRATCH_DIR=dirs.scratch_dir,
@@ -33,6 +39,7 @@ def setup_directories():
        THUMBNAIL_DIR=dirs.thumbnail_dir,
        ARCHIVE_DIR=dirs.archive_dir,
        CONSUMPTION_DIR=dirs.consumption_dir,
+        LOGGING_DIR=dirs.logging_dir,
        INDEX_DIR=dirs.index_dir,
        MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle"),
        MEDIA_LOCK=os.path.join(dirs.media_dir, "media.lock")
@@ -75,3 +82,45 @@ class DirectoriesMixin:
    def tearDown(self) -> None:
        super(DirectoriesMixin, self).tearDown()
        remove_dirs(self.dirs)
+
+
+class TestMigrations(TransactionTestCase):
+
+    @property
+    def app(self):
+        return apps.get_containing_app_config(type(self).__module__).name
+
+    migrate_from = None
+    migrate_to = None
+    auto_migrate = True
+
+    def setUp(self):
+        super(TestMigrations, self).setUp()
+
+        assert self.migrate_from and self.migrate_to, \
+            "TestCase '{}' must define migrate_from and migrate_to     properties".format(type(self).__name__)
+        self.migrate_from = [(self.app, self.migrate_from)]
+        self.migrate_to = [(self.app, self.migrate_to)]
+        executor = MigrationExecutor(connection)
+        old_apps = executor.loader.project_state(self.migrate_from).apps
+
+        # Reverse to the original migration
+        executor.migrate(self.migrate_from)
+
+        self.setUpBeforeMigration(old_apps)
+
+        self.apps = old_apps
+
+        if self.auto_migrate:
+            self.performMigration()
+
+    def performMigration(self):
+        # Run the migration to test
+        executor = MigrationExecutor(connection)
+        executor.loader.build_graph()  # reload.
+        executor.migrate(self.migrate_to)
+
+        self.apps = executor.loader.project_state(self.migrate_to).apps
+
+    def setUpBeforeMigration(self, apps):
+        pass
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,6 +1,8 @@
 import logging
 import os
 import tempfile
+import uuid
+import zipfile
 from datetime import datetime
 from time import mktime

@@ -15,7 +17,9 @@ from django_filters.rest_framework import DjangoFilterBackend
 from django_q.tasks import async_task
 from rest_framework import parsers
 from rest_framework.decorators import action
+from rest_framework.exceptions import NotFound
 from rest_framework.filters import OrderingFilter, SearchFilter
+from rest_framework.generics import GenericAPIView
 from rest_framework.mixins import (
    DestroyModelMixin,
    ListModelMixin,
@@ -28,33 +32,40 @@ from rest_framework.views import APIView
 from rest_framework.viewsets import (
    GenericViewSet,
    ModelViewSet,
-    ReadOnlyModelViewSet
+    ViewSet
 )

-import documents.index as index
 from paperless.db import GnuPG
 from paperless.views import StandardPagination
+from .bulk_download import OriginalAndArchiveStrategy, OriginalsOnlyStrategy, \
+    ArchiveOnlyStrategy
+from .classifier import load_classifier
 from .filters import (
    CorrespondentFilterSet,
    DocumentFilterSet,
    TagFilterSet,
-    DocumentTypeFilterSet,
-    LogFilterSet
+    DocumentTypeFilterSet
 )
-from .models import Correspondent, Document, Log, Tag, DocumentType, SavedView
+from .matching import match_correspondents, match_tags, match_document_types
+from .models import Correspondent, Document, Tag, DocumentType, SavedView
 from .parsers import get_parser_class_for_mime_type
 from .serialisers import (
    CorrespondentSerializer,
    DocumentSerializer,
-    LogSerializer,
+    TagSerializerVersion1,
    TagSerializer,
    DocumentTypeSerializer,
    PostDocumentSerializer,
    SavedViewSerializer,
-    BulkEditSerializer, SelectionDataSerializer
+    BulkEditSerializer,
+    DocumentListSerializer,
+    BulkDownloadSerializer
 )


+logger = logging.getLogger("paperless.api")
+
+
 class IndexView(TemplateView):
    template_name = "index.html"

@@ -81,6 +92,7 @@ class IndexView(TemplateView):
        context['polyfills_js'] = f"frontend/{self.get_language()}/polyfills.js"  # NOQA: E501
        context['main_js'] = f"frontend/{self.get_language()}/main.js"
        context['webmanifest'] = f"frontend/{self.get_language()}/manifest.webmanifest"  # NOQA: E501
+        context['apple_touch_icon'] = f"frontend/{self.get_language()}/apple-touch-icon.png"  # NOQA: E501
        return context


@@ -110,7 +122,12 @@ class TagViewSet(ModelViewSet):
    queryset = Tag.objects.annotate(
        document_count=Count('documents')).order_by(Lower('name'))

-    serializer_class = TagSerializer
+    def get_serializer_class(self):
+        if int(self.request.version) == 1:
+            return TagSerializerVersion1
+        else:
+            return TagSerializer
+
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
@@ -132,10 +149,6 @@ class DocumentTypeViewSet(ModelViewSet):
    ordering_fields = ("name", "matching_algorithm", "match", "document_count")


-class BulkEditForm(object):
-    pass
-
-
 class DocumentViewSet(RetrieveModelMixin,
                      UpdateModelMixin,
                      DestroyModelMixin,
@@ -159,6 +172,9 @@ class DocumentViewSet(RetrieveModelMixin,
        "added",
        "archive_serial_number")

+    def get_queryset(self):
+        return Document.objects.distinct()
+
    def get_serializer(self, *args, **kwargs):
        fields_param = self.request.query_params.get('fields', None)
        if fields_param:
@@ -173,10 +189,12 @@ class DocumentViewSet(RetrieveModelMixin,
    def update(self, request, *args, **kwargs):
        response = super(DocumentViewSet, self).update(
            request, *args, **kwargs)
+        from documents import index
        index.add_or_update_document(self.get_object())
        return response

    def destroy(self, request, *args, **kwargs):
+        from documents import index
        index.remove_document_from_index(self.get_object())
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

@@ -189,7 +207,7 @@ class DocumentViewSet(RetrieveModelMixin,

    def file_response(self, pk, request, disposition):
        doc = Document.objects.get(id=pk)
-        if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501
+        if not self.original_requested(request) and doc.has_archive_version:  # NOQA: E501
            file_handle = doc.archive_file
            filename = doc.get_public_filename(archive=True)
            mime_type = 'application/pdf'
@@ -212,7 +230,7 @@ class DocumentViewSet(RetrieveModelMixin,

        parser_class = get_parser_class_for_mime_type(mime_type)
        if parser_class:
-            parser = parser_class(logging_group=None)
+            parser = parser_class(progress_callback=None, logging_group=None)

            try:
                return parser.extract_metadata(file, mime_type)
@@ -222,35 +240,60 @@ class DocumentViewSet(RetrieveModelMixin,
        else:
            return []

+    def get_filesize(self, filename):
+        if os.path.isfile(filename):
+            return os.stat(filename).st_size
+        else:
+            return None
+
    @action(methods=['get'], detail=True)
    def metadata(self, request, pk=None):
        try:
            doc = Document.objects.get(pk=pk)
-
-            meta = {
-                "original_checksum": doc.checksum,
-                "original_size": os.stat(doc.source_path).st_size,
-                "original_mime_type": doc.mime_type,
-                "media_filename": doc.filename,
-                "has_archive_version": os.path.isfile(doc.archive_path),
-                "original_metadata": self.get_metadata(
-                    doc.source_path, doc.mime_type)
-            }
-
-            if doc.archive_checksum and os.path.isfile(doc.archive_path):
-                meta['archive_checksum'] = doc.archive_checksum
-                meta['archive_size'] = os.stat(doc.archive_path).st_size,
-                meta['archive_metadata'] = self.get_metadata(
-                    doc.archive_path, "application/pdf")
-            else:
-                meta['archive_checksum'] = None
-                meta['archive_size'] = None
-                meta['archive_metadata'] = None
-
-            return Response(meta)
        except Document.DoesNotExist:
            raise Http404()

+        meta = {
+            "original_checksum": doc.checksum,
+            "original_size": self.get_filesize(doc.source_path),
+            "original_mime_type": doc.mime_type,
+            "media_filename": doc.filename,
+            "has_archive_version": doc.has_archive_version,
+            "original_metadata": self.get_metadata(
+                doc.source_path, doc.mime_type),
+            "archive_checksum": doc.archive_checksum,
+            "archive_media_filename": doc.archive_filename
+        }
+
+        if doc.has_archive_version:
+            meta['archive_size'] = self.get_filesize(doc.archive_path)
+            meta['archive_metadata'] = self.get_metadata(
+                doc.archive_path, "application/pdf")
+        else:
+            meta['archive_size'] = None
+            meta['archive_metadata'] = None
+
+        return Response(meta)
+
+    @action(methods=['get'], detail=True)
+    def suggestions(self, request, pk=None):
+        try:
+            doc = Document.objects.get(pk=pk)
+        except Document.DoesNotExist:
+            raise Http404()
+
+        classifier = load_classifier()
+
+        return Response({
+            "correspondents": [
+                c.id for c in match_correspondents(doc, classifier)
+            ],
+            "tags": [t.id for t in match_tags(doc, classifier)],
+            "document_types": [
+                dt.id for dt in match_document_types(doc, classifier)
+            ]
+        })
+
    @action(methods=['get'], detail=True)
    def preview(self, request, pk=None):
        try:
@@ -269,6 +312,8 @@ class DocumentViewSet(RetrieveModelMixin,
                handle = GnuPG.decrypted(doc.thumbnail_file)
            else:
                handle = doc.thumbnail_file
+            # TODO: Send ETag information and use that to send new thumbnails
+            #  if available
            return HttpResponse(handle,
                                content_type='image/png')
        except (FileNotFoundError, Document.DoesNotExist):
@@ -283,16 +328,92 @@ class DocumentViewSet(RetrieveModelMixin,
            raise Http404()


-class LogViewSet(ReadOnlyModelViewSet):
-    model = Log
+class SearchResultSerializer(DocumentSerializer):
+
+    def to_representation(self, instance):
+        doc = Document.objects.get(id=instance['id'])
+        r = super(SearchResultSerializer, self).to_representation(doc)
+        r['__search_hit__'] = {
+            "score": instance.score,
+            "highlights": instance.highlights("content",
+                                   text=doc.content) if doc else None,  # NOQA: E501
+            "rank": instance.rank
+        }
+
+        return r
+
+
+class UnifiedSearchViewSet(DocumentViewSet):
+
+    def __init__(self, *args, **kwargs):
+        super(UnifiedSearchViewSet, self).__init__(*args, **kwargs)
+        self.searcher = None
+
+    def get_serializer_class(self):
+        if self._is_search_request():
+            return SearchResultSerializer
+        else:
+            return DocumentSerializer
+
+    def _is_search_request(self):
+        return ("query" in self.request.query_params or
+                "more_like_id" in self.request.query_params)
+
+    def filter_queryset(self, queryset):
+        if self._is_search_request():
+            from documents import index
+
+            if "query" in self.request.query_params:
+                query_class = index.DelayedFullTextQuery
+            elif "more_like_id" in self.request.query_params:
+                query_class = index.DelayedMoreLikeThisQuery
+            else:
+                raise ValueError()
+
+            return query_class(
+                self.searcher,
+                self.request.query_params,
+                self.paginator.get_page_size(self.request))
+        else:
+            return super(UnifiedSearchViewSet, self).filter_queryset(queryset)
+
+    def list(self, request, *args, **kwargs):
+        if self._is_search_request():
+            from documents import index
+            try:
+                with index.open_index_searcher() as s:
+                    self.searcher = s
+                    return super(UnifiedSearchViewSet, self).list(request)
+            except NotFound:
+                raise
+            except Exception as e:
+                return HttpResponseBadRequest(str(e))
+        else:
+            return super(UnifiedSearchViewSet, self).list(request)
+
+
+class LogViewSet(ViewSet):

-    queryset = Log.objects.all()
-    serializer_class = LogSerializer
-    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
-    filter_backends = (DjangoFilterBackend, OrderingFilter)
-    filterset_class = LogFilterSet
-    ordering_fields = ("created",)
+
+    log_files = ["paperless", "mail"]
+
+    def retrieve(self, request, pk=None, *args, **kwargs):
+        if pk not in self.log_files:
+            raise Http404()
+
+        filename = os.path.join(settings.LOGGING_DIR, f"{pk}.log")
+
+        if not os.path.isfile(filename):
+            raise Http404()
+
+        with open(filename, "r") as f:
+            lines = [line.rstrip() for line in f.readlines()]
+
+        return Response(lines)
+
+    def list(self, request, *args, **kwargs):
+        return Response(self.log_files)


 class SavedViewViewSet(ModelViewSet):
@@ -311,23 +432,12 @@ class SavedViewViewSet(ModelViewSet):
        serializer.save(user=self.request.user)


-class BulkEditView(APIView):
+class BulkEditView(GenericAPIView):

    permission_classes = (IsAuthenticated,)
    serializer_class = BulkEditSerializer
    parser_classes = (parsers.JSONParser,)

-    def get_serializer_context(self):
-        return {
-            'request': self.request,
-            'format': self.format_kwarg,
-            'view': self
-        }
-
-    def get_serializer(self, *args, **kwargs):
-        kwargs['context'] = self.get_serializer_context()
-        return self.serializer_class(*args, **kwargs)
-
    def post(self, request, *args, **kwargs):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)
@@ -344,23 +454,12 @@ class BulkEditView(APIView):
            return HttpResponseBadRequest(str(e))


-class PostDocumentView(APIView):
+class PostDocumentView(GenericAPIView):

    permission_classes = (IsAuthenticated,)
    serializer_class = PostDocumentSerializer
    parser_classes = (parsers.MultiPartParser,)

-    def get_serializer_context(self):
-        return {
-            'request': self.request,
-            'format': self.format_kwarg,
-            'view': self
-        }
-
-    def get_serializer(self, *args, **kwargs):
-        kwargs['context'] = self.get_serializer_context()
-        return self.serializer_class(*args, **kwargs)
-
    def post(self, request, *args, **kwargs):

        serializer = self.get_serializer(data=request.data)
@@ -381,35 +480,29 @@ class PostDocumentView(APIView):
                                         delete=False) as f:
            f.write(doc_data)
            os.utime(f.name, times=(t, t))
+            temp_filename = f.name
+
+        task_id = str(uuid.uuid4())
+
+        async_task("documents.tasks.consume_file",
+                   temp_filename,
+                   override_filename=doc_name,
+                   override_title=title,
+                   override_correspondent_id=correspondent_id,
+                   override_document_type_id=document_type_id,
+                   override_tag_ids=tag_ids,
+                   task_id=task_id,
+                   task_name=os.path.basename(doc_name)[:100])

-            async_task("documents.tasks.consume_file",
-                       f.name,
-                       override_filename=doc_name,
-                       override_title=title,
-                       override_correspondent_id=correspondent_id,
-                       override_document_type_id=document_type_id,
-                       override_tag_ids=tag_ids,
-                       task_name=os.path.basename(doc_name)[:100])
        return Response("OK")


-class SelectionDataView(APIView):
+class SelectionDataView(GenericAPIView):

    permission_classes = (IsAuthenticated,)
-    serializer_class = SelectionDataSerializer
+    serializer_class = DocumentListSerializer
    parser_classes = (parsers.MultiPartParser, parsers.JSONParser)

-    def get_serializer_context(self):
-        return {
-            'request': self.request,
-            'format': self.format_kwarg,
-            'view': self
-        }
-
-    def get_serializer(self, *args, **kwargs):
-        kwargs['context'] = self.get_serializer_context()
-        return self.serializer_class(*args, **kwargs)
-
    def post(self, request, format=None):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)
@@ -450,83 +543,10 @@ class SelectionDataView(APIView):
        return r


-class SearchView(APIView):
-
-    permission_classes = (IsAuthenticated,)
-
-    def __init__(self, *args, **kwargs):
-        super(SearchView, self).__init__(*args, **kwargs)
-        self.ix = index.open_index()
-
-    def add_infos_to_hit(self, r):
-        try:
-            doc = Document.objects.get(id=r['id'])
-        except Document.DoesNotExist:
-            logging.getLogger(__name__).warning(
-                f"Search index returned a non-existing document: "
-                f"id: {r['id']}, title: {r['title']}. "
-                f"Search index needs reindex."
-            )
-            doc = None
-
-        return {'id': r['id'],
-                'highlights': r.highlights("content", text=doc.content) if doc else None,  # NOQA: E501
-                'score': r.score,
-                'rank': r.rank,
-                'document': DocumentSerializer(doc).data if doc else None,
-                'title': r['title']
-                }
-
-    def get(self, request, format=None):
-
-        if 'query' in request.query_params:
-            query = request.query_params['query']
-        else:
-            query = None
-
-        if 'more_like' in request.query_params:
-            more_like_id = request.query_params['more_like']
-            more_like_content = Document.objects.get(id=more_like_id).content
-        else:
-            more_like_id = None
-            more_like_content = None
-
-        if not query and not more_like_id:
-            return Response({
-                'count': 0,
-                'page': 0,
-                'page_count': 0,
-                'corrected_query': None,
-                'results': []})
-
-        try:
-            page = int(request.query_params.get('page', 1))
-        except (ValueError, TypeError):
-            page = 1
-
-        if page < 1:
-            page = 1
-
-        try:
-            with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501
-                return Response(
-                    {'count': len(result_page),
-                     'page': result_page.pagenum,
-                     'page_count': result_page.pagecount,
-                     'corrected_query': corrected_query,
-                     'results': list(map(self.add_infos_to_hit, result_page))})
-        except Exception as e:
-            return HttpResponseBadRequest(str(e))
-
-
 class SearchAutoCompleteView(APIView):

    permission_classes = (IsAuthenticated,)

-    def __init__(self, *args, **kwargs):
-        super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
-        self.ix = index.open_index()
-
    def get(self, request, format=None):
        if 'term' in request.query_params:
            term = request.query_params['term']
@@ -540,7 +560,11 @@ class SearchAutoCompleteView(APIView):
        else:
            limit = 10

-        return Response(index.autocomplete(self.ix, term, limit))
+        from documents import index
+
+        ix = index.open_index()
+
+        return Response(index.autocomplete(ix, term, limit))


 class StatisticsView(APIView):
@@ -548,8 +572,55 @@ class StatisticsView(APIView):
    permission_classes = (IsAuthenticated,)

    def get(self, request, format=None):
-        return Response({
-            'documents_total': Document.objects.all().count(),
-            'documents_inbox': Document.objects.filter(
+        documents_total = Document.objects.all().count()
+        if Tag.objects.filter(is_inbox_tag=True).exists():
+            documents_inbox = Document.objects.filter(
                tags__is_inbox_tag=True).distinct().count()
+        else:
+            documents_inbox = None
+
+        return Response({
+            'documents_total': documents_total,
+            'documents_inbox': documents_inbox,
        })
+
+
+class BulkDownloadView(GenericAPIView):
+
+    permission_classes = (IsAuthenticated,)
+    serializer_class = BulkDownloadSerializer
+    parser_classes = (parsers.JSONParser,)
+
+    def post(self, request, format=None):
+        serializer = self.get_serializer(data=request.data)
+        serializer.is_valid(raise_exception=True)
+
+        ids = serializer.validated_data.get('documents')
+        compression = serializer.validated_data.get('compression')
+        content = serializer.validated_data.get('content')
+
+        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
+        temp = tempfile.NamedTemporaryFile(
+            dir=settings.SCRATCH_DIR,
+            suffix="-compressed-archive",
+            delete=False)
+
+        if content == 'both':
+            strategy_class = OriginalAndArchiveStrategy
+        elif content == 'originals':
+            strategy_class = OriginalsOnlyStrategy
+        else:
+            strategy_class = ArchiveOnlyStrategy
+
+        with zipfile.ZipFile(temp.name, "w", compression) as zipf:
+            strategy = strategy_class(zipf)
+            for id in ids:
+                doc = Document.objects.get(id=id)
+                strategy.add_document(doc)
+
+        with open(temp.name, "rb") as f:
+            response = HttpResponse(f, content_type="application/zip")
+            response["Content-Disposition"] = '{}; filename="{}"'.format(
+                "attachment", "documents.zip")
+
+            return response