Merge branch 'dev' into feature-websockets-status

2025-10-30 03:56:23 -05:00 · 2020-12-06 22:53:54 +01:00
parent e9f55d9b3d 18c9c22941
commit 0bfecaa0fc
179 changed files with 5678 additions and 2460 deletions
--- a/src/documents/init.py
+++ b/src/documents/init.py
@@ -1 +1,2 @@
-from .checks import changed_password_check
+# this is here so that django finds the checks.
+from .checks import *
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -4,12 +4,13 @@ import os
 import pickle
 import re

+from django.conf import settings
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
+from sklearn.utils.multiclass import type_of_target

 from documents.models import Document, MatchingModel
-from paperless import settings


 class IncompatibleClassifierVersionError(Exception):
@@ -27,7 +28,7 @@ def preprocess_content(content):

 class DocumentClassifier(object):

-    FORMAT_VERSION = 5
+    FORMAT_VERSION = 6

    def __init__(self):
        # mtime of the model file on disk. used to prevent reloading when
@@ -54,6 +55,8 @@ class DocumentClassifier(object):
                        "Cannor load classifier, incompatible versions.")
                else:
                    if self.classifier_version > 0:
+                        # Don't be confused by this check. It's simply here
+                        # so that we wont log anything on initial reload.
                        logger.info("Classifier updated on disk, "
                                    "reloading classifier models")
                    self.data_hash = pickle.load(f)
@@ -122,9 +125,14 @@ class DocumentClassifier(object):
        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])

        num_tags = len(labels_tags_unique)
+
        # substract 1 since -1 (null) is also part of the classes.
-        num_correspondents = len(set(labels_correspondent)) - 1
-        num_document_types = len(set(labels_document_type)) - 1
+
+        # union with {-1} accounts for cases where all documents have
+        # correspondents and types assigned, so -1 isnt part of labels_x, which
+        # it usually is.
+        num_correspondents = len(set(labels_correspondent) | {-1}) - 1
+        num_document_types = len(set(labels_document_type) | {-1}) - 1

        logging.getLogger(__name__).debug(
            "{} documents, {} tag(s), {} correspondent(s), "
@@ -145,12 +153,23 @@ class DocumentClassifier(object):
        )
        data_vectorized = self.data_vectorizer.fit_transform(data)

-        self.tags_binarizer = MultiLabelBinarizer()
-        labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
-
        # Step 3: train the classifiers
        if num_tags > 0:
            logging.getLogger(__name__).debug("Training tags classifier...")
+
+            if num_tags == 1:
+                # Special case where only one tag has auto:
+                # Fallback to binary classification.
+                labels_tags = [label[0] if len(label) == 1 else -1
+                               for label in labels_tags]
+                self.tags_binarizer = LabelBinarizer()
+                labels_tags_vectorized = self.tags_binarizer.fit_transform(
+                    labels_tags).ravel()
+            else:
+                self.tags_binarizer = MultiLabelBinarizer()
+                labels_tags_vectorized = self.tags_binarizer.fit_transform(
+                    labels_tags)
+
            self.tags_classifier = MLPClassifier(tol=0.01)
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
@@ -222,6 +241,16 @@ class DocumentClassifier(object):
            X = self.data_vectorizer.transform([preprocess_content(content)])
            y = self.tags_classifier.predict(X)
            tags_ids = self.tags_binarizer.inverse_transform(y)[0]
-            return tags_ids
+            if type_of_target(y).startswith('multilabel'):
+                # the usual case when there are multiple tags.
+                return list(tags_ids)
+            elif type_of_target(y) == 'binary' and tags_ids != -1:
+                # This is for when we have binary classification with only one
+                # tag and the result is to assign this tag.
+                return [tags_ids]
+            else:
+                # Usually binary as well with -1 as the result, but we're
+                # going to catch everything else here as well.
+                return []
        else:
            return []
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -8,14 +8,15 @@ from asgiref.sync import async_to_sync
 from channels.layers import get_channel_layer
 from django.conf import settings
 from django.db import transaction
+from django.db.models import Q
 from django.utils import timezone

-from paperless.db import GnuPG
 from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
-from .file_handling import generate_filename, create_source_path_directory
+from .file_handling import create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
-from .parsers import ParseError, get_parser_class_for_mime_type
+from .parsers import ParseError, get_parser_class_for_mime_type, \
+    get_supported_file_extensions, parse_date
 from .signals import (
    document_consumption_finished,
    document_consumption_started
@@ -58,21 +59,10 @@ class Consumer(LoggingMixin):
            raise ConsumerError("Cannot consume {}: It is not a file".format(
                self.path))

-    def pre_check_consumption_dir(self):
-        if not settings.CONSUMPTION_DIR:
-            raise ConsumerError(
-                "The CONSUMPTION_DIR settings variable does not appear to be "
-                "set.")
-
-        if not os.path.isdir(settings.CONSUMPTION_DIR):
-            raise ConsumerError(
-                "Consumption directory {} does not exist".format(
-                    settings.CONSUMPTION_DIR))
-
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
-        if Document.objects.filter(checksum=checksum).exists():
+        if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501
            if settings.CONSUMER_DELETE_DUPLICATES:
                os.unlink(self.path)
            raise ConsumerError(
@@ -83,6 +73,7 @@ class Consumer(LoggingMixin):
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
+        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)

    def try_consume_file(self,
                         path,
@@ -110,7 +101,6 @@ class Consumer(LoggingMixin):
        # Make sure that preconditions for consuming the file are met.

        self.pre_check_file_exists()
-        self.pre_check_consumption_dir()
        self.pre_check_directories()
        self.pre_check_duplicate()

@@ -145,7 +135,7 @@ class Consumer(LoggingMixin):

        # This doesn't parse the document yet, but gives us a parser.

-        document_parser = parser_class(self.path, self.logging_group, progress_callback)
+        document_parser = parser_class(self.logging_group, progress_callback)

        # However, this already created working directories which we have to
        # clean up.
@@ -153,19 +143,30 @@ class Consumer(LoggingMixin):
        # Parse the document. This may take some time.

        try:
-            self.log("debug", f"Generating thumbnail for {self.filename}...")
-            self._send_progress(self.filename, 10, 100, 'WORKING',
-                                'Generating thumbnail...')
-            thumbnail = document_parser.get_optimised_thumbnail()
-            self.log("debug", "Parsing {}...".format(self.filename))
            self._send_progress(self.filename, 20, 100, 'WORKING',
-                                'Getting text from document...')
+                                'Parsing document...')
+            self.log("debug", "Parsing {}...".format(self.filename))
+            document_parser.parse(self.path, mime_type)
+
+            self.log("debug", f"Generating thumbnail for {self.filename}...")
+            self._send_progress(self.filename, 70, 100, 'WORKING',
+                                'Generating thumbnail...')
+            thumbnail = document_parser.get_optimised_thumbnail(
+                self.path, mime_type)
+
            text = document_parser.get_text()
-            self._send_progress(self.filename, 80, 100, 'WORKING',
-                                'Getting date from document...')
            date = document_parser.get_date()
+            if not date:
+                self._send_progress(self.filename, 90, 100, 'WORKING',
+                                    'Getting date from document...')
+                date = parse_date(self.filename, text)
+            archive_path = document_parser.get_archive_path()
+
        except ParseError as e:
            document_parser.cleanup()
+            self.log(
+                "error",
+                f"Error while consuming document {self.filename}: {e}")
            self._send_progress(self.filename, 100, 100, 'FAILED',
                                "Failed: {}".format(e))
            raise ConsumerError(e)
@@ -183,7 +184,7 @@ class Consumer(LoggingMixin):
            logging.getLogger(__name__).warning(
                "Cannot classify documents: {}.".format(e))
            classifier = None
-        self._send_progress(self.filename, 85, 100, 'WORKING',
+        self._send_progress(self.filename, 95, 100, 'WORKING',
                            'Storing the document...')
        # now that everything is done, we can start to store the document
        # in the system. This will be a transaction and reasonably fast.
@@ -200,9 +201,6 @@ class Consumer(LoggingMixin):
                # If we get here, it was successful. Proceed with post-consume
                # hooks. If they fail, nothing will get changed.

-                self._send_progress(self.filename, 90, 100, 'WORKING',
-                                    'Performing post-consumption tasks...')
-
                document_consumption_finished.send(
                    sender=self.__class__,
                    document=document,
@@ -213,14 +211,41 @@ class Consumer(LoggingMixin):
                # After everything is in the database, copy the files into
                # place. If this fails, we'll also rollback the transaction.

+                # TODO: not required, since this is done by the file handling
+                #  logic
                create_source_path_directory(document.source_path)
-                self._write(document, self.path, document.source_path)
-                self._write(document, thumbnail, document.thumbnail_path)
+
+                self._write(document.storage_type,
+                            self.path, document.source_path)
+
+                self._write(document.storage_type,
+                            thumbnail, document.thumbnail_path)
+
+                if archive_path and os.path.isfile(archive_path):
+                    self._write(document.storage_type,
+                                archive_path, document.archive_path)
+
+                    with open(archive_path, 'rb') as f:
+                        document.archive_checksum = hashlib.md5(
+                            f.read()).hexdigest()
+                        document.save()
+
+                # Afte performing all database operations and moving files
+                # into place, tell paperless where the file is.
+                document.filename = os.path.basename(document.source_path)
+                # Saving the document now will trigger the filename handling
+                # logic.
+                document.save()

                # Delete the file only if it was successfully consumed
                self.log("debug", "Deleting file {}".format(self.path))
                os.unlink(self.path)
        except Exception as e:
+            self.log(
+                "error",
+                f"The following error occured while consuming "
+                f"{self.filename}: {e}"
+            )
            self._send_progress(self.filename, 100, 100, 'FAILED',
                                "Failed: {}".format(e))
            raise ConsumerError(e)
@@ -250,10 +275,7 @@ class Consumer(LoggingMixin):
        created = file_info.created or date or timezone.make_aware(
            datetime.datetime.fromtimestamp(stats.st_mtime))

-        if settings.PASSPHRASE:
-            storage_type = Document.STORAGE_TYPE_GPG
-        else:
-            storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        storage_type = Document.STORAGE_TYPE_UNENCRYPTED

        with open(self.path, "rb") as f:
            document = Document.objects.create(
@@ -275,12 +297,6 @@ class Consumer(LoggingMixin):

        self.apply_overrides(document)

-        document.filename = generate_filename(document)
-
-        # We need to save the document twice, since we need the PK of the
-        # document in order to create its filename above.
-        document.save()
-
        return document

    def apply_overrides(self, document):
@@ -299,11 +315,7 @@ class Consumer(LoggingMixin):
            for tag_id in self.override_tag_ids:
                document.tags.add(Tag.objects.get(pk=tag_id))

-    def _write(self, document, source, target):
+    def _write(self, storage_type, source, target):
        with open(source, "rb") as read_file:
            with open(target, "wb") as write_file:
-                if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
-                    write_file.write(read_file.read())
-                    return
-                self.log("debug", "Encrypting")
-                write_file.write(GnuPG.encrypted(read_file))
+                write_file.write(read_file.read())
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -1,7 +1,9 @@
+import datetime
 import logging
 import os
 from collections import defaultdict

+import pathvalidate
 from django.conf import settings
 from django.template.defaultfilters import slugify

@@ -10,10 +12,13 @@ def create_source_path_directory(source_path):
    os.makedirs(os.path.dirname(source_path), exist_ok=True)


-def delete_empty_directories(directory):
+def delete_empty_directories(directory, root):
+    if not os.path.isdir(directory):
+        return
+
    # Go up in the directory hierarchy and try to delete all directories
    directory = os.path.normpath(directory)
-    root = os.path.normpath(settings.ORIGINALS_DIR)
+    root = os.path.normpath(root)

    if not directory.startswith(root + os.path.sep):
        # don't do anything outside our originals folder.
@@ -72,14 +77,31 @@ def generate_filename(doc):
        if settings.PAPERLESS_FILENAME_FORMAT is not None:
            tags = defaultdict(lambda: slugify(None),
                               many_to_dictionary(doc.tags))
+
+            if doc.correspondent:
+                correspondent = pathvalidate.sanitize_filename(
+                    doc.correspondent.name, replacement_text="-"
+                )
+            else:
+                correspondent = "none"
+
+            if doc.document_type:
+                document_type = pathvalidate.sanitize_filename(
+                    doc.document_type.name, replacement_text="-"
+                )
+            else:
+                document_type = "none"
+
            path = settings.PAPERLESS_FILENAME_FORMAT.format(
-                correspondent=slugify(doc.correspondent),
-                title=slugify(doc.title),
-                created=slugify(doc.created),
+                title=pathvalidate.sanitize_filename(
+                    doc.title, replacement_text="-"),
+                correspondent=correspondent,
+                document_type=document_type,
+                created=datetime.date.isoformat(doc.created),
                created_year=doc.created.year if doc.created else "none",
                created_month=doc.created.month if doc.created else "none",
                created_day=doc.created.day if doc.created else "none",
-                added=slugify(doc.added),
+                added=datetime.date.isoformat(doc.added),
                added_year=doc.added.year if doc.added else "none",
                added_month=doc.added.month if doc.added else "none",
                added_day=doc.added.day if doc.added else "none",
@@ -101,3 +123,8 @@ def generate_filename(doc):
        filename += ".gpg"

    return filename
+
+
+def archive_name_from_filename(filename):
+
+    return os.path.splitext(filename)[0] + ".pdf"
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -1,59 +0,0 @@
-import os
-import tempfile
-from datetime import datetime
-from time import mktime
-
-import magic
-from django import forms
-from django.conf import settings
-from django_q.tasks import async_task
-from pathvalidate import validate_filename, ValidationError
-
-from documents.parsers import is_mime_type_supported
-
-
-class UploadForm(forms.Form):
-
-    document = forms.FileField()
-
-    def clean_document(self):
-        document_name = self.cleaned_data.get("document").name
-
-        try:
-            validate_filename(document_name)
-        except ValidationError:
-            raise forms.ValidationError("That filename is suspicious.")
-
-        document_data = self.cleaned_data.get("document").read()
-
-        mime_type = magic.from_buffer(document_data, mime=True)
-
-        if not is_mime_type_supported(mime_type):
-            raise forms.ValidationError("This mime type is not supported.")
-
-        return document_name, document_data
-
-    def save(self):
-        """
-        Since the consumer already does a lot of work, it's easier just to save
-        to-be-consumed files to the consumption directory rather than have the
-        form do that as well.  Think of it as a poor-man's queue server.
-        """
-
-        original_filename, data = self.cleaned_data.get("document")
-
-        t = int(mktime(datetime.now().timetuple()))
-
-        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-
-        with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
-                                         dir=settings.SCRATCH_DIR,
-                                         delete=False) as f:
-
-            f.write(data)
-            os.utime(f.name, times=(t, t))
-
-            async_task("documents.tasks.consume_file",
-                       f.name,
-                       override_filename=original_filename,
-                       task_name=os.path.basename(original_filename)[:100])
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -4,10 +4,11 @@ from contextlib import contextmanager

 from django.conf import settings
 from whoosh import highlight
-from whoosh.fields import Schema, TEXT, NUMERIC
+from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
 from whoosh.highlight import Formatter, get_text
 from whoosh.index import create_in, exists_in, open_dir
 from whoosh.qparser import MultifieldParser
+from whoosh.qparser.dateparse import DateParserPlugin
 from whoosh.writing import AsyncWriter


@@ -59,33 +60,49 @@ def get_schema():
        id=NUMERIC(stored=True, unique=True, numtype=int),
        title=TEXT(stored=True),
        content=TEXT(),
-        correspondent=TEXT(stored=True)
+        correspondent=TEXT(stored=True),
+        tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
+        type=TEXT(stored=True),
+        created=DATETIME(stored=True, sortable=True),
+        modified=DATETIME(stored=True, sortable=True),
+        added=DATETIME(stored=True, sortable=True),
    )


 def open_index(recreate=False):
-    if exists_in(settings.INDEX_DIR) and not recreate:
-        return open_dir(settings.INDEX_DIR)
-    else:
-        # TODO: this is not thread safe. If 2 instances try to create the index
-        #  at the same time, this fails. This currently prevents parallel
-        #  tests.
-        if not os.path.isdir(settings.INDEX_DIR):
-            os.makedirs(settings.INDEX_DIR, exist_ok=True)
-        return create_in(settings.INDEX_DIR, get_schema())
+    try:
+        if exists_in(settings.INDEX_DIR) and not recreate:
+            return open_dir(settings.INDEX_DIR, schema=get_schema())
+    except Exception as e:
+        logger.error(f"Error while opening the index: {e}, recreating.")
+
+    if not os.path.isdir(settings.INDEX_DIR):
+        os.makedirs(settings.INDEX_DIR, exist_ok=True)
+    return create_in(settings.INDEX_DIR, get_schema())


 def update_document(writer, doc):
+    # TODO: this line caused many issues all around, since:
+    #  We need to make sure that this method does not get called with
+    #  deserialized documents (i.e, document objects that don't come from
+    #  Django's ORM interfaces directly.
    logger.debug("Indexing {}...".format(doc))
+    tags = ",".join([t.name for t in doc.tags.all()])
    writer.update_document(
        id=doc.pk,
        title=doc.title,
        content=doc.content,
-        correspondent=doc.correspondent.name if doc.correspondent else None
+        correspondent=doc.correspondent.name if doc.correspondent else None,
+        tag=tags if tags else None,
+        type=doc.document_type.name if doc.document_type else None,
+        created=doc.created,
+        added=doc.added,
+        modified=doc.modified,
    )


 def remove_document(writer, doc):
+    # TODO: see above.
    logger.debug("Removing {} from index...".format(doc))
    writer.delete_by_term('id', doc.pk)

@@ -103,16 +120,27 @@ def remove_document_from_index(document):


@contextmanager
-def query_page(ix, query, page):
+def query_page(ix, querystring, page):
    searcher = ix.searcher()
    try:
-        query_parser = MultifieldParser(["content", "title", "correspondent"],
-                                        ix.schema).parse(query)
-        result_page = searcher.search_page(query_parser, page)
+        qp = MultifieldParser(
+            ["content", "title", "correspondent", "tag", "type"],
+            ix.schema)
+        qp.add_plugin(DateParserPlugin())
+
+        q = qp.parse(querystring)
+        result_page = searcher.search_page(q, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()
-        yield result_page
+
+        corrected = searcher.correct_query(q, querystring)
+        if corrected.query != q:
+            corrected_query = corrected.string
+        else:
+            corrected_query = None
+
+        yield result_page, corrected_query
    finally:
        searcher.close()

--- a/src/documents/loggers.py
+++ b/src/documents/loggers.py
@@ -1,9 +1,14 @@
 import logging
 import uuid

+from django.conf import settings
+

 class PaperlessHandler(logging.Handler):
    def emit(self, record):
+        if settings.DISABLE_DBHANDLER:
+            return
+
        # We have to do the import here or Django will barf when it tries to
        # load this because the apps aren't loaded at that point
        from .models import Log
@@ -23,10 +28,10 @@ class LoggingMixin:
    def renew_logging_group(self):
        self.logging_group = uuid.uuid4()

-    def log(self, level, message):
+    def log(self, level, message, **kwargs):
        target = ".".join([self.__class__.__module__, self.__class__.__name__])
        logger = logging.getLogger(target)

        getattr(logger, level)(message, extra={
            "group": self.logging_group
-        })
+        }, **kwargs)
--- a/src/documents/management/commands/change_storage_type.py
+++ b/src/documents/management/commands/change_storage_type.py
@@ -17,16 +17,6 @@ class Command(BaseCommand):

    def add_arguments(self, parser):

-        parser.add_argument(
-            "from",
-            choices=("gpg", "unencrypted"),
-            help="The state you want to change your documents from"
-        )
-        parser.add_argument(
-            "to",
-            choices=("gpg", "unencrypted"),
-            help="The state you want to change your documents to"
-        )
        parser.add_argument(
            "--passphrase",
            help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
@@ -50,11 +40,6 @@ class Command(BaseCommand):
        except KeyboardInterrupt:
            return

-        if options["from"] == options["to"]:
-            raise CommandError(
-                'The "from" and "to" values can\'t be the same.'
-            )
-
        passphrase = options["passphrase"] or settings.PASSPHRASE
        if not passphrase:
            raise CommandError(
@@ -62,10 +47,7 @@ class Command(BaseCommand):
                "by declaring it in your environment or your config."
            )

-        if options["from"] == "gpg" and options["to"] == "unencrypted":
-            self.__gpg_to_unencrypted(passphrase)
-        elif options["from"] == "unencrypted" and options["to"] == "gpg":
-            self.__unencrypted_to_gpg(passphrase)
+        self.__gpg_to_unencrypted(passphrase)

    @staticmethod
    def __gpg_to_unencrypted(passphrase):
@@ -79,42 +61,28 @@ class Command(BaseCommand):
                document).encode('utf-8'), "green"))

            old_paths = [document.source_path, document.thumbnail_path]
+
            raw_document = GnuPG.decrypted(document.source_file, passphrase)
            raw_thumb = GnuPG.decrypted(document.thumbnail_file, passphrase)

            document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED

+            ext = os.path.splitext(document.filename)[1]
+
+            if not ext == '.gpg':
+                raise CommandError(
+                    f"Abort: encrypted file {document.source_path} does not "
+                    f"end with .gpg")
+
+            document.filename = os.path.splitext(document.filename)[0]
+
            with open(document.source_path, "wb") as f:
                f.write(raw_document)

            with open(document.thumbnail_path, "wb") as f:
                f.write(raw_thumb)

-            document.save(update_fields=("storage_type",))
-
-            for path in old_paths:
-                os.unlink(path)
-
-    @staticmethod
-    def __unencrypted_to_gpg(passphrase):
-
-        unencrypted_files = Document.objects.filter(
-            storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
-
-        for document in unencrypted_files:
-
-            print(coloured("Encrypting {}".format(document), "green"))
-
-            old_paths = [document.source_path, document.thumbnail_path]
-            with open(document.source_path, "rb") as raw_document:
-                with open(document.thumbnail_path, "rb") as raw_thumb:
-                    document.storage_type = Document.STORAGE_TYPE_GPG
-                    with open(document.source_path, "wb") as f:
-                        f.write(GnuPG.encrypted(raw_document, passphrase))
-                    with open(document.thumbnail_path, "wb") as f:
-                        f.write(GnuPG.encrypted(raw_thumb, passphrase))
-
-            document.save(update_fields=("storage_type",))
+            document.save(update_fields=("storage_type", "filename"))

            for path in old_paths:
                os.unlink(path)
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -0,0 +1,128 @@
+import hashlib
+import multiprocessing
+
+import logging
+import os
+import shutil
+import uuid
+
+import tqdm
+from django import db
+from django.conf import settings
+from django.core.management.base import BaseCommand
+from django.db import transaction
+from whoosh.writing import AsyncWriter
+
+from documents.models import Document
+from ... import index
+from ...file_handling import create_source_path_directory
+from ...mixins import Renderable
+from ...parsers import get_parser_class_for_mime_type
+
+
+logger = logging.getLogger(__name__)
+
+
+def handle_document(document_id):
+    document = Document.objects.get(id=document_id)
+
+    mime_type = document.mime_type
+
+    parser_class = get_parser_class_for_mime_type(mime_type)
+
+    parser = parser_class(logging_group=uuid.uuid4())
+
+    try:
+        parser.parse(document.source_path, mime_type)
+
+        if parser.get_archive_path():
+            with transaction.atomic():
+                with open(parser.get_archive_path(), 'rb') as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+                # i'm going to save first so that in case the file move
+                # fails, the database is rolled back.
+                # we also don't use save() since that triggers the filehandling
+                # logic, and we don't want that yet (file not yet in place)
+                Document.objects.filter(pk=document.pk).update(
+                    archive_checksum=checksum,
+                    content=parser.get_text()
+                )
+                create_source_path_directory(document.archive_path)
+                shutil.move(parser.get_archive_path(), document.archive_path)
+
+        with AsyncWriter(index.open_index()) as writer:
+            index.update_document(writer, document)
+
+    except Exception as e:
+        logger.error(f"Error while parsing document {document}: {str(e)}")
+    finally:
+        parser.cleanup()
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        Using the current classification model, assigns correspondents, tags
+        and document types to all documents, effectively allowing you to
+        back-tag all previously indexed documents with metadata created (or
+        modified) after their initial import.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        self.verbosity = 0
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-f", "--overwrite",
+            default=False,
+            action="store_true",
+            help="Recreates the archived document for documents that already "
+                 "have an archived version."
+        )
+        parser.add_argument(
+            "-d", "--document",
+            default=None,
+            type=int,
+            required=False,
+            help="Specify the ID of a document, and this command will only "
+                 "run on this specific document."
+        )
+
+    def handle(self, *args, **options):
+
+        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
+
+        overwrite = options["overwrite"]
+
+        if options['document']:
+            documents = Document.objects.filter(pk=options['document'])
+        else:
+            documents = Document.objects.all()
+
+        document_ids = list(map(
+            lambda doc: doc.id,
+            filter(
+                lambda d: overwrite or not d.archive_checksum,
+                documents
+            )
+        ))
+
+        # Note to future self: this prevents django from reusing database
+        # conncetions between processes, which is bad and does not work
+        # with postgres.
+        db.connections.close_all()
+
+        try:
+
+            logging.getLogger().handlers[0].level = logging.ERROR
+            with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
+                list(tqdm.tqdm(
+                    pool.imap_unordered(
+                        handle_document,
+                        document_ids
+                    ),
+                    total=len(document_ids)
+                ))
+        except KeyboardInterrupt:
+            print("Aborting...")
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -1,37 +1,104 @@
 import logging
 import os
+from pathlib import Path
+from time import sleep

 from django.conf import settings
-from django.core.management.base import BaseCommand
+from django.core.management.base import BaseCommand, CommandError
+from django.utils.text import slugify
 from django_q.tasks import async_task
 from watchdog.events import FileSystemEventHandler
-from watchdog.observers import Observer
 from watchdog.observers.polling import PollingObserver

+from documents.models import Tag
+from documents.parsers import is_file_ext_supported
+
 try:
-    from inotify_simple import INotify, flags
+    from inotifyrecursive import INotify, flags
 except ImportError:
    INotify = flags = None

+logger = logging.getLogger(__name__)
+
+
+def _tags_from_path(filepath):
+    """Walk up the directory tree from filepath to CONSUMPTION_DIr
+       and get or create Tag IDs for every directory.
+    """
+    tag_ids = set()
+    path_parts = Path(filepath).relative_to(
+                settings.CONSUMPTION_DIR).parent.parts
+    for part in path_parts:
+        tag_ids.add(Tag.objects.get_or_create(
+            slug=slugify(part),
+            defaults={"name": part},
+        )[0].pk)
+
+    return tag_ids
+
+
+def _consume(filepath):
+    if os.path.isdir(filepath):
+        return
+
+    if not os.path.isfile(filepath):
+        logger.debug(
+            f"Not consuming file {filepath}: File has moved.")
+        return
+
+    if not is_file_ext_supported(os.path.splitext(filepath)[1]):
+        logger.debug(
+            f"Not consuming file {filepath}: Unknown file extension.")
+        return
+
+    tag_ids = None
+    try:
+        if settings.CONSUMER_SUBDIRS_AS_TAGS:
+            tag_ids = _tags_from_path(filepath)
+    except Exception as e:
+        logger.error(
+            "Error creating tags from path: {}".format(e))
+
+    try:
+        async_task("documents.tasks.consume_file",
+                   filepath,
+                   override_tag_ids=tag_ids if tag_ids else None,
+                   task_name=os.path.basename(filepath)[:100])
+    except Exception as e:
+        # Catch all so that the consumer won't crash.
+        # This is also what the test case is listening for to check for
+        # errors.
+        logger.error(
+            "Error while consuming document: {}".format(e))
+
+
+def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
+    mtime = -1
+    current_try = 0
+    while current_try < num_tries:
+        try:
+            new_mtime = os.stat(file).st_mtime
+        except FileNotFoundError:
+            logger.debug(f"File {file} moved while waiting for it to remain "
+                         f"unmodified.")
+            return
+        if new_mtime == mtime:
+            _consume(file)
+            return
+        mtime = new_mtime
+        sleep(wait_time)
+        current_try += 1
+
+    logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
+

 class Handler(FileSystemEventHandler):

-    def _consume(self, file):
-        if os.path.isfile(file):
-            try:
-                async_task("documents.tasks.consume_file",
-                           file,
-                           task_name=os.path.basename(file)[:100])
-            except Exception as e:
-                # Catch all so that the consumer won't crash.
-                logging.getLogger(__name__).error(
-                    "Error while consuming document: {}".format(e))
-
    def on_created(self, event):
-        self._consume(event.src_path)
+        _consume_wait_unmodified(event.src_path)

    def on_moved(self, event):
-        self._consume(event.src_path)
+        _consume_wait_unmodified(event.dest_path)


 class Command(BaseCommand):
@@ -40,12 +107,15 @@ class Command(BaseCommand):
    consumption directory.
    """

+    # This is here primarily for the tests and is irrelevant in production.
+    stop_flag = False
+
    def __init__(self, *args, **kwargs):

-        self.verbosity = 0
        self.logger = logging.getLogger(__name__)

        BaseCommand.__init__(self, *args, **kwargs)
+        self.observer = None

    def add_arguments(self, parser):
        parser.add_argument(
@@ -54,38 +124,81 @@ class Command(BaseCommand):
            nargs="?",
            help="The consumption directory."
        )
-
-    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
-        directory = options["directory"]
-
-        logging.getLogger(__name__).info(
-            "Starting document consumer at {}".format(
-                directory
-            )
+        parser.add_argument(
+            "--oneshot",
+            action="store_true",
+            help="Run only once."
        )

-        # Consume all files as this is not done initially by the watchdog
-        for entry in os.scandir(directory):
-            if entry.is_file():
-                async_task("documents.tasks.consume_file",
-                           entry.path,
-                           task_name=os.path.basename(entry.path)[:100])
+    def handle(self, *args, **options):
+        directory = options["directory"]
+        recursive = settings.CONSUMER_RECURSIVE

-        # Start the watchdog. Woof!
-        if settings.CONSUMER_POLLING > 0:
-            logging.getLogger(__name__).info(
-                "Using polling instead of file system notifications.")
-            observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
+        if not directory:
+            raise CommandError(
+                "CONSUMPTION_DIR does not appear to be set."
+            )
+
+        if not os.path.isdir(directory):
+            raise CommandError(
+                f"Consumption directory {directory} does not exist")
+
+        if recursive:
+            for dirpath, _, filenames in os.walk(directory):
+                for filename in filenames:
+                    filepath = os.path.join(dirpath, filename)
+                    _consume(filepath)
        else:
-            observer = Observer()
-        event_handler = Handler()
-        observer.schedule(event_handler, directory, recursive=True)
-        observer.start()
+            for entry in os.scandir(directory):
+                _consume(entry.path)
+
+        if options["oneshot"]:
+            return
+
+        if settings.CONSUMER_POLLING == 0 and INotify:
+            self.handle_inotify(directory, recursive)
+        else:
+            self.handle_polling(directory, recursive)
+
+        logger.debug("Consumer exiting.")
+
+    def handle_polling(self, directory, recursive):
+        logging.getLogger(__name__).info(
+            f"Polling directory for changes: {directory}")
+        self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
+        self.observer.schedule(Handler(), directory, recursive=recursive)
+        self.observer.start()
        try:
-            while observer.is_alive():
-                observer.join(1)
+            while self.observer.is_alive():
+                self.observer.join(1)
+                if self.stop_flag:
+                    self.observer.stop()
        except KeyboardInterrupt:
-            observer.stop()
-        observer.join()
+            self.observer.stop()
+        self.observer.join()
+
+    def handle_inotify(self, directory, recursive):
+        logging.getLogger(__name__).info(
+            f"Using inotify to watch directory for changes: {directory}")
+
+        inotify = INotify()
+        inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO
+        if recursive:
+            descriptor = inotify.add_watch_recursive(directory, inotify_flags)
+        else:
+            descriptor = inotify.add_watch(directory, inotify_flags)
+
+        try:
+            while not self.stop_flag:
+                for event in inotify.read(timeout=1000):
+                    if recursive:
+                        path = inotify.get_path(event.wd)
+                    else:
+                        path = directory
+                    filepath = os.path.join(path, event.name)
+                    _consume(filepath)
+        except KeyboardInterrupt:
+            pass
+
+        inotify.rm_watch(descriptor)
+        inotify.close()
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -7,7 +7,8 @@ from django.core import serializers
 from django.core.management.base import BaseCommand, CommandError

 from documents.models import Document, Correspondent, Tag, DocumentType
-from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
+from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
+    EXPORTER_ARCHIVE_NAME
 from paperless.db import GnuPG
 from ...mixins import Renderable

@@ -22,13 +23,6 @@ class Command(Renderable, BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument("target")
-        parser.add_argument(
-            "--legacy",
-            action="store_true",
-            help="Don't try to export all of the document data, just dump the "
-                 "original document files out in a format that makes "
-                 "re-consuming them easy."
-        )

    def __init__(self, *args, **kwargs):
        BaseCommand.__init__(self, *args, **kwargs)
@@ -44,10 +38,10 @@ class Command(Renderable, BaseCommand):
        if not os.access(self.target, os.W_OK):
            raise CommandError("That path doesn't appear to be writable")

-        if options["legacy"]:
-            self.dump_legacy()
-        else:
-            self.dump()
+        if os.listdir(self.target):
+            raise CommandError("That directory is not empty.")
+
+        self.dump()

    def dump(self):

@@ -63,34 +57,56 @@ class Command(Renderable, BaseCommand):

            document = document_map[document_dict["pk"]]

-            unique_filename = f"{document.pk:07}_{document.file_name}"
+            print(f"Exporting: {document}")

-            file_target = os.path.join(self.target, unique_filename)
+            filename_counter = 0
+            while True:
+                original_name = document.get_public_filename(
+                    counter=filename_counter)
+                original_target = os.path.join(self.target, original_name)

-            thumbnail_name = unique_filename + "-thumbnail.png"
+                if not os.path.exists(original_target):
+                    break
+                else:
+                    filename_counter += 1
+
+            thumbnail_name = original_name + "-thumbnail.png"
            thumbnail_target = os.path.join(self.target, thumbnail_name)

-            document_dict[EXPORTER_FILE_NAME] = unique_filename
+            document_dict[EXPORTER_FILE_NAME] = original_name
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

-            print(f"Exporting: {file_target}")
+            if os.path.exists(document.archive_path):
+                archive_name = document.get_public_filename(
+                    archive=True, counter=filename_counter, suffix="_archive")
+                archive_target = os.path.join(self.target, archive_name)
+                document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
+            else:
+                archive_target = None

            t = int(time.mktime(document.created.timetuple()))
            if document.storage_type == Document.STORAGE_TYPE_GPG:

-                with open(file_target, "wb") as f:
+                with open(original_target, "wb") as f:
                    f.write(GnuPG.decrypted(document.source_file))
-                    os.utime(file_target, times=(t, t))
+                    os.utime(original_target, times=(t, t))

                with open(thumbnail_target, "wb") as f:
                    f.write(GnuPG.decrypted(document.thumbnail_file))
                    os.utime(thumbnail_target, times=(t, t))

+                if archive_target:
+                    with open(archive_target, "wb") as f:
+                        f.write(GnuPG.decrypted(document.archive_path))
+                        os.utime(archive_target, times=(t, t))
            else:

-                shutil.copy(document.source_path, file_target)
+                shutil.copy(document.source_path, original_target)
                shutil.copy(document.thumbnail_path, thumbnail_target)

+                if archive_target:
+                    shutil.copy(document.archive_path, archive_target)
+
        manifest += json.loads(
            serializers.serialize("json", Correspondent.objects.all()))

@@ -102,33 +118,3 @@ class Command(Renderable, BaseCommand):

        with open(os.path.join(self.target, "manifest.json"), "w") as f:
            json.dump(manifest, f, indent=2)
-
-    def dump_legacy(self):
-
-        for document in Document.objects.all():
-
-            target = os.path.join(
-                self.target, self._get_legacy_file_name(document))
-
-            print("Exporting: {}".format(target))
-
-            with open(target, "wb") as f:
-                f.write(GnuPG.decrypted(document.source_file))
-                t = int(time.mktime(document.created.timetuple()))
-                os.utime(target, times=(t, t))
-
-    @staticmethod
-    def _get_legacy_file_name(doc):
-
-        if not doc.correspondent and not doc.title:
-            return os.path.basename(doc.source_path)
-
-        created = doc.created.strftime("%Y%m%d%H%M%SZ")
-        tags = ",".join([t.slug for t in doc.tags.all()])
-
-        if tags:
-            return "{} - {} - {} - {}{}".format(
-                created, doc.correspondent, doc.title, tags, doc.file_type)
-
-        return "{} - {} - {}{}".format(
-            created, doc.correspondent, doc.title, doc.file_type)
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -7,8 +7,8 @@ from django.core.management import call_command
 from django.core.management.base import BaseCommand, CommandError

 from documents.models import Document
-from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
-from paperless.db import GnuPG
+from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
+    EXPORTER_ARCHIVE_NAME
 from ...file_handling import generate_filename, create_source_path_directory
 from ...mixins import Renderable

@@ -79,25 +79,41 @@ class Command(Renderable, BaseCommand):
                    'appear to be in the source directory.'.format(doc_file)
                )

+            if EXPORTER_ARCHIVE_NAME in record:
+                archive_file = record[EXPORTER_ARCHIVE_NAME]
+                if not os.path.exists(os.path.join(self.source, archive_file)):
+                    raise CommandError(
+                        f"The manifest file refers to {archive_file} which "
+                        f"does not appear to be in the source directory."
+                    )
+
    def _import_files_from_manifest(self):

-        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
-        if settings.PASSPHRASE:
-            storage_type = Document.STORAGE_TYPE_GPG
+        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
+        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
+        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)

        for record in self.manifest:

            if not record["model"] == "documents.document":
                continue

-            doc_file = record[EXPORTER_FILE_NAME]
-            thumb_file = record[EXPORTER_THUMBNAIL_NAME]
            document = Document.objects.get(pk=record["pk"])

+            doc_file = record[EXPORTER_FILE_NAME]
            document_path = os.path.join(self.source, doc_file)
+
+            thumb_file = record[EXPORTER_THUMBNAIL_NAME]
            thumbnail_path = os.path.join(self.source, thumb_file)

-            document.storage_type = storage_type
+            if EXPORTER_ARCHIVE_NAME in record:
+                archive_file = record[EXPORTER_ARCHIVE_NAME]
+                archive_path = os.path.join(self.source, archive_file)
+            else:
+                archive_path = None
+
+            document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+
            document.filename = generate_filename(document)

            if os.path.isfile(document.source_path):
@@ -105,23 +121,10 @@ class Command(Renderable, BaseCommand):

            create_source_path_directory(document.source_path)

-            if settings.PASSPHRASE:
-
-                with open(document_path, "rb") as unencrypted:
-                    with open(document.source_path, "wb") as encrypted:
-                        print("Encrypting {} and saving it to {}".format(
-                            doc_file, document.source_path))
-                        encrypted.write(GnuPG.encrypted(unencrypted))
-
-                with open(thumbnail_path, "rb") as unencrypted:
-                    with open(document.thumbnail_path, "wb") as encrypted:
-                        print("Encrypting {} and saving it to {}".format(
-                            thumb_file, document.thumbnail_path))
-                        encrypted.write(GnuPG.encrypted(unencrypted))
-
-            else:
-                print(f"Moving {document_path} to {document.source_path}")
-                shutil.copy(document_path, document.source_path)
-                shutil.copy(thumbnail_path, document.thumbnail_path)
+            print(f"Moving {document_path} to {document.source_path}")
+            shutil.copy(document_path, document.source_path)
+            shutil.copy(thumbnail_path, document.thumbnail_path)
+            if archive_path:
+                shutil.copy(archive_path, document.archive_path)

            document.save()
--- a/src/documents/migrations/1000_update_paperless_all.py
+++ b/src/documents/migrations/1000_update_paperless_all.py
@@ -5,23 +5,6 @@ from django.db import migrations, models
 import django.db.models.deletion


-def make_index(apps, schema_editor):
-    Document = apps.get_model("documents", "Document")
-    documents = Document.objects.all()
-    print()
-    try:
-        print("  --> Creating document index...")
-        from whoosh.writing import AsyncWriter
-        from documents import index
-        ix = index.open_index(recreate=True)
-        with AsyncWriter(ix) as writer:
-            for document in documents:
-                index.update_document(writer, document)
-    except ImportError:
-        # index may not be relevant anymore
-        print("  --> Cannot create document index.")
-
-
 def logs_set_default_group(apps, schema_editor):
    Log = apps.get_model('documents', 'Log')
    for log in Log.objects.all():
@@ -99,8 +82,4 @@ class Migration(migrations.Migration):
            code=django.db.migrations.operations.special.RunPython.noop,
            reverse_code=logs_set_default_group
        ),
-        migrations.RunPython(
-            code=make_index,
-            reverse_code=django.db.migrations.operations.special.RunPython.noop,
-        ),
    ]
--- a/src/documents/migrations/1004_sanity_check_schedule.py
+++ b/src/documents/migrations/1004_sanity_check_schedule.py
@@ -0,0 +1,26 @@
+# Generated by Django 3.1.3 on 2020-11-25 14:53
+
+from django.db import migrations
+from django.db.migrations import RunPython
+from django_q.models import Schedule
+from django_q.tasks import schedule
+
+
+def add_schedules(apps, schema_editor):
+    schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
+
+
+def remove_schedules(apps, schema_editor):
+    Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1003_mime_types'),
+        ('django_q', '0013_task_attempt_count'),
+    ]
+
+    operations = [
+        RunPython(add_schedules, remove_schedules)
+    ]
--- a/src/documents/migrations/1005_checksums.py
+++ b/src/documents/migrations/1005_checksums.py
@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2020-11-29 00:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1004_sanity_check_schedule'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_checksum',
+            field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='checksum',
+            field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,17 +1,21 @@
 # coding=utf-8
-
+import datetime
 import logging
-import mimetypes
 import os
 import re
 from collections import OrderedDict

+import pathvalidate
+
 import dateutil.parser
 from django.conf import settings
 from django.db import models
 from django.utils import timezone
 from django.utils.text import slugify

+from documents.file_handling import archive_name_from_filename
+from documents.parsers import get_default_file_extension
+

 class MatchingModel(models.Model):

@@ -157,9 +161,15 @@ class Document(models.Model):
        max_length=32,
        editable=False,
        unique=True,
-        help_text="The checksum of the original document (before it was "
-                  "encrypted).  We use this to prevent duplicate document "
-                  "imports."
+        help_text="The checksum of the original document."
+    )
+
+    archive_checksum = models.CharField(
+        max_length=32,
+        editable=False,
+        blank=True,
+        null=True,
+        help_text="The checksum of the archived document."
    )

    created = models.DateTimeField(
@@ -198,13 +208,11 @@ class Document(models.Model):
        ordering = ("correspondent", "title")

    def __str__(self):
-        created = self.created.strftime("%Y%m%d%H%M%S")
+        created = datetime.date.isoformat(self.created)
        if self.correspondent and self.title:
-            return "{}: {} - {}".format(
-                created, self.correspondent, self.title)
-        if self.correspondent or self.title:
-            return "{}: {}".format(created, self.correspondent or self.title)
-        return str(created)
+            return f"{created} {self.correspondent} {self.title}"
+        else:
+            return f"{created} {self.title}"

    @property
    def source_path(self):
@@ -225,12 +233,40 @@ class Document(models.Model):
        return open(self.source_path, "rb")

    @property
-    def file_name(self):
-        return slugify(str(self)) + self.file_type
+    def archive_path(self):
+        if self.filename:
+            fname = archive_name_from_filename(self.filename)
+        else:
+            fname = "{:07}.pdf".format(self.pk)
+
+        return os.path.join(
+            settings.ARCHIVE_DIR,
+            fname
+        )
+
+    @property
+    def archive_file(self):
+        return open(self.archive_path, "rb")
+
+    def get_public_filename(self, archive=False, counter=0, suffix=None):
+        result = str(self)
+
+        if counter:
+            result += f"_{counter:02}"
+
+        if suffix:
+            result += suffix
+
+        if archive:
+            result += ".pdf"
+        else:
+            result += self.file_type
+
+        return pathvalidate.sanitize_filename(result, replacement_text="-")

    @property
    def file_type(self):
-        return mimetypes.guess_extension(str(self.mime_type))
+        return get_default_file_extension(self.mime_type)

    @property
    def thumbnail_path(self):
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -1,4 +1,5 @@
 import logging
+import mimetypes
 import os
 import re
 import shutil
@@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type):
    return get_parser_class_for_mime_type(mime_type) is not None


+def get_default_file_extension(mime_type):
+    for response in document_consumer_declaration.send(None):
+        parser_declaration = response[1]
+        supported_mime_types = parser_declaration["mime_types"]
+
+        if mime_type in supported_mime_types:
+            return supported_mime_types[mime_type]
+
+    ext = mimetypes.guess_extension(mime_type)
+    if ext:
+        return ext
+    else:
+        return ""
+
+
+def is_file_ext_supported(ext):
+    if ext:
+        return ext.lower() in get_supported_file_extensions()
+    else:
+        return False
+
+
+def get_supported_file_extensions():
+    extensions = set()
+    for response in document_consumer_declaration.send(None):
+        parser_declaration = response[1]
+        supported_mime_types = parser_declaration["mime_types"]
+
+        for mime_type in supported_mime_types:
+            extensions.update(mimetypes.guess_all_extensions(mime_type))
+
+    return extensions
+
+
 def get_parser_class_for_mime_type(mime_type):

    options = []
@@ -107,21 +142,59 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))


-def run_unpaper(pnm, logging_group=None):
-    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
+def parse_date(filename, text):
+    """
+    Returns the date of the document.
+    """

-    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
-                    pnm_out)
+    def __parser(ds, date_order):
+        """
+        Call dateparser.parse with a particular date ordering
+        """
+        return dateparser.parse(
+            ds,
+            settings={
+                "DATE_ORDER": date_order,
+                "PREFER_DAY_OF_MONTH": "first",
+                "RETURN_AS_TIMEZONE_AWARE":
+                True
+            }
+        )

-    logger.debug(f"Execute: {' '.join(command_args)}",
-                 extra={'group': logging_group})
+    date = None

-    if not subprocess.Popen(command_args,
-                            stdout=subprocess.DEVNULL,
-                            stderr=subprocess.DEVNULL).wait() == 0:
-        raise ParseError(f"Unpaper failed at {command_args}")
+    next_year = timezone.now().year + 5  # Arbitrary 5 year future limit

-    return pnm_out
+    # if filename date parsing is enabled, search there first:
+    if settings.FILENAME_DATE_ORDER:
+        for m in re.finditer(DATE_REGEX, filename):
+            date_string = m.group(0)
+
+            try:
+                date = __parser(date_string, settings.FILENAME_DATE_ORDER)
+            except (TypeError, ValueError):
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                return date
+
+    # Iterate through all regex matches in text and try to parse the date
+    for m in re.finditer(DATE_REGEX, text):
+        date_string = m.group(0)
+
+        try:
+            date = __parser(date_string, settings.DATE_ORDER)
+        except (TypeError, ValueError):
+            # Skip all matches that do not parse to a proper date
+            continue
+
+        if date is not None and next_year > date.year > 1900:
+            break
+        else:
+            date = None
+
+    return date


 class ParseError(Exception):
@@ -134,27 +207,36 @@ class DocumentParser(LoggingMixin):
    `paperless_tesseract.parsers` for inspiration.
    """

-    def __init__(self, path, logging_group, progress_callback):
+    def __init__(self, logging_group, progress_callback):
        super().__init__()
        self.logging_group = logging_group
-        self.document_path = path
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)
+
+        self.archive_path = None
+        self.text = None
+        self.date = None
        self.progress_callback = progress_callback

-    def get_thumbnail(self):
+    def parse(self, document_path, mime_type):
+        raise NotImplementedError()
+
+    def get_archive_path(self):
+        return self.archive_path
+
+    def get_thumbnail(self, document_path, mime_type):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()

-    def optimise_thumbnail(self, in_path):
-
+    def get_optimised_thumbnail(self, document_path, mime_type):
+        thumbnail = self.get_thumbnail(document_path, mime_type)
        if settings.OPTIMIZE_THUMBNAILS:
-            out_path = os.path.join(self.tempdir, "optipng.png")
+            out_path = os.path.join(self.tempdir, "thumb_optipng.png")

            args = (settings.OPTIPNG_BINARY,
-                    "-silent", "-o5", in_path, "-out", out_path)
+                    "-silent", "-o5", thumbnail, "-out", out_path)

            self.log('debug', f"Execute: {' '.join(args)}")

@@ -163,97 +245,13 @@ class DocumentParser(LoggingMixin):

            return out_path
        else:
-            return in_path
-
-    def get_optimised_thumbnail(self):
-        return self.optimise_thumbnail(self.get_thumbnail())
+            return thumbnail

    def get_text(self):
-        """
-        Returns the text from the document and only the text.
-        """
-        raise NotImplementedError()
+        return self.text

    def get_date(self):
-        """
-        Returns the date of the document.
-        """
-
-        def __parser(ds, date_order):
-            """
-            Call dateparser.parse with a particular date ordering
-            """
-            return dateparser.parse(
-                ds,
-                settings={
-                    "DATE_ORDER": date_order,
-                    "PREFER_DAY_OF_MONTH": "first",
-                    "RETURN_AS_TIMEZONE_AWARE":
-                    True
-                }
-            )
-
-        date = None
-        date_string = None
-
-        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
-        title = os.path.basename(self.document_path)
-
-        # if filename date parsing is enabled, search there first:
-        if settings.FILENAME_DATE_ORDER:
-            self.log("info", "Checking document title for date")
-            for m in re.finditer(DATE_REGEX, title):
-                date_string = m.group(0)
-
-                try:
-                    date = __parser(date_string, settings.FILENAME_DATE_ORDER)
-                except (TypeError, ValueError):
-                    # Skip all matches that do not parse to a proper date
-                    continue
-
-                if date is not None and next_year > date.year > 1900:
-                    self.log(
-                        "info",
-                        "Detected document date {} based on string {} "
-                        "from document title"
-                        "".format(date.isoformat(), date_string)
-                    )
-                    return date
-
-        try:
-            # getting text after checking filename will save time if only
-            # looking at the filename instead of the whole text
-            text = self.get_text()
-        except ParseError:
-            return None
-
-        # Iterate through all regex matches in text and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            date_string = m.group(0)
-
-            try:
-                date = __parser(date_string, settings.DATE_ORDER)
-            except (TypeError, ValueError):
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None and next_year > date.year > 1900:
-                break
-            else:
-                date = None
-
-        if date is not None:
-            self.log(
-                "info",
-                "Detected document date {} based on string {}".format(
-                    date.isoformat(),
-                    date_string
-                )
-            )
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
+        return self.date

    def cleanup(self):
        self.log("debug", "Deleting directory {}".format(self.tempdir))
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -0,0 +1,117 @@
+import hashlib
+import os
+
+from django.conf import settings
+
+from documents.models import Document
+
+
+class SanityMessage:
+    message = None
+
+
+class SanityWarning(SanityMessage):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return f"Warning: {self.message}"
+
+
+class SanityError(SanityMessage):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return f"ERROR: {self.message}"
+
+
+class SanityFailedError(Exception):
+
+    def __init__(self, messages):
+        self.messages = messages
+
+    def __str__(self):
+        message_string = "\n".join([str(m) for m in self.messages])
+        return (
+            f"The following issuse were found by the sanity checker:\n"
+            f"{message_string}\n\n===============\n\n")
+
+
+def check_sanity():
+    messages = []
+
+    present_files = []
+    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
+        for f in files:
+            present_files.append(os.path.normpath(os.path.join(root, f)))
+
+    for doc in Document.objects.all():
+        # Check sanity of the thumbnail
+        if not os.path.isfile(doc.thumbnail_path):
+            messages.append(SanityError(
+                f"Thumbnail of document {doc.pk} does not exist."))
+        else:
+            present_files.remove(os.path.normpath(doc.thumbnail_path))
+            try:
+                with doc.thumbnail_file as f:
+                    f.read()
+            except OSError as e:
+                messages.append(SanityError(
+                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
+                ))
+
+        # Check sanity of the original file
+        # TODO: extract method
+        if not os.path.isfile(doc.source_path):
+            messages.append(SanityError(
+                f"Original of document {doc.pk} does not exist."))
+        else:
+            present_files.remove(os.path.normpath(doc.source_path))
+            try:
+                with doc.source_file as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+            except OSError as e:
+                messages.append(SanityError(
+                    f"Cannot read original file of document {doc.pk}: {e}"))
+            else:
+                if not checksum == doc.checksum:
+                    messages.append(SanityError(
+                        f"Checksum mismatch of document {doc.pk}. "
+                        f"Stored: {doc.checksum}, actual: {checksum}."
+                    ))
+
+        # Check sanity of the archive file.
+        if doc.archive_checksum:
+            if not os.path.isfile(doc.archive_path):
+                messages.append(SanityError(
+                    f"Archived version of document {doc.pk} does not exist."
+                ))
+            else:
+                present_files.remove(os.path.normpath(doc.archive_path))
+                try:
+                    with doc.archive_file as f:
+                        checksum = hashlib.md5(f.read()).hexdigest()
+                except OSError as e:
+                    messages.append(SanityError(
+                        f"Cannot read archive file of document {doc.pk}: {e}"
+                    ))
+                else:
+                    if not checksum == doc.archive_checksum:
+                        messages.append(SanityError(
+                            f"Checksum mismatch of archive {doc.pk}. "
+                            f"Stored: {doc.checksum}, actual: {checksum}."
+                        ))
+
+        # other document checks
+        if not doc.content:
+            messages.append(SanityWarning(
+                f"Document {doc.pk} has no content."
+            ))
+
+    for extra_file in present_files:
+        messages.append(SanityWarning(
+            f"Orphaned file in media dir: {extra_file}"
+        ))
+
+    return messages
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1,6 +1,9 @@
+import magic
+from pathvalidate import validate_filename, ValidationError
 from rest_framework import serializers

 from .models import Correspondent, Tag, Document, Log, DocumentType
+from .parsers import is_mime_type_supported


 class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
@@ -76,11 +79,9 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):

 class DocumentSerializer(serializers.ModelSerializer):

-    correspondent_id = CorrespondentField(
-        allow_null=True, source='correspondent')
-    tags_id = TagsField(many=True, source='tags')
-    document_type_id = DocumentTypeField(
-        allow_null=True, source='document_type')
+    correspondent = CorrespondentField(allow_null=True)
+    tags = TagsField(many=True)
+    document_type = DocumentTypeField(allow_null=True)

    class Meta:
        model = Document
@@ -88,19 +89,13 @@ class DocumentSerializer(serializers.ModelSerializer):
        fields = (
            "id",
            "correspondent",
-            "correspondent_id",
            "document_type",
-            "document_type_id",
            "title",
            "content",
-            "mime_type",
            "tags",
-            "tags_id",
-            "checksum",
            "created",
            "modified",
            "added",
-            "file_name",
            "archive_serial_number"
        )

@@ -116,3 +111,82 @@ class LogSerializer(serializers.ModelSerializer):
            "group",
            "level"
        )
+
+
+class PostDocumentSerializer(serializers.Serializer):
+
+    document = serializers.FileField(
+        label="Document",
+        write_only=True,
+    )
+
+    title = serializers.CharField(
+        label="Title",
+        write_only=True,
+        required=False,
+    )
+
+    correspondent = serializers.PrimaryKeyRelatedField(
+        queryset=Correspondent.objects.all(),
+        label="Correspondent",
+        allow_null=True,
+        write_only=True,
+        required=False,
+    )
+
+    document_type = serializers.PrimaryKeyRelatedField(
+        queryset=DocumentType.objects.all(),
+        label="Document type",
+        allow_null=True,
+        write_only=True,
+        required=False,
+    )
+
+    tags = serializers.PrimaryKeyRelatedField(
+        many=True,
+        queryset=Tag.objects.all(),
+        label="Tags",
+        write_only=True,
+        required=False,
+    )
+
+    def validate_document(self, document):
+
+        try:
+            validate_filename(document.name)
+        except ValidationError:
+            raise serializers.ValidationError("Invalid filename.")
+
+        document_data = document.file.read()
+        mime_type = magic.from_buffer(document_data, mime=True)
+
+        if not is_mime_type_supported(mime_type):
+            raise serializers.ValidationError(
+                "This file type is not supported.")
+
+        return document.name, document_data
+
+    def validate_title(self, title):
+        if title:
+            return title
+        else:
+            # do not return empty strings.
+            return None
+
+    def validate_correspondent(self, correspondent):
+        if correspondent:
+            return correspondent.id
+        else:
+            return None
+
+    def validate_document_type(self, document_type):
+        if document_type:
+            return document_type.id
+        else:
+            return None
+
+    def validate_tags(self, tags):
+        if tags:
+            return [tag.id for tag in tags]
+        else:
+            return None
--- a/src/documents/settings.py
+++ b/src/documents/settings.py
@@ -2,3 +2,4 @@
 # for exporting/importing commands
 EXPORTER_FILE_NAME = "__exported_file_name__"
 EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
+EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -9,10 +9,11 @@ from django.contrib.contenttypes.models import ContentType
 from django.db import models, DatabaseError
 from django.dispatch import receiver
 from django.utils import timezone
+from rest_framework.reverse import reverse

 from .. import index, matching
 from ..file_handling import delete_empty_directories, generate_filename, \
-    create_source_path_directory
+    create_source_path_directory, archive_name_from_filename
 from ..models import Document, Tag


@@ -156,11 +157,11 @@ def run_post_consume_script(sender, document, **kwargs):
    Popen((
        settings.POST_CONSUME_SCRIPT,
        str(document.pk),
-        document.file_name,
-        document.source_path,
-        document.thumbnail_path,
-        None,
-        None,
+        document.get_public_filename(),
+        os.path.normpath(document.source_path),
+        os.path.normpath(document.thumbnail_path),
+        reverse("document-download", kwargs={"pk": document.pk}),
+        reverse("document-thumb", kwargs={"pk": document.pk}),
        str(document.correspondent),
        str(",".join(document.tags.all().values_list("slug", flat=True)))
    )).wait()
@@ -168,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):

@receiver(models.signals.post_delete, sender=Document)
 def cleanup_document_deletion(sender, instance, using, **kwargs):
-    for f in (instance.source_path, instance.thumbnail_path):
-        try:
-            os.unlink(f)
-        except FileNotFoundError:
-            pass  # The file's already gone, so we're cool with it.
+    for f in (instance.source_path,
+              instance.archive_path,
+              instance.thumbnail_path):
+        if os.path.isfile(f):
+            try:
+                os.unlink(f)
+                logging.getLogger(__name__).debug(
+                    f"Deleted file {f}.")
+            except OSError as e:
+                logging.getLogger(__name__).warning(
+                    f"While deleting document {str(instance)}, the file "
+                    f"{f} could not be deleted: {e}"
+                )

-    delete_empty_directories(os.path.dirname(instance.source_path))
+    delete_empty_directories(
+        os.path.dirname(instance.source_path),
+        root=settings.ORIGINALS_DIR
+    )
+
+    delete_empty_directories(
+        os.path.dirname(instance.archive_path),
+        root=settings.ARCHIVE_DIR
+    )
+
+
+def validate_move(instance, old_path, new_path):
+    if not os.path.isfile(old_path):
+        # Can't do anything if the old file does not exist anymore.
+        logging.getLogger(__name__).fatal(
+            f"Document {str(instance)}: File {old_path} has gone.")
+        return False
+
+    if os.path.isfile(new_path):
+        # Can't do anything if the new file already exists. Skip updating file.
+        logging.getLogger(__name__).warning(
+            f"Document {str(instance)}: Cannot rename file "
+            f"since target path {new_path} already exists.")
+        return False
+
+    return True


@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@@ -182,51 +216,91 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
 def update_filename_and_move_files(sender, instance, **kwargs):

    if not instance.filename:
-        # Can't update the filename if there is not filename to begin with
-        # This happens after the consumer creates a new document.
-        # The PK needs to be set first by saving the document once. When this
-        # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
-        # renamed anyway. In all other cases, instance.filename will be set.
+        # Can't update the filename if there is no filename to begin with
+        # This happens when the consumer creates a new document.
+        # The document is modified and saved multiple times, and only after
+        # everything is done (i.e., the generated filename is final),
+        # filename will be set to the location where the consumer has put
+        # the file.
+        #
+        # This will in turn cause this logic to move the file where it belongs.
        return

    old_filename = instance.filename
-    old_path = instance.source_path
    new_filename = generate_filename(instance)

    if new_filename == instance.filename:
        # Don't do anything if its the same.
        return

-    new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
+    old_source_path = instance.source_path
+    new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)

-    if not os.path.isfile(old_path):
-        # Can't do anything if the old file does not exist anymore.
-        logging.getLogger(__name__).fatal(
-            f"Document {str(instance)}: File {old_path} has gone.")
+    if not validate_move(instance, old_source_path, new_source_path):
        return

-    if os.path.isfile(new_path):
-        # Can't do anything if the new file already exists. Skip updating file.
-        logging.getLogger(__name__).warning(
-            f"Document {str(instance)}: Cannot rename file "
-            f"since target path {new_path} already exists.")
-        return
+    # archive files are optional, archive checksum tells us if we have one,
+    # since this is None for documents without archived files.
+    if instance.archive_checksum:
+        new_archive_filename = archive_name_from_filename(new_filename)
+        old_archive_path = instance.archive_path
+        new_archive_path = os.path.join(settings.ARCHIVE_DIR,
+                                        new_archive_filename)

-    create_source_path_directory(new_path)
+        if not validate_move(instance, old_archive_path, new_archive_path):
+            return
+
+        create_source_path_directory(new_archive_path)
+    else:
+        old_archive_path = None
+        new_archive_path = None
+
+    create_source_path_directory(new_source_path)

    try:
-        os.rename(old_path, new_path)
+        os.rename(old_source_path, new_source_path)
+        if instance.archive_checksum:
+            os.rename(old_archive_path, new_archive_path)
        instance.filename = new_filename
-        instance.save()
+        # Don't save here to prevent infinite recursion.
+        Document.objects.filter(pk=instance.pk).update(filename=new_filename)
+
+        logging.getLogger(__name__).debug(
+            f"Moved file {old_source_path} to {new_source_path}.")
+
+        if instance.archive_checksum:
+            logging.getLogger(__name__).debug(
+                f"Moved file {old_archive_path} to {new_archive_path}.")

    except OSError as e:
        instance.filename = old_filename
+        # this happens when we can't move a file. If that's the case for the
+        # archive file, we try our best to revert the changes.
+        try:
+            os.rename(new_source_path, old_source_path)
+            os.rename(new_archive_path, old_archive_path)
+        except Exception as e:
+            # This is fine, since:
+            # A: if we managed to move source from A to B, we will also manage
+            #  to move it from B to A. If not, we have a serious issue
+            #  that's going to get caught by the santiy checker.
+            #  all files remain in place and will never be overwritten,
+            #  so this is not the end of the world.
+            # B: if moving the orignal file failed, nothing has changed anyway.
+            pass
    except DatabaseError as e:
-        os.rename(new_path, old_path)
+        os.rename(new_source_path, old_source_path)
+        if instance.archive_checksum:
+            os.rename(new_archive_path, old_archive_path)
        instance.filename = old_filename

-    if not os.path.isfile(old_path):
-        delete_empty_directories(os.path.dirname(old_path))
+    if not os.path.isfile(old_source_path):
+        delete_empty_directories(os.path.dirname(old_source_path),
+                                 root=settings.ORIGINALS_DIR)
+
+    if old_archive_path and not os.path.isfile(old_archive_path):
+        delete_empty_directories(os.path.dirname(old_archive_path),
+                                 root=settings.ARCHIVE_DIR)


 def set_log_entry(sender, document=None, logging_group=None, **kwargs):
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -3,15 +3,18 @@ import logging
 from django.conf import settings
 from whoosh.writing import AsyncWriter

-from documents import index
+from documents import index, sanity_checker
 from documents.classifier import DocumentClassifier, \
    IncompatibleClassifierVersionError
 from documents.consumer import Consumer, ConsumerError
 from documents.models import Document
+from documents.sanity_checker import SanityFailedError


 def index_optimize():
-    index.open_index().optimize()
+    ix = index.open_index()
+    writer = AsyncWriter(ix)
+    writer.commit(optimize=True)


 def index_reindex():
@@ -74,3 +77,12 @@ def consume_file(path,
    else:
        raise ConsumerError("Unknown error: Returned document was null, but "
                            "no error message was given.")
+
+
+def sanity_check():
+    messages = sanity_checker.check_sanity()
+
+    if len(messages) > 0:
+        raise SanityFailedError(messages)
+    else:
+        return "No issues detected."
--- a/src/paperless_tesseract/tests/samples/no-text.png
+++ b/src/paperless_tesseract/tests/samples/no-text.png
--- a/src/documents/tests/samples/documents/archive/0000001.pdf
+++ b/src/documents/tests/samples/documents/archive/0000001.pdf
--- a/src/documents/tests/samples/documents/originals/0000001.pdf
+++ b/src/documents/tests/samples/documents/originals/0000001.pdf
--- a/src/documents/tests/samples/documents/originals/0000002.pdf.gpg
+++ b/src/documents/tests/samples/documents/originals/0000002.pdf.gpg
--- a/src/documents/tests/samples/documents/thumbnails/0000001.png
+++ b/src/documents/tests/samples/documents/thumbnails/0000001.png
--- a/src/documents/tests/samples/documents/thumbnails/0000002.png.gpg
+++ b/src/documents/tests/samples/documents/thumbnails/0000002.png.gpg
--- a/src/documents/tests/samples/simple.pdf
+++ b/src/documents/tests/samples/simple.pdf
--- a/src/documents/tests/samples/simple.zip
+++ b/src/documents/tests/samples/simple.zip
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -1,40 +1,25 @@
 import os
-import shutil
 import tempfile
 from unittest import mock

 from django.contrib.auth.models import User
-from django.test import override_settings
+from pathvalidate import ValidationError
 from rest_framework.test import APITestCase
+from whoosh.writing import AsyncWriter

+from documents import index
 from documents.models import Document, Correspondent, DocumentType, Tag
+from documents.tests.utils import DirectoriesMixin


-class DocumentApiTest(APITestCase):
+class TestDocumentApi(DirectoriesMixin, APITestCase):

    def setUp(self):
-        self.scratch_dir = tempfile.mkdtemp()
-        self.media_dir = tempfile.mkdtemp()
-        self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
-        self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
-
-        os.makedirs(self.originals_dir, exist_ok=True)
-        os.makedirs(self.thumbnail_dir, exist_ok=True)
-
-        override_settings(
-            SCRATCH_DIR=self.scratch_dir,
-            MEDIA_ROOT=self.media_dir,
-            ORIGINALS_DIR=self.originals_dir,
-            THUMBNAIL_DIR=self.thumbnail_dir
-        ).enable()
+        super(TestDocumentApi, self).setUp()

        user = User.objects.create_superuser(username="temp_admin")
        self.client.force_login(user=user)

-    def tearDown(self):
-        shutil.rmtree(self.scratch_dir, ignore_errors=True)
-        shutil.rmtree(self.media_dir, ignore_errors=True)
-
    def testDocuments(self):

        response = self.client.get("/api/documents/").data
@@ -56,20 +41,13 @@ class DocumentApiTest(APITestCase):
        returned_doc = response.data['results'][0]
        self.assertEqual(returned_doc['id'], doc.id)
        self.assertEqual(returned_doc['title'], doc.title)
-        self.assertEqual(returned_doc['correspondent']['name'], c.name)
-        self.assertEqual(returned_doc['document_type']['name'], dt.name)
-        self.assertEqual(returned_doc['correspondent']['id'], c.id)
-        self.assertEqual(returned_doc['document_type']['id'], dt.id)
-        self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
-        self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
-        self.assertEqual(len(returned_doc['tags']), 1)
-        self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
-        self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
-        self.assertListEqual(returned_doc['tags_id'], [tag.id])
+        self.assertEqual(returned_doc['correspondent'], c.id)
+        self.assertEqual(returned_doc['document_type'], dt.id)
+        self.assertListEqual(returned_doc['tags'], [tag.id])

        c2 = Correspondent.objects.create(name="c2")

-        returned_doc['correspondent_id'] = c2.pk
+        returned_doc['correspondent'] = c2.pk
        returned_doc['title'] = "the new title"

        response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
@@ -87,7 +65,7 @@ class DocumentApiTest(APITestCase):

    def test_document_actions(self):

-        _, filename = tempfile.mkstemp(dir=self.originals_dir)
+        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)

        content = b"This is a test"
        content_thumbnail = b"thumbnail content"
@@ -97,7 +75,7 @@ class DocumentApiTest(APITestCase):

        doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")

-        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
+        with open(os.path.join(self.dirs.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
            f.write(content_thumbnail)

        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
@@ -115,6 +93,44 @@ class DocumentApiTest(APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_thumbnail)

+    def test_download_with_archive(self):
+
+        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
+
+        content = b"This is a test"
+        content_archive = b"This is the same test but archived"
+
+        with open(filename, "wb") as f:
+            f.write(content)
+
+        filename = os.path.basename(filename)
+
+        doc = Document.objects.create(title="none", filename=filename,
+                                      mime_type="application/pdf")
+
+        with open(doc.archive_path, "wb") as f:
+            f.write(content_archive)
+
+        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content_archive)
+
+        response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content)
+
+        response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content_archive)
+
+        response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content)
+
    def test_document_actions_not_existing_file(self):

        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
@@ -179,6 +195,109 @@ class DocumentApiTest(APITestCase):
        results = response.data['results']
        self.assertEqual(len(results), 3)

+    def test_search_no_query(self):
+        response = self.client.get("/api/search/")
+        results = response.data['results']
+
+        self.assertEqual(len(results), 0)
+
+    def test_search(self):
+        d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
+        d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
+        d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
+        with AsyncWriter(index.open_index()) as writer:
+            # Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once
+            # (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer.
+            # That's why we cant open the writer in a model on_save handler or something.
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+        response = self.client.get("/api/search/?query=bank")
+        results = response.data['results']
+        self.assertEqual(response.data['count'], 3)
+        self.assertEqual(response.data['page'], 1)
+        self.assertEqual(response.data['page_count'], 1)
+        self.assertEqual(len(results), 3)
+
+        response = self.client.get("/api/search/?query=september")
+        results = response.data['results']
+        self.assertEqual(response.data['count'], 1)
+        self.assertEqual(response.data['page'], 1)
+        self.assertEqual(response.data['page_count'], 1)
+        self.assertEqual(len(results), 1)
+
+        response = self.client.get("/api/search/?query=statement")
+        results = response.data['results']
+        self.assertEqual(response.data['count'], 2)
+        self.assertEqual(response.data['page'], 1)
+        self.assertEqual(response.data['page_count'], 1)
+        self.assertEqual(len(results), 2)
+
+        response = self.client.get("/api/search/?query=sfegdfg")
+        results = response.data['results']
+        self.assertEqual(response.data['count'], 0)
+        self.assertEqual(response.data['page'], 0)
+        self.assertEqual(response.data['page_count'], 0)
+        self.assertEqual(len(results), 0)
+
+    def test_search_multi_page(self):
+        with AsyncWriter(index.open_index()) as writer:
+            for i in range(55):
+                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
+                index.update_document(writer, doc)
+
+        # This is here so that we test that no document gets returned twice (might happen if the paging is not working)
+        seen_ids = []
+
+        for i in range(1, 6):
+            response = self.client.get(f"/api/search/?query=content&page={i}")
+            results = response.data['results']
+            self.assertEqual(response.data['count'], 55)
+            self.assertEqual(response.data['page'], i)
+            self.assertEqual(response.data['page_count'], 6)
+            self.assertEqual(len(results), 10)
+
+            for result in results:
+                self.assertNotIn(result['id'], seen_ids)
+                seen_ids.append(result['id'])
+
+        response = self.client.get(f"/api/search/?query=content&page=6")
+        results = response.data['results']
+        self.assertEqual(response.data['count'], 55)
+        self.assertEqual(response.data['page'], 6)
+        self.assertEqual(response.data['page_count'], 6)
+        self.assertEqual(len(results), 5)
+
+        for result in results:
+            self.assertNotIn(result['id'], seen_ids)
+            seen_ids.append(result['id'])
+
+        response = self.client.get(f"/api/search/?query=content&page=7")
+        results = response.data['results']
+        self.assertEqual(response.data['count'], 55)
+        self.assertEqual(response.data['page'], 6)
+        self.assertEqual(response.data['page_count'], 6)
+        self.assertEqual(len(results), 5)
+
+    def test_search_invalid_page(self):
+        with AsyncWriter(index.open_index()) as writer:
+            for i in range(15):
+                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
+                index.update_document(writer, doc)
+
+        first_page = self.client.get(f"/api/search/?query=content&page=1").data
+        second_page = self.client.get(f"/api/search/?query=content&page=2").data
+        should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
+        should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
+        should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
+        should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
+
+        self.assertDictEqual(first_page, should_be_first_page_1)
+        self.assertDictEqual(first_page, should_be_first_page_2)
+        self.assertDictEqual(first_page, should_be_first_page_3)
+        self.assertDictEqual(first_page, should_be_first_page_4)
+        self.assertNotEqual(len(first_page['results']), len(second_page['results']))
+
    @mock.patch("documents.index.autocomplete")
    def test_search_autocomplete(self, m):
        m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
@@ -201,6 +320,22 @@ class DocumentApiTest(APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(len(response.data), 10)

+    def test_search_spelling_correction(self):
+        with AsyncWriter(index.open_index()) as writer:
+            for i in range(55):
+                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
+                index.update_document(writer, doc)
+
+        response = self.client.get("/api/search/?query=thing")
+        correction = response.data['corrected_query']
+
+        self.assertEqual(correction, "things")
+
+        response = self.client.get("/api/search/?query=things")
+        correction = response.data['corrected_query']
+
+        self.assertEqual(correction, None)
+
    def test_statistics(self):

        doc1 = Document.objects.create(title="none1", checksum="A")
@@ -215,3 +350,128 @@ class DocumentApiTest(APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.data['documents_total'], 3)
        self.assertEqual(response.data['documents_inbox'], 1)
+
+    @mock.patch("documents.views.async_task")
+    def test_upload(self, m):
+
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f})
+
+        self.assertEqual(response.status_code, 200)
+
+        m.assert_called_once()
+
+        args, kwargs = m.call_args
+        self.assertEqual(kwargs['override_filename'], "simple.pdf")
+        self.assertIsNone(kwargs['override_title'])
+        self.assertIsNone(kwargs['override_correspondent_id'])
+        self.assertIsNone(kwargs['override_document_type_id'])
+        self.assertIsNone(kwargs['override_tag_ids'])
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_invalid_form(self, m):
+
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"documenst": f})
+        self.assertEqual(response.status_code, 400)
+        m.assert_not_called()
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_invalid_file(self, m):
+
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f})
+        self.assertEqual(response.status_code, 400)
+        m.assert_not_called()
+
+    @mock.patch("documents.views.async_task")
+    @mock.patch("documents.serialisers.validate_filename")
+    def test_upload_invalid_filename(self, validate_filename, async_task):
+        validate_filename.side_effect = ValidationError()
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f})
+        self.assertEqual(response.status_code, 400)
+
+        async_task.assert_not_called()
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_title(self, async_task):
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f, "title": "my custom title"})
+        self.assertEqual(response.status_code, 200)
+
+        async_task.assert_called_once()
+
+        args, kwargs = async_task.call_args
+
+        self.assertEqual(kwargs['override_title'], "my custom title")
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_correspondent(self, async_task):
+        c = Correspondent.objects.create(name="test-corres")
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": c.id})
+        self.assertEqual(response.status_code, 200)
+
+        async_task.assert_called_once()
+
+        args, kwargs = async_task.call_args
+
+        self.assertEqual(kwargs['override_correspondent_id'], c.id)
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_invalid_correspondent(self, async_task):
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": 3456})
+        self.assertEqual(response.status_code, 400)
+
+        async_task.assert_not_called()
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_document_type(self, async_task):
+        dt = DocumentType.objects.create(name="invoice")
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": dt.id})
+        self.assertEqual(response.status_code, 200)
+
+        async_task.assert_called_once()
+
+        args, kwargs = async_task.call_args
+
+        self.assertEqual(kwargs['override_document_type_id'], dt.id)
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_invalid_document_type(self, async_task):
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": 34578})
+        self.assertEqual(response.status_code, 400)
+
+        async_task.assert_not_called()
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_tags(self, async_task):
+        t1 = Tag.objects.create(name="tag1")
+        t2 = Tag.objects.create(name="tag2")
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post(
+                "/api/documents/post_document/",
+                {"document": f, "tags": [t2.id, t1.id]})
+        self.assertEqual(response.status_code, 200)
+
+        async_task.assert_called_once()
+
+        args, kwargs = async_task.call_args
+
+        self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id])
+
+    @mock.patch("documents.views.async_task")
+    def test_upload_with_invalid_tags(self, async_task):
+        t1 = Tag.objects.create(name="tag1")
+        t2 = Tag.objects.create(name="tag2")
+        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
+            response = self.client.post(
+                "/api/documents/post_document/",
+                {"document": f, "tags": [t2.id, t1.id, 734563]})
+        self.assertEqual(response.status_code, 400)
+
+        async_task.assert_not_called()
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -1,24 +1,29 @@
 import tempfile
+from time import sleep
+from unittest import mock

 from django.test import TestCase, override_settings

-from documents.classifier import DocumentClassifier
+from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from documents.models import Correspondent, Document, Tag, DocumentType
+from documents.tests.utils import DirectoriesMixin


-class TestClassifier(TestCase):
+class TestClassifier(DirectoriesMixin, TestCase):

    def setUp(self):
-
+        super(TestClassifier, self).setUp()
        self.classifier = DocumentClassifier()

    def generate_test_data(self):
        self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
        self.c2 = Correspondent.objects.create(name="c2")
+        self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO)
        self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
        self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
        self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
+        self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO)

        self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
        self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
@@ -59,8 +64,8 @@ class TestClassifier(TestCase):
        self.classifier.train()
        self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
        self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
-        self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
-        self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
+        self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk])
+        self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk])
        self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
        self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)

@@ -71,6 +76,44 @@ class TestClassifier(TestCase):
        self.assertTrue(self.classifier.train())
        self.assertFalse(self.classifier.train())

+    def testVersionIncreased(self):
+
+        self.generate_test_data()
+        self.assertTrue(self.classifier.train())
+        self.assertFalse(self.classifier.train())
+
+        self.classifier.save_classifier()
+
+        classifier2 = DocumentClassifier()
+
+        current_ver = DocumentClassifier.FORMAT_VERSION
+        with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
+            # assure that we won't load old classifiers.
+            self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
+
+            self.classifier.save_classifier()
+
+            # assure that we can load the classifier after saving it.
+            classifier2.reload()
+
+    def testReload(self):
+
+        self.generate_test_data()
+        self.assertTrue(self.classifier.train())
+        self.classifier.save_classifier()
+
+        classifier2 = DocumentClassifier()
+        classifier2.reload()
+        v1 = classifier2.classifier_version
+
+        # change the classifier after some time.
+        sleep(1)
+        self.classifier.save_classifier()
+
+        classifier2.reload()
+        v2 = classifier2.classifier_version
+        self.assertNotEqual(v1, v2)
+
    @override_settings(DATA_DIR=tempfile.mkdtemp())
    def testSaveClassifier(self):

@@ -83,3 +126,112 @@ class TestClassifier(TestCase):
        new_classifier = DocumentClassifier()
        new_classifier.reload()
        self.assertFalse(new_classifier.train())
+
+    def test_one_correspondent_predict(self):
+        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
+
+        self.classifier.train()
+        self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
+
+    def test_one_correspondent_predict_manydocs(self):
+        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
+        doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B")
+
+        self.classifier.train()
+        self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
+        self.assertIsNone(self.classifier.predict_correspondent(doc2.content))
+
+    def test_one_type_predict(self):
+        dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
+                                            checksum="A", document_type=dt)
+
+        self.classifier.train()
+        self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
+
+    def test_one_type_predict_manydocs(self):
+        dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
+                                            checksum="A", document_type=dt)
+
+        doc2 = Document.objects.create(title="doc1", content="this is a document from c2",
+                                            checksum="B")
+
+        self.classifier.train()
+        self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
+        self.assertIsNone(self.classifier.predict_document_type(doc2.content))
+
+    def test_one_tag_predict(self):
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
+
+        doc1.tags.add(t1)
+        self.classifier.train()
+        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
+
+    def test_one_tag_predict_unassigned(self):
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
+
+        self.classifier.train()
+        self.assertListEqual(self.classifier.predict_tags(doc1.content), [])
+
+    def test_two_tags_predict_singledoc(self):
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
+        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
+
+        doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
+
+        doc4.tags.add(t1)
+        doc4.tags.add(t2)
+        self.classifier.train()
+        self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
+
+    def test_two_tags_predict(self):
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
+        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
+        doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B")
+        doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C")
+        doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
+
+        doc1.tags.add(t1)
+        doc2.tags.add(t2)
+
+        doc4.tags.add(t1)
+        doc4.tags.add(t2)
+        self.classifier.train()
+        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
+        self.assertListEqual(self.classifier.predict_tags(doc2.content), [t2.pk])
+        self.assertListEqual(self.classifier.predict_tags(doc3.content), [])
+        self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
+
+    def test_one_tag_predict_multi(self):
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
+        doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
+
+        doc1.tags.add(t1)
+        doc2.tags.add(t1)
+        self.classifier.train()
+        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
+        self.assertListEqual(self.classifier.predict_tags(doc2.content), [t1.pk])
+
+    def test_one_tag_predict_multi_2(self):
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
+
+        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
+        doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
+
+        doc1.tags.add(t1)
+        self.classifier.train()
+        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
+        self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -7,6 +7,7 @@ from unittest.mock import MagicMock

 from django.test import TestCase, override_settings

+from .utils import DirectoriesMixin
 from ..consumer import Consumer, ConsumerError
 from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
 from ..parsers import DocumentParser, ParseError
@@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):

 class DummyParser(DocumentParser):

-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        # not important during tests
        raise NotImplementedError()

-    def __init__(self, path, logging_group, scratch_dir):
-        super(DummyParser, self).__init__(path, logging_group)
+    def __init__(self, logging_group, scratch_dir, archive_path):
+        super(DummyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
+        self.archive_path = archive_path

-    def get_optimised_thumbnail(self):
+    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb

-    def get_text(self):
-        return "The Text"
+    def parse(self, document_path, mime_type):
+        self.text = "The Text"


 class FaultyParser(DocumentParser):

-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        # not important during tests
        raise NotImplementedError()

-    def __init__(self, path, logging_group, scratch_dir):
-        super(FaultyParser, self).__init__(path, logging_group)
+    def __init__(self, logging_group, scratch_dir):
+        super(FaultyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)

-    def get_optimised_thumbnail(self):
+    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb

-    def get_text(self):
+    def parse(self, document_path, mime_type):
        raise ParseError("Does not compute.")


@@ -408,32 +410,22 @@ def fake_magic_from_file(file, mime=False):


@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
-class TestConsumer(TestCase):
+class TestConsumer(DirectoriesMixin, TestCase):

-    def make_dummy_parser(self, path, logging_group):
-        return DummyParser(path, logging_group, self.scratch_dir)
+    def make_dummy_parser(self, logging_group):
+        return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())

-    def make_faulty_parser(self, path, logging_group):
-        return FaultyParser(path, logging_group, self.scratch_dir)
+    def make_faulty_parser(self, logging_group):
+        return FaultyParser(logging_group, self.dirs.scratch_dir)

    def setUp(self):
-        self.scratch_dir = tempfile.mkdtemp()
-        self.media_dir = tempfile.mkdtemp()
-        self.consumption_dir = tempfile.mkdtemp()
-
-        override_settings(
-            SCRATCH_DIR=self.scratch_dir,
-            MEDIA_ROOT=self.media_dir,
-            ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
-            THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
-            CONSUMPTION_DIR=self.consumption_dir
-        ).enable()
+        super(TestConsumer, self).setUp()

        patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
        m = patcher.start()
        m.return_value = [(None, {
            "parser": self.make_dummy_parser,
-            "mime_types": ["application/pdf"],
+            "mime_types": {"application/pdf": ".pdf"},
            "weight": 0
        })]

@@ -441,15 +433,19 @@ class TestConsumer(TestCase):

        self.consumer = Consumer()

-    def tearDown(self):
-        shutil.rmtree(self.scratch_dir, ignore_errors=True)
-        shutil.rmtree(self.media_dir, ignore_errors=True)
-        shutil.rmtree(self.consumption_dir, ignore_errors=True)
-
    def get_test_file(self):
-        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
-        return f
+        src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
+        dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
+        shutil.copy(src, dst)
+        return dst

+    def get_test_archive_file(self):
+        src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
+        dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
+        shutil.copy(src, dst)
+        return dst
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
    def testNormalOperation(self):

        filename = self.get_test_file()
@@ -469,6 +465,13 @@ class TestConsumer(TestCase):
            document.thumbnail_path
        ))

+        self.assertTrue(os.path.isfile(
+            document.archive_path
+        ))
+
+        self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
+        self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
+
        self.assertFalse(os.path.isfile(filename))

    def testOverrideFilename(self):
@@ -516,27 +519,7 @@ class TestConsumer(TestCase):

        self.fail("Should throw exception")

-    @override_settings(CONSUMPTION_DIR=None)
-    def testConsumptionDirUnset(self):
-        try:
-            self.consumer.try_consume_file(self.get_test_file())
-        except ConsumerError as e:
-            self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
-            return
-
-        self.fail("Should throw exception")
-
-    @override_settings(CONSUMPTION_DIR="asd")
-    def testNoConsumptionDir(self):
-        try:
-            self.consumer.try_consume_file(self.get_test_file())
-        except ConsumerError as e:
-            self.assertEqual(str(e), "Consumption directory asd does not exist")
-            return
-
-        self.fail("Should throw exception")
-
-    def testDuplicates(self):
+    def testDuplicates1(self):
        self.consumer.try_consume_file(self.get_test_file())

        try:
@@ -547,6 +530,21 @@ class TestConsumer(TestCase):

        self.fail("Should throw exception")

+    def testDuplicates2(self):
+        self.consumer.try_consume_file(self.get_test_file())
+
+        try:
+            self.consumer.try_consume_file(self.get_test_archive_file())
+        except ConsumerError as e:
+            self.assertTrue(str(e).endswith("It is a duplicate."))
+            return
+
+        self.fail("Should throw exception")
+
+    def testDuplicates3(self):
+        self.consumer.try_consume_file(self.get_test_archive_file())
+        self.consumer.try_consume_file(self.get_test_file())
+
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def testNoParsers(self, m):
        m.return_value = []
@@ -554,7 +552,7 @@ class TestConsumer(TestCase):
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
-            self.assertTrue(str(e).startswith("No parsers abvailable"))
+            self.assertTrue("No parsers abvailable for" in str(e))
            return

        self.fail("Should throw exception")
@@ -563,7 +561,7 @@ class TestConsumer(TestCase):
    def testFaultyParser(self, m):
        m.return_value = [(None, {
            "parser": self.make_faulty_parser,
-            "mime_types": ["application/pdf"],
+            "mime_types": {"application/pdf": ".pdf"},
            "weight": 0
        })]

@@ -598,12 +596,33 @@ class TestConsumer(TestCase):

        document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")

-        print(document.source_path)
-        print("===")
+        self.assertEqual(document.title, "new docs")
+        self.assertEqual(document.correspondent.name, "Bank")
+        self.assertEqual(document.filename, "Bank/new docs-0000001.pdf")
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    @mock.patch("documents.signals.handlers.generate_filename")
+    def testFilenameHandlingUnstableFormat(self, m):
+
+        filenames = ["this", "that", "now this", "i cant decide"]
+
+        def get_filename():
+            f = filenames.pop()
+            filenames.insert(0, f)
+            return f
+
+        m.side_effect = lambda f: get_filename()
+
+        filename = self.get_test_file()
+
+        Tag.objects.create(name="test", is_inbox_tag=True)
+
+        document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")

        self.assertEqual(document.title, "new docs")
        self.assertEqual(document.correspondent.name, "Bank")
-        self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
+        self.assertIsNotNone(os.path.isfile(document.title))
+        self.assertTrue(os.path.isfile(document.source_path))

    @mock.patch("documents.consumer.DocumentClassifier")
    def testClassifyDocument(self, m):
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -0,0 +1,140 @@
+import datetime
+import os
+import shutil
+from unittest import mock
+from uuid import uuid4
+
+from dateutil import tz
+from django.conf import settings
+from django.test import TestCase, override_settings
+
+from documents.parsers import parse_date
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class TestDate(TestCase):
+
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
+    SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
+
+    def setUp(self):
+        os.makedirs(self.SCRATCH, exist_ok=True)
+
+    def tearDown(self):
+        shutil.rmtree(self.SCRATCH)
+
+    def test_date_format_1(self):
+        text = "lorem ipsum 130218 lorem ipsum"
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_2(self):
+        text = "lorem ipsum 2018 lorem ipsum"
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_3(self):
+        text = "lorem ipsum 20180213 lorem ipsum"
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_4(self):
+        text = "lorem ipsum 13.02.2018 lorem ipsum"
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_date_format_5(self):
+        text = (
+            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
+            "ipsum"
+        )
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_date_format_6(self):
+        text = (
+            "lorem ipsum\n"
+            "Wohnort\n"
+            "3100\n"
+            "IBAN\n"
+            "AT87 4534\n"
+            "1234\n"
+            "1234 5678\n"
+            "BIC\n"
+            "lorem ipsum"
+        )
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_7(self):
+        text = (
+            "lorem ipsum\n"
+            "März 2019\n"
+            "lorem ipsum"
+        )
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2019, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_date_format_8(self):
+        text = (
+            "lorem ipsum\n"
+            "Wohnort\n"
+            "3100\n"
+            "IBAN\n"
+            "AT87 4534\n"
+            "1234\n"
+            "1234 5678\n"
+            "BIC\n"
+            "lorem ipsum\n"
+            "März 2020"
+        )
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(
+                2020, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    @override_settings(SCRATCH_DIR=SCRATCH)
+    def test_date_format_9(self):
+        text = (
+            "lorem ipsum\n"
+            "27. Nullmonth 2020\n"
+            "März 2020\n"
+            "lorem ipsum"
+        )
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(
+                2020, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_crazy_date_past(self, *args):
+        self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
+
+    def test_crazy_date_future(self, *args):
+        self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
+
+    def test_crazy_date_with_spaces(self, *args):
+        self.assertIsNone(parse_date("", "20 408000l 2475"))
+
+    @override_settings(FILENAME_DATE_ORDER="YMD")
+    def test_filename_date_parse_invalid(self, *args):
+        self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@@ -1,12 +1,29 @@
+import shutil
+import tempfile
+from datetime import datetime
+from pathlib import Path
 from unittest import mock

-from django.test import TestCase
+from django.test import TestCase, override_settings

 from ..models import Document, Correspondent


 class TestDocument(TestCase):

+    def setUp(self) -> None:
+        self.originals_dir = tempfile.mkdtemp()
+        self.thumb_dir = tempfile.mkdtemp()
+
+        override_settings(
+            ORIGINALS_DIR=self.originals_dir,
+            THUMBNAIL_DIR=self.thumb_dir,
+        ).enable()
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.originals_dir)
+        shutil.rmtree(self.thumb_dir)
+
    def test_file_deletion(self):
        document = Document.objects.create(
            correspondent=Correspondent.objects.create(name="Test0"),
@@ -19,8 +36,31 @@ class TestDocument(TestCase):
        file_path = document.source_path
        thumb_path = document.thumbnail_path

+        Path(file_path).touch()
+        Path(thumb_path).touch()
+
        with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
            document.delete()
            mock_unlink.assert_any_call(file_path)
            mock_unlink.assert_any_call(thumb_path)
            self.assertEqual(mock_unlink.call_count, 2)
+
+    def test_file_name(self):
+
+        doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25))
+        self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf")
+
+    def test_file_name_jpg(self):
+
+        doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25))
+        self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg")
+
+    def test_file_name_unknown(self):
+
+        doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25))
+        self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip")
+
+    def test_file_name_invalid_type(self):
+
+        doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25))
+        self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -1,32 +1,18 @@
+import datetime
 import os
-import shutil
 from pathlib import Path
-from uuid import uuid4
+from unittest import mock

 from django.conf import settings
+from django.db import DatabaseError
 from django.test import TestCase, override_settings

+from .utils import DirectoriesMixin
 from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
 from ..models import Document, Correspondent
-from ..signals.handlers import update_filename_and_move_files


-class TestDate(TestCase):
-    deletion_list = []
-
-    def add_to_deletion_list(self, dirname):
-        self.deletion_list.append(dirname)
-
-    def setUp(self):
-        folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
-        os.makedirs(folder + "/documents/originals")
-        override_settings(MEDIA_ROOT=folder).enable()
-        override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
-        self.add_to_deletion_list(folder)
-
-    def tearDown(self):
-        for dirname in self.deletion_list:
-            shutil.rmtree(dirname, ignore_errors=True)
+class TestFileHandling(DirectoriesMixin, TestCase):

    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_generate_source_filename(self):
@@ -103,7 +89,7 @@ class TestDate(TestCase):
        document.save()

        # Check proper handling of files
-        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
+        self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
        self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))

        os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@@ -133,18 +119,14 @@ class TestDate(TestCase):
        document.correspondent = Correspondent.objects.get_or_create(
            name="test")[0]

-        # This will cause save() to fail.
-        document.checksum = document1.checksum
+        with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
+            m.side_effect = DatabaseError()
+            document.save()

-        # Assume saving the document initially works, this gets called.
-        # After renaming, an error occurs, and filename is not saved:
-        # document should still be available at document.filename.
-        update_filename_and_move_files(None, document)
-
-        # Check proper handling of files
-        self.assertTrue(os.path.isfile(document.source_path))
-        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
-        self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
+            # Check proper handling of files
+            self.assertTrue(os.path.isfile(document.source_path))
+            self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
+            self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))

    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
    def test_document_delete(self):
@@ -199,8 +181,8 @@ class TestDate(TestCase):
        document.save()

        # Check proper handling of files
-        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
-        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
+        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
+        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
        self.assertTrue(os.path.isfile(important_file))

    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
@@ -318,13 +300,12 @@ class TestDate(TestCase):
        # Create our working directory
        tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
        os.makedirs(tmp)
-        self.add_to_deletion_list(tmp)

        os.makedirs(os.path.join(tmp, "notempty"))
        Path(os.path.join(tmp, "notempty", "file")).touch()
        os.makedirs(os.path.join(tmp, "notempty", "empty"))

-        delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
+        delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
        self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
        self.assertEqual(os.path.isfile(
            os.path.join(tmp, "notempty", "file")), True)
@@ -348,3 +329,179 @@ class TestDate(TestCase):
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED

        self.assertEqual(generate_filename(document), "0000001.pdf")
+
+
+class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
+    def test_create_no_format(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_create_with_format(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+        self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
+        self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_move_archive_gone(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        #Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertFalse(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_move_archive_exists(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
+        Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    @mock.patch("documents.signals.handlers.os.rename")
+    def test_move_archive_error(self, m):
+
+        def fake_rename(src, dst):
+            if "archive" in src:
+                raise OSError()
+            else:
+                os.remove(src)
+                Path(dst).touch()
+
+        m.side_effect = fake_rename
+
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_move_file_gone(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        #Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertFalse(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    @mock.patch("documents.signals.handlers.os.rename")
+    def test_move_file_error(self, m):
+
+        def fake_rename(src, dst):
+            if "original" in src:
+                raise OSError()
+            else:
+                os.remove(src)
+                Path(dst).touch()
+
+        m.side_effect = fake_rename
+
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    def test_archive_deleted(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+        doc.delete()
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertFalse(os.path.isfile(doc.source_path))
+        self.assertFalse(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_database_error(self):
+
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
+            m.side_effect = DatabaseError()
+            doc.save()
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+class TestFilenameGeneration(TestCase):
+
+    @override_settings(
+        PAPERLESS_FILENAME_FORMAT="{title}"
+    )
+    def test_invalid_characters(self):
+
+        doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1")
+        self.assertEqual(generate_filename(doc), "This. is the title-0000001.pdf")
+
+        doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2")
+        self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay-0000002.pdf")
+
+    @override_settings(
+        PAPERLESS_FILENAME_FORMAT="{created}"
+    )
+    def test_date(self):
+        doc = Document.objects.create(title="does not matter", created=datetime.datetime(2020,5,21, 7,36,51, 153), mime_type="application/pdf", pk=2, checksum="2")
+        self.assertEqual(generate_filename(doc), "2020-05-21-0000002.pdf")
--- a/src/documents/tests/test_logger.py
+++ b/src/documents/tests/test_logger.py
@@ -2,7 +2,7 @@ import logging
 import uuid
 from unittest import mock

-from django.test import TestCase
+from django.test import TestCase, override_settings

 from ..models import Log

@@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase):
        self.logger = logging.getLogger(
            "documents.management.commands.document_consumer")

+    @override_settings(DISABLE_DBHANDLER=False)
    def test_that_it_saves_at_all(self):

        kw = {"group": uuid.uuid4()}
@@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase):
            self.logger.critical("This is a critical message", extra=kw)
            self.assertEqual(Log.objects.all().count(), 5)

+    @override_settings(DISABLE_DBHANDLER=False)
    def test_groups(self):

        kw1 = {"group": uuid.uuid4()}
--- a/src/documents/tests/test_management_archiver.py
+++ b/src/documents/tests/test_management_archiver.py
@@ -0,0 +1,42 @@
+import filecmp
+import os
+import shutil
+
+from django.core.management import call_command
+from django.test import TestCase
+
+from documents.management.commands.document_archiver import handle_document
+from documents.models import Document
+from documents.tests.utils import DirectoriesMixin
+
+
+sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+
+
+class TestArchiver(DirectoriesMixin, TestCase):
+
+    def make_models(self):
+        self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
+        #self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
+        #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
+
+    def test_archiver(self):
+
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
+        self.make_models()
+
+        call_command('document_archiver')
+
+    def test_handle_document(self):
+
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
+        self.make_models()
+
+        handle_document(self.d1.pk)
+
+        doc = Document.objects.get(id=self.d1.id)
+
+        self.assertIsNotNone(doc.checksum)
+        self.assertTrue(os.path.isfile(doc.archive_path))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -0,0 +1,262 @@
+import filecmp
+import os
+import shutil
+from threading import Thread
+from time import sleep
+from unittest import mock
+
+from django.conf import settings
+from django.core.management import call_command, CommandError
+from django.test import override_settings, TransactionTestCase
+
+from documents.models import Tag
+from documents.consumer import ConsumerError
+from documents.management.commands import document_consumer
+from documents.tests.utils import DirectoriesMixin
+
+
+class ConsumerThread(Thread):
+
+    def __init__(self):
+        super().__init__()
+        self.cmd = document_consumer.Command()
+
+    def run(self) -> None:
+        self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False)
+
+    def stop(self):
+        # Consumer checks this every second.
+        self.cmd.stop_flag = True
+
+
+def chunked(size, source):
+    for i in range(0, len(source), size):
+        yield source[i:i+size]
+
+
+class ConsumerMixin:
+
+    sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+
+    def setUp(self) -> None:
+        super(ConsumerMixin, self).setUp()
+        self.t = None
+        patcher = mock.patch("documents.management.commands.document_consumer.async_task")
+        self.task_mock = patcher.start()
+        self.addCleanup(patcher.stop)
+
+    def t_start(self):
+        self.t = ConsumerThread()
+        self.t.start()
+        # give the consumer some time to do initial work
+        sleep(1)
+
+    def tearDown(self) -> None:
+        if self.t:
+            # set the stop flag
+            self.t.stop()
+            # wait for the consumer to exit.
+            self.t.join()
+
+        super(ConsumerMixin, self).tearDown()
+
+    def wait_for_task_mock_call(self):
+        n = 0
+        while n < 100:
+            if self.task_mock.call_count > 0:
+                # give task_mock some time to finish and raise errors
+                sleep(1)
+                return
+            n += 1
+            sleep(0.1)
+
+    # A bogus async_task that will simply check the file for
+    # completeness and raise an exception otherwise.
+    def bogus_task(self, func, filename, **kwargs):
+        eq = filecmp.cmp(filename, self.sample_file, shallow=False)
+        if not eq:
+            print("Consumed an INVALID file.")
+            raise ConsumerError("Incomplete File READ FAILED")
+        else:
+            print("Consumed a perfectly valid file.")
+
+    def slow_write_file(self, target, incomplete=False):
+        with open(self.sample_file, 'rb') as f:
+            pdf_bytes = f.read()
+
+        if incomplete:
+            pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100]
+
+        with open(target, 'wb') as f:
+            # this will take 2 seconds, since the file is about 20k.
+            print("Start writing file.")
+            for b in chunked(1000, pdf_bytes):
+                f.write(b)
+                sleep(0.1)
+            print("file completed.")
+
+
+class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
+
+    def test_consume_file(self):
+        self.t_start()
+
+        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        shutil.copy(self.sample_file, f)
+
+        self.wait_for_task_mock_call()
+
+        self.task_mock.assert_called_once()
+
+        args, kwargs = self.task_mock.call_args
+        self.assertEqual(args[1], f)
+
+    def test_consume_file_invalid_ext(self):
+        self.t_start()
+
+        f = os.path.join(self.dirs.consumption_dir, "my_file.wow")
+        shutil.copy(self.sample_file, f)
+
+        self.wait_for_task_mock_call()
+
+        self.task_mock.assert_not_called()
+
+    def test_consume_existing_file(self):
+        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        shutil.copy(self.sample_file, f)
+
+        self.t_start()
+        self.task_mock.assert_called_once()
+
+        args, kwargs = self.task_mock.call_args
+        self.assertEqual(args[1], f)
+
+    @mock.patch("documents.management.commands.document_consumer.logger.error")
+    def test_slow_write_pdf(self, error_logger):
+
+        self.task_mock.side_effect = self.bogus_task
+
+        self.t_start()
+
+        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+
+        self.slow_write_file(fname)
+
+        self.wait_for_task_mock_call()
+
+        error_logger.assert_not_called()
+
+        self.task_mock.assert_called_once()
+
+        args, kwargs = self.task_mock.call_args
+        self.assertEqual(args[1], fname)
+
+    @mock.patch("documents.management.commands.document_consumer.logger.error")
+    def test_slow_write_and_move(self, error_logger):
+
+        self.task_mock.side_effect = self.bogus_task
+
+        self.t_start()
+
+        fname = os.path.join(self.dirs.consumption_dir, "my_file.~df")
+        fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+
+        self.slow_write_file(fname)
+        shutil.move(fname, fname2)
+
+        self.wait_for_task_mock_call()
+
+        self.task_mock.assert_called_once()
+
+        args, kwargs = self.task_mock.call_args
+        self.assertEqual(args[1], fname2)
+
+        error_logger.assert_not_called()
+
+    @mock.patch("documents.management.commands.document_consumer.logger.error")
+    def test_slow_write_incomplete(self, error_logger):
+
+        self.task_mock.side_effect = self.bogus_task
+
+        self.t_start()
+
+        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        self.slow_write_file(fname, incomplete=True)
+
+        self.wait_for_task_mock_call()
+
+        self.task_mock.assert_called_once()
+        args, kwargs = self.task_mock.call_args
+        self.assertEqual(args[1], fname)
+
+        # assert that we have an error logged with this invalid file.
+        error_logger.assert_called_once()
+
+    @override_settings(CONSUMPTION_DIR="does_not_exist")
+    def test_consumption_directory_invalid(self):
+
+        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
+
+    @override_settings(CONSUMPTION_DIR="")
+    def test_consumption_directory_unset(self):
+
+        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
+
+
+@override_settings(CONSUMER_POLLING=1)
+class TestConsumerPolling(TestConsumer):
+    # just do all the tests with polling
+    pass
+
+
+@override_settings(CONSUMER_RECURSIVE=True)
+class TestConsumerRecursive(TestConsumer):
+    # just do all the tests with recursive
+    pass
+
+
+@override_settings(CONSUMER_RECURSIVE=True)
+@override_settings(CONSUMER_POLLING=1)
+class TestConsumerRecursivePolling(TestConsumer):
+    # just do all the tests with polling and recursive
+    pass
+
+
+class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
+
+    @override_settings(CONSUMER_RECURSIVE=True)
+    @override_settings(CONSUMER_SUBDIRS_AS_TAGS=True)
+    def test_consume_file_with_path_tags(self):
+
+        tag_names = ("existingTag", "Space Tag")
+        # Create a Tag prior to consuming a file using it in path
+        tag_ids = [Tag.objects.create(name=tag_names[0]).pk,]
+
+        self.t_start()
+
+        path = os.path.join(self.dirs.consumption_dir, *tag_names)
+        os.makedirs(path, exist_ok=True)
+        f = os.path.join(path, "my_file.pdf")
+        # Wait at least inotify read_delay for recursive watchers
+        # to be created for the new directories
+        sleep(1)
+        shutil.copy(self.sample_file, f)
+
+        self.wait_for_task_mock_call()
+
+        self.task_mock.assert_called_once()
+
+        # Add the pk of the Tag created by _consume()
+        tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
+
+        args, kwargs = self.task_mock.call_args
+        self.assertEqual(args[1], f)
+
+        # assertCountEqual has a bad name, but test that the first
+        # sequence contains the same elements as second, regardless of
+        # their order.
+        self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
+
+    @override_settings(CONSUMER_POLLING=1)
+    def test_consume_file_with_path_tags_polling(self):
+        self.test_consume_file_with_path_tags()
--- a/src/documents/tests/test_management_decrypt.py
+++ b/src/documents/tests/test_management_decrypt.py
@@ -0,0 +1,57 @@
+import hashlib
+import json
+import os
+import shutil
+import tempfile
+from unittest import mock
+
+from django.core.management import call_command
+from django.test import TestCase, override_settings
+
+from documents.management.commands import document_exporter
+from documents.models import Document, Tag, DocumentType, Correspondent
+
+
+class TestDecryptDocuments(TestCase):
+
+    @override_settings(
+        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
+        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
+        PASSPHRASE="test",
+        PAPERLESS_FILENAME_FORMAT=None
+    )
+    @mock.patch("documents.management.commands.decrypt_documents.input")
+    def test_decrypt(self, m):
+
+        media_dir = tempfile.mkdtemp()
+        originals_dir = os.path.join(media_dir, "documents", "originals")
+        thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
+        os.makedirs(originals_dir, exist_ok=True)
+        os.makedirs(thumb_dir, exist_ok=True)
+
+        override_settings(
+            ORIGINALS_DIR=originals_dir,
+            THUMBNAIL_DIR=thumb_dir,
+            PASSPHRASE="test"
+        ).enable()
+
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
+
+        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
+
+        call_command('decrypt_documents')
+
+        doc = Document.objects.get(id=2)
+
+        self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
+        self.assertEqual(doc.filename, "0000002.pdf")
+        self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
+        self.assertTrue(os.path.isfile(doc.thumbnail_path))
+
+        with doc.source_file as f:
+            checksum = hashlib.md5(f.read()).hexdigest()
+            self.assertEqual(checksum, doc.checksum)
+
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -0,0 +1,74 @@
+import hashlib
+import json
+import os
+import shutil
+import tempfile
+
+from django.core.management import call_command
+from django.test import TestCase, override_settings
+
+from documents.management.commands import document_exporter
+from documents.models import Document, Tag, DocumentType, Correspondent
+from documents.sanity_checker import check_sanity
+from documents.tests.utils import DirectoriesMixin, paperless_environment
+
+
+class TestExportImport(DirectoriesMixin, TestCase):
+
+    @override_settings(
+        PASSPHRASE="test"
+    )
+    def test_exporter(self):
+        shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
+        shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
+
+        file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
+
+        Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
+        Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
+        Tag.objects.create(name="t")
+        DocumentType.objects.create(name="dt")
+        Correspondent.objects.create(name="c")
+
+        target = tempfile.mkdtemp()
+
+        call_command('document_exporter', target)
+
+        with open(os.path.join(target, "manifest.json")) as f:
+            manifest = json.load(f)
+
+        self.assertEqual(len(manifest), 5)
+
+        for element in manifest:
+            if element['model'] == 'documents.document':
+                fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
+                self.assertTrue(os.path.exists(fname))
+                self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
+
+                with open(fname, "rb") as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+                self.assertEqual(checksum, element['fields']['checksum'])
+
+                if document_exporter.EXPORTER_ARCHIVE_NAME in element:
+                    fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
+                    self.assertTrue(os.path.exists(fname))
+
+                    with open(fname, "rb") as f:
+                        checksum = hashlib.md5(f.read()).hexdigest()
+                    self.assertEqual(checksum, element['fields']['archive_checksum'])
+
+        with paperless_environment() as dirs:
+            call_command('document_importer', target)
+            messages = check_sanity()
+            # everything is alright after the test
+            self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
+
+    def test_export_missing_files(self):
+
+        target = tempfile.mkdtemp()
+        Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
+        self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
+
+    def test_duplicate_titles(self):
+        # TODO
+        pass
--- a/src/documents/tests/test_management_retagger.py
+++ b/src/documents/tests/test_management_retagger.py
@@ -0,0 +1,58 @@
+from django.core.management import call_command
+from django.test import TestCase
+
+from documents.models import Document, Tag, Correspondent, DocumentType
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestRetagger(DirectoriesMixin, TestCase):
+
+    def make_models(self):
+        self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
+        self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
+        self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
+
+        self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
+        self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
+
+        self.correspondent_first = Correspondent.objects.create(
+            name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
+        self.correspondent_second = Correspondent.objects.create(
+            name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY)
+
+        self.doctype_first = DocumentType.objects.create(
+            name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY)
+        self.doctype_second = DocumentType.objects.create(
+            name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
+
+    def get_updated_docs(self):
+        return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
+
+    def setUp(self) -> None:
+        super(TestRetagger, self).setUp()
+        self.make_models()
+
+    def test_add_tags(self):
+        call_command('document_retagger', '--tags')
+        d_first, d_second, d_unrelated = self.get_updated_docs()
+
+        self.assertEqual(d_first.tags.count(), 1)
+        self.assertEqual(d_second.tags.count(), 1)
+        self.assertEqual(d_unrelated.tags.count(), 0)
+
+        self.assertEqual(d_first.tags.first(), self.tag_first)
+        self.assertEqual(d_second.tags.first(), self.tag_second)
+
+    def test_add_type(self):
+        call_command('document_retagger', '--document_type')
+        d_first, d_second, d_unrelated = self.get_updated_docs()
+
+        self.assertEqual(d_first.document_type, self.doctype_first)
+        self.assertEqual(d_second.document_type, self.doctype_second)
+
+    def test_add_correspondent(self):
+        call_command('document_retagger', '--correspondent')
+        d_first, d_second, d_unrelated = self.get_updated_docs()
+
+        self.assertEqual(d_first.correspondent, self.correspondent_first)
+        self.assertEqual(d_second.correspondent, self.correspondent_second)
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -1,3 +1,5 @@
+import shutil
+import tempfile
 from random import randint

 from django.contrib.admin.models import LogEntry
@@ -215,6 +217,13 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
        self.doc_contains = Document.objects.create(
            content="I contain the keyword.", mime_type="application/pdf")

+        self.index_dir = tempfile.mkdtemp()
+        # TODO: we should not need the index here.
+        override_settings(INDEX_DIR=self.index_dir).enable()
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.index_dir, ignore_errors=True)
+
    def test_tag_applied_any(self):
        t1 = Tag.objects.create(
            name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,10 +1,15 @@
 import os
+import shutil
+import tempfile
 from tempfile import TemporaryDirectory
 from unittest import mock

-from django.test import TestCase
+from django.test import TestCase, override_settings

-from documents.parsers import get_parser_class
+from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
+    get_parser_class_for_mime_type, DocumentParser, is_file_ext_supported
+from paperless_tesseract.parsers import RasterisedDocumentParser
+from paperless_text.parsers import TextDocumentParser


 def fake_magic_from_file(file, mime=False):
@@ -27,7 +32,7 @@ class TestParserDiscovery(TestCase):
            pass

        m.return_value = (
-            (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
+            (None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
        )

        self.assertEqual(
@@ -45,8 +50,8 @@ class TestParserDiscovery(TestCase):
            pass

        m.return_value = (
-            (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
-            (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
+            (None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
+            (None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
        )

        self.assertEqual(
@@ -61,3 +66,57 @@ class TestParserDiscovery(TestCase):
            self.assertIsNone(
                get_parser_class("doc.pdf")
            )
+
+
+def fake_get_thumbnail(self, path, mimetype):
+    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
+
+
+class TestBaseParser(TestCase):
+
+    def setUp(self) -> None:
+
+        self.scratch = tempfile.mkdtemp()
+        override_settings(
+            SCRATCH_DIR=self.scratch
+        ).enable()
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.scratch)
+
+    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
+    @override_settings(OPTIMIZE_THUMBNAILS=True)
+    def test_get_optimised_thumbnail(self):
+        parser = DocumentParser(None)
+
+        parser.get_optimised_thumbnail("any", "not important")
+
+    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
+    @override_settings(OPTIMIZE_THUMBNAILS=False)
+    def test_get_optimised_thumb_disabled(self):
+        parser = DocumentParser(None)
+
+        path = parser.get_optimised_thumbnail("any", "not important")
+        self.assertEqual(path, fake_get_thumbnail(None, None, None))
+
+
+class TestParserAvailability(TestCase):
+
+    def test_file_extensions(self):
+
+        for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
+            self.assertIn(ext, get_supported_file_extensions())
+        self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
+        self.assertEqual(get_default_file_extension('image/png'), ".png")
+        self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
+        self.assertEqual(get_default_file_extension('text/plain'), ".txt")
+        self.assertEqual(get_default_file_extension('text/csv'), ".csv")
+        self.assertEqual(get_default_file_extension('application/zip'), ".zip")
+        self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
+
+        self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
+        self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
+        self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
+
+        self.assertTrue(is_file_ext_supported('.pdf'))
+        self.assertFalse(is_file_ext_supported('.hsdfh'))
--- a/src/documents/tests/test_post_consume_handlers.py
+++ b/src/documents/tests/test_post_consume_handlers.py
@@ -0,0 +1,56 @@
+from unittest import mock
+
+from django.test import TestCase, override_settings
+
+from documents.models import Document, Tag, Correspondent
+from documents.signals.handlers import run_post_consume_script
+
+
+class PostConsumeTestCase(TestCase):
+
+    @mock.patch("documents.signals.handlers.Popen")
+    @override_settings(POST_CONSUME_SCRIPT=None)
+    def test_no_post_consume_script(self, m):
+        doc = Document.objects.create(title="Test", mime_type="application/pdf")
+        tag1 = Tag.objects.create(name="a")
+        tag2 = Tag.objects.create(name="b")
+        doc.tags.add(tag1)
+        doc.tags.add(tag2)
+
+        run_post_consume_script(None, doc)
+
+        m.assert_not_called()
+
+    @mock.patch("documents.signals.handlers.Popen")
+    @override_settings(POST_CONSUME_SCRIPT="script")
+    def test_post_consume_script_simple(self, m):
+        doc = Document.objects.create(title="Test", mime_type="application/pdf")
+
+        run_post_consume_script(None, doc)
+
+        m.assert_called_once()
+
+    @mock.patch("documents.signals.handlers.Popen")
+    @override_settings(POST_CONSUME_SCRIPT="script")
+    def test_post_consume_script_with_correspondent(self, m):
+        c = Correspondent.objects.create(name="my_bank")
+        doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
+        tag1 = Tag.objects.create(name="a")
+        tag2 = Tag.objects.create(name="b")
+        doc.tags.add(tag1)
+        doc.tags.add(tag2)
+
+        run_post_consume_script(None, doc)
+
+        m.assert_called_once()
+
+        args, kwargs = m.call_args
+
+        command = args[0]
+
+        self.assertEqual(command[0], "script")
+        self.assertEqual(command[1], str(doc.pk))
+        self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
+        self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
+        self.assertEqual(command[7], "my_bank")
+        self.assertCountEqual(command[8].split(","), ["a", "b"])
--- a/src/documents/tests/test_sanity_check.py
+++ b/src/documents/tests/test_sanity_check.py
@@ -0,0 +1,87 @@
+import os
+import shutil
+from pathlib import Path
+
+from django.test import TestCase
+
+from documents.models import Document
+from documents.sanity_checker import check_sanity, SanityFailedError
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestSanityCheck(DirectoriesMixin, TestCase):
+
+    def make_test_data(self):
+
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf"))
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
+        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
+
+        return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
+
+    def test_no_docs(self):
+        self.assertEqual(len(check_sanity()), 0)
+
+    def test_success(self):
+        self.make_test_data()
+        self.assertEqual(len(check_sanity()), 0)
+
+    def test_no_thumbnail(self):
+        doc = self.make_test_data()
+        os.remove(doc.thumbnail_path)
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_thumbnail_no_access(self):
+        doc = self.make_test_data()
+        os.chmod(doc.thumbnail_path, 0o000)
+        self.assertEqual(len(check_sanity()), 1)
+        os.chmod(doc.thumbnail_path, 0o777)
+
+    def test_no_original(self):
+        doc = self.make_test_data()
+        os.remove(doc.source_path)
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_original_no_access(self):
+        doc = self.make_test_data()
+        os.chmod(doc.source_path, 0o000)
+        self.assertEqual(len(check_sanity()), 1)
+        os.chmod(doc.source_path, 0o777)
+
+    def test_original_checksum_mismatch(self):
+        doc = self.make_test_data()
+        doc.checksum = "WOW"
+        doc.save()
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_no_archive(self):
+        doc = self.make_test_data()
+        os.remove(doc.archive_path)
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_archive_no_access(self):
+        doc = self.make_test_data()
+        os.chmod(doc.archive_path, 0o000)
+        self.assertEqual(len(check_sanity()), 1)
+        os.chmod(doc.archive_path, 0o777)
+
+    def test_archive_checksum_mismatch(self):
+        doc = self.make_test_data()
+        doc.archive_checksum = "WOW"
+        doc.save()
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_empty_content(self):
+        doc = self.make_test_data()
+        doc.content = ""
+        doc.save()
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_orphaned_file(self):
+        doc = self.make_test_data()
+        Path(self.dirs.originals_dir, "orphaned").touch()
+        self.assertEqual(len(check_sanity()), 1)
+
+    def test_all(self):
+        Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
+        string = str(SanityFailedError(check_sanity()))
--- a/src/documents/tests/test_tasks.py
+++ b/src/documents/tests/test_tasks.py
@@ -0,0 +1,24 @@
+from datetime import datetime
+
+from django.test import TestCase
+from django.utils import timezone
+
+from documents import tasks
+from documents.models import Document
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestTasks(DirectoriesMixin, TestCase):
+
+    def test_index_reindex(self):
+        Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
+
+        tasks.index_reindex()
+
+    def test_index_optimize(self):
+        Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
+
+        tasks.index_optimize()
+
+    def test_train_classifier(self):
+        tasks.train_classifier()
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -0,0 +1,76 @@
+import os
+import shutil
+import tempfile
+from collections import namedtuple
+from contextlib import contextmanager
+
+from django.test import override_settings
+
+
+def setup_directories():
+
+    dirs = namedtuple("Dirs", ())
+
+    dirs.data_dir = tempfile.mkdtemp()
+    dirs.scratch_dir = tempfile.mkdtemp()
+    dirs.media_dir = tempfile.mkdtemp()
+    dirs.consumption_dir = tempfile.mkdtemp()
+    dirs.index_dir = os.path.join(dirs.data_dir, "index")
+    dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
+    dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
+    dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
+
+    os.makedirs(dirs.index_dir, exist_ok=True)
+    os.makedirs(dirs.originals_dir, exist_ok=True)
+    os.makedirs(dirs.thumbnail_dir, exist_ok=True)
+    os.makedirs(dirs.archive_dir, exist_ok=True)
+
+    dirs.settings_override = override_settings(
+        DATA_DIR=dirs.data_dir,
+        SCRATCH_DIR=dirs.scratch_dir,
+        MEDIA_ROOT=dirs.media_dir,
+        ORIGINALS_DIR=dirs.originals_dir,
+        THUMBNAIL_DIR=dirs.thumbnail_dir,
+        ARCHIVE_DIR=dirs.archive_dir,
+        CONSUMPTION_DIR=dirs.consumption_dir,
+        INDEX_DIR=dirs.index_dir,
+        MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
+
+    )
+    dirs.settings_override.enable()
+
+    return dirs
+
+
+def remove_dirs(dirs):
+    shutil.rmtree(dirs.media_dir, ignore_errors=True)
+    shutil.rmtree(dirs.data_dir, ignore_errors=True)
+    shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
+    shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
+    dirs.settings_override.disable()
+
+
+@contextmanager
+def paperless_environment():
+    dirs = None
+    try:
+        dirs = setup_directories()
+        yield dirs
+    finally:
+        if dirs:
+            remove_dirs(dirs)
+
+
+class DirectoriesMixin:
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dirs = None
+
+    def setUp(self) -> None:
+        self.dirs = setup_directories()
+        super(DirectoriesMixin, self).setUp()
+
+    def tearDown(self) -> None:
+        super(DirectoriesMixin, self).tearDown()
+        remove_dirs(self.dirs)
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,8 +1,16 @@
+import os
+import tempfile
+from datetime import datetime
+from time import mktime
+
+from django.conf import settings
 from django.db.models import Count, Max
 from django.http import HttpResponse, HttpResponseBadRequest, Http404
 from django.views.decorators.cache import cache_control
 from django.views.generic import TemplateView
 from django_filters.rest_framework import DjangoFilterBackend
+from django_q.tasks import async_task
+from rest_framework import parsers
 from rest_framework.decorators import action
 from rest_framework.filters import OrderingFilter, SearchFilter
 from rest_framework.mixins import (
@@ -30,14 +38,14 @@ from .filters import (
    DocumentTypeFilterSet,
    LogFilterSet
 )
-from .forms import UploadForm
 from .models import Correspondent, Document, Log, Tag, DocumentType
 from .serialisers import (
    CorrespondentSerializer,
    DocumentSerializer,
    LogSerializer,
    TagSerializer,
-    DocumentTypeSerializer
+    DocumentTypeSerializer,
+    PostDocumentSerializer
 )


@@ -126,36 +134,54 @@ class DocumentViewSet(RetrieveModelMixin,
        index.remove_document_from_index(self.get_object())
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

-    def file_response(self, pk, disposition):
+    @staticmethod
+    def original_requested(request):
+        return (
+            'original' in request.query_params and
+            request.query_params['original'] == 'true'
+        )
+
+    def file_response(self, pk, request, disposition):
        doc = Document.objects.get(id=pk)
-
-        if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
-            file_handle = doc.source_file
+        if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501
+            file_handle = doc.archive_file
+            filename = doc.get_public_filename(archive=True)
+            mime_type = 'application/pdf'
        else:
-            file_handle = GnuPG.decrypted(doc.source_file)
+            file_handle = doc.source_file
+            filename = doc.get_public_filename()
+            mime_type = doc.mime_type

-        response = HttpResponse(file_handle, content_type=doc.mime_type)
+        if doc.storage_type == Document.STORAGE_TYPE_GPG:
+            file_handle = GnuPG.decrypted(file_handle)
+
+        response = HttpResponse(file_handle, content_type=mime_type)
        response["Content-Disposition"] = '{}; filename="{}"'.format(
-            disposition, doc.file_name)
+            disposition, filename)
        return response

-    @action(methods=['post'], detail=False)
-    def post_document(self, request, pk=None):
-        # TODO: is this a good implementation?
-        form = UploadForm(data=request.POST, files=request.FILES)
-        if form.is_valid():
-            form.save()
-            return Response("OK")
-        else:
-            return HttpResponseBadRequest(str(form.errors))
+    @action(methods=['get'], detail=True)
+    def metadata(self, request, pk=None):
+        try:
+            doc = Document.objects.get(pk=pk)
+            return Response({
+                "paperless__checksum": doc.checksum,
+                "paperless__mime_type": doc.mime_type,
+                "paperless__filename": doc.filename,
+                "paperless__has_archive_version":
+                    os.path.isfile(doc.archive_path)
+            })
+        except Document.DoesNotExist:
+            raise Http404()

    @action(methods=['get'], detail=True)
    def preview(self, request, pk=None):
        try:
-            response = self.file_response(pk, "inline")
+            response = self.file_response(
+                pk, request, "inline")
            return response
-        except FileNotFoundError:
-            raise Http404("Document source file does not exist")
+        except (FileNotFoundError, Document.DoesNotExist):
+            raise Http404()

    @action(methods=['get'], detail=True)
    @cache_control(public=False, max_age=315360000)
@@ -163,15 +189,16 @@ class DocumentViewSet(RetrieveModelMixin,
        try:
            return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
                                content_type='image/png')
-        except FileNotFoundError:
-            raise Http404("Document thumbnail does not exist")
+        except (FileNotFoundError, Document.DoesNotExist):
+            raise Http404()

    @action(methods=['get'], detail=True)
    def download(self, request, pk=None):
        try:
-            return self.file_response(pk, "attachment")
-        except FileNotFoundError:
-            raise Http404("Document source file does not exist")
+            return self.file_response(
+                pk, request, "attachment")
+        except (FileNotFoundError, Document.DoesNotExist):
+            raise Http404()


 class LogViewSet(ReadOnlyModelViewSet):
@@ -186,11 +213,62 @@ class LogViewSet(ReadOnlyModelViewSet):
    ordering_fields = ("created",)


+class PostDocumentView(APIView):
+
+    permission_classes = (IsAuthenticated,)
+    serializer_class = PostDocumentSerializer
+    parser_classes = (parsers.MultiPartParser,)
+
+    def get_serializer_context(self):
+        return {
+            'request': self.request,
+            'format': self.format_kwarg,
+            'view': self
+        }
+
+    def get_serializer(self, *args, **kwargs):
+        kwargs['context'] = self.get_serializer_context()
+        return self.serializer_class(*args, **kwargs)
+
+    def post(self, request, *args, **kwargs):
+
+        serializer = self.get_serializer(data=request.data)
+        serializer.is_valid(raise_exception=True)
+
+        doc_name, doc_data = serializer.validated_data.get('document')
+        correspondent_id = serializer.validated_data.get('correspondent')
+        document_type_id = serializer.validated_data.get('document_type')
+        tag_ids = serializer.validated_data.get('tags')
+        title = serializer.validated_data.get('title')
+
+        t = int(mktime(datetime.now().timetuple()))
+
+        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
+
+        with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
+                                         dir=settings.SCRATCH_DIR,
+                                         delete=False) as f:
+            f.write(doc_data)
+            os.utime(f.name, times=(t, t))
+
+            async_task("documents.tasks.consume_file",
+                       f.name,
+                       override_filename=doc_name,
+                       override_title=title,
+                       override_correspondent_id=correspondent_id,
+                       override_document_type_id=document_type_id,
+                       override_tag_ids=tag_ids,
+                       task_name=os.path.basename(doc_name)[:100])
+        return Response("OK")
+
+
 class SearchView(APIView):

    permission_classes = (IsAuthenticated,)

-    ix = index.open_index()
+    def __init__(self, *args, **kwargs):
+        super(SearchView, self).__init__(*args, **kwargs)
+        self.ix = index.open_index()

    def add_infos_to_hit(self, r):
        doc = Document.objects.get(id=r['id'])
@@ -203,33 +281,42 @@ class SearchView(APIView):
                }

    def get(self, request, format=None):
-        if 'query' in request.query_params:
-            query = request.query_params['query']
-            try:
-                page = int(request.query_params.get('page', 1))
-            except (ValueError, TypeError):
-                page = 1
-
-            with index.query_page(self.ix, query, page) as result_page:
-                return Response(
-                    {'count': len(result_page),
-                     'page': result_page.pagenum,
-                     'page_count': result_page.pagecount,
-                     'results': list(map(self.add_infos_to_hit, result_page))})
-
-        else:
+        if 'query' not in request.query_params:
            return Response({
                'count': 0,
                'page': 0,
                'page_count': 0,
                'results': []})

+        query = request.query_params['query']
+        try:
+            page = int(request.query_params.get('page', 1))
+        except (ValueError, TypeError):
+            page = 1
+
+        if page < 1:
+            page = 1
+
+        try:
+            with index.query_page(self.ix, query, page) as (result_page,
+                                                            corrected_query):
+                return Response(
+                    {'count': len(result_page),
+                     'page': result_page.pagenum,
+                     'page_count': result_page.pagecount,
+                     'corrected_query': corrected_query,
+                     'results': list(map(self.add_infos_to_hit, result_page))})
+        except Exception as e:
+            return HttpResponseBadRequest(str(e))
+

 class SearchAutoCompleteView(APIView):

    permission_classes = (IsAuthenticated,)

-    ix = index.open_index()
+    def __init__(self, *args, **kwargs):
+        super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
+        self.ix = index.open_index()

    def get(self, request, format=None):
        if 'term' in request.query_params:
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@@ -1,8 +1,19 @@
 from django.conf import settings
 from django.contrib.auth.models import User
+from django.utils.deprecation import MiddlewareMixin
 from rest_framework import authentication


+class AutoLoginMiddleware(MiddlewareMixin):
+
+    def process_request(self, request):
+        try:
+            request.user = User.objects.get(
+                username=settings.AUTO_LOGIN_USERNAME)
+        except User.DoesNotExist:
+            pass
+
+
 class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
    """ This class is here to provide authentication to the angular dev server
        during development. This is disabled in production.
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
    binaries = (
        settings.CONVERT_BINARY,
        settings.OPTIPNG_BINARY,
-        settings.UNPAPER_BINARY,
        "tesseract"
    )

--- a/src/paperless/db.py
+++ b/src/paperless/db.py
@@ -17,16 +17,3 @@ class GnuPG:
            passphrase = settings.PASSPHRASE

        return cls.gpg.decrypt_file(file_handle, passphrase=passphrase).data
-
-    @classmethod
-    def encrypted(cls, file_handle, passphrase=None):
-
-        if not passphrase:
-            passphrase = settings.PASSPHRASE
-
-        return cls.gpg.encrypt_file(
-            file_handle,
-            recipients=None,
-            passphrase=passphrase,
-            symmetric=True
-        ).data
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta

 MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
 ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
+ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
 THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")

 DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
@@ -85,6 +86,7 @@ INSTALLED_APPS = [
    "django.contrib.admin",

    "rest_framework",
+    "rest_framework.authtoken",
    "django_filters",

    "django_q",
@@ -96,7 +98,8 @@ INSTALLED_APPS = [
 REST_FRAMEWORK = {
    'DEFAULT_AUTHENTICATION_CLASSES': [
        'rest_framework.authentication.BasicAuthentication',
-        'rest_framework.authentication.SessionAuthentication'
+        'rest_framework.authentication.SessionAuthentication',
+        'rest_framework.authentication.TokenAuthentication'
    ]
 }

@@ -156,6 +159,15 @@ CHANNEL_LAYERS = {
 # Security                                                                    #
 ###############################################################################

+AUTO_LOGIN_USERNAME = os.getenv("PAPERLESS_AUTO_LOGIN_USERNAME")
+
+if AUTO_LOGIN_USERNAME:
+    _index = MIDDLEWARE.index('django.contrib.auth.middleware.AuthenticationMiddleware')
+    # This overrides everything the auth middleware is doing but still allows
+    # regular login in case the provided user does not exist.
+    MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')
+
+
 if DEBUG:
    X_FRAME_OPTIONS = ''
    # this should really be 'allow-from uri' but its not supported in any mayor
@@ -253,29 +265,48 @@ USE_TZ = True
 # Logging                                                                     #
 ###############################################################################

+DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
+
 LOGGING = {
    "version": 1,
    "disable_existing_loggers": False,
+    'formatters': {
+        'verbose': {
+            'format': '{levelname} {asctime} {module} {message}',
+            'style': '{',
+        },
+        'simple': {
+            'format': '{levelname} {message}',
+            'style': '{',
+        },
+    },
    "handlers": {
-        "dbhandler": {
+        "db": {
+            "level": "DEBUG",
            "class": "documents.loggers.PaperlessHandler",
        },
-        "streamhandler": {
-            "class": "logging.StreamHandler"
+        "console": {
+            "level": "INFO",
+            "class": "logging.StreamHandler",
+            "formatter": "verbose",
        }
    },
+    "root": {
+        "handlers": ["console"],
+        "level": "DEBUG",
+    },
    "loggers": {
        "documents": {
-            "handlers": ["dbhandler", "streamhandler"],
-            "level": "DEBUG"
+            "handlers": ["db"],
+            "propagate": True,
        },
        "paperless_mail": {
-            "handlers": ["dbhandler", "streamhandler"],
-            "level": "DEBUG"
+            "handlers": ["db"],
+            "propagate": True,
        },
        "paperless_tesseract": {
-            "handlers": ["dbhandler", "streamhandler"],
-            "level": "DEBUG"
+            "handlers": ["db"],
+            "propagate": True,
        },
    },
 }
@@ -332,6 +363,10 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))

 CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")

+CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
+
+CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
+
 OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")

 OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
@@ -340,9 +375,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")

+# OCRmyPDF --output-type options are available.
+# TODO: validate this setting.
+OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")

-# OCR all documents?
-OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
+# skip. redo, force
+# TODO: validate this.
+OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
+
+OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
+
+OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -351,11 +394,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
 CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
-CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))

 GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
+
 OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
-UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")


 # Pre-2.x versions of Paperless stored your documents locally with GPG
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -4,6 +4,7 @@ from django.contrib.auth.decorators import login_required
 from django.urls import path, re_path
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic import RedirectView
+from rest_framework.authtoken import views
 from rest_framework.routers import DefaultRouter

 from paperless.consumers import StatusConsumer
@@ -16,7 +17,8 @@ from documents.views import (
    SearchView,
    IndexView,
    SearchAutoCompleteView,
-    StatisticsView
+    StatisticsView,
+    PostDocumentView
 )
 from paperless.views import FaviconView

@@ -46,6 +48,11 @@ urlpatterns = [
                StatisticsView.as_view(),
                name="statistics"),

+        re_path(r"^documents/post_document/", PostDocumentView.as_view(),
+                name="post_document"),
+
+        path('token/', views.obtain_auth_token)
+
    ] + api_router.urls)),

    re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (0, 9, 1)
+__version__ = (0, 9, 5)
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -4,6 +4,7 @@ from datetime import timedelta, date

 import magic
 from django.conf import settings
+from django.db import DatabaseError
 from django.utils.text import slugify
 from django_q.tasks import async_task
 from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
@@ -86,46 +87,6 @@ def make_criterias(rule):
    return {**criterias, **get_rule_action(rule).get_criteria()}


-def get_title(message, att, rule):
-    if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
-        title = message.subject
-    elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
-        title = os.path.splitext(os.path.basename(att.filename))[0]
-    else:
-        raise ValueError("Unknown title selector.")
-
-    return title
-
-
-def get_correspondent(message, rule):
-    if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
-        correspondent = None
-    elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
-        correspondent_name = message.from_
-        correspondent = Correspondent.objects.get_or_create(
-            name=correspondent_name, defaults={
-                "slug": slugify(correspondent_name)
-            })[0]
-    elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
-        if message.from_values and \
-           'name' in message.from_values \
-           and message.from_values['name']:
-            correspondent_name = message.from_values['name']
-        else:
-            correspondent_name = message.from_
-
-        correspondent = Correspondent.objects.get_or_create(
-            name=correspondent_name, defaults={
-                "slug": slugify(correspondent_name)
-            })[0]
-    elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
-        correspondent = rule.assign_correspondent
-    else:
-        raise ValueError("Unknwown correspondent selector")
-
-    return correspondent
-
-
 def get_mailbox(server, port, security):
    if security == MailAccount.IMAP_SECURITY_NONE:
        mailbox = MailBoxUnencrypted(server, port)
@@ -140,6 +101,51 @@ def get_mailbox(server, port, security):

 class MailAccountHandler(LoggingMixin):

+    def _correspondent_from_name(self, name):
+        try:
+            return Correspondent.objects.get_or_create(
+                name=name, defaults={
+                    "slug": slugify(name)
+                })[0]
+        except DatabaseError as e:
+            self.log(
+                "error",
+                f"Error while retrieving correspondent {name}: {e}"
+            )
+            return None
+
+    def get_title(self, message, att, rule):
+        if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
+            return message.subject
+
+        elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
+            return os.path.splitext(os.path.basename(att.filename))[0]
+
+        else:
+            raise ValueError("Unknown title selector.")
+
+    def get_correspondent(self, message, rule):
+        c_from = rule.assign_correspondent_from
+
+        if c_from == MailRule.CORRESPONDENT_FROM_NOTHING:
+            return None
+
+        elif c_from == MailRule.CORRESPONDENT_FROM_EMAIL:
+            return self._correspondent_from_name(message.from_)
+
+        elif c_from == MailRule.CORRESPONDENT_FROM_NAME:
+            if message.from_values and 'name' in message.from_values and message.from_values['name']:  # NOQA: E501
+                return self._correspondent_from_name(
+                    message.from_values['name'])
+            else:
+                return self._correspondent_from_name(message.from_)
+
+        elif c_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
+            return rule.assign_correspondent
+
+        else:
+            raise ValueError("Unknwown correspondent selector")
+
    def handle_mail_account(self, account):

        self.renew_logging_group()
@@ -156,79 +162,89 @@ class MailAccountHandler(LoggingMixin):
                M.login(account.username, account.password)
            except Exception:
                raise MailError(
-                    f"Error while authenticating account {account.name}")
+                    f"Error while authenticating account {account}")

            self.log('debug', f"Account {account}: Processing "
                              f"{account.rules.count()} rule(s)")

            for rule in account.rules.order_by('order'):
-                self.log(
-                    'debug',
-                    f"Account {account}: Processing rule {rule.name}")
-
-                self.log(
-                    'debug',
-                    f"Rule {account}.{rule}: Selecting folder {rule.folder}")
-
                try:
-                    M.folder.set(rule.folder)
-                except MailboxFolderSelectError:
-                    raise MailError(
-                        f"Rule {rule.name}: Folder {rule.folder} "
-                        f"does not exist in account {account.name}")
+                    total_processed_files += self.handle_mail_rule(M, rule)
+                except Exception as e:
+                    self.log(
+                        "error",
+                        f"Rule {rule}: Error while processing rule: {e}",
+                        exc_info=True
+                    )

-                criterias = make_criterias(rule)
+        return total_processed_files

+    def handle_mail_rule(self, M, rule):
+
+        self.log(
+            'debug',
+            f"Rule {rule}: Selecting folder {rule.folder}")
+
+        try:
+            M.folder.set(rule.folder)
+        except MailboxFolderSelectError:
+            raise MailError(
+                f"Rule {rule}: Folder {rule.folder} "
+                f"does not exist in account {rule.account}")
+
+        criterias = make_criterias(rule)
+
+        self.log(
+            'debug',
+            f"Rule {rule}: Searching folder with criteria "
+            f"{str(AND(**criterias))}")
+
+        try:
+            messages = M.fetch(criteria=AND(**criterias),
+                               mark_seen=False)
+        except Exception:
+            raise MailError(
+                f"Rule {rule}: Error while fetching folder {rule.folder}")
+
+        post_consume_messages = []
+
+        mails_processed = 0
+        total_processed_files = 0
+
+        for message in messages:
+            try:
+                processed_files = self.handle_message(message, rule)
+                if processed_files > 0:
+                    post_consume_messages.append(message.uid)
+
+                total_processed_files += processed_files
+                mails_processed += 1
+            except Exception as e:
                self.log(
-                    'debug',
-                    f"Rule {account}.{rule}: Searching folder with criteria "
-                    f"{str(AND(**criterias))}")
+                    "error",
+                    f"Rule {rule}: Error while processing mail "
+                    f"{message.uid}: {e}",
+                    exc_info=True)

-                try:
-                    messages = M.fetch(criteria=AND(**criterias),
-                                       mark_seen=False)
-                except Exception:
-                    raise MailError(
-                        f"Rule {rule.name}: Error while fetching folder "
-                        f"{rule.folder} of account {account.name}")
+        self.log(
+            'debug',
+            f"Rule {rule}: Processed {mails_processed} matching mail(s)")

-                post_consume_messages = []
+        self.log(
+            'debug',
+            f"Rule {rule}: Running mail actions on "
+            f"{len(post_consume_messages)} mails")

-                mails_processed = 0
+        try:
+            get_rule_action(rule).post_consume(
+                M,
+                post_consume_messages,
+                rule.action_parameter)

-                for message in messages:
-                    try:
-                        processed_files = self.handle_message(message, rule)
-                    except Exception:
-                        raise MailError(
-                            f"Rule {rule.name}: Error while processing mail "
-                            f"{message.uid} of account {account.name}")
-                    if processed_files > 0:
-                        post_consume_messages.append(message.uid)
-
-                    total_processed_files += processed_files
-                    mails_processed += 1
-
-                self.log(
-                    'debug',
-                    f"Rule {account}.{rule}: Processed {mails_processed} "
-                    f"matching mail(s)")
-
-                self.log(
-                    'debug',
-                    f"Rule {account}.{rule}: Running mail actions on "
-                    f"{len(post_consume_messages)} mails")
-
-                try:
-                    get_rule_action(rule).post_consume(
-                        M,
-                        post_consume_messages,
-                        rule.action_parameter)
-
-                except Exception:
-                    raise MailError(
-                        f"Rule {rule.name}: Error while processing "
-                        f"post-consume actions for account {account.name}")
+        except Exception as e:
+            raise MailError(
+                f"Rule {rule}: Error while processing post-consume actions: "
+                f"{e}")

        return total_processed_files

@@ -238,11 +254,11 @@ class MailAccountHandler(LoggingMixin):

        self.log(
            'debug',
-            f"Rule {rule.account}.{rule}: "
+            f"Rule {rule}: "
            f"Processing mail {message.subject} from {message.from_} with "
            f"{len(message.attachments)} attachment(s)")

-        correspondent = get_correspondent(message, rule)
+        correspondent = self.get_correspondent(message, rule)
        tag = rule.assign_tag
        doc_type = rule.assign_document_type

@@ -253,12 +269,12 @@ class MailAccountHandler(LoggingMixin):
            if not att.content_disposition == "attachment":
                self.log(
                    'debug',
-                    f"Rule {rule.account}.{rule}: "
+                    f"Rule {rule}: "
                    f"Skipping attachment {att.filename} "
-                    f"with content disposition inline")
+                    f"with content disposition {att.content_disposition}")
                continue

-            title = get_title(message, att, rule)
+            title = self.get_title(message, att, rule)

            # don't trust the content type of the attachment. Could be
            # generic application/octet-stream.
@@ -274,7 +290,7 @@ class MailAccountHandler(LoggingMixin):

                self.log(
                    'info',
-                    f"Rule {rule.account}.{rule}: "
+                    f"Rule {rule}: "
                    f"Consuming attachment {att.filename} from mail "
                    f"{message.subject} from {message.from_}")

@@ -293,7 +309,7 @@ class MailAccountHandler(LoggingMixin):
            else:
                self.log(
                    'debug',
-                    f"Rule {rule.account}.{rule}: "
+                    f"Rule {rule}: "
                    f"Skipping attachment {att.filename} "
                    f"since guessed mime type {mime_type} is not supported "
                    f"by paperless")
--- a/src/paperless_mail/models.py
+++ b/src/paperless_mail/models.py
@@ -139,4 +139,4 @@ class MailRule(models.Model):
    )

    def __str__(self):
-        return self.name
+        return f"{self.account.name}.{self.name}"
--- a/src/paperless_mail/tasks.py
+++ b/src/paperless_mail/tasks.py
@@ -1,14 +1,20 @@
 import logging

-from paperless_mail.mail import MailAccountHandler
+from paperless_mail.mail import MailAccountHandler, MailError
 from paperless_mail.models import MailAccount


 def process_mail_accounts():
    total_new_documents = 0
    for account in MailAccount.objects.all():
-        total_new_documents += MailAccountHandler().handle_mail_account(
-            account)
+        try:
+            total_new_documents += MailAccountHandler().handle_mail_account(
+                account)
+        except MailError as e:
+            logging.getLogger(__name__).error(
+                f"Error while processing mail account {account}: {e}",
+                exc_info=True
+            )

    if total_new_documents > 0:
        return f"Added {total_new_documents} document(s)."
@@ -17,8 +23,8 @@ def process_mail_accounts():


 def process_mail_account(name):
-    account = MailAccount.objects.find(name=name)
-    if account:
+    try:
+        account = MailAccount.objects.get(name=name)
        MailAccountHandler().handle_mail_account(account)
-    else:
-        logging.error("Unknown mail acccount: {}".format(name))
+    except MailAccount.DoesNotExist:
+        logging.getLogger(__name__).error(f"Unknown mail acccount: {name}")
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -3,11 +3,14 @@ from collections import namedtuple
 from typing import ContextManager
 from unittest import mock

+from django.core.management import call_command
+from django.db import DatabaseError
 from django.test import TestCase
 from imap_tools import MailMessageFlags, MailboxFolderSelectError

 from documents.models import Correspondent
-from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
+from paperless_mail import tasks
+from paperless_mail.mail import MailError, MailAccountHandler
 from paperless_mail.models import MailRule, MailAccount


@@ -163,28 +166,30 @@ class TestMail(TestCase):
        me_localhost = Correspondent.objects.create(name=message2.from_)
        someone_else = Correspondent.objects.create(name="someone else")

+        handler = MailAccountHandler()
+
        rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
-        self.assertIsNone(get_correspondent(message, rule))
+        self.assertIsNone(handler.get_correspondent(message, rule))

        rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
-        c = get_correspondent(message, rule)
+        c = handler.get_correspondent(message, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.name, "someone@somewhere.com")
-        c = get_correspondent(message2, rule)
+        c = handler.get_correspondent(message2, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.name, "me@localhost.com")
        self.assertEqual(c.id, me_localhost.id)

        rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
-        c = get_correspondent(message, rule)
+        c = handler.get_correspondent(message, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.name, "Someone!")
-        c = get_correspondent(message2, rule)
+        c = handler.get_correspondent(message2, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.id, me_localhost.id)

        rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
-        c = get_correspondent(message, rule)
+        c = handler.get_correspondent(message, rule)
        self.assertEqual(c, someone_else)

    def test_get_title(self):
@@ -192,10 +197,13 @@ class TestMail(TestCase):
        message.subject = "the message title"
        att = namedtuple('Attachment', [])
        att.filename = "this_is_the_file.pdf"
+
+        handler = MailAccountHandler()
+
        rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
-        self.assertEqual(get_title(message, att, rule), "this_is_the_file")
+        self.assertEqual(handler.get_title(message, att, rule), "this_is_the_file")
        rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
-        self.assertEqual(get_title(message, att, rule), "the message title")
+        self.assertEqual(handler.get_title(message, att, rule), "the message title")

    def test_handle_message(self):
        message = create_message(subject="the message title", from_="Myself", num_attachments=2)
@@ -317,7 +325,7 @@ class TestMail(TestCase):
        self.assertEqual(len(self.bogus_mailbox.messages), 2)
        self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)

-    def test_errors(self):
+    def test_error_login(self):
        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")

        try:
@@ -327,26 +335,84 @@ class TestMail(TestCase):
        else:
            self.fail("Should raise exception")

+    def test_error_skip_account(self):
+        account_faulty = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wroasdng")
+
        account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
-        rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
+        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
+                                       action_parameter="spam", filter_subject="Claim")
+
+        tasks.process_mail_accounts()
+        self.assertEqual(self.async_task.call_count, 1)
+        self.assertEqual(len(self.bogus_mailbox.messages), 2)
+        self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
+
+    def test_error_skip_rule(self):
+
+        account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
+        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
+                                       action_parameter="spam", filter_subject="Claim", order=1, folder="uuuhhhh")
+        rule2 = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE,
+                                       action_parameter="spam", filter_subject="Claim", order=2)
+
+        self.mail_account_handler.handle_mail_account(account)
+        self.assertEqual(self.async_task.call_count, 1)
+        self.assertEqual(len(self.bogus_mailbox.messages), 2)
+        self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
+
+
+    @mock.patch("paperless_mail.mail.MailAccountHandler.get_correspondent")
+    def test_error_skip_mail(self, m):
+
+        def get_correspondent_fake(message, rule):
+            if message.from_ == 'amazon@amazon.de':
+                raise ValueError("Does not compute.")
+            else:
+                return None
+
+        m.side_effect = get_correspondent_fake
+
+        account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
+        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam")
+
+        self.mail_account_handler.handle_mail_account(account)
+
+        # test that we still consume mail even if some mails throw errors.
+        self.assertEqual(self.async_task.call_count, 2)
+
+        # faulty mail still in inbox, untouched
+        self.assertEqual(len(self.bogus_mailbox.messages), 1)
+        self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de')
+
+    def test_error_create_correspondent(self):
+
+        account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
+        rule = MailRule.objects.create(
+            name="testrule", filter_from="amazon@amazon.de",
+            account=account, action=MailRule.ACTION_MOVE, action_parameter="spam",
+            assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
+
+        self.mail_account_handler.handle_mail_account(account)
+
+        self.async_task.assert_called_once()
+        args, kwargs = self.async_task.call_args
+
+        c = Correspondent.objects.get(name="amazon@amazon.de")
+        # should work
+        self.assertEquals(kwargs['override_correspondent_id'], c.id)
+
+        self.async_task.reset_mock()
+        self.reset_bogus_mailbox()
+
+        with mock.patch("paperless_mail.mail.Correspondent.objects.get_or_create") as m:
+            m.side_effect = DatabaseError()

-        try:
            self.mail_account_handler.handle_mail_account(account)
-        except MailError as e:
-            self.assertTrue("uuuh does not exist" in str(e))
-        else:
-            self.fail("Should raise exception")

-        account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
+        args, kwargs = self.async_task.call_args
+        self.async_task.assert_called_once()
+        self.assertEquals(kwargs['override_correspondent_id'], None)

-        rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
-
-        try:
-            self.mail_account_handler.handle_mail_account(account)
-        except MailError as e:
-            self.assertTrue("Error while processing post-consume actions" in str(e))
-        else:
-            self.fail("Should raise exception")

    def test_filters(self):

@@ -390,3 +456,43 @@ class TestMail(TestCase):
        self.mail_account_handler.handle_mail_account(account)
        self.assertEqual(len(self.bogus_mailbox.messages), 2)
        self.assertEqual(self.async_task.call_count, 5)
+
+class TestManagementCommand(TestCase):
+
+    @mock.patch("paperless_mail.management.commands.mail_fetcher.tasks.process_mail_accounts")
+    def test_mail_fetcher(self, m):
+
+        call_command("mail_fetcher")
+
+        m.assert_called_once()
+
+class TestTasks(TestCase):
+
+    @mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
+    def test_all_accounts(self, m):
+        m.side_effect = lambda account: 6
+
+        MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
+        MailAccount.objects.create(name="B", imap_server="A", username="A", password="A")
+
+        result = tasks.process_mail_accounts()
+
+        self.assertEqual(m.call_count, 2)
+        self.assertIn("Added 12", result)
+
+        m.side_effect = lambda account: 0
+        result = tasks.process_mail_accounts()
+        self.assertIn("No new", result)
+
+    @mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
+    def test_single_accounts(self, m):
+
+        MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
+
+        tasks.process_mail_account("A")
+
+        m.assert_called_once()
+        m.reset_mock()
+
+        tasks.process_mail_account("B")
+        m.assert_not_called()
--- a/src/paperless_tesseract/init.py
+++ b/src/paperless_tesseract/init.py
@@ -0,0 +1,2 @@
+# this is here so that django finds the checks.
+from .checks import *
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -0,0 +1,34 @@
+import subprocess
+
+from django.conf import settings
+from django.core.checks import Error, register
+
+
+def get_tesseract_langs():
+    with subprocess.Popen(['tesseract', '--list-langs'],
+                          stdout=subprocess.PIPE) as p:
+        stdout, stderr = p.communicate()
+
+    return stdout.decode().strip().split("\n")[1:]
+
+
+@register()
+def check_default_language_available(app_configs, **kwargs):
+    installed_langs = get_tesseract_langs()
+
+    if not settings.OCR_LANGUAGE:
+        return [Warning(
+            "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
+            "This means that tesseract will fallback to english."
+        )]
+
+    specified_langs = settings.OCR_LANGUAGE.split("+")
+
+    for lang in specified_langs:
+        if lang not in installed_langs:
+            return [Error(
+                f"The selected ocr language {lang} is "
+                f"not installed. Paperless cannot OCR your documents "
+                f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
+
+    return []
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,23 +1,15 @@
-import itertools
+import json
 import os
 import re
 import subprocess
-from multiprocessing.pool import ThreadPool

-import langdetect
+import ocrmypdf
 import pdftotext
-import pyocr
 from PIL import Image
 from django.conf import settings
-from pyocr import PyocrException
+from ocrmypdf import InputFileError, EncryptedPdfError

-from documents.parsers import DocumentParser, ParseError, run_unpaper, \
-    run_convert
-from .languages import ISO639
-
-
-class OCRError(Exception):
-    pass
+from documents.parsers import DocumentParser, ParseError, run_convert


 class RasterisedDocumentParser(DocumentParser):
@@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

-    def __init__(self, path, logging_group, progress_callback):
-        super().__init__(path, logging_group, progress_callback)
-        self._text = None
-
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """
@@ -43,8 +31,8 @@ class RasterisedDocumentParser(DocumentParser):
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
-                        trim=True,
-                        input_file="{}[0]".format(self.document_path),
+                        trim=False,
+                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
@@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
-                   self.document_path]
+                   document_path]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
@@ -67,187 +55,160 @@ class RasterisedDocumentParser(DocumentParser):
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
-                        trim=True,
+                        trim=False,
                        input_file=gs_out_path,
                        output_file=out_path,
                        logging_group=self.logging_group)

        return out_path

-    def _is_ocred(self):
-
-        # Extract text from PDF using pdftotext
-        text = get_text_from_pdf(self.document_path)
-
-        # We assume, that a PDF with at least 50 characters contains text
-        # (so no OCR required)
-        return len(text) > 50
-
-    def get_text(self):
-
-        if self._text is not None:
-            return self._text
-
-        if not settings.OCR_ALWAYS and self._is_ocred():
-            self.log("debug", "Skipping OCR, using Text from PDF")
-            self._text = get_text_from_pdf(self.document_path)
-            return self._text
-
-        self.progress_callback(0, 1, "Making greyscale images.")
-        images = self._get_greyscale()
-
-        if not images:
-            raise ParseError("Empty document, nothing to do.")
+    def is_image(self, mime_type):
+        return mime_type in [
+            "image/png",
+            "image/jpeg",
+            "image/tiff",
+            "image/bmp",
+            "image/gif",
+        ]

+    def get_dpi(self, image):
        try:
-
-            sample_page_index = int(len(images) / 2)
-            self.log(
-                "debug",
-                f"Attempting language detection on page "
-                f"{sample_page_index + 1} of {len(images)}...")
-            self.progress_callback(0.4, 1, "Language Detection.")
-            sample_page_text = self._ocr([images[sample_page_index]],
-                                         settings.OCR_LANGUAGE)[0]
-            guessed_language = self._guess_language(sample_page_text)
-            self.progress_callback(0.6, 1, "OCR all the pages.")
-
-            if not guessed_language or guessed_language not in ISO639:
-                self.log("warning", "Language detection failed.")
-                ocr_pages = self._complete_ocr_default_language(
-                    images, sample_page_index, sample_page_text)
-
-            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
-                self.log(
-                    "debug",
-                    f"Detected language: {guessed_language} "
-                    f"(default language)")
-                ocr_pages = self._complete_ocr_default_language(
-                    images, sample_page_index, sample_page_text)
-
-            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501
-                self.log(
-                    "warning",
-                    f"Detected language {guessed_language} is not available "
-                    f"on this system.")
-                ocr_pages = self._complete_ocr_default_language(
-                    images, sample_page_index, sample_page_text)
-
-            else:
-                self.log("debug", f"Detected language: {guessed_language}")
-                ocr_pages = self._ocr(
-                    images, ISO639[guessed_language], report_progress=True)
-
-            self.log("debug", "OCR completed.")
-            self._text = strip_excess_whitespace(" ".join(ocr_pages))
-            return self._text
-
-        except OCRError as e:
-            raise ParseError(e)
-
-    def _get_greyscale(self):
-        """
-        Greyscale images are easier for Tesseract to OCR
-        """
-
-        # Convert PDF to multiple PNMs
-        input_file = self.document_path
-
-        if settings.OCR_PAGES == 1:
-            input_file += "[0]"
-        elif settings.OCR_PAGES > 1:
-            input_file += f"[0-{settings.OCR_PAGES - 1}]"
-
-        self.log(
-            "debug",
-            f"Converting document {input_file} into greyscale images")
-
-        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
-
-        run_convert(density=settings.CONVERT_DENSITY,
-                    depth="8",
-                    type="grayscale",
-                    input_file=input_file,
-                    output_file=output_files,
-                    logging_group=self.logging_group)
-
-        # Get a list of converted images
-        pnms = []
-        for f in os.listdir(self.tempdir):
-            if f.endswith(".pnm"):
-                pnms.append(os.path.join(self.tempdir, f))
-
-        self.log("debug", f"Running unpaper on {len(pnms)} pages...")
-
-        self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms)))
-
-        # Run unpaper in parallel on converted images
-        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
-            pnms = pool.map(run_unpaper, pnms)
-
-        return sorted(filter(lambda __: os.path.isfile(__), pnms))
-
-    def _guess_language(self, text):
-        try:
-            guess = langdetect.detect(text)
-            return guess
+            with Image.open(image) as im:
+                x, y = im.info['dpi']
+                return x
        except Exception as e:
-            self.log('warning', f"Language detection failed with: {e}")
+            self.log(
+                'warning',
+                f"Error while getting DPI from image {image}: {e}")
            return None

-    def _ocr(self, imgs, lang, report_progress=False):
-        self.log(
-            "debug",
-            f"Performing OCR on {len(imgs)} page(s) with language {lang}")
-        r = []
-        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
-            # r = pool.map(image_to_string, itertools.product(imgs, [lang]))
-            for i, page in enumerate(pool.imap(image_to_string, itertools.product(imgs, [lang]))):
-                if report_progress:
-                    self.progress_callback(0.6 + (i / len(imgs)) * 0.4, 1, "OCR'ed {} pages".format(i+1))
-                r += [page]
-        return r
+    def parse(self, document_path, mime_type):
+        mode = settings.OCR_MODE

-    def _complete_ocr_default_language(self,
-                                       images,
-                                       sample_page_index,
-                                       sample_page):
-        images_copy = list(images)
-        del images_copy[sample_page_index]
-        if images_copy:
-            self.log('debug', "Continuing ocr with default language.")
-            ocr_pages = self._ocr(
-                images_copy, settings.OCR_LANGUAGE, report_progress=True)
-            ocr_pages.insert(sample_page_index, sample_page)
-            return ocr_pages
+        text_original = get_text_from_pdf(document_path)
+        has_text = text_original and len(text_original) > 50
+
+        if mode == "skip_noarchive" and has_text:
+            self.log("debug",
+                     "Document has text, skipping OCRmyPDF entirely.")
+            self.text = text_original
+            return
+
+        if mode in ['skip', 'skip_noarchive'] and not has_text:
+            # upgrade to redo, since there appears to be no text in the
+            # document. This happens to some weird encrypted documents or
+            # documents with failed OCR attempts for which OCRmyPDF will
+            # still report that there actually is text in them.
+            self.log("debug",
+                     "No text was found in the document and skip is "
+                     "specified. Upgrading OCR mode to redo.")
+            mode = "redo"
+
+        archive_path = os.path.join(self.tempdir, "archive.pdf")
+
+        ocr_args = {
+            'input_file': document_path,
+            'output_file': archive_path,
+            'use_threads': True,
+            'jobs': settings.THREADS_PER_WORKER,
+            'language': settings.OCR_LANGUAGE,
+            'output_type': settings.OCR_OUTPUT_TYPE,
+            'progress_bar': False,
+            'clean': True
+        }
+
+        if settings.OCR_PAGES > 0:
+            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
+
+        # Mode selection.
+
+        if mode in ['skip', 'skip_noarchive']:
+            ocr_args['skip_text'] = True
+        elif mode == 'redo':
+            ocr_args['redo_ocr'] = True
+        elif mode == 'force':
+            ocr_args['force_ocr'] = True
        else:
-            return [sample_page]
+            raise ParseError(
+                f"Invalid ocr mode: {mode}")
+
+        if self.is_image(mime_type):
+            dpi = self.get_dpi(document_path)
+            if dpi:
+                self.log(
+                    "debug",
+                    f"Detected DPI for image {document_path}: {dpi}"
+                )
+                ocr_args['image_dpi'] = dpi
+            elif settings.OCR_IMAGE_DPI:
+                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
+            else:
+                raise ParseError(
+                    f"Cannot produce archive PDF for image {document_path}, "
+                    f"no DPI information is present in this image and "
+                    f"OCR_IMAGE_DPI is not set.")
+
+        if settings.OCR_USER_ARGS:
+            try:
+                user_args = json.loads(settings.OCR_USER_ARGS)
+                ocr_args = {**ocr_args, **user_args}
+            except Exception as e:
+                self.log(
+                    "warning",
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used: {e}")
+
+        # This forces tesseract to use one core per page.
+        os.environ['OMP_THREAD_LIMIT'] = "1"
+
+        try:
+            self.log("debug",
+                     f"Calling OCRmyPDF with {str(ocr_args)}")
+            ocrmypdf.ocr(**ocr_args)
+            # success! announce results
+            self.archive_path = archive_path
+            self.text = get_text_from_pdf(archive_path)
+
+        except (InputFileError, EncryptedPdfError) as e:
+
+            self.log("debug",
+                     f"Encountered an error: {e}. Trying to use text from "
+                     f"original.")
+            # This happens with some PDFs when used with the redo_ocr option.
+            # This is not the end of the world, we'll just use what we already
+            # have in the document.
+            self.text = text_original
+            # Also, no archived file.
+            if not self.text:
+                # However, if we don't have anything, fail:
+                raise ParseError(e)
+
+        except Exception as e:
+            # Anything else is probably serious.
+            raise ParseError(e)
+
+        if not self.text:
+            # This may happen for files that don't have any text.
+            self.log(
+                'warning',
+                f"Document {document_path} does not have any text."
+                f"This is probably an error or you tried to add an image "
+                f"without text, or something is wrong with this document.")
+            self.text = ""


 def strip_excess_whitespace(text):
+    if not text:
+        return None
+
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(
        r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
    no_trailing_whitespace = re.sub(
        r"([^\S\n\r]+)$", '', no_leading_whitespace)
-    return no_trailing_whitespace

-
-def image_to_string(args):
-    img, lang = args
-    ocr = pyocr.get_available_tools()[0]
-    with Image.open(img) as f:
-        if ocr.can_detect_orientation():
-            try:
-                orientation = ocr.detect_orientation(f, lang=lang)
-                f = f.rotate(orientation["angle"], expand=1)
-            except Exception:
-                # Rotation not possible, ignore
-                pass
-        try:
-            return ocr.image_to_string(f, lang=lang)
-        except PyocrException as e:
-            raise OCRError(e)
+    # TODO: this needs a rework
+    return no_trailing_whitespace.strip()


 def get_text_from_pdf(pdf_file):
@@ -256,6 +217,9 @@ def get_text_from_pdf(pdf_file):
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
-            return ""
+            # might not be a PDF file
+            return None

-    return "\n".join(pdf)
+    text = "\n".join(pdf)
+
+    return strip_excess_whitespace(text)
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -5,9 +5,12 @@ def tesseract_consumer_declaration(sender, **kwargs):
    return {
        "parser": RasterisedDocumentParser,
        "weight": 0,
-        "mime_types": [
-            "application/pdf",
-            "image/jpeg",
-            "image/png"
-        ]
+        "mime_types": {
+            "application/pdf": ".pdf",
+            "image/jpeg": ".jpg",
+            "image/png": ".png",
+            "image/tiff": ".tif",
+            "image/gif": ".gif",
+            "image/bmp": ".bmp",
+        }
    }
--- a/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
--- a/src/paperless_tesseract/tests/samples/multi-page-images.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-images.pdf
--- a/src/paperless_tesseract/tests/samples/no-text-alpha.png
+++ b/src/paperless_tesseract/tests/samples/no-text-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-alpha.png
+++ b/src/paperless_tesseract/tests/samples/simple-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/simple-digital.pdf
--- a/src/paperless_tesseract/tests/samples/simple-no-dpi.png
+++ b/src/paperless_tesseract/tests/samples/simple-no-dpi.png
--- a/src/paperless_tesseract/tests/samples/simple.bmp
+++ b/src/paperless_tesseract/tests/samples/simple.bmp
--- a/src/paperless_tesseract/tests/samples/simple.gif
+++ b/src/paperless_tesseract/tests/samples/simple.gif
--- a/src/paperless_tesseract/tests/samples/simple.jpg
+++ b/src/paperless_tesseract/tests/samples/simple.jpg
--- a/src/paperless_tesseract/tests/samples/simple.png
+++ b/src/paperless_tesseract/tests/samples/simple.png
--- a/src/paperless_tesseract/tests/samples/simple.tif
+++ b/src/paperless_tesseract/tests/samples/simple.tif
--- a/src/paperless_tesseract/tests/samples/with-form.pdf
+++ b/src/paperless_tesseract/tests/samples/with-form.pdf
--- a/src/paperless_tesseract/tests/test_date.py
+++ b/src/paperless_tesseract/tests/test_date.py
@@ -1,193 +0,0 @@
-import datetime
-import os
-import shutil
-from unittest import mock
-from uuid import uuid4
-
-from dateutil import tz
-from django.conf import settings
-from django.test import TestCase, override_settings
-
-from ..parsers import RasterisedDocumentParser
-
-
-class TestDate(TestCase):
-
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-    SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
-
-    def setUp(self):
-        os.makedirs(self.SCRATCH, exist_ok=True)
-
-    def tearDown(self):
-        shutil.rmtree(self.SCRATCH)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_1(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 130218 lorem ipsum"
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_2(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 2018 lorem ipsum"
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_3(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 20180213 lorem ipsum"
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_4(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 13.02.2018 lorem ipsum"
-        date = document.get_date()
-        self.assertEqual(
-            date,
-            datetime.datetime(
-                2018, 2, 13, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_5(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
-            "ipsum"
-        )
-        date = document.get_date()
-        self.assertEqual(
-            date,
-            datetime.datetime(
-                2018, 2, 13, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_6(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "Wohnort\n"
-            "3100\n"
-            "IBAN\n"
-            "AT87 4534\n"
-            "1234\n"
-            "1234 5678\n"
-            "BIC\n"
-            "lorem ipsum"
-        )
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_7(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "März 2019\n"
-            "lorem ipsum"
-        )
-        date = document.get_date()
-        self.assertEqual(
-            date,
-            datetime.datetime(
-                2019, 3, 1, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_8(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "Wohnort\n"
-            "3100\n"
-            "IBAN\n"
-            "AT87 4534\n"
-            "1234\n"
-            "1234 5678\n"
-            "BIC\n"
-            "lorem ipsum\n"
-            "März 2020"
-        )
-        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(
-                2020, 3, 1, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_9(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "27. Nullmonth 2020\n"
-            "März 2020\n"
-            "lorem ipsum"
-        )
-        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(
-                2020, 3, 1, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="01-07-0590 00:00:00"
-    )
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_crazy_date_past(self, *args):
-        document = RasterisedDocumentParser("/dev/null", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="01-07-2350 00:00:00"
-    )
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_crazy_date_future(self, *args):
-        document = RasterisedDocumentParser("/dev/null", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="20 408000l 2475"
-    )
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_crazy_date_with_spaces(self, *args):
-        document = RasterisedDocumentParser("/dev/null", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="No date in here"
-    )
-    @override_settings(FILENAME_DATE_ORDER="YMD")
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_filename_date_parse_invalid(self, *args):
-        document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
--- a/src/paperless_tesseract/tests/test_ocr.py
+++ b/src/paperless_tesseract/tests/test_ocr.py
@@ -1,76 +0,0 @@
-import os
-from unittest import mock, skipIf
-
-import pyocr
-from django.test import TestCase
-from pyocr.libtesseract.tesseract_raw import \
-    TesseractError as OtherTesseractError
-
-from ..parsers import image_to_string, strip_excess_whitespace
-
-
-class FakeTesseract(object):
-
-    @staticmethod
-    def can_detect_orientation():
-        return True
-
-    @staticmethod
-    def detect_orientation(file_handle, lang):
-        raise OtherTesseractError("arbitrary status", "message")
-
-    @staticmethod
-    def image_to_string(file_handle, lang):
-        return "This is test text"
-
-
-class FakePyOcr(object):
-
-    @staticmethod
-    def get_available_tools():
-        return [FakeTesseract]
-
-
-class TestOCR(TestCase):
-
-    text_cases = [
-        ("simple     string", "simple string"),
-        (
-            "simple    newline\n   testing string",
-            "simple newline\ntesting string"
-        ),
-        (
-            "utf-8   строка с пробелами в конце  ",
-            "utf-8 строка с пробелами в конце"
-        )
-    ]
-
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
-
-    def test_strip_excess_whitespace(self):
-        for source, result in self.text_cases:
-            actual_result = strip_excess_whitespace(source)
-            self.assertEqual(
-                result,
-                actual_result,
-                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
-                    source,
-                    result,
-                    actual_result
-                )
-            )
-
-    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
-    @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
-    def test_image_to_string_with_text_free_page(self):
-        """
-        This test is sort of silly, since it's really just reproducing an odd
-        exception thrown by pyocr when it encounters a page with no text.
-        Actually running this test against an installation of Tesseract results
-        in a segmentation fault rooted somewhere deep inside pyocr where I
-        don't care to dig.  Regardless, if you run the consumer normally,
-        text-free pages are now handled correctly so long as we work around
-        this weird exception.
-        """
-        image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -1,46 +1,17 @@
 import os
-import shutil
-import tempfile
 import uuid
 from typing import ContextManager
 from unittest import mock

 from django.test import TestCase, override_settings
-from pyocr.error import TesseractError

 from documents.parsers import ParseError, run_convert
-from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
+from documents.tests.utils import DirectoriesMixin
+from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace

 image_to_string_calls = []


-class FakeTesseract(object):
-
-    @staticmethod
-    def can_detect_orientation():
-        return True
-
-    @staticmethod
-    def detect_orientation(file_handle, lang):
-        raise TesseractError("arbitrary status", "message")
-
-    @staticmethod
-    def get_available_languages():
-        return ['eng', 'deu']
-
-    @staticmethod
-    def image_to_string(file_handle, lang):
-        image_to_string_calls.append((file_handle.name, lang))
-        return file_handle.read()
-
-
-class FakePyOcr(object):
-
-    @staticmethod
-    def get_available_tools():
-        return [FakeTesseract]
-
-
 def fake_convert(input_file, output_file, **kwargs):
    with open(input_file) as f:
        lines = f.readlines()
@@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs):
            f2.write(line.strip())


-def fake_unpaper(pnm):
-    output = pnm + ".unpaper.pnm"
-    shutil.copy(pnm, output)
-    return output
-
-
 class FakeImageFile(ContextManager):
    def __init__(self, fname):
        self.fname = fname
@@ -67,142 +32,50 @@ class FakeImageFile(ContextManager):
        return os.path.basename(self.fname)


-fake_image = FakeImageFile


-@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
-@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
-@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
-@mock.patch("paperless_tesseract.parsers.Image.open", open)
-class TestRasterisedDocumentParser(TestCase):
+class TestParser(DirectoriesMixin, TestCase):

-    def setUp(self):
-        self.scratch = tempfile.mkdtemp()
+    def assertContainsStrings(self, content, strings):
+        # Asserts that all strings appear in content, in the given order.
+        indices = [content.index(s) for s in strings]
+        self.assertListEqual(indices, sorted(indices))

-        global image_to_string_calls
+    text_cases = [
+        ("simple     string", "simple string"),
+        (
+            "simple    newline\n   testing string",
+            "simple newline\ntesting string"
+        ),
+        (
+            "utf-8   строка с пробелами в конце  ",
+            "utf-8 строка с пробелами в конце"
+        )
+    ]

-        image_to_string_calls = []
-
-        override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
-
-    def tearDown(self):
-        shutil.rmtree(self.scratch)
-
-    def get_input_file(self, pages):
-        _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
-        with open(fname, "w") as f:
-            f.writelines([f"line {p}\n" for p in range(pages)])
-        return fname
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
-    def test_parse_text_simple_language_match(self):
-        parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
-    def test_parse_text_2_pages(self):
-        parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
-    def test_parse_text_3_pages(self):
-        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
-    def test_parse_text_lang_detect_failed(self):
-        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
-    def test_parse_text_lang_not_installed(self):
-        parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2 line 3")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
-    def test_parse_text_lang_mismatch(self):
-        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
-    def test_parse_empty_doc(self):
-        parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
-        try:
-            parser.get_text()
-        except ParseError as e:
-            self.assertEqual("Empty document, nothing to do.", str(e))
-        else:
-            self.fail("Should raise exception")
-
-
-class TestAuxilliaryFunctions(TestCase):
-
-    def setUp(self):
-        self.scratch = tempfile.mkdtemp()
-
-        override_settings(SCRATCH_DIR=self.scratch).enable()
-
-    def tearDown(self):
-        shutil.rmtree(self.scratch)
+    def test_strip_excess_whitespace(self):
+        for source, result in self.text_cases:
+            actual_result = strip_excess_whitespace(source)
+            self.assertEqual(
+                result,
+                actual_result,
+                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
+                    source,
+                    result,
+                    actual_result
+                )
+            )

    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")

    def test_get_text_from_pdf(self):
-        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
+        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'))

-        self.assertEqual(text.strip(), "This is a test document.")
-
-    def test_get_text_from_pdf_error(self):
-        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
-
-        self.assertEqual(text.strip(), "")
-
-    def test_image_to_string(self):
-        text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
-
-        self.assertEqual(text, "This is a test document.")
-
-    def test_image_to_string_language_unavailable(self):
-        try:
-            image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
-        except OCRError as e:
-            self.assertTrue("Failed loading language" in str(e))
-        else:
-            self.fail("Should raise exception")
-
-    @override_settings(OCR_ALWAYS=False)
-    @mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
-    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
-    def test_is_ocred(self, m2, m):
-        parser = RasterisedDocumentParser("", uuid.uuid4())
-        m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
-                         "lots of text lots of text lots of text lots of text lots of text lots of text " \
-                         "lots of text lots of text lots of text lots of text lots of text lots of text "
-        parser.get_text()
-        self.assertEqual(m.call_count, 2)
-        self.assertEqual(m2.call_count, 0)
+        self.assertContainsStrings(text.strip(), ["This is a test document."])

    def test_thumbnail(self):
-        parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
-        parser.get_thumbnail()
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
        # dont really know how to test it, just call it and assert that it does not raise anything.

    @mock.patch("paperless_tesseract.parsers.run_convert")
@@ -216,6 +89,191 @@ class TestAuxilliaryFunctions(TestCase):

        m.side_effect = call_convert

-        parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
-        parser.get_thumbnail()
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
        # dont really know how to test it, just call it and assert that it does not raise anything.
+
+    def test_get_dpi(self):
+        parser = RasterisedDocumentParser(None)
+
+        dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
+        self.assertEqual(dpi, None)
+
+        dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
+        self.assertEqual(dpi, 72)
+
+    def test_simple_digital(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_with_form(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
+
+    @override_settings(OCR_MODE="redo")
+    def test_with_form_error(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
+
+    @override_settings(OCR_MODE="redo")
+    @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None)
+    def test_with_form_error_notext(self):
+        parser = RasterisedDocumentParser(None)
+
+        def f():
+            parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertRaises(ParseError, f)
+
+    @override_settings(OCR_MODE="force")
+    def test_with_form_force(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
+
+    def test_image_simple(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_image_simple_alpha_fail(self):
+        parser = RasterisedDocumentParser(None)
+
+        def f():
+            parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png")
+
+        self.assertRaises(ParseError, f)
+
+
+    def test_image_no_dpi_fail(self):
+        parser = RasterisedDocumentParser(None)
+
+        def f():
+            parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
+
+        self.assertRaises(ParseError, f)
+
+    @override_settings(OCR_IMAGE_DPI=72)
+    def test_image_no_dpi_default(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
+
+    def test_multi_page(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
+    def test_multi_page_pages_skip(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
+    def test_multi_page_pages_redo(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="force")
+    def test_multi_page_pages_force(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OOCR_MODE="skip")
+    def test_multi_page_analog_pages_skip(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
+    def test_multi_page_analog_pages_redo(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
+        self.assertFalse("page 3" in parser.get_text().lower())
+
+    @override_settings(OCR_PAGES=1, OCR_MODE="force")
+    def test_multi_page_analog_pages_force(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
+        self.assertFalse("page 2" in parser.get_text().lower())
+        self.assertFalse("page 3" in parser.get_text().lower())
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_skip_noarchive_withtext(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_skip_noarchive_notext(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.join(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+
+class TestParserFileTypes(DirectoriesMixin, TestCase):
+
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+
+    def test_bmp(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertTrue("this is a test document" in parser.get_text().lower())
+
+    def test_jpg(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertTrue("this is a test document" in parser.get_text().lower())
+
+    @override_settings(OCR_IMAGE_DPI=200)
+    def test_gif(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertTrue("this is a test document" in parser.get_text().lower())
+
+    def test_tiff(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertTrue("this is a test document" in parser.get_text().lower())
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
    This parser directly parses a text document (.txt, .md, or .csv)
    """

-    def __init__(self, path, logging_group):
-        super().__init__(path, logging_group)
-        self._text = None
-
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
@@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
            )

        def read_text():
-            with open(self.document_path, 'r') as src:
+            with open(document_path, 'r') as src:
                lines = [line.strip() for line in src.readlines()]
                text = "\n".join([line for line in lines[:n_lines]])
                return text.replace('"', "'")
@@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):

        return out_path

-    def get_text(self):
-
-        if self._text is not None:
-            return self._text
-
-        with open(self.document_path, 'r') as f:
-            self._text = f.read()
-
-        return self._text
+    def parse(self, document_path, mime_type):
+        with open(document_path, 'r') as f:
+            self.text = f.read()


 def run_command(*args):
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
    return {
        "parser": TextDocumentParser,
        "weight": 10,
-        "mime_types": [
-            "text/plain",
-            "text/comma-separated-values"
-        ]
+        "mime_types": {
+            "text/plain": ".txt",
+            "text/csv": ".csv",
+        }
    }
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -3,10 +3,9 @@ exclude = migrations, paperless/settings.py, .tox, */tests/*

 [tool:pytest]
 DJANGO_SETTINGS_MODULE=paperless.settings
-addopts = --pythonwarnings=all
+addopts = --pythonwarnings=all --cov --cov-report=html -n auto
 env =
-  PAPERLESS_SECRET=paperless
-  PAPERLESS_EMAIL_SECRET=paperless
+  PAPERLESS_DISABLE_DBHANDLER=true


 [coverage:run]