Merge branch 'dev'

2025-06-06 14:07:26 -05:00 · 2018-09-12 17:20:12 +02:00 · 2018-09-12 17:20:12 +02:00 · fb1a2ee577
commit fb1a2ee577
parent e72735c4f0 7c589f71a4
37 changed files with 56648 additions and 316 deletions
--- a/.gitignore
+++ b/.gitignore
@ -81,3 +81,8 @@ docker-compose.env
 scripts/import-for-development
 scripts/nuke

+# Static files collected by the collectstatic command
+static/
+
+# Classification Models
+models/
--- a/models/.keep
+++ b/models/.keep
--- a/paperless.conf.example
+++ b/paperless.conf.example
@ -3,6 +3,16 @@
 # As this file contains passwords it should only be readable by the user
 # running paperless.

+###############################################################################
+####                        Database Settings                              ####
+###############################################################################
+
+# By default, sqlite is used as the database backend. This can be changed here.
+#PAPERLESS_DBENGINE="django.db.backends.postgresql_psycopg2"
+#PAPERLESS_DBNAME="paperless"
+#PAPERLESS_DBUSER="paperless"
+#PAPERLESS_DBPASS="paperless"
+

 ###############################################################################
 ####                         Paths & Folders                               ####
@ -38,6 +48,13 @@ PAPERLESS_CONSUMPTION_DIR=""
 #PAPERLESS_STATIC_URL="/static/"


+# You can specify where the document classification model file should be
+# stored. Make sure that this file is writeable by the user executing the
+# management command "document_create_classifier" and that the path exists.
+# The default location is /models/model.pickle wwithin the install folder.
+#PAPERLESS_MODEL_FILE=/path/to/model/file
+
+
 # These values are required if you want paperless to check a particular email
 # box every 10 minutes and attempt to consume documents from there.  If you
 # don't define a HOST, mail checking will just be disabled.
@ -59,6 +76,11 @@ PAPERLESS_EMAIL_SECRET=""
 ####                              Security                                 ####
 ###############################################################################

+# Controls whether django's debug mode is enabled. Disable this on production
+# systems. Debug mode is enabled by default.
+PAPERLESS_DEBUG="false"
+
+
 # Paperless can be instructed to attempt to encrypt your PDF files with GPG
 # using the PAPERLESS_PASSPHRASE specified below.  If however you're not
 # concerned about encrypting these files (for example if you have disk
@ -203,3 +225,8 @@ PAPERLESS_EMAIL_SECRET=""
 # positive integer, but if you don't define one in paperless.conf, a default of
 # 100 will be used.
 #PAPERLESS_LIST_PER_PAGE=100
+
+
+# The number of years for which a correspondent will be included in the recent
+# correspondents filter.
+#PAPERLESS_RECENT_CORRESPONDENT_YEARS=1
--- a/requirements.txt
+++ b/requirements.txt
@ -45,6 +45,7 @@ pytz==2018.5
 regex==2018.8.29
 requests==2.19.1
 six==1.11.0
+scikit-learn==0.19.2
 termcolor==1.1.0
 text-unidecode==1.2
 tzlocal==1.5.1
--- a/src/documents/actions.py
+++ b/src/documents/actions.py
@ -0,0 +1,139 @@
+from django.contrib import messages
+from django.contrib.admin import helpers
+from django.contrib.admin.utils import model_ngettext
+from django.core.exceptions import PermissionDenied
+from django.template.response import TemplateResponse
+
+from documents.classifier import DocumentClassifier
+from documents.models import Tag, Correspondent, DocumentType
+
+
+def select_action(modeladmin, request, queryset, title, action, modelclass, success_message="", document_action=None, queryset_action=None):
+    opts = modeladmin.model._meta
+    app_label = opts.app_label
+
+    if not modeladmin.has_change_permission(request):
+        raise PermissionDenied
+
+    if request.POST.get('post'):
+        n = queryset.count()
+        selected_object = modelclass.objects.get(id=request.POST.get('obj_id'))
+        if n:
+            for document in queryset:
+                if document_action:
+                    document_action(document, selected_object)
+                document_display = str(document)
+                modeladmin.log_change(request, document, document_display)
+            if queryset_action:
+                queryset_action(queryset, selected_object)
+
+            modeladmin.message_user(request, success_message % {
+                "selected_object": selected_object.name, "count": n, "items": model_ngettext(modeladmin.opts, n)
+            }, messages.SUCCESS)
+
+        # Return None to display the change list page again.
+        return None
+
+    context = dict(
+        modeladmin.admin_site.each_context(request),
+        title=title,
+        queryset=queryset,
+        opts=opts,
+        action_checkbox_name=helpers.ACTION_CHECKBOX_NAME,
+        media=modeladmin.media,
+        action=action,
+        objects=modelclass.objects.all(),
+        itemname=model_ngettext(modelclass,1)
+    )
+
+    request.current_app = modeladmin.admin_site.name
+
+    return TemplateResponse(request,
+        "admin/%s/%s/select_object.html" % (app_label, opts.model_name)
+    , context)
+
+
+def simple_action(modeladmin, request, queryset, success_message="", document_action=None, queryset_action=None):
+    if not modeladmin.has_change_permission(request):
+        raise PermissionDenied
+
+    n = queryset.count()
+    if n:
+        for document in queryset:
+            if document_action:
+                document_action(document)
+            document_display = str(document)
+            modeladmin.log_change(request, document, document_display)
+        if queryset_action:
+            queryset_action(queryset)
+        modeladmin.message_user(request, success_message % {
+            "count": n, "items": model_ngettext(modeladmin.opts, n)
+        }, messages.SUCCESS)
+
+    return None
+
+
+def add_tag_to_selected(modeladmin, request, queryset):
+    return select_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                         title="Add tag to multiple documents",
+                         action="add_tag_to_selected",
+                         modelclass=Tag,
+                         success_message="Successfully added tag %(selected_object)s to %(count)d %(items)s.",
+                         document_action=lambda doc, tag: doc.tags.add(tag))
+add_tag_to_selected.short_description = "Add tag to selected documents"
+
+
+def remove_tag_from_selected(modeladmin, request, queryset):
+    return select_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                         title="Remove tag from multiple documents",
+                         action="remove_tag_from_selected",
+                         modelclass=Tag,
+                         success_message="Successfully removed tag %(selected_object)s from %(count)d %(items)s.",
+                         document_action=lambda doc, tag: doc.tags.remove(tag))
+remove_tag_from_selected.short_description = "Remove tag from selected documents"
+
+
+def set_correspondent_on_selected(modeladmin, request, queryset):
+    return select_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                         title="Set correspondent on multiple documents",
+                         action="set_correspondent_on_selected",
+                         modelclass=Correspondent,
+                         success_message="Successfully set correspondent %(selected_object)s on %(count)d %(items)s.",
+                         queryset_action=lambda queryset, correspondent: queryset.update(correspondent=correspondent))
+set_correspondent_on_selected.short_description = "Set correspondent on selected documents"
+
+
+def remove_correspondent_from_selected(modeladmin, request, queryset):
+    return simple_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                         success_message="Successfully removed correspondent from %(count)d %(items)s.",
+                         queryset_action=lambda qs: qs.update(correspondent=None))
+remove_correspondent_from_selected.short_description = "Remove correspondent from selected documents"
+
+
+def set_document_type_on_selected(modeladmin, request, queryset):
+    return select_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                         title="Set document type on multiple documents",
+                         action="set_document_type_on_selected",
+                         modelclass=DocumentType,
+                         success_message="Successfully set document type %(selected_object)s on %(count)d %(items)s.",
+                         queryset_action=lambda queryset, document_type: queryset.update(document_type=document_type))
+set_document_type_on_selected.short_description = "Set document type on selected documents"
+
+
+def remove_document_type_from_selected(modeladmin, request, queryset):
+    return simple_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                         success_message="Successfully removed document type from %(count)d %(items)s.",
+                         queryset_action=lambda qs: qs.update(document_type=None))
+remove_document_type_from_selected.short_description = "Remove document type from selected documents"
+
+
+def run_document_classifier_on_selected(modeladmin, request, queryset):
+    try:
+        clf = DocumentClassifier.load_classifier()
+        return simple_action(modeladmin=modeladmin, request=request, queryset=queryset,
+                             success_message="Successfully applied document classifier to %(count)d %(items)s.",
+                             document_action=lambda doc: clf.classify_document(doc, classify_correspondent=True, classify_tags=True, classify_document_type=True))
+    except FileNotFoundError:
+        modeladmin.message_user(request, "Classifier model file not found.", messages.ERROR)
+        return None
+run_document_classifier_on_selected.short_description = "Run document classifier on selected"
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@ -1,42 +1,24 @@
-from datetime import datetime
+from datetime import datetime, timedelta

 from django.conf import settings
-from django.contrib import admin
+from django.contrib import admin, messages
+from django.contrib.admin.templatetags.admin_urls import add_preserved_filters
 from django.contrib.auth.models import User, Group
+from django.http import HttpResponseRedirect
 try:
    from django.core.urlresolvers import reverse
 except ImportError:
    from django.urls import reverse
 from django.templatetags.static import static
-from django.utils.safestring import mark_safe
 from django.utils.html import format_html, format_html_join
+from django.utils.http import urlquote
+from django.utils.safestring import mark_safe
+from django.db import models

-from .models import Correspondent, Tag, Document, Log
-
-
-class MonthListFilter(admin.SimpleListFilter):
-
-    title = "Month"
-
-    # Parameter for the filter that will be used in the URL query.
-    parameter_name = "month"
-
-    def lookups(self, request, model_admin):
-        r = []
-        for document in Document.objects.all():
-            r.append((
-                document.created.strftime("%Y-%m"),
-                document.created.strftime("%B %Y")
-            ))
-        return sorted(set(r), key=lambda x: x[0], reverse=True)
-
-    def queryset(self, request, queryset):
-
-        if not self.value():
-            return None
-
-        year, month = self.value().split("-")
-        return queryset.filter(created__year=year, created__month=month)
+from documents.actions import add_tag_to_selected, remove_tag_from_selected, set_correspondent_on_selected, \
+    remove_correspondent_from_selected, set_document_type_on_selected, remove_document_type_from_selected, \
+    run_document_classifier_on_selected
+from .models import Correspondent, Tag, Document, Log, DocumentType


 class FinancialYearFilter(admin.SimpleListFilter):
@ -104,48 +86,97 @@ class FinancialYearFilter(admin.SimpleListFilter):
                               created__lte=self._fy_end(end))


+class RecentCorrespondentFilter(admin.RelatedFieldListFilter):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.title = "correspondent (recent)"
+
+    def field_choices(self, field, request, model_admin):
+        lookups = []
+        date_limit = datetime.now() - timedelta(days=365*settings.PAPERLESS_RECENT_CORRESPONDENT_YEARS)
+        for c in Correspondent.objects.filter(documents__created__gte = date_limit).distinct():
+            lookups.append( (c.id, c.name) )
+        return lookups
+
+
 class CommonAdmin(admin.ModelAdmin):
    list_per_page = settings.PAPERLESS_LIST_PER_PAGE


 class CorrespondentAdmin(CommonAdmin):

-    list_display = ("name", "match", "matching_algorithm", "document_count")
-    list_filter = ("matching_algorithm",)
-    list_editable = ("match", "matching_algorithm")
+    list_display = ("name", "automatic_classification", "document_count", "last_correspondence")
+    list_editable = ("automatic_classification",)
+
+    def get_queryset(self, request):
+        qs = super(CorrespondentAdmin, self).get_queryset(request)
+        qs = qs.annotate(document_count=models.Count("documents"), last_correspondence=models.Max("documents__created"))
+        return qs

    def document_count(self, obj):
-        return obj.documents.count()
+        return obj.document_count
+    document_count.admin_order_field = "document_count"
+
+    def last_correspondence(self, obj):
+        return obj.last_correspondence
+    last_correspondence.admin_order_field = "last_correspondence"


 class TagAdmin(CommonAdmin):

-    list_display = ("name", "colour", "match", "matching_algorithm",
-                    "document_count")
-    list_filter = ("colour", "matching_algorithm")
-    list_editable = ("colour", "match", "matching_algorithm")
+    list_display = ("name", "colour", "automatic_classification", "document_count")
+    list_filter = ("colour",)
+    list_editable = ("colour", "automatic_classification")
+
+    def get_queryset(self, request):
+        qs = super(TagAdmin, self).get_queryset(request)
+        qs = qs.annotate(document_count=models.Count("documents"))
+        return qs

    def document_count(self, obj):
-        return obj.documents.count()
+        return obj.document_count
+    document_count.admin_order_field = "document_count"


+class DocumentTypeAdmin(CommonAdmin):
+
+    list_display = ("name", "automatic_classification", "document_count")
+    list_editable = ("automatic_classification",)
+
+    def get_queryset(self, request):
+        qs = super(DocumentTypeAdmin, self).get_queryset(request)
+        qs = qs.annotate(document_count=models.Count("documents"))
+        return qs
+
+    def document_count(self, obj):
+        return obj.document_count
+    document_count.admin_order_field = "document_count"
+
 class DocumentAdmin(CommonAdmin):

    class Media:
        css = {
            "all": ("paperless.css",)
+
        }

    search_fields = ("correspondent__name", "title", "content", "tags__name")
    readonly_fields = ("added",)
    list_display = ("title", "created", "added", "thumbnail", "correspondent",
-                    "tags_")
-    list_filter = ("tags", "correspondent", FinancialYearFilter,
-                   MonthListFilter)
+                    "tags_", "archive_serial_number", "document_type")
+    list_filter = ("document_type", "tags", ('correspondent', RecentCorrespondentFilter), "correspondent", FinancialYearFilter)
+
    filter_horizontal = ("tags",)

    ordering = ["-created", "correspondent"]

+    actions = [add_tag_to_selected, remove_tag_from_selected, set_correspondent_on_selected, remove_correspondent_from_selected, set_document_type_on_selected, remove_document_type_from_selected, run_document_classifier_on_selected]
+
+    date_hierarchy = 'created'
+
+    document_queue = None
+
    def has_add_permission(self, request):
        return False

@ -153,6 +184,57 @@ class DocumentAdmin(CommonAdmin):
        return obj.created.date().strftime("%Y-%m-%d")
    created_.short_description = "Created"

+    def changelist_view(self, request, extra_context=None):
+        response = super().changelist_view(request, extra_context)
+
+        if request.method == 'GET':
+            cl = self.get_changelist_instance(request)
+            self.document_queue = [doc.id for doc in cl.queryset]
+
+        return response
+
+    def change_view(self, request, object_id=None, form_url='', extra_context=None):
+        extra_context = extra_context or {}
+        doc = Document.objects.get(id=object_id)
+        extra_context['download_url'] = doc.download_url
+        extra_context['file_type'] = doc.file_type
+        if self.document_queue and object_id and int(object_id) in self.document_queue:
+            #There is a queue of documents
+            current_index = self.document_queue.index(int(object_id))
+            if current_index < len(self.document_queue) - 1:
+                #... and there are still documents in the queue
+                extra_context['next_object'] = self.document_queue[current_index + 1]
+        return super(DocumentAdmin, self).change_view(
+            request, object_id, form_url, extra_context=extra_context,
+        )
+
+    def response_change(self, request, obj):
+
+        # This is mostly copied from ModelAdmin.response_change()
+        opts = self.model._meta
+        preserved_filters = self.get_preserved_filters(request)
+
+        msg_dict = {
+            'name': opts.verbose_name,
+            'obj': format_html('<a href="{}">{}</a>', urlquote(request.path), obj),
+        }
+        if "_saveandeditnext" in request.POST:
+            msg = format_html(
+                'The {name} "{obj}" was changed successfully. Editing next object.',
+                **msg_dict
+            )
+            self.message_user(request, msg, messages.SUCCESS)
+            redirect_url = reverse('admin:%s_%s_change' %
+                                   (opts.app_label, opts.model_name),
+                                   args=(request.POST['_next_object'],),
+                                   current_app=self.admin_site.name)
+            redirect_url = add_preserved_filters({'preserved_filters': preserved_filters, 'opts': opts}, redirect_url)
+            response = HttpResponseRedirect(redirect_url)
+        else:
+            response = super().response_change(request, obj)
+
+        return response
+
    @mark_safe
    def thumbnail(self, obj):
        return self._html_tag(
@ -221,6 +303,7 @@ class LogAdmin(CommonAdmin):

 admin.site.register(Correspondent, CorrespondentAdmin)
 admin.site.register(Tag, TagAdmin)
+admin.site.register(DocumentType, DocumentTypeAdmin)
 admin.site.register(Document, DocumentAdmin)
 admin.site.register(Log, LogAdmin)

--- a/src/documents/apps.py
+++ b/src/documents/apps.py
@ -11,8 +11,8 @@ class DocumentsConfig(AppConfig):
        from .signals import document_consumption_started
        from .signals import document_consumption_finished
        from .signals.handlers import (
-            set_correspondent,
-            set_tags,
+            classify_document,
+            add_inbox_tags,
            run_pre_consume_script,
            run_post_consume_script,
            cleanup_document_deletion,
@ -21,8 +21,8 @@ class DocumentsConfig(AppConfig):

        document_consumption_started.connect(run_pre_consume_script)

-        document_consumption_finished.connect(set_tags)
-        document_consumption_finished.connect(set_correspondent)
+        document_consumption_finished.connect(classify_document)
+        document_consumption_finished.connect(add_inbox_tags)
        document_consumption_finished.connect(set_log_entry)
        document_consumption_finished.connect(run_post_consume_script)

--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@ -0,0 +1,167 @@
+import logging
+import os
+import pickle
+
+from sklearn.neural_network import MLPClassifier
+
+from documents.models import Correspondent, DocumentType, Tag, Document
+from paperless import settings
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
+
+
+def preprocess_content(content):
+    content = content.lower()
+    content = content.strip()
+    content = content.replace("\n", " ")
+    content = content.replace("\r", " ")
+    while content.find("  ") > -1:
+        content = content.replace("  ", " ")
+    return content
+
+
+class DocumentClassifier(object):
+
+    classifier_version = None
+
+    data_vectorizer = None
+
+    tags_binarizer = None
+    correspondent_binarizer = None
+    document_type_binarizer = None
+
+    tags_classifier = None
+    correspondent_classifier = None
+    document_type_classifier = None
+
+    @staticmethod
+    def load_classifier():
+        clf = DocumentClassifier()
+        clf.reload()
+        return clf
+
+    def reload(self):
+        if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
+            logging.getLogger(__name__).info("Reloading classifier models")
+            with open(settings.MODEL_FILE, "rb") as f:
+                self.data_vectorizer = pickle.load(f)
+                self.tags_binarizer = pickle.load(f)
+                self.correspondent_binarizer = pickle.load(f)
+                self.document_type_binarizer = pickle.load(f)
+
+                self.tags_classifier = pickle.load(f)
+                self.correspondent_classifier = pickle.load(f)
+                self.document_type_classifier = pickle.load(f)
+            self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
+
+    def save_classifier(self):
+        with open(settings.MODEL_FILE, "wb") as f:
+            pickle.dump(self.data_vectorizer, f)
+
+            pickle.dump(self.tags_binarizer, f)
+            pickle.dump(self.correspondent_binarizer, f)
+            pickle.dump(self.document_type_binarizer, f)
+
+            pickle.dump(self.tags_classifier, f)
+            pickle.dump(self.correspondent_classifier, f)
+            pickle.dump(self.document_type_classifier, f)
+
+    def train(self):
+        data = list()
+        labels_tags = list()
+        labels_correspondent = list()
+        labels_document_type = list()
+
+        # Step 1: Extract and preprocess training data from the database.
+        logging.getLogger(__name__).info("Gathering data from database...")
+        for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+            data.append(preprocess_content(doc.content))
+            labels_document_type.append(doc.document_type.id if doc.document_type is not None and doc.document_type.automatic_classification else -1)
+            labels_correspondent.append(doc.correspondent.id if doc.correspondent is not None and doc.correspondent.automatic_classification else -1)
+            tags = [tag.id for tag in doc.tags.filter(automatic_classification=True)]
+            labels_tags.append(tags)
+
+        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
+        logging.getLogger(__name__).info("{} documents, {} tag(s), {} correspondent(s), {} document type(s).".format(len(data), len(labels_tags_unique), len(set(labels_correspondent)), len(set(labels_document_type))))
+
+        # Step 2: vectorize data
+        logging.getLogger(__name__).info("Vectorizing data...")
+        self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 5), min_df=0.1)
+        data_vectorized = self.data_vectorizer.fit_transform(data)
+
+        self.tags_binarizer = MultiLabelBinarizer()
+        labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
+
+        self.correspondent_binarizer = LabelBinarizer()
+        labels_correspondent_vectorized = self.correspondent_binarizer.fit_transform(labels_correspondent)
+
+        self.document_type_binarizer = LabelBinarizer()
+        labels_document_type_vectorized = self.document_type_binarizer.fit_transform(labels_document_type)
+
+        # Step 3: train the classifiers
+        if len(self.tags_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training tags classifier...")
+            self.tags_classifier = MLPClassifier(verbose=True)
+            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
+        else:
+            self.tags_classifier = None
+            logging.getLogger(__name__).info("There are no tags. Not training tags classifier.")
+
+        if len(self.correspondent_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training correspondent classifier...")
+            self.correspondent_classifier = MLPClassifier(verbose=True)
+            self.correspondent_classifier.fit(data_vectorized, labels_correspondent_vectorized)
+        else:
+            self.correspondent_classifier = None
+            logging.getLogger(__name__).info("There are no correspondents. Not training correspondent classifier.")
+
+        if len(self.document_type_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training document type classifier...")
+            self.document_type_classifier = MLPClassifier(verbose=True)
+            self.document_type_classifier.fit(data_vectorized, labels_document_type_vectorized)
+        else:
+            self.document_type_classifier = None
+            logging.getLogger(__name__).info("There are no document types. Not training document type classifier.")
+
+    def classify_document(self, document, classify_correspondent=False, classify_document_type=False, classify_tags=False, replace_tags=False):
+        X = self.data_vectorizer.transform([preprocess_content(document.content)])
+
+        update_fields=()
+
+        if classify_correspondent and self.correspondent_classifier is not None:
+            y_correspondent = self.correspondent_classifier.predict(X)
+            correspondent_id = self.correspondent_binarizer.inverse_transform(y_correspondent)[0]
+            try:
+                correspondent = Correspondent.objects.get(id=correspondent_id) if correspondent_id != -1 else None
+                logging.getLogger(__name__).info("Detected correspondent: {}".format(correspondent.name if correspondent else "-"))
+                document.correspondent = correspondent
+                update_fields = update_fields + ("correspondent",)
+            except Correspondent.DoesNotExist:
+                logging.getLogger(__name__).warning("Detected correspondent with id {} does not exist anymore! Did you delete it?".format(correspondent_id))
+
+        if classify_document_type and self.document_type_classifier is not None:
+            y_type = self.document_type_classifier.predict(X)
+            type_id = self.document_type_binarizer.inverse_transform(y_type)[0]
+            try:
+                document_type = DocumentType.objects.get(id=type_id) if type_id != -1 else None
+                logging.getLogger(__name__).info("Detected document type: {}".format(document_type.name if document_type else "-"))
+                document.document_type = document_type
+                update_fields = update_fields + ("document_type",)
+            except DocumentType.DoesNotExist:
+                logging.getLogger(__name__).warning("Detected document type with id {} does not exist anymore! Did you delete it?".format(type_id))
+
+        if classify_tags and self.tags_classifier is not None:
+            y_tags = self.tags_classifier.predict(X)
+            tags_ids = self.tags_binarizer.inverse_transform(y_tags)[0]
+            if replace_tags:
+                document.tags.clear()
+            for tag_id in tags_ids:
+                try:
+                    tag = Tag.objects.get(id=tag_id)
+                    document.tags.add(tag)
+                    logging.getLogger(__name__).info("Detected tag: {}".format(tag.name))
+                except Tag.DoesNotExist:
+                    logging.getLogger(__name__).warning("Detected tag with id {} does not exist anymore! Did you delete it?".format(tag_id))
+
+        document.save(update_fields=update_fields)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -221,12 +221,6 @@ class Consumer:
                storage_type=self.storage_type
            )

-        relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
-        if relevant_tags:
-            tag_names = ", ".join([t.slug for t in relevant_tags])
-            self.log("debug", "Tagging with {}".format(tag_names))
-            document.tags.add(*relevant_tags)
-
        self._write(document, doc, document.source_path)
        self._write(document, thumbnail, document.thumbnail_path)

--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@ -1,6 +1,6 @@
 from django_filters.rest_framework import CharFilter, FilterSet, BooleanFilter

-from .models import Correspondent, Document, Tag
+from .models import Correspondent, Document, Tag, DocumentType


 class CorrespondentFilterSet(FilterSet):
@ -29,6 +29,19 @@ class TagFilterSet(FilterSet):
        }


+class DocumentTypeFilterSet(FilterSet):
+
+    class Meta(object):
+        model = DocumentType
+        fields = {
+            "name": [
+                "startswith", "endswith", "contains",
+                "istartswith", "iendswith", "icontains"
+            ],
+            "slug": ["istartswith", "iendswith", "icontains"]
+        }
+
+
 class DocumentFilterSet(FilterSet):

    CHAR_KWARGS = {
@ -52,6 +65,10 @@ class DocumentFilterSet(FilterSet):
        field_name="tags__slug", **CHAR_KWARGS)
    tags__empty = BooleanFilter(
        field_name="tags", lookup_expr="isnull", distinct=True)
+    document_type__name = CharFilter(
+        name="document_type__name", **CHAR_KWARGS)
+    document_type__slug = CharFilter(
+        name="document_type__slug", **CHAR_KWARGS)

    class Meta:
        model = Document
--- a/src/documents/management/commands/document_correspondents.py
+++ b/src/documents/management/commands/document_correspondents.py
@ -1,82 +0,0 @@
-import sys
-
-from django.core.management.base import BaseCommand
-
-from documents.models import Correspondent, Document
-
-from ...mixins import Renderable
-
-
-class Command(Renderable, BaseCommand):
-
-    help = """
-        Using the current set of correspondent rules, apply said rules to all
-        documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with correspondent created (or modified)
-        after their initial import.
-    """.replace("    ", "")
-
-    TOO_MANY_CONTINUE = (
-        "Detected {} potential correspondents for {}, so we've opted for {}")
-    TOO_MANY_SKIP = (
-        "Detected {} potential correspondents for {}, so we're skipping it")
-    CHANGE_MESSAGE = (
-        'Document {}: "{}" was given the correspondent id {}: "{}"')
-
-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "--use-first",
-            default=False,
-            action="store_true",
-            help="By default this command won't try to assign a correspondent "
-                 "if more than one matches the document.  Use this flag if "
-                 "you'd rather it just pick the first one it finds."
-        )
-
-    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
-
-        for document in Document.objects.filter(correspondent__isnull=True):
-
-            potential_correspondents = list(
-                Correspondent.match_all(document.content))
-
-            if not potential_correspondents:
-                continue
-
-            potential_count = len(potential_correspondents)
-            correspondent = potential_correspondents[0]
-
-            if potential_count > 1:
-                if not options["use_first"]:
-                    print(
-                        self.TOO_MANY_SKIP.format(potential_count, document),
-                        file=sys.stderr
-                    )
-                    continue
-                print(
-                    self.TOO_MANY_CONTINUE.format(
-                        potential_count,
-                        document,
-                        correspondent
-                    ),
-                    file=sys.stderr
-                )
-
-            document.correspondent = correspondent
-            document.save(update_fields=("correspondent",))
-
-            print(
-                self.CHANGE_MESSAGE.format(
-                    document.pk,
-                    document.title,
-                    correspondent.pk,
-                    correspondent.name
-                ),
-                file=sys.stderr
-            )
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@ -0,0 +1,27 @@
+import logging
+import os.path
+import pickle
+
+from django.core.management.base import BaseCommand
+from documents.classifier import  DocumentClassifier
+from paperless import settings
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        There is no help.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def handle(self, *args, **options):
+        clf = DocumentClassifier()
+
+        clf.train()
+
+        logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
+
+        clf.save_classifier()
--- a/src/documents/management/commands/document_create_dataset.py
+++ b/src/documents/management/commands/document_create_dataset.py
@ -0,0 +1,40 @@
+from django.core.management.base import BaseCommand
+
+from documents.classifier import preprocess_content
+from documents.models import Document
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        There is no help.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def handle(self, *args, **options):
+        with open("dataset_tags.txt", "w") as f:
+            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+                labels = []
+                for tag in doc.tags.filter(automatic_classification=True):
+                    labels.append(tag.name)
+                f.write(",".join(labels))
+                f.write(";")
+                f.write(preprocess_content(doc.content))
+                f.write("\n")
+
+        with open("dataset_types.txt", "w") as f:
+            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+                f.write(doc.document_type.name if doc.document_type is not None and doc.document_type.automatic_classification else "-")
+                f.write(";")
+                f.write(preprocess_content(doc.content))
+                f.write("\n")
+
+        with open("dataset_correspondents.txt", "w") as f:
+            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+                f.write(doc.correspondent.name if doc.correspondent is not None and doc.correspondent.automatic_classification else "-")
+                f.write(";")
+                f.write(preprocess_content(doc.content))
+                f.write("\n")
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@ -6,7 +6,7 @@ import shutil
 from django.core.management.base import BaseCommand, CommandError
 from django.core import serializers

-from documents.models import Document, Correspondent, Tag
+from documents.models import Document, Correspondent, Tag, DocumentType
 from paperless.db import GnuPG

 from ...mixins import Renderable
@ -91,6 +91,9 @@ class Command(Renderable, BaseCommand):
        manifest += json.loads(serializers.serialize(
            "json", Tag.objects.all()))

+        manifest += json.loads(serializers.serialize(
+            "json", DocumentType.objects.all()))
+
        with open(os.path.join(self.target, "manifest.json"), "w") as f:
            json.dump(manifest, f, indent=2)

--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@ -1,5 +1,8 @@
+import logging
+
 from django.core.management.base import BaseCommand

+from documents.classifier import DocumentClassifier
 from documents.models import Document, Tag

 from ...mixins import Renderable
@ -8,25 +11,51 @@ from ...mixins import Renderable
 class Command(Renderable, BaseCommand):

    help = """
-        Using the current set of tagging rules, apply said rules to all
-        documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with tags created (or modified) after
-        their initial import.
+        There is no help. #TODO
    """.replace("    ", "")

    def __init__(self, *args, **kwargs):
        self.verbosity = 0
        BaseCommand.__init__(self, *args, **kwargs)

+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-c", "--correspondent",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-T", "--tags",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-t", "--type",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-i", "--inbox-only",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-r", "--replace-tags",
+            action="store_true"
+        )
+
    def handle(self, *args, **options):

        self.verbosity = options["verbosity"]

-        for document in Document.objects.all():
+        if options['inbox_only']:
+            documents = Document.objects.filter(tags__is_inbox_tag=True).exclude(tags__is_archived_tag=True).distinct()
+        else:
+            documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()

-            tags = Tag.objects.exclude(
-                pk__in=document.tags.values_list("pk", flat=True))
+        logging.getLogger(__name__).info("Loading classifier")
+        try:
+            clf = DocumentClassifier.load_classifier()
+        except FileNotFoundError:
+            logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.")
+            return

-            for tag in Tag.match_all(document.content, tags):
-                print('Tagging {} with "{}"'.format(document, tag))
-                document.tags.add(tag)
+        for document in documents:
+            logging.getLogger(__name__).info("Processing document {}".format(document.title))
+            clf.classify_document(document, classify_document_type=options['type'], classify_tags=options['tags'], classify_correspondent=options['correspondent'], replace_tags=options['replace_tags'])
--- a/src/documents/migrations/0011_auto_20160303_1929.py
+++ b/src/documents/migrations/0011_auto_20160303_1929.py
--- a/src/documents/migrations/0022_workflow_improvements.py
+++ b/src/documents/migrations/0022_workflow_improvements.py
@ -0,0 +1,28 @@
+# Generated by Django 2.0.7 on 2018-07-12 09:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0021_document_storage_type'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_serial_number',
+            field=models.IntegerField(blank=True, db_index=True, help_text='The position of this document in your physical document archive.', null=True, unique=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='is_archived_tag',
+            field=models.BooleanField(default=False, help_text='Marks this tag as an archive tag: All documents tagged with archive tags will never be modified automatically (i.e., modifying tags by matching rules)'),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='is_inbox_tag',
+            field=models.BooleanField(default=False, help_text='Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.'),
+        ),
+    ]
--- a/src/documents/migrations/0023_auto_20180823_1155.py
+++ b/src/documents/migrations/0023_auto_20180823_1155.py
@ -0,0 +1,33 @@
+# Generated by Django 2.0.7 on 2018-08-23 11:55
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0022_workflow_improvements'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='DocumentType',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.CharField(max_length=128, unique=True)),
+                ('slug', models.SlugField(blank=True)),
+                ('match', models.CharField(blank=True, max_length=256)),
+                ('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF.  Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided.  A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF.  (If you don\'t know what a regex is, you probably don\'t want this option.)  Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
+                ('is_insensitive', models.BooleanField(default=True)),
+            ],
+            options={
+                'abstract': False,
+            },
+        ),
+        migrations.AddField(
+            model_name='document',
+            name='document_type',
+            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='documents', to='documents.DocumentType'),
+        ),
+    ]
--- a/src/documents/migrations/0024_auto_20180904_1425.py
+++ b/src/documents/migrations/0024_auto_20180904_1425.py
@ -0,0 +1,77 @@
+# Generated by Django 2.0.8 on 2018-09-04 14:25
+
+from django.db import migrations, models
+
+
+def transfer_automatic_classification(apps, schema_editor):
+    for model_name in ["Tag", "Correspondent", "DocumentType"]:
+        model_class = apps.get_model("documents", model_name)
+        for o in model_class.objects.all():
+            o.automatic_classification = o.match is not None and len(o.match) > 0
+            o.save()
+
+
+def reverse_automatic_classification(apps, schema_editor):
+    pass
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0023_auto_20180823_1155'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='correspondent',
+            name='automatic_classification',
+            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
+        ),
+        migrations.AddField(
+            model_name='documenttype',
+            name='automatic_classification',
+            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='automatic_classification',
+            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
+        ),
+        migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
+        migrations.RemoveField(
+            model_name='correspondent',
+            name='is_insensitive',
+        ),
+        migrations.RemoveField(
+            model_name='correspondent',
+            name='match',
+        ),
+        migrations.RemoveField(
+            model_name='correspondent',
+            name='matching_algorithm',
+        ),
+        migrations.RemoveField(
+            model_name='documenttype',
+            name='is_insensitive',
+        ),
+        migrations.RemoveField(
+            model_name='documenttype',
+            name='match',
+        ),
+        migrations.RemoveField(
+            model_name='documenttype',
+            name='matching_algorithm',
+        ),
+        migrations.RemoveField(
+            model_name='tag',
+            name='is_insensitive',
+        ),
+        migrations.RemoveField(
+            model_name='tag',
+            name='match',
+        ),
+        migrations.RemoveField(
+            model_name='tag',
+            name='matching_algorithm',
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -23,42 +23,10 @@ except ImportError:

 class MatchingModel(models.Model):

-    MATCH_ANY = 1
-    MATCH_ALL = 2
-    MATCH_LITERAL = 3
-    MATCH_REGEX = 4
-    MATCH_FUZZY = 5
-    MATCHING_ALGORITHMS = (
-        (MATCH_ANY, "Any"),
-        (MATCH_ALL, "All"),
-        (MATCH_LITERAL, "Literal"),
-        (MATCH_REGEX, "Regular Expression"),
-        (MATCH_FUZZY, "Fuzzy Match"),
-    )
-
    name = models.CharField(max_length=128, unique=True)
    slug = models.SlugField(blank=True)

-    match = models.CharField(max_length=256, blank=True)
-    matching_algorithm = models.PositiveIntegerField(
-        choices=MATCHING_ALGORITHMS,
-        default=MATCH_ANY,
-        help_text=(
-            "Which algorithm you want to use when matching text to the OCR'd "
-            "PDF.  Here, \"any\" looks for any occurrence of any word "
-            "provided in the PDF, while \"all\" requires that every word "
-            "provided appear in the PDF, albeit not in the order provided.  A "
-            "\"literal\" match means that the text you enter must appear in "
-            "the PDF exactly as you've entered it, and \"regular expression\" "
-            "uses a regex to match the PDF.  (If you don't know what a regex "
-            "is, you probably don't want this option.)  Finally, a \"fuzzy "
-            "match\" looks for words or phrases that are mostly—but not "
-            "exactly—the same, which can be useful for matching against "
-            "documents containg imperfections that foil accurate OCR."
-        )
-    )
-
-    is_insensitive = models.BooleanField(default=True)
+    automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')

    class Meta:
        abstract = True
@ -67,87 +35,8 @@ class MatchingModel(models.Model):
    def __str__(self):
        return self.name

-    @property
-    def conditions(self):
-        return "{}: \"{}\" ({})".format(
-            self.name, self.match, self.get_matching_algorithm_display())
-
-    @classmethod
-    def match_all(cls, text, tags=None):
-
-        if tags is None:
-            tags = cls.objects.all()
-
-        text = text.lower()
-        for tag in tags:
-            if tag.matches(text):
-                yield tag
-
-    def matches(self, text):
-
-        search_kwargs = {}
-
-        # Check that match is not empty
-        if self.match.strip() == "":
-            return False
-
-        if self.is_insensitive:
-            search_kwargs = {"flags": re.IGNORECASE}
-
-        if self.matching_algorithm == self.MATCH_ALL:
-            for word in self._split_match():
-                search_result = re.search(
-                    r"\b{}\b".format(word), text, **search_kwargs)
-                if not search_result:
-                    return False
-            return True
-
-        if self.matching_algorithm == self.MATCH_ANY:
-            for word in self._split_match():
-                if re.search(r"\b{}\b".format(word), text, **search_kwargs):
-                    return True
-            return False
-
-        if self.matching_algorithm == self.MATCH_LITERAL:
-            return bool(re.search(
-                r"\b{}\b".format(self.match), text, **search_kwargs))
-
-        if self.matching_algorithm == self.MATCH_REGEX:
-            return bool(re.search(
-                re.compile(self.match, **search_kwargs), text))
-
-        if self.matching_algorithm == self.MATCH_FUZZY:
-            match = re.sub(r'[^\w\s]', '', self.match)
-            text = re.sub(r'[^\w\s]', '', text)
-            if self.is_insensitive:
-                match = match.lower()
-                text = text.lower()
-
-            return True if fuzz.partial_ratio(match, text) >= 90 else False
-
-        raise NotImplementedError("Unsupported matching algorithm")
-
-    def _split_match(self):
-        """
-        Splits the match to individual keywords, getting rid of unnecessary
-        spaces and grouping quoted words together.
-
-        Example:
-          '  some random  words "with   quotes  " and   spaces'
-            ==>
-          ["some", "random", "words", "with+quotes", "and", "spaces"]
-        """
-        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
-        normspace = re.compile(r"\s+").sub
-        return [
-            normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
-            for t in findterms(self.match)
-        ]
-
    def save(self, *args, **kwargs):

-        self.match = self.match.lower()
-
        if not self.slug:
            self.slug = slugify(self.name)

@ -184,6 +73,19 @@ class Tag(MatchingModel):

    colour = models.PositiveIntegerField(choices=COLOURS, default=1)

+    is_inbox_tag = models.BooleanField(
+        default=False,
+        help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.")
+
+    is_archived_tag = models.BooleanField(
+        default=False,
+        help_text="Marks this tag as an archive tag: All documents tagged with archive tags will never be modified automatically (i.e., modifying tags by matching rules)")
+
+
+class DocumentType(MatchingModel):
+
+    pass
+

 class Document(models.Model):

@ -215,6 +117,14 @@ class Document(models.Model):

    title = models.CharField(max_length=128, blank=True, db_index=True)

+    document_type = models.ForeignKey(
+        DocumentType,
+        blank=True,
+        null=True,
+        related_name="documents",
+        on_delete=models.SET_NULL
+    )
+
    content = models.TextField(
        db_index=True,
        blank=True,
@ -255,6 +165,13 @@ class Document(models.Model):
    added = models.DateTimeField(
        default=timezone.now, editable=False, db_index=True)

+    archive_serial_number = models.IntegerField(
+        blank=True,
+        null=True,
+        unique=True,
+        db_index=True,
+        help_text="The position of this document in your physical document archive.")
+
    class Meta:
        ordering = ("correspondent", "title")

--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@ -1,13 +1,20 @@
 from rest_framework import serializers

-from .models import Correspondent, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log, DocumentType


 class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):

    class Meta:
        model = Correspondent
-        fields = ("id", "slug", "name")
+        fields = ("id", "slug", "name", "automatic_classification")
+
+
+class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
+
+    class Meta(object):
+        model = DocumentType
+        fields = ("id", "slug", "name", "automatic_classification")


 class TagSerializer(serializers.HyperlinkedModelSerializer):
@ -15,7 +22,7 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
    class Meta:
        model = Tag
        fields = (
-            "id", "slug", "name", "colour", "match", "matching_algorithm")
+            "id", "slug", "name", "colour", "automatic_classification")


 class CorrespondentField(serializers.HyperlinkedRelatedField):
@ -28,17 +35,25 @@ class TagsField(serializers.HyperlinkedRelatedField):
        return Tag.objects.all()


+class DocumentTypeField(serializers.HyperlinkedRelatedField):
+    def get_queryset(self):
+        return DocumentType.objects.all()
+
+
 class DocumentSerializer(serializers.ModelSerializer):

    correspondent = CorrespondentField(
        view_name="drf:correspondent-detail", allow_null=True)
    tags = TagsField(view_name="drf:tag-detail", many=True)
+    document_type = DocumentTypeField(
+        view_name="drf:documenttype-detail", allow_null=True)

    class Meta:
        model = Document
        fields = (
            "id",
            "correspondent",
+            "document_type",
            "title",
            "content",
            "file_type",
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@ -8,57 +8,29 @@ from django.contrib.auth.models import User
 from django.contrib.contenttypes.models import ContentType
 from django.utils import timezone

-from ..models import Correspondent, Document, Tag
+from documents.classifier import DocumentClassifier
+from ..models import Document, Tag


 def logger(message, group):
    logging.getLogger(__name__).debug(message, extra={"group": group})


-def set_correspondent(sender, document=None, logging_group=None, **kwargs):
-
-    # No sense in assigning a correspondent when one is already set.
-    if document.correspondent:
-        return
-
-    # No matching correspondents, so no need to continue
-    potential_correspondents = list(Correspondent.match_all(document.content))
-    if not potential_correspondents:
-        return
-
-    potential_count = len(potential_correspondents)
-    selected = potential_correspondents[0]
-    if potential_count > 1:
-        message = "Detected {} potential correspondents, so we've opted for {}"
-        logger(
-            message.format(potential_count, selected),
-            logging_group
-        )
-
-    logger(
-        'Assigning correspondent "{}" to "{}" '.format(selected, document),
-        logging_group
-    )
-
-    document.correspondent = selected
-    document.save(update_fields=("correspondent",))
+classifier = DocumentClassifier()


-def set_tags(sender, document=None, logging_group=None, **kwargs):
+def classify_document(sender, document=None, logging_group=None, **kwargs):
+    global classifier
+    try:
+        classifier.reload()
+        classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_document_type=True)
+    except FileNotFoundError:
+        logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.")

-    current_tags = set(document.tags.all())
-    relevant_tags = set(Tag.match_all(document.content)) - current_tags

-    if not relevant_tags:
-        return
-
-    message = 'Tagging "{}" with "{}"'
-    logger(
-        message.format(document, ", ".join([t.slug for t in relevant_tags])),
-        logging_group
-    )
-
-    document.tags.add(*relevant_tags)
+def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
+    inbox_tags = Tag.objects.filter(is_inbox_tag=True)
+    document.tags.add(*inbox_tags)


 def run_pre_consume_script(sender, filename, **kwargs):
--- a/src/documents/static/documents/js/pdf.js
+++ b/src/documents/static/documents/js/pdf.js
--- a/src/documents/static/documents/js/pdf.js.map
+++ b/src/documents/static/documents/js/pdf.js.map
--- a/src/documents/static/documents/js/pdf.worker.js
+++ b/src/documents/static/documents/js/pdf.worker.js
--- a/src/documents/static/documents/js/pdf.worker.js.map
+++ b/src/documents/static/documents/js/pdf.worker.js.map
--- a/src/documents/static/paperless.css
+++ b/src/documents/static/paperless.css
@ -21,3 +21,16 @@ td a.tag {
  width: 90%;
  height: 5em;
 }
+
+#change_form_twocolumn_parent {
+  display: flex;
+}
+#change_form_form_parent {
+  flex:50%;
+  margin-right: 10px;
+}
+#change_form_viewer_parent {
+  flex:50%;
+  margin-left: 10px;
+  text-align: center;
+}
--- a/src/documents/templates/admin/documents/document/change_form.html
+++ b/src/documents/templates/admin/documents/document/change_form.html
@ -1,5 +1,41 @@
 {% extends 'admin/change_form.html' %}

+{% block content %}
+
+{{ block.super }}
+
+{% if file_type in "pdf jpg png" %}
+
+	<div id="change_form_twocolumn_parent">
+		<div id="change_form_form_parent"></div>
+		<div id="change_form_viewer_parent">
+			{% if file_type == "pdf" %}
+				{% include "admin/documents/document/viewers/viewer_pdf.html" %}
+			{% endif %}
+			{% if file_type in "jpg png" %}
+				{% include "admin/documents/document/viewers/viewer_image.html" %}
+			{% endif %}
+		</div>
+	</div>
+
+	<script>
+		django.jQuery("#change_form_form_parent").append(django.jQuery("#document_form"));
+		django.jQuery("#content-main").append(django.jQuery("#change_form_twocolumn_parent"));
+	</script>
+
+	{% if next_object %}
+		<script type="text/javascript">//<![CDATA[
+			(function($){
+				$('<input type="submit" value="Save and edit next" name="_saveandeditnext" />')
+				.prependTo('div.submit-row');
+				$('<input type="hidden" value="{{next_object}}" name="_next_object" />')
+				.prependTo('div.submit-row');
+			})(django.jQuery);
+		//]]></script>
+	{% endif %}
+{% endif %}
+
+{% endblock content %}

 {% block footer %}

--- a/src/documents/templates/admin/documents/document/change_list_results.html
+++ b/src/documents/templates/admin/documents/document/change_list_results.html
@ -24,7 +24,8 @@
    border: 1px solid #cccccc;
    border-radius: 2%;
    overflow: hidden;
-    height: 300px;
+    height: 350px;
+    position: relative;
  }
  .result .header {
    padding: 5px;
@ -60,6 +61,11 @@
  .result a.tag {
    color: #ffffff;
  }
+  .result .documentType {
+    padding: 5px;
+    background-color: #eeeeee;
+    text-align: center;
+  }
  .result .date {
    padding: 5px;
  }
@ -79,6 +85,15 @@
  .result .image img {
    width: 100%;
  }
+  .result .footer {
+    position: absolute;
+    bottom: 0;
+    right: 0;
+    border-left: 1px solid #cccccc;
+    border-top: 1px solid #cccccc;
+    padding: 4px 10px 4px 10px;
+    background: white;
+  }

  .grid {
    margin-right: 260px;
@ -152,7 +167,9 @@
    {# 4: Image #}
    {# 5: Correspondent #}
    {# 6: Tags #}
-    {# 7: Document edit url #}
+    {# 7: Archive serial number #}
+    {# 8: Document type #}
+    {# 9: Document edit url #}
    <div class="box">
      <div class="result">
        <div class="header">
@ -166,7 +183,7 @@
            selection would not be possible with mouse click + drag. Instead,
            the underlying link would be dragged.
          {% endcomment %}
-          <div class="headerLink" onclick="location.href='{{ result.7 }}';"></div>
+          <div class="headerLink" onclick="location.href='{{ result.9 }}';"></div>
          <div class="checkbox">{{ result.0 }}</div>
          <div class="info">
            {{ result.5 }}
@ -174,10 +191,14 @@
          {{ result.1 }}
          <div style="clear: both;"></div>
        </div>
+        {% if '>-<' not in result.8 %}<div class="documentType">{{ result.8 }}</div>{% endif %}
        <div class="tags">{{ result.6 }}</div>
        <div class="date">{{ result.2 }}</div>
        <div style="clear: both;"></div>
        <div class="image">{{ result.4 }}</div>
+        {# Only show the archive serial number if it is set on the document. #}
+        {# checking for >-< (i.e., will a dash be displayed) doesn't feel like a very good solution to me. #}
+        {% if '>-<' not in result.7 %}<div class="footer">#{{ result.7 }}</div>{% endif %}
      </div>
    </div>
  {% endfor %}
--- a/src/documents/templates/admin/documents/document/select_object.html
+++ b/src/documents/templates/admin/documents/document/select_object.html
@ -0,0 +1,46 @@
+{% extends "admin/base_site.html" %}
+{% load i18n l10n admin_urls static %}
+{% load staticfiles %}
+
+{% block extrahead %}
+{{ block.super }}
+{{ media }}
+<script type="text/javascript" src="{% static 'admin/js/cancel.js' %}"></script>
+
+{% endblock %}
+
+{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} delete-confirmation delete-selected-confirmation{% endblock %}
+
+{% block breadcrumbs %}
+<div class="breadcrumbs">
+    <a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
+    &rsaquo; <a href="{% url 'admin:app_list' app_label=opts.app_label %}">{{ opts.app_config.verbose_name }}</a>
+    &rsaquo; <a href="{% url opts|admin_urlname:'changelist' %}">{{ opts.verbose_name_plural|capfirst }}</a>
+    &rsaquo; {{title}}
+</div>
+{% endblock %}
+
+{% block content %}
+<p>Please select the {{itemname}}.</p>
+<form method="post">{% csrf_token %}
+    <div>
+        {% for obj in queryset %}
+        <input type="hidden" name="{{ action_checkbox_name }}" value="{{ obj.pk|unlocalize }}"/>
+        {% endfor %}
+        <p>
+            <select name="obj_id">
+                {% for obj in objects %}
+                <option value="{{obj.id}}">{{obj.name}}</option>
+                {% endfor %}
+            </select>
+        </p>
+
+        <input type="hidden" name="action" value="{{action}}"/>
+        <input type="hidden" name="post" value="yes"/>
+        <p>
+            <input type="submit" value="{% trans " Confirm" %}" />
+            <a href="#" class="button cancel-link">{% trans "Go back" %}</a>
+        </p>
+    </div>
+</form>
+{% endblock %}
--- a/src/documents/templates/admin/documents/document/viewers/viewer_image.html
+++ b/src/documents/templates/admin/documents/document/viewers/viewer_image.html
@ -0,0 +1,2 @@
+<img src="{{download_url}}" style="max-width: 100%">
+
--- a/src/documents/templates/admin/documents/document/viewers/viewer_pdf.html
+++ b/src/documents/templates/admin/documents/document/viewers/viewer_pdf.html
@ -0,0 +1,119 @@
+{% load static %}
+
+<div>
+    <input id="prev" value="Previous" class="default" type="button">
+    <input id="next" value="Next" class="default" type="button">
+  &nbsp; &nbsp;
+  <span>Page: <span id="page_num"></span> / <span id="page_count"></span></span>
+  &nbsp; &nbsp;
+    <input id="zoomin" value="+" class="default" type="button">
+    <input id="zoomout" value="-" class="default" type="button">
+</div>
+
+<div style="width: 100%; overflow: auto;">
+<canvas id="the-canvas"></canvas>
+    </div>
+<script type="text/javascript" src="{% static 'documents/js/pdf.js' %}"></script>
+<script type="text/javascript" src="{% static 'documents/js/pdf.worker.js' %}"></script>
+	{# Load and display PDF document#}
+	<script>
+var pdfjsLib = window['pdfjs-dist/build/pdf'];
+
+var pdfDoc = null,
+      pageNum = 1,
+      pageRendering = false,
+      pageNumPending = null,
+      scale = 1.0,
+      canvas = document.getElementById('the-canvas'),
+      ctx = canvas.getContext('2d');
+
+/**
+   * Get page info from document, resize canvas accordingly, and render page.
+   * @param num Page number.
+   */
+  function renderPage(num) {
+    pageRendering = true;
+    // Using promise to fetch the page
+    pdfDoc.getPage(num).then(function(page) {
+      var viewport = page.getViewport(scale);
+      canvas.height = viewport.height;
+      canvas.width = viewport.width;
+      // Render PDF page into canvas context
+      var renderContext = {
+        canvasContext: ctx,
+        viewport: viewport
+      };
+      var renderTask = page.render(renderContext);
+      // Wait for rendering to finish
+      renderTask.promise.then(function () {
+        pageRendering = false;
+        if (pageNumPending !== null) {
+          // New page rendering is pending
+          renderPage(pageNumPending);
+          pageNumPending = null;
+        }
+      });
+    });
+    // Update page counters
+    document.getElementById('page_num').textContent = num;
+  }
+  /**
+   * If another page rendering in progress, waits until the rendering is
+   * finised. Otherwise, executes rendering immediately.
+   */
+  function queueRenderPage(num) {
+    if (pageRendering) {
+      pageNumPending = num;
+    } else {
+      renderPage(num);
+    }
+  }
+  /**
+   * Displays previous page.
+   */
+  function onPrevPage() {
+    if (pageNum <= 1) {
+      return;
+    }
+    pageNum--;
+    queueRenderPage(pageNum);
+  }
+  document.getElementById('prev').addEventListener('click', onPrevPage);
+  /**
+   * Displays next page.
+   */
+  function onNextPage() {
+    if (pageNum >= pdfDoc.numPages) {
+      return;
+    }
+    pageNum++;
+    queueRenderPage(pageNum);
+  }
+  document.getElementById('next').addEventListener('click', onNextPage);
+  /**
+   * Displays next page.
+   */
+  function onZoomIn() {
+    scale *= 1.2;
+    queueRenderPage(pageNum);
+  }
+  document.getElementById('zoomin').addEventListener('click', onZoomIn);
+  /**
+   * Displays next page.
+   */
+  function onZoomOut() {
+    scale /= 1.2;
+    queueRenderPage(pageNum);
+  }
+  document.getElementById('zoomout').addEventListener('click', onZoomOut);
+  /**
+   * Asynchronously downloads PDF.
+   */
+  pdfjsLib.getDocument("{{download_url}}").then(function (pdfDoc_) {
+    pdfDoc = pdfDoc_;
+    document.getElementById('page_count').textContent = pdfDoc.numPages;
+    // Initial/first page rendering
+    renderPage(pageNum);
+  });
+	</script>
+
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -20,15 +20,15 @@ from rest_framework.viewsets import (
    ReadOnlyModelViewSet
 )

-from .filters import CorrespondentFilterSet, DocumentFilterSet, TagFilterSet
+from .filters import CorrespondentFilterSet, DocumentFilterSet, TagFilterSet, DocumentTypeFilterSet
 from .forms import UploadForm
-from .models import Correspondent, Document, Log, Tag
+from .models import Correspondent, Document, Log, Tag, DocumentType
 from .serialisers import (
    CorrespondentSerializer,
    DocumentSerializer,
    LogSerializer,
-    TagSerializer
-)
+    TagSerializer,
+    DocumentTypeSerializer)


 class IndexView(TemplateView):
@ -116,6 +116,17 @@ class TagViewSet(ModelViewSet):
    ordering_fields = ("name", "slug")


+class DocumentTypeViewSet(ModelViewSet):
+    model = DocumentType
+    queryset = DocumentType.objects.all()
+    serializer_class = DocumentTypeSerializer
+    pagination_class = StandardPagination
+    permission_classes = (IsAuthenticated,)
+    filter_backends = (DjangoFilterBackend, OrderingFilter)
+    filter_class = DocumentTypeFilterSet
+    ordering_fields = ("name", "slug")
+
+
 class DocumentViewSet(RetrieveModelMixin,
                      UpdateModelMixin,
                      DestroyModelMixin,
--- a/src/manage.py
+++ b/src/manage.py
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -22,12 +22,12 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
    load_dotenv("/usr/local/etc/paperless.conf")


-def __get_boolean(key):
+def __get_boolean(key, default="NO"):
    """
    Return a boolean value based on whatever the user has supplied in the
    environment based on whether the value "looks like" it's True or not.
    """
-    return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true"))
+    return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))


 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
@ -47,7 +47,7 @@ SECRET_KEY = os.getenv(


 # SECURITY WARNING: don't run with debug turned on in production!
-DEBUG = True
+DEBUG = __get_boolean("PAPERLESS_DEBUG", "YES")

 LOGIN_URL = "admin:login"

@ -81,7 +81,7 @@ INSTALLED_APPS = [

    "rest_framework",
    "crispy_forms",
-    "django_filters",
+    "django_filters"

 ]

@ -144,9 +144,9 @@ DATABASES = {
    }
 }

-if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
+if os.getenv("PAPERLESS_DBENGINE"):
    DATABASES["default"] = {
-        "ENGINE": "django.db.backends.postgresql_psycopg2",
+        "ENGINE": os.getenv("PAPERLESS_DBENGINE"),
        "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
        "USER": os.getenv("PAPERLESS_DBUSER"),
        "PASSWORD": os.getenv("PAPERLESS_DBPASS")
@ -198,6 +198,11 @@ STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/")
 MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/")


+# Document classification models location
+MODEL_FILE = os.getenv(
+    "PAPERLESS_MODEL_FILE", os.path.join(BASE_DIR, "..", "models", "model.pickle"))
+
+
 # Paperless-specific stuff
 # You shouldn't have to edit any of these values.  Rather, you can set these
 # values in /etc/paperless.conf instead.
@ -292,3 +297,5 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")

 # Specify the default date order (for autodetected dates)
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
+
+PAPERLESS_RECENT_CORRESPONDENT_YEARS = int(os.getenv("PAPERLESS_RECENT_CORRESPONDENT_YEARS", 1))
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@ -12,12 +12,13 @@ from documents.views import (
    FetchView,
    LogViewSet,
    PushView,
-    TagViewSet
-)
+    TagViewSet,
+    DocumentTypeViewSet)
 from reminders.views import ReminderViewSet

 router = DefaultRouter()
 router.register(r"correspondents", CorrespondentViewSet)
+router.register(r"document_types", DocumentTypeViewSet)
 router.register(r"documents", DocumentViewSet)
 router.register(r"logs", LogViewSet)
 router.register(r"reminders", ReminderViewSet)
--- a/src/reminders/models.py
+++ b/src/reminders/models.py
				`@ -0,0 +1,2 @@`
				`<img src="{{download_url}}" style="max-width: 100%">`