From cebc44f2c98653a7977d8c675fd791934564a44e Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 16 Feb 2016 09:28:34 +0000
Subject: [PATCH 01/71] API is halfway there

---
 src/documents/admin.py       |  5 ++++-
 src/documents/serialisers.py | 38 ++++++++++++++++++++++++++++++++++++
 src/documents/views.py       | 20 ++++++++++++++++++-
 src/paperless/settings.py    |  2 ++
 src/paperless/urls.py        | 16 ++++++++++++---
 5 files changed, 76 insertions(+), 5 deletions(-)
 create mode 100644 src/documents/serialisers.py

diff --git a/src/documents/admin.py b/src/documents/admin.py
index 635b9ddf8..d3bdd3ba4 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -46,10 +46,13 @@ class DocumentAdmin(admin.ModelAdmin):
         }
 
     search_fields = ("sender__name", "title", "content")
-    list_display = ("created", "sender", "title", "tags_", "document")
+    list_display = ("created_", "sender", "title", "tags_", "document")
     list_filter = ("tags", "sender", MonthListFilter)
     list_per_page = 25
 
+    def created_(self, obj):
+        return obj.created.date().strftime("%Y-%m-%d")
+
     def tags_(self, obj):
         r = ""
         for tag in obj.tags.all():
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
new file mode 100644
index 000000000..209c778a1
--- /dev/null
+++ b/src/documents/serialisers.py
@@ -0,0 +1,38 @@
+from rest_framework import serializers
+
+from .models import Sender, Tag, Document
+
+
+class SenderSerializer(serializers.ModelSerializer):
+
+    class Meta(object):
+        model = Sender
+        fields = ("id", "slug", "name")
+
+
+class TagSerializer(serializers.ModelSerializer):
+
+    class Meta(object):
+        model = Tag
+        fields = ("id", "slug", "name", "colour", "match", "matching_algorithm")
+
+
+class DocumentSerializer(serializers.ModelSerializer):
+
+    sender = serializers.HyperlinkedModelSerializer(read_only=True)
+    tags = serializers.HyperlinkedModelSerializer(read_only=True)
+
+    class Meta(object):
+        model = Document
+        fields = (
+            "id",
+            "sender",
+            "title",
+            "content",
+            "file_type",
+            "tags",
+            "created",
+            "modified",
+            "file_name",
+            "download_url"
+        )
diff --git a/src/documents/views.py b/src/documents/views.py
index c92b6af09..45caf50e9 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -3,10 +3,13 @@ from django.template.defaultfilters import slugify
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic import FormView, DetailView
 
+from rest_framework.viewsets import ModelViewSet
+
 from paperless.db import GnuPG
 
-from .models import Document
 from .forms import UploadForm
+from .models import Sender, Tag, Document
+from .serialisers import SenderSerializer, TagSerializer, DocumentSerializer
 
 
 class PdfView(DetailView):
@@ -52,3 +55,18 @@ class PushView(FormView):
 
     def form_invalid(self, form):
         return HttpResponse("0")
+
+
+class SenderViewSet(ModelViewSet):
+    model = Sender
+    serializer_class = SenderSerializer
+
+
+class TagViewSet(ModelViewSet):
+    model = Tag
+    serializer_class = TagSerializer
+
+
+class DocumentViewSet(ModelViewSet):
+    model = Document
+    serializer_class = DocumentSerializer
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 444989990..d31879110 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -44,6 +44,8 @@ INSTALLED_APPS = [
     "documents",
     "logger",
 
+    "rest_framework",
+
 ]
 
 MIDDLEWARE_CLASSES = [
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 060953676..d8a48995d 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -15,14 +15,24 @@ Including another URLconf
     3. Add a URL to urlpatterns:  url(r'^blog/', include(blog_urls))
 """
 from django.conf import settings
-from django.conf.urls import url, static
+from django.conf.urls import url, static, include
 from django.contrib import admin
 
-from documents.views import PdfView, PushView
+from rest_framework.routers import DefaultRouter
+
+from documents.views import (
+    PdfView, PushView, SenderViewSet, TagViewSet, DocumentViewSet)
+
+router = DefaultRouter()
+router.register(r'senders', SenderViewSet)
+router.register(r'tags', TagViewSet)
+router.register(r'documents', DocumentViewSet)
 
 urlpatterns = [
+    url(r"^api/auth/", include('rest_framework.urls', namespace='rest_framework')),
+    url(r"^api/", include(router.urls)),
     url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
-    url(r'', admin.site.urls),
+    url(r"", admin.site.urls),
 ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
 
 if settings.UPLOAD_SHARED_SECRET:

From eb01bcf98b168d160fb667a7f53b2119c4c143bb Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Wed, 17 Feb 2016 23:06:35 +0000
Subject: [PATCH 02/71] The Log class needed a __str__() method

---
 src/logger/models.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/logger/models.py b/src/logger/models.py
index 48774c199..f7f2c421a 100644
--- a/src/logger/models.py
+++ b/src/logger/models.py
@@ -29,6 +29,9 @@ class Log(models.Model):
     class Meta(object):
         ordering = ("-time",)
 
+    def __str__(self):
+        return self.message
+
     @classmethod
     def error(cls, message, component):
         cls.objects.create(

From 1e7ece81ee7afe44342c3b649687550fde702e15 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Wed, 17 Feb 2016 23:07:54 +0000
Subject: [PATCH 03/71] Fixes #45

---
 src/documents/consumer.py                            |  6 +-----
 .../management/commands/document_retagger.py         |  9 +++++----
 src/documents/models.py                              | 12 ++++++++++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 5ca42813b..98fedde09 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -273,11 +273,7 @@ class Consumer(object):
     def _store(self, text, doc):
 
         sender, title, tags, file_type = self._guess_attributes_from_name(doc)
-        tags = list(tags)
-
-        lower_text = text.lower()
-        relevant_tags = set(
-            [t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
+        relevant_tags = set(list(Tag.match_all(text)) + list(tags))
 
         stats = os.stat(doc)
 
diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py
index d7519f53b..09a3fb917 100644
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
         self.verbosity = options["verbosity"]
 
         for document in Document.objects.all():
+
             tags = Tag.objects.exclude(
                 pk__in=document.tags.values_list("pk", flat=True))
-            for tag in tags:
-                if tag.matches(document.content):
-                    print('Tagging {} with "{}"'.format(document, tag))
-                    document.tags.add(tag)
+
+            for tag in Tag.match_all(document.content, tags):
+                print('Tagging {} with "{}"'.format(document, tag))
+                document.tags.add(tag)
diff --git a/src/documents/models.py b/src/documents/models.py
index 447beaa66..03758eff5 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -86,7 +86,19 @@ class Tag(SluggedModel):
         return "{}: \"{}\" ({})".format(
             self.name, self.match, self.get_matching_algorithm_display())
 
+    @classmethod
+    def match_all(cls, text, tags=None):
+
+        if tags is None:
+            tags = cls.objects.all()
+
+        text = text.lower()
+        for tag in tags:
+            if tag.matches(text):
+                yield tag
+
     def matches(self, text):
+
         # Check that match is not empty
         if self.match.strip() == "":
             return False

From c34d57a872859e8f6799dceb41022b043490c6bd Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Thu, 18 Feb 2016 09:37:13 +0100
Subject: [PATCH 04/71] Detect image orientation if the OCR supports it

Fixes issue #47.
---
 src/documents/consumer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 98fedde09..12761e992 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -28,6 +28,9 @@ from .languages import ISO639
 def image_to_string(args):
     self, png, lang = args
     with Image.open(os.path.join(self.SCRATCH, png)) as f:
+        if self.OCR.can_detect_orientation():
+            orientation = self.OCR.detect_orientation(f, lang=lang)
+            f = f.rotate(orientation["angle"], expand=1)
         return self.OCR.image_to_string(f, lang=lang)
 
 

From 724afa59c75853bf71e735650133e4d414558dfa Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Wed, 17 Feb 2016 18:45:04 +0100
Subject: [PATCH 05/71] Add Dockerfile for application and documentation

This commit adds a `Dockerfile` to the root of the project, accompanied
by a `docker-compose.yml.example` for simplified deployment. The
`Dockerfile` is agnostic to whether it will be the webserver, the
consumer, or if it is run for a one-off command (i.e. creation of a
superuser, migration of the database, document export, ...).

The containers entrypoint is the `scripts/docker-entrypoint.sh` script.
This script verifies that the required permissions are set, remaps the
default users and/or groups id if required and installs additional
languages if the user wishes to.

After initialization, it analyzes the command the user supplied:

  - If the command starts with a slash, it is expected that the user
    wants to execute a binary file and the command will be executed
    without further intervention. (Using `exec` to effectively replace
    the started shell-script and not have any reaping-issues.)

  - If the command does not start with a slash, the command will be
    passed directly to the `manage.py` script without further
    modification. (Again using `exec`.)

The default command is set to `--help`.

If the user wants to execute a command that is not meant for `manage.py`
but doesn't start with a slash, the Docker `--entrypoint` parameter can
be used to circumvent the mechanics of `docker-entrypoint.sh`.

Further information can be found in `docs/setup.rst` and in
`docs/migrating.rst`.

For additional convenience, a `Dockerfile` has been added to the `docs/`
directory which allows for easy building and serving of the
documentation. This is documented in `docs/requirements.rst`.
---
 .gitignore                                    |   1 +
 Dockerfile                                    |  43 +++++
 docker-compose.env                            |  15 ++
 docker-compose.yml.example                    |  31 ++++
 docs/Dockerfile                               |  18 ++
 docs/migrating.rst                            |  95 ++++++++++
 docs/requirements.rst                         |  13 ++
 docs/setup.rst                                | 167 +++++++++++++++++-
 scripts/docker-entrypoint.sh                  |  74 ++++++++
 .../management/commands/loaddata_stdin.py     |  23 +++
 10 files changed, 474 insertions(+), 6 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.env
 create mode 100644 docker-compose.yml.example
 create mode 100644 docs/Dockerfile
 create mode 100644 scripts/docker-entrypoint.sh
 create mode 100644 src/documents/management/commands/loaddata_stdin.py

diff --git a/.gitignore b/.gitignore
index 908fa9748..2c65f8dcd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,7 @@ db.sqlite3
 # Other stuff that doesn't belong
 virtualenv
 .vagrant
+docker-compose.yml
 
 # Used for development
 scripts/import-for-development
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..dade863ca
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,43 @@
+FROM python:3.5.1
+MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
+
+# Install dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        sudo \
+        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install python dependencies
+RUN mkdir -p /usr/src/paperless
+WORKDIR /usr/src/paperless
+COPY requirements.txt /usr/src/paperless/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application
+RUN mkdir -p /usr/src/paperless/src
+COPY src/ /usr/src/paperless/src/
+
+# Set consumption directory
+ENV PAPERLESS_CONSUME /consume
+RUN mkdir -p $PAPERLESS_CONSUME
+
+# Migrate database
+WORKDIR /usr/src/paperless/src
+RUN mkdir /usr/src/paperless/data
+RUN ./manage.py migrate
+
+# Create user
+RUN groupadd -g 1000 paperless \
+    && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
+    && chown -Rh paperless:paperless /usr/src/paperless
+
+# Setup entrypoint
+COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
+RUN chmod 755 /sbin/docker-entrypoint.sh
+
+# Mount volumes
+VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
+
+ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
+CMD ["--help"]
diff --git a/docker-compose.env b/docker-compose.env
new file mode 100644
index 000000000..13c74b6ab
--- /dev/null
+++ b/docker-compose.env
@@ -0,0 +1,15 @@
+# Environment variables to set for Paperless
+# Commented out variables will be replaced by a default within Paperless.
+
+# Passphrase Paperless uses to encrypt and decrypt your documents
+PAPERLESS_PASSPHRASE=CHANGE_ME
+
+# The amount of threads to use for text recognition
+# PAPERLESS_OCR_THREADS=4
+
+# Additional languages to install for text recognition
+# PAPERLESS_OCR_LANGUAGES=deu ita
+
+# You can change the default user and group id to a custom one
+# USERMAP_UID=1000
+# USERMAP_GID=1000
diff --git a/docker-compose.yml.example b/docker-compose.yml.example
new file mode 100644
index 000000000..f8e9b5b93
--- /dev/null
+++ b/docker-compose.yml.example
@@ -0,0 +1,31 @@
+version: '2'
+
+services:
+    webserver:
+        image: paperless
+        ports:
+            # You can adapt the port you want Paperless to listen on by
+            # modifying the part before the `:`.
+            - "8000:8000"
+        volumes:
+            - paperless-data:/usr/src/paperless/data
+            - paperless-media:/usr/src/paperless/media
+        env_file: docker-compose.env
+        environment:
+            - PAPERLESS_OCR_LANGUAGES=
+        command: ["runserver", "0.0.0.0:8000"]
+
+    consumer:
+        image: paperless
+        volumes:
+            - paperless-data:/usr/src/paperless/data
+            - paperless-media:/usr/src/paperless/media
+            # You have to adapt the local path you want the consumption
+            # directory to mount to by modifying the part before the ':'.
+            - /path/to/arbitrary/place:/consume
+        env_file: docker-compose.env
+        command: ["document_consumer"]
+
+volumes:
+    paperless-data:
+    paperless-media:
diff --git a/docs/Dockerfile b/docs/Dockerfile
new file mode 100644
index 000000000..ee63aebb4
--- /dev/null
+++ b/docs/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.5.1
+MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
+
+# Install Sphinx and Pygments
+RUN pip install Sphinx Pygments
+
+# Setup directories, copy data
+RUN mkdir /build
+COPY . /build
+WORKDIR /build/docs
+
+# Build documentation
+RUN make html
+
+# Start webserver
+WORKDIR /build/docs/_build/html
+EXPOSE 8000/tcp
+CMD ["python3", "-m", "http.server"]
diff --git a/docs/migrating.rst b/docs/migrating.rst
index 46083533a..1e03bb3cb 100644
--- a/docs/migrating.rst
+++ b/docs/migrating.rst
@@ -30,6 +30,20 @@ as part of the update:
 Note that it's possible (even likely) that while ``git pull`` may update some
 files, the ``migrate`` step may not update anything.  This is totally normal.
 
+If you are :ref:`using Docker <setup-installation-docker>` the update process
+requires only one additional step:
+
+.. code-block:: shell-session
+
+    $ cd /path/to/project
+    $ git pull
+    $ docker build -t paperless .
+    $ docker-compose up -d
+    $ docker-compose run --rm webserver migrate
+
+If ``git pull`` doesn't report any changes, there is no need to continue with
+the remaining steps.
+
 
 .. _migrating-backup:
 
@@ -53,6 +67,65 @@ with Django's ``dumpdata`` command, which produces JSON output.
     $ ./manage.py document_export /path/to/arbitrary/place/
     $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
 
+If you are :ref:`using Docker <setup-installation-docker>`, exporting your tags
+as JSON is almost as easy:
+
+.. code-block:: shell-session
+
+    $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
+
+Exporting the documents though is a little more involved, since docker-compose
+doesn't support mounting additional volumes with the ``run`` command. You have
+three general options:
+
+1. Use the consumption directory if you happen to already have it mounted to a
+   host directory.
+
+   .. code-block:: console
+
+       $ # Stop the consumer so that it doesn't consume the exported documents
+       $ docker-compose stop consumer
+       $ # Export into the consumption directory
+       $ docker-compose run --rm consumer document_exporter /consume
+
+2. Add another volume to ``docker-compose.yml`` for exports and use
+   ``docker-compose run``:
+
+   .. code-block:: diff
+
+      diff --git a/docker-compose.yml b/docker-compose.yml
+      --- a/docker-compose.yml
+      +++ b/docker-compose.yml
+      @@ -17,9 +18,8 @@ services:
+               volumes:
+                   - paperless-data:/usr/src/paperless/data
+                   - paperless-media:/usr/src/paperless/media
+                   - /consume
+      +            - /path/to/arbitrary/place:/export
+
+   .. code-block:: shell-session
+
+       $ docker-compose run --rm consumer document_exporter /export
+
+3. Use ``docker run`` directly, supplying the necessary commandline options:
+
+   .. code-block:: shell-session
+
+       $ # Identify your containers
+       $ docker-compose ps
+               Name                       Command                State     Ports
+       -------------------------------------------------------------------------
+       paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
+       paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
+
+       $ # Make sure to replace your passphrase and remove or adapt the id mapping
+       $ docker run --rm \
+           --volumes-from paperless_data_1 \
+           --volume /path/to/arbitrary/place:/export \
+           -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
+           -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
+           paperless document_exporter /export
+
 
 .. _migrating-restoring:
 
@@ -77,3 +150,25 @@ exported documents into the consumption directory and start up the consumer.
     $ cp /path/to/exported/docs/* /path/to/consumption/dir/
     $ ./manage.py document_consumer
 
+Importing your data if you are :ref:`using Docker <setup-installation-docker>`
+is almost as simple:
+
+.. code-block:: shell-session
+
+    $ # Stop and remove your current containers
+    $ docker-compose stop
+    $ docker-compose rm -f
+
+    $ # Recreate them, add the superuser
+    $ docker-compose up -d
+    $ docker-compose run --rm webserver createsuperuser
+
+    $ # Load the tags
+    $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
+
+    $ # Load your exported documents into the consumption directory
+    $ # (How you do this highly depends on how you have set this up)
+    $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
+
+After loading the documents into the consumption directory the consumer will
+immediately start consuming the documents.
diff --git a/docs/requirements.rst b/docs/requirements.rst
index 1c4f989db..ee287d835 100644
--- a/docs/requirements.rst
+++ b/docs/requirements.rst
@@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to:
     $ pip install sphinx
 
 and then cd into the ``docs`` directory and type ``make html``.
+
+If you are using Docker, you can use the following commands to build the
+documentation and run a webserver serving it on `port 8001`_:
+
+.. code:: bash
+
+    $ pwd
+    /path/to/paperless
+
+    $ docker build -t paperless:docs -f docs/Dockerfile .
+    $ docker run --rm -it -p "8001:8000" paperless:docs
+
+.. _port 8001: http://127.0.0.1:8001
diff --git a/docs/setup.rst b/docs/setup.rst
index 24a9b9fa2..796de88e6 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -37,11 +37,18 @@ or just download the tarball and go that route:
 Installation & Configuration
 ----------------------------
 
-You can go two routes with setting up and running Paperless.  The *Vagrant*
-route is quick & easy, but means you're running a VM which comes with memory
-consumption etc.  Alternatively the standard, "bare metal" approach is a little
-more complicated.
+You can go multiple routes with setting up and running Paperless. The `Vagrant
+route`_ is quick & easy, but means you're running a VM which comes with memory
+consumption etc. We also `support Docker`_, which you can use natively under
+Linux and in a VM with `Docker Machine`_ (this guide was written for native
+Docker usage under Linux, you might have to adapt it for Docker Machine.)
+Alternatively the standard, `bare metal`_ approach is a little more complicated.
 
+.. _Vagrant route: setup-installation-vagrant_
+.. _support Docker: setup-installation-docker_
+.. _bare metal: setup-installation-standard_
+
+.. _Docker Machine: https://docs.docker.com/machine/
 
 .. _setup-installation-standard:
 
@@ -118,6 +125,150 @@ Vagrant Method
 .. _Paperless server: http://172.28.128.4:8000
 
 
+.. _setup-installation-docker:
+
+Docker Method
+.............
+
+1. Install `Docker`_.
+
+   .. caution::
+
+      As mentioned earlier, this guide assumes that you use Docker natively
+      under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows,
+      you will have to adapt IP addresses, volume-mounting, command execution
+      and maybe more.
+
+2. Install `docker-compose`_. [#compose]_
+
+   .. caution::
+
+       If you want to use the included ``docker-compose.yml.example`` file, you
+       need to have at least Docker version **1.10.0** and docker-compose
+       version **1.6.0**.
+
+       See the `Docker installation guide`_ on how to install the current
+       version of Docker for your operating system or Linux distribution of
+       choice. To get an up-to-date version of docker-compose, follow the
+       `docker-compose installation guide`_ if your package repository doesn't
+       include it.
+
+       .. _Docker installation guide: https://docs.docker.com/engine/installation/
+       .. _docker-compose installation guide: https://docs.docker.com/compose/install/
+
+3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``.
+4. Modify ``docker-compose.env`` and adapt the following environment variables:
+
+   ``PAPERLESS_PASSPHRASE``
+     This is the passphrase Paperless uses to encrypt/decrypt the original
+     document.
+
+   ``PAPERLESS_OCR_THREADS``
+     This is the number of threads the OCR process will spawn to process
+     document pages in parallel. If the variable is not set, Python determines
+     the core-count of your CPU and uses that value.
+
+   ``PAPERLESS_OCR_LANGUAGES``
+     If you want the OCR to recognize other languages in addition to the default
+     English, set this parameter to a space separated list of three-letter
+     language-codes after `ISO 639-2/T`_. For a list of available languages --
+     including their three letter codes -- see the `Debian packagelist`_.
+
+   ``USERMAP_UID`` and ``USERMAP_GID``
+     If you want to mount the consumption volume (directory ``/consume`` within
+     the containers) to a host-directory -- which you probably want to do --
+     access rights might be an issue. The default user and group ``paperless``
+     in the containers have an id of 1000. The containers will enforce that the
+     owning group of the consumption directory will be ``paperless`` to be able
+     to delete consumed documents. If your host-system has a group with an id of
+     1000 and you don't want this group to have access rights to the consumption
+     directory, you can use ``USERMAP_GID`` to change the id in the container
+     and thus the one of the consumption directory. Furthermore, you can change
+     the id of the default user as well using ``USERMAP_UID``.
+
+5. Run ``docker-compose up -d``. This will create and start the necessary
+   containers.
+6. To be able to login, you will need a super user. To create it, execute the
+   following command:
+
+   .. code-block:: shell-session
+
+       $ docker-compose run --rm webserver createsuperuser
+
+   This will prompt you to set a username (default ``paperless``), an optional
+   e-mail address and finally a password.
+7. The default ``docker-compose.yml`` exports the webserver on your local port
+   8000. If you haven't adapted this, you should now be able to visit your
+   `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
+   user and password you just created.
+8. Add files to consumption directory the way you prefer to. Following are two
+   possible options:
+
+   1. Mount the consumption directory to a local host path by modifying your
+      ``docker-compose.yml``:
+
+      .. code-block:: diff
+
+         diff --git a/docker-compose.yml b/docker-compose.yml
+         --- a/docker-compose.yml
+         +++ b/docker-compose.yml
+         @@ -17,9 +18,8 @@ services:
+                  volumes:
+                      - paperless-data:/usr/src/paperless/data
+                      - paperless-media:/usr/src/paperless/media
+         -            - /consume
+         +            - /local/path/you/choose:/consume
+
+      .. danger::
+
+          While the consumption container will ensure at startup that it can
+          **delete** a consumed file from a host-mounted directory, it might not
+          be able to **read** the document in the first place if the access
+          rights to the file are incorrect.
+
+          Make sure that the documents you put into the consumption directory
+          will either be readable by everyone (``chmod o+r file.pdf``) or
+          readable by the default user or group id 1000 (or the one you have set
+          with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
+
+   2. Use ``docker cp`` to copy your files directly into the container:
+
+      .. code-block:: shell-session
+
+         $ # Identify your containers
+         $ docker-compose ps
+                 Name                       Command                State     Ports
+         -------------------------------------------------------------------------
+         paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
+         paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
+
+         $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
+
+      ``docker cp`` is a one-shot-command, just like ``cp``. This means that
+      every time you want to consume a new document, you will have to execute
+      ``docker cp`` again. You can of course automate this process, but option 1
+      is generally the preferred one.
+
+      .. danger::
+
+          ``docker cp`` will change the owning user and group of a copied file
+          to the acting user at the destination, which will be ``root``.
+
+          You therefore need to ensure that the documents you want to copy into
+          the container are readable by everyone (``chmod o+r file.pdf``) before
+          copying them.
+
+
+.. _Docker: https://www.docker.com/
+.. _docker-compose: https://docs.docker.com/compose/install/
+.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
+
+.. [#compose] You of course don't have to use docker-compose, but it
+   simplifies deployment immensely. If you know your way around Docker, feel
+   free to tinker around without using compose!
+
+
 .. _making-things-a-little-more-permanent:
 
 Making Things a Little more Permanent
@@ -126,5 +277,9 @@ Making Things a Little more Permanent
 Once you've tested things and are happy with the work flow, you can automate the
 process of starting the webserver and consumer automatically.  If you're running
 on a bare metal system that's using Systemd, you can use the service unit files
-in the ``scripts`` directory to set this up.  If you're on a SysV or other
-startup system (like the Vagrant box), then you're currently on your own.
+in the ``scripts`` directory to set this up.  If you're on another startup
+system or are using a Vagrant box, then you're currently on your own. If you are
+using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to
+have the containers automatically start with the Docker daemon.
+
+.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh
new file mode 100644
index 000000000..9001574a1
--- /dev/null
+++ b/scripts/docker-entrypoint.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -e
+
+# Source: https://github.com/sameersbn/docker-gitlab/
+map_uidgid() {
+    USERMAP_ORIG_UID=$(id -u paperless)
+    USERMAP_ORIG_UID=$(id -g paperless)
+    USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
+    USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
+    if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
+        echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
+        groupmod -g ${USERMAP_GID} paperless
+        sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
+    fi
+}
+
+set_permissions() {
+    # Set permissions for consumption directory
+    chgrp paperless "$PAPERLESS_CONSUME"
+    chmod g+x "$PAPERLESS_CONSUME"
+
+    # Set permissions for application directory
+    chown -Rh paperless:paperless /usr/src/paperless
+}
+
+initialize() {
+    map_uidgid
+    set_permissions
+}
+
+install_languages() {
+    local langs="$1"
+    read -ra langs <<<"$langs"
+
+    # Check that it is not empty
+    if [ ${#langs[@]} -eq 0 ]; then
+        return
+    fi
+
+    # Update apt-lists
+    apt-get update
+
+    # Loop over languages to be installed
+    for lang in "${langs[@]}"; do
+        pkg="tesseract-ocr-$lang"
+        if dpkg -s "$pkg" 2>&1 > /dev/null; then
+            continue
+        fi
+
+        if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
+            continue
+        fi
+
+        apt-get install "$pkg"
+    done
+
+    # Remove apt lists
+    rm -rf /var/lib/apt/lists/*
+}
+
+
+if [[ "$1" != "/"* ]]; then
+    initialize
+
+    # Install additional languages if specified
+    if [ ! -z "$PAPERLESS_OCR_LANGUAGES"  ]; then
+        install_languages "$PAPERLESS_OCR_LANGUAGES"
+    fi
+
+    exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@"
+fi
+
+exec "$@"
+
diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py
new file mode 100644
index 000000000..b6848f1eb
--- /dev/null
+++ b/src/documents/management/commands/loaddata_stdin.py
@@ -0,0 +1,23 @@
+"""
+Source:
+    https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
+
+License:
+    MIT
+    Copyright (c) 2016 Baptiste Mispelon
+"""
+import sys
+
+from django.core.management.commands.loaddata import Command as LoadDataCommand
+
+
+class Command(LoadDataCommand):
+    def parse_name(self, fixture_name):
+        self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
+        if fixture_name == '-':
+            return '-', 'json', 'stdin'
+
+    def find_fixtures(self, fixture_label):
+        if fixture_label == '-':
+            return [('-', None, '-')]
+        return super(Command, self).find_fixtures(fixture_label)

From ec88ea73f67e8f8b1d8f36da5d10296e75a26b4c Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 19 Feb 2016 00:45:02 +0000
Subject: [PATCH 06/71] #48: make the tag matching smarter

---
 src/documents/models.py          |   8 +--
 src/documents/tests/test_tags.py | 120 +++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 src/documents/tests/test_tags.py

diff --git a/src/documents/models.py b/src/documents/models.py
index 03758eff5..d4d95aa38 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -105,21 +105,21 @@ class Tag(SluggedModel):
 
         if self.matching_algorithm == self.MATCH_ALL:
             for word in self.match.split(" "):
-                if word not in text:
+                if not re.search(r"\b{}\b".format(word), text):
                     return False
             return True
 
         if self.matching_algorithm == self.MATCH_ANY:
             for word in self.match.split(" "):
-                if word in text:
+                if re.search(r"\b{}\b".format(word), text):
                     return True
             return False
 
         if self.matching_algorithm == self.MATCH_LITERAL:
-            return self.match in text
+            return bool(re.search(r"\b{}\b".format(self.match), text))
 
         if self.matching_algorithm == self.MATCH_REGEX:
-            return re.search(re.compile(self.match), text)
+            return bool(re.search(re.compile(self.match), text))
 
         raise NotImplementedError("Unsupported matching algorithm")
 
diff --git a/src/documents/tests/test_tags.py b/src/documents/tests/test_tags.py
new file mode 100644
index 000000000..f3518e012
--- /dev/null
+++ b/src/documents/tests/test_tags.py
@@ -0,0 +1,120 @@
+from django.test import TestCase
+
+from ..models import Tag
+
+
+class TestTagMatching(TestCase):
+
+    def test_match_all(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_ALL
+        )
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have charlie in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_ALL
+        )
+        self.assertFalse(t.matches("I have 12 in me"))
+        self.assertFalse(t.matches("I have 34 in me"))
+        self.assertFalse(t.matches("I have 56 in me"))
+        self.assertFalse(t.matches("I have 12 and 34 in me"))
+        self.assertTrue(t.matches("I have 12 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+
+    def test_match_any(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_ANY
+        )
+
+        self.assertTrue(t.matches("I have alpha in me"))
+        self.assertTrue(t.matches("I have charlie in me"))
+        self.assertTrue(t.matches("I have gamma in me"))
+        self.assertTrue(t.matches("I have alpha and charlie in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_ANY
+        )
+        self.assertTrue(t.matches("I have 12 in me"))
+        self.assertTrue(t.matches("I have 34 in me"))
+        self.assertTrue(t.matches("I have 56 in me"))
+        self.assertTrue(t.matches("I have 12 and 34 in me"))
+        self.assertTrue(t.matches("I have 12 34, and 56 in me"))
+        self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
+        self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+
+    def test_match_literal(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_LITERAL
+        )
+
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have charlie in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_LITERAL
+        )
+        self.assertFalse(t.matches("I have 12 in me"))
+        self.assertFalse(t.matches("I have 34 in me"))
+        self.assertFalse(t.matches("I have 56 in me"))
+        self.assertFalse(t.matches("I have 12 and 34 in me"))
+        self.assertFalse(t.matches("I have 12 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
+        self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+        self.assertTrue(t.matches("I have 12 34 56 in me"))
+
+    def test_match_regex(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha\w+gamma",
+            matching_algorithm=Tag.MATCH_REGEX
+        )
+
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertTrue(t.matches("I have alpha_and_gamma in me"))
+        self.assertTrue(t.matches("I have alphas_and_gamma in me"))
+        self.assertFalse(t.matches("I have alpha,and,gamma in me"))
+        self.assertFalse(t.matches("I have alpha and gamma in me"))
+        self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+

From c45f951ca017f1fa94c87d15768e6ed06d99ca15 Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Fri, 19 Feb 2016 09:52:32 +0100
Subject: [PATCH 07/71] Ignore error if orientation detection fails

Fixes an additional issue that came up in #48.
---
 src/documents/consumer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 12761e992..21484036b 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -17,6 +17,7 @@ from PIL import Image
 from django.conf import settings
 from django.utils import timezone
 from django.template.defaultfilters import slugify
+from pyocr.tesseract import TesseractError
 
 from logger.models import Log
 from paperless.db import GnuPG
@@ -29,8 +30,11 @@ def image_to_string(args):
     self, png, lang = args
     with Image.open(os.path.join(self.SCRATCH, png)) as f:
         if self.OCR.can_detect_orientation():
-            orientation = self.OCR.detect_orientation(f, lang=lang)
-            f = f.rotate(orientation["angle"], expand=1)
+            try:
+                orientation = self.OCR.detect_orientation(f, lang=lang)
+                f = f.rotate(orientation["angle"], expand=1)
+            except TesseractError:
+                pass
         return self.OCR.image_to_string(f, lang=lang)
 
 

From 3a8755e4c8e8ea09a091985852da6bdba5355ed3 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 19 Feb 2016 17:26:40 +0000
Subject: [PATCH 08/71] Document the retagger

Fixes #54
---
 docs/utilities.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/utilities.rst b/docs/utilities.rst
index 2b795d31a..f5b452a6f 100644
--- a/docs/utilities.rst
+++ b/docs/utilities.rst
@@ -105,3 +105,30 @@ import, so should you can now safely delete the entire project directly,
 database, encrypted PDFs and all, and later create it all again simply by
 running the consumer again and dumping all of these files into
 ``CONSUMPTION_DIR``.
+
+
+.. _utilities-retagger:
+
+The Re-tagger
+-------------
+
+Say you've imported a few hundred documents and now want to introduce a tag
+and apply its matching to all of the currently-imported docs.  This problem is
+common enough that there's a tool for it.
+
+
+.. _utilities-retagger-howto:
+
+How to Use It
+.............
+
+This too is done via the ``manage.py`` script:
+
+.. code:: bash
+
+    $ /path/to/paperless/src/manage.py document_retagger
+
+That's it.  It'll loop over all of the documents in your database and attempt
+to match all of your tags to them.  If one matches, it'll be applied.  And
+don't worry, you can run this as often as you like, it' won't double-tag
+a document.

From 147f8f72a2b76f3118d7f28fe316a2c7e49412fe Mon Sep 17 00:00:00 2001
From: Tikitu de Jager <tikitu@minddistrict.com>
Date: Fri, 19 Feb 2016 09:48:43 +0200
Subject: [PATCH 09/71] Simplify instructions for exporting with docker

The export workflow reusing the `/consume` volume is complex and error-
prone, and not at all necessary if the `docker-compose.yml` file has a
volume for `/export` from the beginning.
---
 docker-compose.yml.example |  6 ++++
 docs/migrating.rst         | 68 ++++++++++++++------------------------
 2 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/docker-compose.yml.example b/docker-compose.yml.example
index f8e9b5b93..7e3557aa8 100644
--- a/docker-compose.yml.example
+++ b/docker-compose.yml.example
@@ -23,6 +23,12 @@ services:
             # You have to adapt the local path you want the consumption
             # directory to mount to by modifying the part before the ':'.
             - /path/to/arbitrary/place:/consume
+            # Likewise, you can add a local path to mount a directory for
+            # exporting. This is not strictly needed for paperless to
+            # function, only if you're exporting your files: uncomment
+            # it and fill in a local path if you know you're going to 
+            # want to export your documents.
+            # - /path/to/another/arbitrary/place:/export
         env_file: docker-compose.env
         command: ["document_consumer"]
 
diff --git a/docs/migrating.rst b/docs/migrating.rst
index 1e03bb3cb..491eeace4 100644
--- a/docs/migrating.rst
+++ b/docs/migrating.rst
@@ -74,57 +74,37 @@ as JSON is almost as easy:
 
     $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
 
-Exporting the documents though is a little more involved, since docker-compose
-doesn't support mounting additional volumes with the ``run`` command. You have
-three general options:
+To export the documents you can either use ``docker run`` directly, specifying all
+the commandline options by hand, or (more simply) mount a second volume for export.
 
-1. Use the consumption directory if you happen to already have it mounted to a
-   host directory.
+To mount a volume for exports, follow the instructions in the
+``docker-compose.yml.example`` file for the ``/export`` volume (making the changes
+in your own ``docker-compose.yml`` file, of course). Once you have the
+volume mounted, the command to run an export is:
 
-   .. code-block:: console
+.. code-block:: console
 
-       $ # Stop the consumer so that it doesn't consume the exported documents
-       $ docker-compose stop consumer
-       $ # Export into the consumption directory
-       $ docker-compose run --rm consumer document_exporter /consume
+   $ docker-compose run --rm consumer document_exporter /export
 
-2. Add another volume to ``docker-compose.yml`` for exports and use
-   ``docker-compose run``:
+If you prefer to use ``docker run`` directly, supplying the necessary commandline
+options:
 
-   .. code-block:: diff
+.. code-block:: shell-session
 
-      diff --git a/docker-compose.yml b/docker-compose.yml
-      --- a/docker-compose.yml
-      +++ b/docker-compose.yml
-      @@ -17,9 +18,8 @@ services:
-               volumes:
-                   - paperless-data:/usr/src/paperless/data
-                   - paperless-media:/usr/src/paperless/media
-                   - /consume
-      +            - /path/to/arbitrary/place:/export
+   $ # Identify your containers
+   $ docker-compose ps
+           Name                       Command                State     Ports
+   -------------------------------------------------------------------------
+   paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
+   paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
 
-   .. code-block:: shell-session
-
-       $ docker-compose run --rm consumer document_exporter /export
-
-3. Use ``docker run`` directly, supplying the necessary commandline options:
-
-   .. code-block:: shell-session
-
-       $ # Identify your containers
-       $ docker-compose ps
-               Name                       Command                State     Ports
-       -------------------------------------------------------------------------
-       paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
-       paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
-
-       $ # Make sure to replace your passphrase and remove or adapt the id mapping
-       $ docker run --rm \
-           --volumes-from paperless_data_1 \
-           --volume /path/to/arbitrary/place:/export \
-           -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
-           -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
-           paperless document_exporter /export
+   $ # Make sure to replace your passphrase and remove or adapt the id mapping
+   $ docker run --rm \
+       --volumes-from paperless_data_1 \
+       --volume /path/to/arbitrary/place:/export \
+       -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
+       -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
+       paperless document_exporter /export
 
 
 .. _migrating-restoring:

From 438b161a25d6d26fd8c5bc0b3aa9d20ea2f6376a Mon Sep 17 00:00:00 2001
From: Tikitu de Jager <tikitu@minddistrict.com>
Date: Fri, 19 Feb 2016 22:51:49 +0200
Subject: [PATCH 10/71] Move `docker-compose.env` to
 `docker-compose.env.example` & adjust docs

This file, like `docker-compose.yml`, should be edited by the user. To
avoid merge conflicts when pulling updates, the edited version should
not be committed to the repository.
---
 .gitignore                                    |  1 +
 ...-compose.env => docker-compose.env.example |  0
 docs/setup.rst                                | 19 +++++++++++++------
 3 files changed, 14 insertions(+), 6 deletions(-)
 rename docker-compose.env => docker-compose.env.example (100%)

diff --git a/.gitignore b/.gitignore
index 2c65f8dcd..d4c3fe38e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,6 +69,7 @@ db.sqlite3
 virtualenv
 .vagrant
 docker-compose.yml
+docker-compose.env
 
 # Used for development
 scripts/import-for-development
diff --git a/docker-compose.env b/docker-compose.env.example
similarity index 100%
rename from docker-compose.env
rename to docker-compose.env.example
diff --git a/docs/setup.rst b/docs/setup.rst
index 796de88e6..be8a349d8 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -156,8 +156,15 @@ Docker Method
        .. _Docker installation guide: https://docs.docker.com/engine/installation/
        .. _docker-compose installation guide: https://docs.docker.com/compose/install/
 
-3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``.
-4. Modify ``docker-compose.env`` and adapt the following environment variables:
+3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
+   a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
+   editing both these files: taking a copy ensures that you can ``git pull`` to 
+   receive updates without risking merge conflicts with your modified versions 
+   of the configuration files.
+4. Modify ``docker-compose.yml`` to your preferences, following the instructions
+   in comments in the file. The only change that is a hard requirement is to 
+   specify where the consumption directory should mount.
+5. Modify ``docker-compose.env`` and adapt the following environment variables:
 
    ``PAPERLESS_PASSPHRASE``
      This is the passphrase Paperless uses to encrypt/decrypt the original
@@ -186,9 +193,9 @@ Docker Method
      and thus the one of the consumption directory. Furthermore, you can change
      the id of the default user as well using ``USERMAP_UID``.
 
-5. Run ``docker-compose up -d``. This will create and start the necessary
+6. Run ``docker-compose up -d``. This will create and start the necessary
    containers.
-6. To be able to login, you will need a super user. To create it, execute the
+7. To be able to login, you will need a super user. To create it, execute the
    following command:
 
    .. code-block:: shell-session
@@ -197,11 +204,11 @@ Docker Method
 
    This will prompt you to set a username (default ``paperless``), an optional
    e-mail address and finally a password.
-7. The default ``docker-compose.yml`` exports the webserver on your local port
+8. The default ``docker-compose.yml`` exports the webserver on your local port
    8000. If you haven't adapted this, you should now be able to visit your
    `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
    user and password you just created.
-8. Add files to consumption directory the way you prefer to. Following are two
+9. Add files to consumption directory the way you prefer to. Following are two
    possible options:
 
    1. Mount the consumption directory to a local host path by modifying your

From 51b19f4c19fc38e45712c12f41fa86c8a7dac75f Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 20 Feb 2016 22:30:01 +0000
Subject: [PATCH 11/71] Issue #57

---
 src/documents/consumer.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 21484036b..d6818cf5d 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -128,9 +128,11 @@ class Consumer(object):
             except OCRError:
                 self._ignore.append(doc)
                 Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
+                self._cleanup_tempdir(tempdir)
                 continue
-            finally:
-                self._cleanup(tempdir, doc)
+            else:
+                self._cleanup_tempdir(tempdir)
+                self._cleanup_doc(doc)
 
     def _get_greyscale(self, tempdir, doc):
 
@@ -146,8 +148,12 @@ class Consumer(object):
             "-type", "grayscale", doc, png
         )).wait()
 
-        pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
-        return sorted(filter(lambda f: os.path.isfile(f), pngs))
+        pngs = []
+        for f in os.listdir(tempdir):
+            if f.startswith("convert"):
+                pngs.append(os.path.join(tempdir, f))
+
+        return sorted(filter(lambda __: os.path.isfile(__), pngs))
 
     @staticmethod
     def _guess_language(text):
@@ -308,12 +314,13 @@ class Consumer(object):
                 Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
                 encrypted.write(GnuPG.encrypted(unencrypted))
 
-    def _cleanup(self, tempdir, doc):
-        # Remove temporary directory recursively
-        Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
-        shutil.rmtree(tempdir)
+    @staticmethod
+    def _cleanup_tempdir(d):
+        Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
+        shutil.rmtree(d)
 
-        # Remove doc
+    @staticmethod
+    def _cleanup_doc(doc):
         Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
         os.unlink(doc)
 

From 422ae9303ac72dfad3fe53c598b1b53fc4d616c1 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 00:14:50 +0000
Subject: [PATCH 12/71] pep8

---
 src/documents/admin.py                        | 52 +++++++++++++------
 src/documents/consumer.py                     | 11 ++--
 src/documents/forms.py                        |  8 +--
 src/documents/languages.py                    |  6 +--
 .../management/commands/document_retagger.py  |  4 +-
 .../management/commands/loaddata_stdin.py     |  2 +-
 src/documents/mixins.py                       |  4 +-
 src/documents/models.py                       |  8 +--
 src/documents/serialisers.py                  |  3 +-
 src/documents/tests/test_consumer.py          |  4 +-
 src/documents/tests/test_tags.py              |  1 -
 src/paperless/urls.py                         | 12 ++++-
 tox.ini                                       | 14 +++++
 13 files changed, 89 insertions(+), 40 deletions(-)
 create mode 100644 tox.ini

diff --git a/src/documents/admin.py b/src/documents/admin.py
index d3bdd3ba4..42c3fc968 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -56,26 +56,35 @@ class DocumentAdmin(admin.ModelAdmin):
     def tags_(self, obj):
         r = ""
         for tag in obj.tags.all():
-            r += '<a class="tag" style="background-color: {};" href="{}">{}</a>'.format(
-                tag.get_colour_display(),
-                "{}?tags__id__exact={}".format(
-                    reverse("admin:documents_document_changelist"),
-                    tag.pk
-                ),
-                tag.slug
+            colour = tag.get_colour_display()
+            r += html_tag(
+                "a",
+                tag.slug,
+                **{
+                    "class": "tag",
+                    "style": "background-color: {};".format(colour),
+                    "href": "{}?tags__id__exact={}".format(
+                        reverse("admin:documents_document_changelist"),
+                        tag.pk
+                    )
+                }
             )
         return r
     tags_.allow_tags = True
 
     def document(self, obj):
-        return '<a href="{}">' \
-                 '<img src="{}" width="22" height="22" alt="{} icon" title="{}">' \
-               '</a>'.format(
-                    obj.download_url,
-                    static("documents/img/{}.png".format(obj.file_type)),
-                    obj.file_type,
-                    obj.file_name
-                )
+        return html_tag(
+            "a",
+            html_tag(
+                "img",
+                src=static("documents/img/{}.png".format(obj.file_type)),
+                width=22,
+                height=22,
+                alt=obj.file_type,
+                title=obj.file_name
+            ),
+            href=obj.download_url
+        )
     document.allow_tags = True
 
 admin.site.register(Sender)
@@ -85,3 +94,16 @@ admin.site.register(Document, DocumentAdmin)
 # Unless we implement multi-user, these default registrations don't make sense.
 admin.site.unregister(Group)
 admin.site.unregister(User)
+
+
+def html_tag(kind, inside=None, **kwargs):
+
+    attributes = []
+    for lft, rgt in kwargs.items():
+        attributes.append('{}="{}"'.format(lft, rgt))
+
+    if inside is not None:
+        return "<{kind} {attributes}>{inside}</{kind}>".format(
+            kind=kind, attributes=" ".join(attributes), inside=inside)
+
+    return "<{} {}/>".format(kind, " ".join(attributes))
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index d6818cf5d..6cf3b3d9d 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -127,7 +127,8 @@ class Consumer(object):
                 self._store(text, doc)
             except OCRError:
                 self._ignore.append(doc)
-                Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
+                Log.error(
+                    "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
                 self._cleanup_tempdir(tempdir)
                 continue
             else:
@@ -190,8 +191,8 @@ class Consumer(object):
             Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
             if settings.FORGIVING_OCR:
                 Log.warning(
-                    "As FORGIVING_OCR is enabled, we're going to make the best "
-                    "with what we have.",
+                    "As FORGIVING_OCR is enabled, we're going to make the "
+                    "best with what we have.",
                     Log.COMPONENT_CONSUMER
                 )
                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
@@ -246,8 +247,8 @@ class Consumer(object):
 
     def _guess_attributes_from_name(self, parseable):
         """
-        We use a crude naming convention to make handling the sender, title, and
-        tags easier:
+        We use a crude naming convention to make handling the sender, title,
+        and tags easier:
           "<sender> - <title> - <tags>.<suffix>"
           "<sender> - <title>.<suffix>"
           "<title>.<suffix>"
diff --git a/src/documents/forms.py b/src/documents/forms.py
index d544917b4..404be1763 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -26,15 +26,17 @@ class UploadForm(forms.Form):
     sender = forms.CharField(
         max_length=Sender._meta.get_field("name").max_length, required=False)
     title = forms.CharField(
-        max_length=Document._meta.get_field("title").max_length, required=False)
+        max_length=Document._meta.get_field("title").max_length,
+        required=False
+    )
     document = forms.FileField()
     signature = forms.CharField(max_length=256)
 
     def clean_sender(self):
         """
         I suppose it might look cleaner to use .get_or_create() here, but that
-        would also allow someone to fill up the db with bogus senders before all
-        validation was met.
+        would also allow someone to fill up the db with bogus senders before
+        all validation was met.
         """
         sender = self.cleaned_data.get("sender")
         if not sender:
diff --git a/src/documents/languages.py b/src/documents/languages.py
index 2bfafe08a..5ea560654 100644
--- a/src/documents/languages.py
+++ b/src/documents/languages.py
@@ -185,10 +185,10 @@ ISO639 = {
     "yo": "yor",
     "za": "zha",
 
-    # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I have
-    # no idea which one is better, so I just picked the bigger file.
+    # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra".  I
+    # have no idea which one is better, so I just picked the bigger file.
     "zh": "chi_tra",
 
     "zu": "zul"
 
-}
\ No newline at end of file
+}
diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py
index 09a3fb917..8f56e1eea 100644
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -10,8 +10,8 @@ class Command(Renderable, BaseCommand):
     help = """
         Using the current set of tagging rules, apply said rules to all
         documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with tags created (or modified) after their
-        initial import.
+        previously indexed documents with tags created (or modified) after
+        their initial import.
     """.replace("    ", "")
 
     def __init__(self, *args, **kwargs):
diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py
index b6848f1eb..ca0b9ef7b 100644
--- a/src/documents/management/commands/loaddata_stdin.py
+++ b/src/documents/management/commands/loaddata_stdin.py
@@ -13,7 +13,7 @@ from django.core.management.commands.loaddata import Command as LoadDataCommand
 
 class Command(LoadDataCommand):
     def parse_name(self, fixture_name):
-        self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
+        self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None)
         if fixture_name == '-':
             return '-', 'json', 'stdin'
 
diff --git a/src/documents/mixins.py b/src/documents/mixins.py
index 881589fa3..4d4e9783f 100644
--- a/src/documents/mixins.py
+++ b/src/documents/mixins.py
@@ -1,7 +1,7 @@
 class Renderable(object):
     """
-    A handy mixin to make it easier/cleaner to print output based on a verbosity
-    value.
+    A handy mixin to make it easier/cleaner to print output based on a
+    verbosity value.
     """
 
     def _render(self, text, verbosity):
diff --git a/src/documents/models.py b/src/documents/models.py
index d4d95aa38..267bebffe 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -36,7 +36,7 @@ class Sender(SluggedModel):
 
 
 class Tag(SluggedModel):
-    
+
     COLOURS = (
         (1, "#a6cee3"),
         (2, "#1f78b4"),
@@ -71,9 +71,9 @@ class Tag(SluggedModel):
         default=MATCH_ANY,
         help_text=(
             "Which algorithm you want to use when matching text to the OCR'd "
-            "PDF.  Here, \"any\" looks for any occurrence of any word provided "
-            "in the PDF, while \"all\" requires that every word provided "
-            "appear in the PDF, albeit not in the order provided.  A "
+            "PDF.  Here, \"any\" looks for any occurrence of any word "
+            "provided in the PDF, while \"all\" requires that every word "
+            "provided appear in the PDF, albeit not in the order provided.  A "
             "\"literal\" match means that the text you enter must appear in "
             "the PDF exactly as you've entered it, and \"regular expression\" "
             "uses a regex to match the PDF.  If you don't know what a regex "
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index 209c778a1..f23a482c6 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -14,7 +14,8 @@ class TagSerializer(serializers.ModelSerializer):
 
     class Meta(object):
         model = Tag
-        fields = ("id", "slug", "name", "colour", "match", "matching_algorithm")
+        fields = (
+            "id", "slug", "name", "colour", "match", "matching_algorithm")
 
 
 class DocumentSerializer(serializers.ModelSerializer):
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index 7cee524c3..6db501e02 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -4,10 +4,10 @@ from ..consumer import Consumer
 
 
 class TestAttachment(TestCase):
-    
+
     TAGS = ("tag1", "tag2", "tag3")
     CONSUMER = Consumer()
-    
+
     def _test_guess_attributes_from_name(self, path, sender, title, tags):
         for suffix in ("pdf", "png", "jpg", "jpeg", "gif"):
             f = path.format(suffix)
diff --git a/src/documents/tests/test_tags.py b/src/documents/tests/test_tags.py
index f3518e012..e0ab43244 100644
--- a/src/documents/tests/test_tags.py
+++ b/src/documents/tests/test_tags.py
@@ -117,4 +117,3 @@ class TestTagMatching(TestCase):
         self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
         self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
         self.assertFalse(t.matches("I have alphas in me"))
-
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index d8a48995d..5803a6685 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -29,10 +29,20 @@ router.register(r'tags', TagViewSet)
 router.register(r'documents', DocumentViewSet)
 
 urlpatterns = [
-    url(r"^api/auth/", include('rest_framework.urls', namespace='rest_framework')),
+
+    # API
+    url(
+        r"^api/auth/",
+        include('rest_framework.urls', namespace='rest_framework')
+    ),
     url(r"^api/", include(router.urls)),
+
+    # File downloads
     url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
+
+    # The Django admin
     url(r"", admin.site.urls),
+
 ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
 
 if settings.UPLOAD_SHARED_SECRET:
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 000000000..360385de8
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,14 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+#[tox]
+#envlist = py34, py35
+
+#[testenv]
+#commands = {envpython} src/manage.py test
+#deps =
+
+[pep8]
+exclude=migrations,src/paperless/settings.py

From 440614eddc3f35db448f40426b4302f9cc218387 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 00:29:21 +0000
Subject: [PATCH 13/71] Got tox working

---
 src/tox.ini | 23 +++++++++++++++++++++++
 tox.ini     | 14 --------------
 2 files changed, 23 insertions(+), 14 deletions(-)
 create mode 100644 src/tox.ini
 delete mode 100644 tox.ini

diff --git a/src/tox.ini b/src/tox.ini
new file mode 100644
index 000000000..962e39a19
--- /dev/null
+++ b/src/tox.ini
@@ -0,0 +1,23 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+skipsdist = True
+envlist = py34, py35, pep8
+
+[testenv]
+commands = {envpython} manage.py test
+deps = -r{toxinidir}/../requirements.txt
+setenv =
+    PAPERLESS_CONSUME=/tmp/paperless/consume
+    PAPERLESS_PASSPHRASE=THISISNOTASECRET
+    PAPERLESS_SECRET=paperless
+
+[testenv:pep8]
+commands=pep8
+deps=pep8
+
+[pep8]
+exclude=.tox,migrations,paperless/settings.py
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index 360385de8..000000000
--- a/tox.ini
+++ /dev/null
@@ -1,14 +0,0 @@
-# Tox (http://tox.testrun.org/) is a tool for running tests
-# in multiple virtualenvs. This configuration file will run the
-# test suite on all supported python versions. To use it, "pip install tox"
-# and then run "tox" from this directory.
-
-#[tox]
-#envlist = py34, py35
-
-#[testenv]
-#commands = {envpython} src/manage.py test
-#deps =
-
-[pep8]
-exclude=migrations,src/paperless/settings.py

From 809fb8fa1ff58c83e262d5470e8990a000c676a1 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 00:29:59 +0000
Subject: [PATCH 14/71] Moved the default GNUPG home to /tmp for
 tox-friendliness

---
 requirements.txt          | 1 +
 src/paperless/settings.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6dd8b32b5..25f4a0a40 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 Django==1.9
 django-extensions==1.6.1
+djangorestframework==3.3.2
 filemagic==1.6
 langdetect==1.0.5
 Pillow==3.0.0
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index d31879110..07918882f 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -158,7 +158,7 @@ OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
 FORGIVING_OCR = True
 
 # GNUPG needs a home directory for some reason
-GNUPG_HOME = os.environ.get("HOME", "/dev/null")
+GNUPG_HOME = os.environ.get("HOME", "/tmp")
 
 # Convert is part of the Imagemagick package
 CONVERT_BINARY = "/usr/bin/convert"

From 17d3a449525248e77327f22e1b141c76c2adfabc Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 00:55:38 +0000
Subject: [PATCH 15/71] A crude API is in place

---
 src/documents/serialisers.py | 10 ++++++----
 src/documents/views.py       | 13 +++++++++++++
 src/paperless/urls.py        |  4 ++--
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index f23a482c6..345fa166d 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -3,14 +3,14 @@ from rest_framework import serializers
 from .models import Sender, Tag, Document
 
 
-class SenderSerializer(serializers.ModelSerializer):
+class SenderSerializer(serializers.HyperlinkedModelSerializer):
 
     class Meta(object):
         model = Sender
         fields = ("id", "slug", "name")
 
 
-class TagSerializer(serializers.ModelSerializer):
+class TagSerializer(serializers.HyperlinkedModelSerializer):
 
     class Meta(object):
         model = Tag
@@ -20,8 +20,10 @@ class TagSerializer(serializers.ModelSerializer):
 
 class DocumentSerializer(serializers.ModelSerializer):
 
-    sender = serializers.HyperlinkedModelSerializer(read_only=True)
-    tags = serializers.HyperlinkedModelSerializer(read_only=True)
+    sender = serializers.HyperlinkedRelatedField(
+        read_only=True, view_name="drf:sender-detail", allow_null=True)
+    tags = serializers.HyperlinkedRelatedField(
+        read_only=True, view_name="drf:tag-detail", many=True)
 
     class Meta(object):
         model = Document
diff --git a/src/documents/views.py b/src/documents/views.py
index 45caf50e9..bcce6f677 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -3,6 +3,7 @@ from django.template.defaultfilters import slugify
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic import FormView, DetailView
 
+from rest_framework.pagination import PageNumberPagination
 from rest_framework.viewsets import ModelViewSet
 
 from paperless.db import GnuPG
@@ -57,16 +58,28 @@ class PushView(FormView):
         return HttpResponse("0")
 
 
+class StandardPagination(PageNumberPagination):
+    page_size = 25
+    page_size_query_param = "page-size"
+    max_page_size = 100000
+
+
 class SenderViewSet(ModelViewSet):
     model = Sender
+    queryset = Sender.objects.all()
     serializer_class = SenderSerializer
+    pagination_class = StandardPagination
 
 
 class TagViewSet(ModelViewSet):
     model = Tag
+    queryset = Tag.objects.all()
     serializer_class = TagSerializer
+    pagination_class = StandardPagination
 
 
 class DocumentViewSet(ModelViewSet):
     model = Document
+    queryset = Document.objects.all()
     serializer_class = DocumentSerializer
+    pagination_class = StandardPagination
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 5803a6685..b7ffe17dc 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -33,9 +33,9 @@ urlpatterns = [
     # API
     url(
         r"^api/auth/",
-        include('rest_framework.urls', namespace='rest_framework')
+        include('rest_framework.urls', namespace="rest_framework")
     ),
-    url(r"^api/", include(router.urls)),
+    url(r"^api/", include(router.urls, namespace="drf")),
 
     # File downloads
     url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),

From 0d466430269d87555ab8ae7473106f11098b36a7 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:24:30 +0000
Subject: [PATCH 16/71] Version bump

---
 docs/changelog.rst       | 36 ++++++++++++++++++++++++++++++++++--
 src/paperless/version.py |  2 +-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index c56e7a367..5fdd2143b 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,10 +1,28 @@
 Changelog
 #########
 
+* 0.1.0
+
+  * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
+    `Tikitu de Jager`_ for this one, and especially to `Pit`_
+    who spearheadded this effort.
+  * A simple REST API is in place, but it should be considered unstable.
+  * Cleaned up the consumer to use temporary directories instead of a single
+    scratch space.  (Thanks `Pit`_)
+  * Improved the efficiency of the consumer by parsing pages more intelligently
+    and introducing a threaded OCR process (thanks again `Pit`_).
+  * `#45`_: Cleaned up the logic for tag matching.  Reported by `darkmatter`_.
+  * `#47`_: Auto-rotate landscape documents.  Reported by `Paul`_ and fixed by
+    `Pit`_.
+  * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
+  * `#54`_: Documented the re-tagger (`zedster`_)
+  * `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
+  * Added tox with pep8 checking
+
 * 0.0.6
 
-  * Added support for parallel OCR (significant work from pitkley)
-  * Sped up the language detection (significant work from pitkley)
+  * Added support for parallel OCR (significant work from `Pit`_)
+  * Sped up the language detection (significant work from `Pit`_)
   * Added simple logging
 
 * 0.0.5
@@ -35,3 +53,17 @@ Changelog
 * 0.0.1
 
   * Initial release
+
+.. _Wayne Werner: https://github.com/waynew
+.. _Brian Conn: https://github.com/TheConnMan
+.. _Tikitu de Jager: https://github.com/tikitu
+.. _Pit: https://github.com/pitkley
+.. _Paul: https://github.com/polo2ro
+.. _darkmatter: https://github.com/darkmatter
+.. _zedster: https://github.com/zedster
+
+.. _#45: https://github.com/danielquinn/paperless/issues/45
+.. _#47: https://github.com/danielquinn/paperless/issues/47
+.. _#48: https://github.com/danielquinn/paperless/issues/48
+.. _#54: https://github.com/danielquinn/paperless/issues/54
+.. _#57: https://github.com/danielquinn/paperless/issues/57
diff --git a/src/paperless/version.py b/src/paperless/version.py
index 7afad8b77..8e2c2d9ea 100644
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (0, 0, 6)
+__version__ = (0, 1, 0)

From c7787bc076b70e897f57bf137d224c80af08c840 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:37:57 +0000
Subject: [PATCH 17/71] Let's see if I can get Travis CI working on the first
 try

---
 src/.travis.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 src/.travis.yml

diff --git a/src/.travis.yml b/src/.travis.yml
new file mode 100644
index 000000000..cd0985dd3
--- /dev/null
+++ b/src/.travis.yml
@@ -0,0 +1,10 @@
+language: python
+sudo: false
+env:
+  - TOXENV=py34
+  - TOXENV=py35
+  - TOXENV=pep8
+install:
+  - pip install --requirement ../requirements.txt
+  - pip install tox
+script: tox

From 55a7dc2444db87bb74aafc9d452fc03f1be58b35 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:43:48 +0000
Subject: [PATCH 18/71] pep8

---
 src/paperless/settings.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 07918882f..5d7cc3b2f 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -170,7 +170,8 @@ SCRATCH_DIR = "/tmp/paperless"
 CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
 
 # If you want to use IMAP mail consumption, populate this with useful values.
-# If you leave HOST set to None, we assume you're not going to use this feature.
+# If you leave HOST set to None, we assume you're not going to use this
+# feature.
 MAIL_CONSUMPTION = {
     "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
     "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
@@ -180,8 +181,8 @@ MAIL_CONSUMPTION = {
     "INBOX": "INBOX"  # The name of the inbox on the server
 }
 
-# This is used to encrypt the original documents and decrypt them later when you
-# want to download them.  Set it and change the permissions on this file to
+# This is used to encrypt the original documents and decrypt them later when
+# you want to download them.  Set it and change the permissions on this file to
 # 0600, or set it to `None` and you'll be prompted for the passphrase at
 # runtime.  The default looks for an environment variable.
 # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things
@@ -189,7 +190,7 @@ MAIL_CONSUMPTION = {
 # files.
 PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
 
-# If you intend to use the "API" to push files into the consumer, you'll need to
-# provide a shared secret here.  Leaving this as the default will disable the
-# API.
+# If you intend to use the "API" to push files into the consumer, you'll need
+# to provide a shared secret here.  Leaving this as the default will disable
+# the API.
 UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "")

From 6f7169d2d67f445637d12982a9f44c872bc7de89 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:46:49 +0000
Subject: [PATCH 19/71] Travis integration: take 3

---
 src/.travis.yml => .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename src/.travis.yml => .travis.yml (85%)

diff --git a/src/.travis.yml b/.travis.yml
similarity index 85%
rename from src/.travis.yml
rename to .travis.yml
index cd0985dd3..1d461d255 100644
--- a/src/.travis.yml
+++ b/.travis.yml
@@ -7,4 +7,4 @@ env:
 install:
   - pip install --requirement ../requirements.txt
   - pip install tox
-script: tox
+script: tox -c src/tox.ini

From e0b2d27e01090db5935887d9641b38411416aaeb Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:50:04 +0000
Subject: [PATCH 20/71] Travis integration: take 4

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 1d461d255..feba6c290 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,6 +5,6 @@ env:
   - TOXENV=py35
   - TOXENV=pep8
 install:
-  - pip install --requirement ../requirements.txt
+  - pip install --requirement requirements.txt
   - pip install tox
 script: tox -c src/tox.ini

From 300dc97e83adba66d330e9ed531bfa9f81a79856 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:53:10 +0000
Subject: [PATCH 21/71] Travis integration: take 5

---
 .travis.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index feba6c290..a83352daa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,8 @@
 language: python
 sudo: false
+python:
+  - "3.4"
+  - "3.5"
 env:
   - TOXENV=py34
   - TOXENV=py35

From 5f0962bc3edf0987c79bb79143ca4e6e0cdfcb3c Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 01:58:09 +0000
Subject: [PATCH 22/71] Travis integration: take 6

---
 .travis.yml | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a83352daa..6d3d5d217 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,13 +1,17 @@
 language: python
+
 sudo: false
-python:
-  - "3.4"
-  - "3.5"
-env:
-  - TOXENV=py34
-  - TOXENV=py35
-  - TOXENV=pep8
+
+matrix:
+    include:
+        - python: 3.4
+          env: TOXENV=py34
+        - python: 3.5
+          env: TOXENV=py35
+        - env: TOXENV=pep8
+
 install:
-  - pip install --requirement requirements.txt
-  - pip install tox
+    - pip install --requirement requirements.txt
+    - pip install tox
+
 script: tox -c src/tox.ini

From 3a7923e32dba6d76949788cecd361e6f19df04d4 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 02:24:05 +0000
Subject: [PATCH 23/71] Moved pyocr.get_available_tools() into a method

---
 .travis.yml               |  3 ++-
 src/documents/consumer.py | 27 +++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6d3d5d217..dcaaeab8d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,7 +8,8 @@ matrix:
           env: TOXENV=py34
         - python: 3.5
           env: TOXENV=py35
-        - env: TOXENV=pep8
+        - python: 3.5
+          env: TOXENV=pep8
 
 install:
     - pip install --requirement requirements.txt
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 6cf3b3d9d..2bd47c6da 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -26,18 +26,6 @@ from .models import Sender, Tag, Document
 from .languages import ISO639
 
 
-def image_to_string(args):
-    self, png, lang = args
-    with Image.open(os.path.join(self.SCRATCH, png)) as f:
-        if self.OCR.can_detect_orientation():
-            try:
-                orientation = self.OCR.detect_orientation(f, lang=lang)
-                f = f.rotate(orientation["angle"], expand=1)
-            except TesseractError:
-                pass
-        return self.OCR.image_to_string(f, lang=lang)
-
-
 class OCRError(Exception):
     pass
 
@@ -61,7 +49,6 @@ class Consumer(object):
     CONSUME = settings.CONSUMPTION_DIR
     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 
-    OCR = pyocr.get_available_tools()[0]
     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
 
     REGEX_TITLE = re.compile(
@@ -239,12 +226,24 @@ class Consumer(object):
 
         with Pool(processes=self.THREADS) as pool:
             r = pool.map(
-                image_to_string, itertools.product([self], pngs, [lang]))
+                self.image_to_string, itertools.product(pngs, [lang]))
             r = " ".join(r)
 
         # Strip out excess white space to allow matching to go smoother
         return re.sub(r"\s+", " ", r)
 
+    def image_to_string(self, args):
+        png, lang = args
+        ocr = pyocr.get_available_tools()[0]
+        with Image.open(os.path.join(self.SCRATCH, png)) as f:
+            if ocr.can_detect_orientation():
+                try:
+                    orientation = ocr.detect_orientation(f, lang=lang)
+                    f = f.rotate(orientation["angle"], expand=1)
+                except TesseractError:
+                    pass
+            return ocr.image_to_string(f, lang=lang)
+
     def _guess_attributes_from_name(self, parseable):
         """
         We use a crude naming convention to make handling the sender, title,

From 312cb523d817da7122444dbb7dd6abbdc72a183f Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 02:30:39 +0000
Subject: [PATCH 24/71] /tmp is probably better than /tmp/paperless/consume

---
 src/tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tox.ini b/src/tox.ini
index 962e39a19..1840b507e 100644
--- a/src/tox.ini
+++ b/src/tox.ini
@@ -11,7 +11,7 @@ envlist = py34, py35, pep8
 commands = {envpython} manage.py test
 deps = -r{toxinidir}/../requirements.txt
 setenv =
-    PAPERLESS_CONSUME=/tmp/paperless/consume
+    PAPERLESS_CONSUME=/tmp
     PAPERLESS_PASSPHRASE=THISISNOTASECRET
     PAPERLESS_SECRET=paperless
 

From b400c24dc8cb233c496b14d56f8df8c739df4975 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 21 Feb 2016 02:32:47 +0000
Subject: [PATCH 25/71] Adding travis badge

---
 README.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index cf9d387cc..0aba0545e 100644
--- a/README.rst
+++ b/README.rst
@@ -3,6 +3,7 @@ Paperless
 
 |Documentation|
 |Chat|
+|Travis|
 
 Scan, index, and archive all of your paper documents
 
@@ -105,4 +106,5 @@ home.
 .. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg
    :alt: Join the chat at https://gitter.im/danielquinn/paperless
    :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
-
+.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
+   :target: https://travis-ci.org/danielquinn/paperless

From 1aecb1e63a9fede4998612df8024990ba940cbfe Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 23 Feb 2016 20:15:13 +0000
Subject: [PATCH 26/71] Compensate for case and format of jpg vs. jpeg

---
 src/documents/consumer.py            | 17 ++++++++++++++---
 src/documents/tests/test_consumer.py | 12 ++++++++++--
 src/documents/views.py               |  2 +-
 src/paperless/urls.py                |  4 ++--
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 2bd47c6da..ddbe474a7 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -264,6 +264,12 @@ class Consumer(object):
                     Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
             return tuple(r)
 
+        def get_suffix(suffix):
+            suffix = suffix.lower()
+            if suffix == "jpeg":
+                return "jpg"
+            return suffix
+
         # First attempt: "<sender> - <title> - <tags>.<suffix>"
         m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable)
         if m:
@@ -271,17 +277,22 @@ class Consumer(object):
                 get_sender(m.group(1)),
                 m.group(2),
                 get_tags(m.group(3)),
-                m.group(4)
+                get_suffix(m.group(4))
             )
 
         # Second attempt: "<sender> - <title>.<suffix>"
         m = re.match(self.REGEX_SENDER_TITLE, parseable)
         if m:
-            return get_sender(m.group(1)), m.group(2), (), m.group(3)
+            return (
+                get_sender(m.group(1)),
+                m.group(2),
+                (),
+                get_suffix(m.group(3))
+            )
 
         # That didn't work, so we assume sender and tags are None
         m = re.match(self.REGEX_TITLE, parseable)
-        return None, m.group(1), (), m.group(2)
+        return None, m.group(1), (), get_suffix(m.group(2))
 
     def _store(self, text, doc):
 
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index 6db501e02..04f92f98c 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -7,15 +7,23 @@ class TestAttachment(TestCase):
 
     TAGS = ("tag1", "tag2", "tag3")
     CONSUMER = Consumer()
+    SUFFIXES = (
+        "pdf", "png", "jpg", "jpeg", "gif",
+        "PDF", "PNG", "JPG", "JPEG", "GIF",
+        "PdF", "PnG", "JpG", "JPeG", "GiF",
+    )
 
     def _test_guess_attributes_from_name(self, path, sender, title, tags):
-        for suffix in ("pdf", "png", "jpg", "jpeg", "gif"):
+        for suffix in self.SUFFIXES:
             f = path.format(suffix)
             results = self.CONSUMER._guess_attributes_from_name(f)
             self.assertEqual(results[0].name, sender, f)
             self.assertEqual(results[1], title, f)
             self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
-            self.assertEqual(results[3], suffix, f)
+            if suffix.lower() == "jpeg":
+                self.assertEqual(results[3], "jpg", f)
+            else:
+                self.assertEqual(results[3], suffix.lower(), f)
 
     def test_guess_attributes_from_name0(self):
         self._test_guess_attributes_from_name(
diff --git a/src/documents/views.py b/src/documents/views.py
index bcce6f677..26642c9fc 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -13,7 +13,7 @@ from .models import Sender, Tag, Document
 from .serialisers import SenderSerializer, TagSerializer, DocumentSerializer
 
 
-class PdfView(DetailView):
+class FetchView(DetailView):
 
     model = Document
 
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index b7ffe17dc..fd1af065d 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -21,7 +21,7 @@ from django.contrib import admin
 from rest_framework.routers import DefaultRouter
 
 from documents.views import (
-    PdfView, PushView, SenderViewSet, TagViewSet, DocumentViewSet)
+    FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet)
 
 router = DefaultRouter()
 router.register(r'senders', SenderViewSet)
@@ -38,7 +38,7 @@ urlpatterns = [
     url(r"^api/", include(router.urls, namespace="drf")),
 
     # File downloads
-    url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
+    url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"),
 
     # The Django admin
     url(r"", admin.site.urls),

From e149baec4ecf70f791981dd516f7eb0e212811fd Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 23 Feb 2016 20:30:33 +0000
Subject: [PATCH 27/71] Update for #53

---
 docs/changelog.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 5fdd2143b..86f365653 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,6 +1,11 @@
 Changelog
 #########
 
+* 0.1.1 (master)
+
+  * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
+    to be imported but made unavailable.
+
 * 0.1.0
 
   * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
@@ -65,5 +70,6 @@ Changelog
 .. _#45: https://github.com/danielquinn/paperless/issues/45
 .. _#47: https://github.com/danielquinn/paperless/issues/47
 .. _#48: https://github.com/danielquinn/paperless/issues/48
+.. _#53: https://github.com/danielquinn/paperless/issues/53
 .. _#54: https://github.com/danielquinn/paperless/issues/54
 .. _#57: https://github.com/danielquinn/paperless/issues/57

From df1741e1fa1d43931ee21e0eb71ed497a97da512 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 26 Feb 2016 11:21:14 +0000
Subject: [PATCH 28/71] Added a time to the logger admin

---
 src/logger/admin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/logger/admin.py b/src/logger/admin.py
index dc9446821..b949f1908 100644
--- a/src/logger/admin.py
+++ b/src/logger/admin.py
@@ -5,7 +5,7 @@ from .models import Log
 
 class LogAdmin(admin.ModelAdmin):
 
-    list_display = ("message", "level", "component")
+    list_display = ("message", "time", "level", "component")
     list_filter = ("level", "component",)
 
 

From 7b9e55d2085fd2830743b72c1a38b28176679360 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 26 Feb 2016 11:21:24 +0000
Subject: [PATCH 29/71] Software updates

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 25f4a0a40..810af8ec2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-Django==1.9
+Django==1.9.2
 django-extensions==1.6.1
 djangorestframework==3.3.2
 filemagic==1.6
 langdetect==1.0.5
-Pillow==3.0.0
+Pillow==3.1.1
 pyocr==0.3.1
 python-dateutil==2.4.2
 python-gnupg==0.3.8

From 2fe9b0cbc129929127482771f5c4053a6ae09cc6 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 27 Feb 2016 20:18:50 +0000
Subject: [PATCH 30/71] New logging appears to work

---
 docs/changelog.rst                            |   2 +
 src/documents/admin.py                        |  43 +++---
 src/documents/consumer.py                     | 109 ++++++++-------
 src/documents/loggers.py                      |  30 +++++
 src/documents/mail.py                         |  48 ++++---
 .../migrations/0010_log.py}                   |  18 +--
 src/documents/models.py                       |  32 +++++
 src/documents/tests/test_logger.py            | 124 ++++++++++++++++++
 src/logger/__init__.py                        |   0
 src/logger/admin.py                           |  12 --
 src/logger/apps.py                            |   5 -
 src/logger/migrations/__init__.py             |   0
 src/logger/models.py                          |  53 --------
 src/logger/tests.py                           |   3 -
 src/logger/views.py                           |   3 -
 src/paperless/settings.py                     |  52 +++++---
 16 files changed, 346 insertions(+), 188 deletions(-)
 create mode 100644 src/documents/loggers.py
 rename src/{logger/migrations/0001_initial.py => documents/migrations/0010_log.py} (57%)
 create mode 100644 src/documents/tests/test_logger.py
 delete mode 100644 src/logger/__init__.py
 delete mode 100644 src/logger/admin.py
 delete mode 100644 src/logger/apps.py
 delete mode 100644 src/logger/migrations/__init__.py
 delete mode 100644 src/logger/models.py
 delete mode 100644 src/logger/tests.py
 delete mode 100644 src/logger/views.py

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 86f365653..cdb720926 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -3,6 +3,7 @@ Changelog
 
 * 0.1.1 (master)
 
+  * `#60`_: Setup logging to actually use the Python native logging framework.
   * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
     to be imported but made unavailable.
 
@@ -73,3 +74,4 @@ Changelog
 .. _#53: https://github.com/danielquinn/paperless/issues/53
 .. _#54: https://github.com/danielquinn/paperless/issues/54
 .. _#57: https://github.com/danielquinn/paperless/issues/57
+.. _#60: https://github.com/danielquinn/paperless/issues/60
diff --git a/src/documents/admin.py b/src/documents/admin.py
index 42c3fc968..118a295eb 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group
 from django.core.urlresolvers import reverse
 from django.templatetags.static import static
 
-from .models import Sender, Tag, Document
+from .models import Sender, Tag, Document, Log
 
 
 class MonthListFilter(admin.SimpleListFilter):
@@ -57,7 +57,7 @@ class DocumentAdmin(admin.ModelAdmin):
         r = ""
         for tag in obj.tags.all():
             colour = tag.get_colour_display()
-            r += html_tag(
+            r += self._html_tag(
                 "a",
                 tag.slug,
                 **{
@@ -73,9 +73,9 @@ class DocumentAdmin(admin.ModelAdmin):
     tags_.allow_tags = True
 
     def document(self, obj):
-        return html_tag(
+        return self._html_tag(
             "a",
-            html_tag(
+            self._html_tag(
                 "img",
                 src=static("documents/img/{}.png".format(obj.file_type)),
                 width=22,
@@ -87,23 +87,32 @@ class DocumentAdmin(admin.ModelAdmin):
         )
     document.allow_tags = True
 
+    @staticmethod
+    def _html_tag(kind, inside=None, **kwargs):
+
+        attributes = []
+        for lft, rgt in kwargs.items():
+            attributes.append('{}="{}"'.format(lft, rgt))
+
+        if inside is not None:
+            return "<{kind} {attributes}>{inside}</{kind}>".format(
+                kind=kind, attributes=" ".join(attributes), inside=inside)
+
+        return "<{} {}/>".format(kind, " ".join(attributes))
+
+
+class LogAdmin(admin.ModelAdmin):
+
+    list_display = ("message", "level", "component")
+    list_filter = ("level", "component",)
+
+
 admin.site.register(Sender)
 admin.site.register(Tag, TagAdmin)
 admin.site.register(Document, DocumentAdmin)
+admin.site.register(Log, LogAdmin)
+
 
 # Unless we implement multi-user, these default registrations don't make sense.
 admin.site.unregister(Group)
 admin.site.unregister(User)
-
-
-def html_tag(kind, inside=None, **kwargs):
-
-    attributes = []
-    for lft, rgt in kwargs.items():
-        attributes.append('{}="{}"'.format(lft, rgt))
-
-    if inside is not None:
-        return "<{kind} {attributes}>{inside}</{kind}>".format(
-            kind=kind, attributes=" ".join(attributes), inside=inside)
-
-    return "<{} {}/>".format(kind, " ".join(attributes))
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index ddbe474a7..37b348495 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,5 +1,8 @@
 import datetime
+import logging
 import tempfile
+import uuid
+
 from multiprocessing.pool import Pool
 
 import itertools
@@ -19,10 +22,9 @@ from django.utils import timezone
 from django.template.defaultfilters import slugify
 from pyocr.tesseract import TesseractError
 
-from logger.models import Log
 from paperless.db import GnuPG
 
-from .models import Sender, Tag, Document
+from .models import Sender, Tag, Document, Log
 from .languages import ISO639
 
 
@@ -67,6 +69,8 @@ class Consumer(object):
     def __init__(self, verbosity=1):
 
         self.verbosity = verbosity
+        self.logger = logging.getLogger(__name__)
+        self.logging_group = None
 
         try:
             os.makedirs(self.SCRATCH)
@@ -86,6 +90,12 @@ class Consumer(object):
             raise ConsumerError(
                 "Consumption directory {} does not exist".format(self.CONSUME))
 
+    def log(self, level, message):
+        getattr(self.logger, level)(message, extra={
+            "group": self.logging_group,
+            "component": Log.COMPONENT_CONSUMER
+        })
+
     def consume(self):
 
         for doc in os.listdir(self.CONSUME):
@@ -104,7 +114,9 @@ class Consumer(object):
             if self._is_ready(doc):
                 continue
 
-            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
+            self.logging_group = uuid.uuid4()
+
+            self.log("info", "Consuming {}".format(doc))
 
             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
             pngs = self._get_greyscale(tempdir, doc)
@@ -114,8 +126,7 @@ class Consumer(object):
                 self._store(text, doc)
             except OCRError:
                 self._ignore.append(doc)
-                Log.error(
-                    "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
+                self.log("error", "OCR FAILURE: {}".format(doc))
                 self._cleanup_tempdir(tempdir)
                 continue
             else:
@@ -124,10 +135,7 @@ class Consumer(object):
 
     def _get_greyscale(self, tempdir, doc):
 
-        Log.debug(
-            "Generating greyscale image from {}".format(doc),
-            Log.COMPONENT_CONSUMER
-        )
+        self.log("info", "Generating greyscale image from {}".format(doc))
 
         png = os.path.join(tempdir, "convert-%04d.jpg")
 
@@ -143,18 +151,13 @@ class Consumer(object):
 
         return sorted(filter(lambda __: os.path.isfile(__), pngs))
 
-    @staticmethod
-    def _guess_language(text):
+    def _guess_language(self, text):
         try:
             guess = langdetect.detect(text)
-            Log.debug(
-                "Language detected: {}".format(guess),
-                Log.COMPONENT_CONSUMER
-            )
+            self.log("debug", "Language detected: {}".format(guess))
             return guess
         except Exception as e:
-            Log.warning(
-                "Language detection error: {}".format(e), Log.COMPONENT_MAIL)
+            self.log("warning", "Language detection error: {}".format(e))
 
     def _get_ocr(self, pngs):
         """
@@ -165,7 +168,7 @@ class Consumer(object):
         if not pngs:
             raise OCRError
 
-        Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
+        self.log("info", "OCRing the document")
 
         # Since the division gets rounded down by int, this calculation works
         # for every edge-case, i.e. 1
@@ -175,12 +178,12 @@ class Consumer(object):
         guessed_language = self._guess_language(raw_text)
 
         if not guessed_language or guessed_language not in ISO639:
-            Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
+            self.log("warning", "Language detection failed!")
             if settings.FORGIVING_OCR:
-                Log.warning(
+                self.log(
+                    "warning",
                     "As FORGIVING_OCR is enabled, we're going to make the "
-                    "best with what we have.",
-                    Log.COMPONENT_CONSUMER
+                    "best with what we have."
                 )
                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                 return raw_text
@@ -194,12 +197,12 @@ class Consumer(object):
             return self._ocr(pngs, ISO639[guessed_language])
         except pyocr.pyocr.tesseract.TesseractError:
             if settings.FORGIVING_OCR:
-                Log.warning(
+                self.log(
+                    "warning",
                     "OCR for {} failed, but we're going to stick with what "
                     "we've got since FORGIVING_OCR is enabled.".format(
                         guessed_language
-                    ),
-                    Log.COMPONENT_CONSUMER
+                    )
                 )
                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                 return raw_text
@@ -222,28 +225,15 @@ class Consumer(object):
         if not pngs:
             return ""
 
-        Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
+        self.log("info", "Parsing for {}".format(lang))
 
         with Pool(processes=self.THREADS) as pool:
-            r = pool.map(
-                self.image_to_string, itertools.product(pngs, [lang]))
+            r = pool.map(image_to_string, itertools.product(pngs, [lang]))
             r = " ".join(r)
 
         # Strip out excess white space to allow matching to go smoother
         return re.sub(r"\s+", " ", r)
 
-    def image_to_string(self, args):
-        png, lang = args
-        ocr = pyocr.get_available_tools()[0]
-        with Image.open(os.path.join(self.SCRATCH, png)) as f:
-            if ocr.can_detect_orientation():
-                try:
-                    orientation = ocr.detect_orientation(f, lang=lang)
-                    f = f.rotate(orientation["angle"], expand=1)
-                except TesseractError:
-                    pass
-            return ocr.image_to_string(f, lang=lang)
-
     def _guess_attributes_from_name(self, parseable):
         """
         We use a crude naming convention to make handling the sender, title,
@@ -301,7 +291,7 @@ class Consumer(object):
 
         stats = os.stat(doc)
 
-        Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)
+        self.log("debug", "Saving record to database")
 
         document = Document.objects.create(
             sender=sender,
@@ -316,23 +306,22 @@ class Consumer(object):
 
         if relevant_tags:
             tag_names = ", ".join([t.slug for t in relevant_tags])
-            Log.debug(
-                "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
+            self.log("debug", "Tagging with {}".format(tag_names))
             document.tags.add(*relevant_tags)
 
         with open(doc, "rb") as unencrypted:
             with open(document.source_path, "wb") as encrypted:
-                Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
+                self.log("debug", "Encrypting")
                 encrypted.write(GnuPG.encrypted(unencrypted))
 
-    @staticmethod
-    def _cleanup_tempdir(d):
-        Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
+        self.log("info", "Completed")
+
+    def _cleanup_tempdir(self, d):
+        self.log("debug", "Deleting directory {}".format(d))
         shutil.rmtree(d)
 
-    @staticmethod
-    def _cleanup_doc(doc):
-        Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
+    def _cleanup_doc(self, doc):
+        self.log("debug", "Deleting document {}".format(doc))
         os.unlink(doc)
 
     def _is_ready(self, doc):
@@ -350,3 +339,23 @@ class Consumer(object):
         self.stats[doc] = t
 
         return False
+
+
+def image_to_string(args):
+    """
+    I have no idea why, but if this function were a method of Consumer, it
+    would explode with:
+
+      `TypeError: cannot serialize '_io.TextIOWrapper' object`.
+    """
+
+    png, lang = args
+    ocr = pyocr.get_available_tools()[0]
+    with Image.open(os.path.join(Consumer.SCRATCH, png)) as f:
+        if ocr.can_detect_orientation():
+            try:
+                orientation = ocr.detect_orientation(f, lang=lang)
+                f = f.rotate(orientation["angle"], expand=1)
+            except TesseractError:
+                pass
+        return ocr.image_to_string(f, lang=lang)
diff --git a/src/documents/loggers.py b/src/documents/loggers.py
new file mode 100644
index 000000000..3464478cc
--- /dev/null
+++ b/src/documents/loggers.py
@@ -0,0 +1,30 @@
+import logging
+
+
+class PaperlessLogger(logging.StreamHandler):
+    """
+    A logger smart enough to know to log some kinds of messages to the database
+    for later retrieval in a pretty interface.
+    """
+
+    def emit(self, record):
+
+        logging.StreamHandler.emit(self, record)
+
+        if not hasattr(record, "component"):
+            return
+
+        # We have to do the import here or Django will barf when it tries to
+        # load this because the apps aren't loaded at that point
+        from .models import Log
+
+        kwargs = {
+            "message": record.msg,
+            "component": record.component,
+            "level": record.levelno,
+        }
+
+        if hasattr(record, "group"):
+            kwargs["group"] = record.group
+
+        Log.objects.create(**kwargs)
diff --git a/src/documents/mail.py b/src/documents/mail.py
index 384567e60..cc987bf64 100644
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -1,8 +1,10 @@
 import datetime
 import imaplib
+import logging
 import os
 import re
 import time
+import uuid
 
 from base64 import b64decode
 from email import policy
@@ -11,10 +13,8 @@ from dateutil import parser
 
 from django.conf import settings
 
-from logger.models import Log
-
 from .consumer import Consumer
-from .models import Sender
+from .models import Sender, Log
 
 
 class MailFetcherError(Exception):
@@ -25,7 +25,20 @@ class InvalidMessageError(Exception):
     pass
 
 
-class Message(object):
+class Loggable(object):
+
+    def __init__(self, group=None):
+        self.logger = logging.getLogger(__name__)
+        self.logging_group = group or uuid.uuid4()
+
+    def log(self, level, message):
+        getattr(self.logger, level)(message, extra={
+            "group": self.logging_group,
+            "component": Log.COMPONENT_MAIL
+        })
+
+
+class Message(Loggable):
     """
     A crude, but simple email message class.  We assume that there's a subject
     and n attachments, and that we don't care about the message body.
@@ -33,13 +46,13 @@ class Message(object):
 
     SECRET = settings.UPLOAD_SHARED_SECRET
 
-    def __init__(self, data, verbosity=1):
+    def __init__(self, data, group=None):
         """
         Cribbed heavily from
         https://www.ianlewis.org/en/parsing-email-attachments-python
         """
 
-        self.verbosity = verbosity
+        Loggable.__init__(self, group=group)
 
         self.subject = None
         self.time = None
@@ -54,8 +67,7 @@ class Message(object):
 
         self._set_time(message)
 
-        Log.info(
-            'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)
+        self.log("info", 'Importing email: "{}"'.format(self.subject))
 
         attachments = []
         for part in message.walk():
@@ -134,9 +146,11 @@ class Attachment(object):
         return self.data
 
 
-class MailFetcher(object):
+class MailFetcher(Loggable):
 
-    def __init__(self, verbosity=1):
+    def __init__(self):
+
+        Loggable.__init__(self)
 
         self._connection = None
         self._host = settings.MAIL_CONSUMPTION["HOST"]
@@ -148,7 +162,6 @@ class MailFetcher(object):
         self._enabled = bool(self._host)
 
         self.last_checked = datetime.datetime.now()
-        self.verbosity = verbosity
 
     def pull(self):
         """
@@ -159,14 +172,11 @@ class MailFetcher(object):
 
         if self._enabled:
 
-            Log.info("Checking mail", Log.COMPONENT_MAIL)
+            self.log("info", "Checking mail")
 
             for message in self._get_messages():
 
-                Log.debug(
-                    'Storing email: "{}"'.format(message.subject),
-                    Log.COMPONENT_MAIL
-                )
+                self.log("info", 'Storing email: "{}"'.format(message.subject))
 
                 t = int(time.mktime(message.time.timetuple()))
                 file_name = os.path.join(Consumer.CONSUME, message.file_name)
@@ -193,7 +203,7 @@ class MailFetcher(object):
             self._connection.logout()
 
         except Exception as e:
-            Log.error(e, Log.COMPONENT_MAIL)
+            self.log("error", str(e))
 
         return r
 
@@ -218,9 +228,9 @@ class MailFetcher(object):
 
             message = None
             try:
-                message = Message(data[0][1], self.verbosity)
+                message = Message(data[0][1], self.logging_group)
             except InvalidMessageError as e:
-                Log.error(e, Log.COMPONENT_MAIL)
+                self.log("error", str(e))
             else:
                 self._connection.store(num, "+FLAGS", "\\Deleted")
 
diff --git a/src/logger/migrations/0001_initial.py b/src/documents/migrations/0010_log.py
similarity index 57%
rename from src/logger/migrations/0001_initial.py
rename to src/documents/migrations/0010_log.py
index 029fe43c2..57cf804b7 100644
--- a/src/logger/migrations/0001_initial.py
+++ b/src/documents/migrations/0010_log.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Generated by Django 1.9 on 2016-02-14 16:08
+# Generated by Django 1.9 on 2016-02-27 17:54
 from __future__ import unicode_literals
 
 from django.db import migrations, models
@@ -7,9 +7,8 @@ from django.db import migrations, models
 
 class Migration(migrations.Migration):
 
-    initial = True
-
     dependencies = [
+        ('documents', '0009_auto_20160214_0040'),
     ]
 
     operations = [
@@ -17,14 +16,15 @@ class Migration(migrations.Migration):
             name='Log',
             fields=[
                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('time', models.DateTimeField(auto_now_add=True)),
+                ('group', models.UUIDField(blank=True)),
                 ('message', models.TextField()),
-                ('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)),
+                ('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)),
                 ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
+                ('created', models.DateTimeField(auto_now_add=True)),
+                ('modified', models.DateTimeField(auto_now=True)),
             ],
-        ),
-        migrations.AlterModelOptions(
-            name='log',
-            options={'ordering': ('-time',)},
+            options={
+                'ordering': ('-modified',),
+            },
         ),
     ]
diff --git a/src/documents/models.py b/src/documents/models.py
index 267bebffe..91dd458ea 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import re
 
@@ -187,3 +188,34 @@ class Document(models.Model):
     @property
     def download_url(self):
         return reverse("fetch", kwargs={"pk": self.pk})
+
+
+class Log(models.Model):
+
+    LEVELS = (
+        (logging.DEBUG, "Debugging"),
+        (logging.INFO, "Informational"),
+        (logging.WARNING, "Warning"),
+        (logging.ERROR, "Error"),
+        (logging.CRITICAL, "Critical"),
+    )
+
+    COMPONENT_CONSUMER = 1
+    COMPONENT_MAIL = 2
+    COMPONENTS = (
+        (COMPONENT_CONSUMER, "Consumer"),
+        (COMPONENT_MAIL, "Mail Fetcher")
+    )
+
+    group = models.UUIDField(blank=True)
+    message = models.TextField()
+    level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
+    component = models.PositiveIntegerField(choices=COMPONENTS)
+    created = models.DateTimeField(auto_now_add=True)
+    modified = models.DateTimeField(auto_now=True)
+
+    class Meta(object):
+        ordering = ("-modified",)
+
+    def __str__(self):
+        return self.message
diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py
new file mode 100644
index 000000000..d5527d7c6
--- /dev/null
+++ b/src/documents/tests/test_logger.py
@@ -0,0 +1,124 @@
+import logging
+import uuid
+
+from unittest import mock
+
+from django.test import TestCase
+
+from ..models import Log
+
+
+class TestPaperlessLog(TestCase):
+
+    def __init__(self, *args, **kwargs):
+        TestCase.__init__(self, *args, **kwargs)
+        self.logger = logging.getLogger(
+            "documents.management.commands.document_consumer")
+
+    def test_ignored(self):
+        with mock.patch("logging.StreamHandler.emit") as __:
+            self.assertEqual(Log.objects.all().count(), 0)
+            self.logger.info("This is an informational message")
+            self.logger.warning("This is an informational message")
+            self.logger.error("This is an informational message")
+            self.logger.critical("This is an informational message")
+            self.assertEqual(Log.objects.all().count(), 0)
+
+    def test_that_it_saves_at_all(self):
+
+        kw = {
+            "group": uuid.uuid4(),
+            "component": Log.COMPONENT_MAIL
+        }
+
+        self.assertEqual(Log.objects.all().count(), 0)
+
+        with mock.patch("logging.StreamHandler.emit") as __:
+
+            # Debug messages are ignored by default
+            self.logger.debug("This is a debugging message", extra=kw)
+            self.assertEqual(Log.objects.all().count(), 0)
+
+            self.logger.info("This is an informational message", extra=kw)
+            self.assertEqual(Log.objects.all().count(), 1)
+
+            self.logger.warning("This is an warning message", extra=kw)
+            self.assertEqual(Log.objects.all().count(), 2)
+
+            self.logger.error("This is an error message", extra=kw)
+            self.assertEqual(Log.objects.all().count(), 3)
+
+            self.logger.critical("This is a critical message", extra=kw)
+            self.assertEqual(Log.objects.all().count(), 4)
+
+    def test_groups(self):
+
+        kw1 = {
+            "group": uuid.uuid4(),
+            "component": Log.COMPONENT_MAIL
+        }
+        kw2 = {
+            "group": uuid.uuid4(),
+            "component": Log.COMPONENT_MAIL
+        }
+
+        self.assertEqual(Log.objects.all().count(), 0)
+
+        with mock.patch("logging.StreamHandler.emit") as __:
+
+            # Debug messages are ignored by default
+            self.logger.debug("This is a debugging message", extra=kw1)
+            self.assertEqual(Log.objects.all().count(), 0)
+
+            self.logger.info("This is an informational message", extra=kw2)
+            self.assertEqual(Log.objects.all().count(), 1)
+            self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
+
+            self.logger.warning("This is an warning message", extra=kw1)
+            self.assertEqual(Log.objects.all().count(), 2)
+            self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
+
+            self.logger.error("This is an error message", extra=kw2)
+            self.assertEqual(Log.objects.all().count(), 3)
+            self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
+
+            self.logger.critical("This is a critical message", extra=kw1)
+            self.assertEqual(Log.objects.all().count(), 4)
+            self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
+
+    def test_components(self):
+
+        c1 = Log.COMPONENT_CONSUMER
+        c2 = Log.COMPONENT_MAIL
+        kw1 = {
+            "group": uuid.uuid4(),
+            "component": c1
+        }
+        kw2 = {
+            "group": kw1["group"],
+            "component": c2
+        }
+
+        self.assertEqual(Log.objects.all().count(), 0)
+
+        with mock.patch("logging.StreamHandler.emit") as __:
+
+            # Debug messages are ignored by default
+            self.logger.debug("This is a debugging message", extra=kw1)
+            self.assertEqual(Log.objects.all().count(), 0)
+
+            self.logger.info("This is an informational message", extra=kw2)
+            self.assertEqual(Log.objects.all().count(), 1)
+            self.assertEqual(Log.objects.filter(component=c2).count(), 1)
+
+            self.logger.warning("This is an warning message", extra=kw1)
+            self.assertEqual(Log.objects.all().count(), 2)
+            self.assertEqual(Log.objects.filter(component=c1).count(), 1)
+
+            self.logger.error("This is an error message", extra=kw2)
+            self.assertEqual(Log.objects.all().count(), 3)
+            self.assertEqual(Log.objects.filter(component=c2).count(), 2)
+
+            self.logger.critical("This is a critical message", extra=kw1)
+            self.assertEqual(Log.objects.all().count(), 4)
+            self.assertEqual(Log.objects.filter(component=c1).count(), 2)
diff --git a/src/logger/__init__.py b/src/logger/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/logger/admin.py b/src/logger/admin.py
deleted file mode 100644
index dc9446821..000000000
--- a/src/logger/admin.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from django.contrib import admin
-
-from .models import Log
-
-
-class LogAdmin(admin.ModelAdmin):
-
-    list_display = ("message", "level", "component")
-    list_filter = ("level", "component",)
-
-
-admin.site.register(Log, LogAdmin)
diff --git a/src/logger/apps.py b/src/logger/apps.py
deleted file mode 100644
index 2c1a7d735..000000000
--- a/src/logger/apps.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from django.apps import AppConfig
-
-
-class LoggerConfig(AppConfig):
-    name = 'logger'
diff --git a/src/logger/migrations/__init__.py b/src/logger/migrations/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/logger/models.py b/src/logger/models.py
deleted file mode 100644
index f7f2c421a..000000000
--- a/src/logger/models.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from django.db import models
-
-
-class Log(models.Model):
-
-    LEVEL_ERROR = 1
-    LEVEL_WARNING = 2
-    LEVEL_INFO = 3
-    LEVEL_DEBUG = 4
-    LEVELS = (
-        (LEVEL_ERROR, "Error"),
-        (LEVEL_WARNING, "Warning"),
-        (LEVEL_INFO, "Informational"),
-        (LEVEL_DEBUG, "Debugging"),
-    )
-
-    COMPONENT_CONSUMER = 1
-    COMPONENT_MAIL = 2
-    COMPONENTS = (
-        (COMPONENT_CONSUMER, "Consumer"),
-        (COMPONENT_MAIL, "Mail Fetcher")
-    )
-
-    time = models.DateTimeField(auto_now_add=True)
-    message = models.TextField()
-    level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO)
-    component = models.PositiveIntegerField(choices=COMPONENTS)
-
-    class Meta(object):
-        ordering = ("-time",)
-
-    def __str__(self):
-        return self.message
-
-    @classmethod
-    def error(cls, message, component):
-        cls.objects.create(
-            message=message, level=cls.LEVEL_ERROR, component=component)
-
-    @classmethod
-    def warning(cls, message, component):
-        cls.objects.create(
-            message=message, level=cls.LEVEL_WARNING, component=component)
-
-    @classmethod
-    def info(cls, message, component):
-        cls.objects.create(
-            message=message, level=cls.LEVEL_INFO, component=component)
-
-    @classmethod
-    def debug(cls, message, component):
-        cls.objects.create(
-            message=message, level=cls.LEVEL_DEBUG, component=component)
diff --git a/src/logger/tests.py b/src/logger/tests.py
deleted file mode 100644
index 7ce503c2d..000000000
--- a/src/logger/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/src/logger/views.py b/src/logger/views.py
deleted file mode 100644
index 91ea44a21..000000000
--- a/src/logger/views.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 5d7cc3b2f..1f7bb6d0a 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -42,7 +42,6 @@ INSTALLED_APPS = [
     "django_extensions",
 
     "documents",
-    "logger",
 
     "rest_framework",
 
@@ -89,12 +88,12 @@ DATABASES = {
         "NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"),
     }
 }
-if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"):
+if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
     DATABASES["default"] = {
         "ENGINE": "django.db.backends.postgresql_psycopg2",
-        "NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"),
-        "USER": os.environ.get("PAPERLESS_DBUSER"),
-        "PASSWORD": os.environ.get("PAPERLESS_DBPASS")
+        "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
+        "USER": os.getenv("PAPERLESS_DBUSER"),
+        "PASSWORD": os.getenv("PAPERLESS_DBPASS")
     }
 
 
@@ -141,6 +140,25 @@ STATIC_URL = '/static/'
 MEDIA_URL = "/media/"
 
 
+# Logging
+
+LOGGING = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "handlers": {
+        "consumer": {
+            "class": "documents.loggers.PaperlessLogger",
+        }
+    },
+    "loggers": {
+        "documents": {
+            "handlers": ["consumer"],
+            "level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"),
+        },
+    },
+}
+
+
 # Paperless-specific stuffs
 # Change these paths if yours are different
 # ----------------------------------------------------------------------------
@@ -150,15 +168,15 @@ MEDIA_URL = "/media/"
 OCR_LANGUAGE = "eng"
 
 # The amount of threads to use for OCR
-OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
+OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
 
-# If this is true, any failed attempts to OCR a PDF will result in the PDF being
-# indexed anyway, with whatever we could get.  If it's False, the file will
-# simply be left in the CONSUMPTION_DIR.
+# If this is true, any failed attempts to OCR a PDF will result in the PDF
+# being indexed anyway, with whatever we could get.  If it's False, the file
+# will simply be left in the CONSUMPTION_DIR.
 FORGIVING_OCR = True
 
 # GNUPG needs a home directory for some reason
-GNUPG_HOME = os.environ.get("HOME", "/tmp")
+GNUPG_HOME = os.getenv("HOME", "/tmp")
 
 # Convert is part of the Imagemagick package
 CONVERT_BINARY = "/usr/bin/convert"
@@ -167,16 +185,16 @@ CONVERT_BINARY = "/usr/bin/convert"
 SCRATCH_DIR = "/tmp/paperless"
 
 # This is where Paperless will look for PDFs to index
-CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
+CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME")
 
 # If you want to use IMAP mail consumption, populate this with useful values.
 # If you leave HOST set to None, we assume you're not going to use this
 # feature.
 MAIL_CONSUMPTION = {
-    "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
-    "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
-    "USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"),
-    "PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"),
+    "HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"),
+    "PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"),
+    "USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"),
+    "PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"),
     "USE_SSL": True,  # If True, use SSL/TLS to connect
     "INBOX": "INBOX"  # The name of the inbox on the server
 }
@@ -188,9 +206,9 @@ MAIL_CONSUMPTION = {
 # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things
 # with GPG, including an interesting case where it may "encrypt" zero-byte
 # files.
-PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
+PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
 
 # If you intend to use the "API" to push files into the consumer, you'll need
 # to provide a shared secret here.  Leaving this as the default will disable
 # the API.
-UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "")
+UPLOAD_SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "")

From 51173d80cf1a153cab5f5ec91461960b5aacfbe9 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 27 Feb 2016 20:19:09 +0000
Subject: [PATCH 31/71] License clarification

---
 src/documents/management/commands/loaddata_stdin.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py
index ca0b9ef7b..9cce7a047 100644
--- a/src/documents/management/commands/loaddata_stdin.py
+++ b/src/documents/management/commands/loaddata_stdin.py
@@ -1,17 +1,14 @@
-"""
-Source:
-    https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
-
-License:
-    MIT
-    Copyright (c) 2016 Baptiste Mispelon
-"""
 import sys
 
 from django.core.management.commands.loaddata import Command as LoadDataCommand
 
 
 class Command(LoadDataCommand):
+    """
+    Allow the loading of data from standard in.  Sourced originally from:
+    https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)
+    """
+
     def parse_name(self, fixture_name):
         self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None)
         if fixture_name == '-':

From a4d89ed1244f27b89905b7adc6ae3410aeb9c858 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 27 Feb 2016 20:50:48 +0000
Subject: [PATCH 32/71] Fixt the test to ignore verbosity

---
 src/documents/tests/test_mail.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/documents/tests/test_mail.py b/src/documents/tests/test_mail.py
index 9a9480db4..366dc97b9 100644
--- a/src/documents/tests/test_mail.py
+++ b/src/documents/tests/test_mail.py
@@ -27,7 +27,7 @@ class TestMessage(TestCase):
 
         with open(self.sample, "rb") as f:
 
-            message = Message(f.read(), verbosity=0)
+            message = Message(f.read())
 
             self.assertTrue(message)
             self.assertEqual(message.subject, "Test 0")

From 631aa99d9299ce17c382cfd9b206a15c7ef2f186 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 28 Feb 2016 00:39:40 +0000
Subject: [PATCH 33/71] No need to pass verbosity around anymore

---
 src/documents/consumer.py                              | 3 +--
 src/documents/management/commands/document_consumer.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 37b348495..f3d5b71cb 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -66,9 +66,8 @@ class Consumer(object):
         flags=re.IGNORECASE
     )
 
-    def __init__(self, verbosity=1):
+    def __init__(self):
 
-        self.verbosity = verbosity
         self.logger = logging.getLogger(__name__)
         self.logging_group = None
 
diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
index ae72381e2..0eae5c80c 100644
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -34,7 +34,7 @@ class Command(BaseCommand):
         self.verbosity = options["verbosity"]
 
         try:
-            self.file_consumer = Consumer(verbosity=self.verbosity)
+            self.file_consumer = Consumer()
             self.mail_fetcher = MailFetcher()
         except (ConsumerError, MailFetcherError) as e:
             raise CommandError(e)

From d686aba9ae497fd83216870a278657351c3b17d2 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 28 Feb 2016 00:40:08 +0000
Subject: [PATCH 34/71] Reset the group id for every pull

---
 src/documents/mail.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/documents/mail.py b/src/documents/mail.py
index cc987bf64..a5f5416cb 100644
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -172,6 +172,9 @@ class MailFetcher(Loggable):
 
         if self._enabled:
 
+            # Reset the grouping id for each fetch
+            self.logging_group = uuid.uuid4()
+
             self.log("info", "Checking mail")
 
             for message in self._get_messages():

From 5a8e75112f7cc59358e85699caa6836b2f3a4451 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 28 Feb 2016 00:41:03 +0000
Subject: [PATCH 35/71] Added a custom manager for groupped logs

---
 src/documents/managers.py | 70 +++++++++++++++++++++++++++++++++++++++
 src/documents/models.py   |  4 +++
 2 files changed, 74 insertions(+)
 create mode 100644 src/documents/managers.py

diff --git a/src/documents/managers.py b/src/documents/managers.py
new file mode 100644
index 000000000..d7e7225eb
--- /dev/null
+++ b/src/documents/managers.py
@@ -0,0 +1,70 @@
+from django.conf import settings
+
+from django.db import models
+from django.db.models.aggregates import Max
+
+
+class Concat(models.Aggregate):
+    """
+    Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've
+    only ever tested it in Sqlite.
+    """
+
+    ENGINE_SQLITE = 1
+    ENGINE_POSTGRESQL = 2
+    ENGINE_MYSQL = 3
+    ENGINES = {
+        "django.db.backends.sqlite3": ENGINE_SQLITE,
+        "django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL,
+        "django.db.backends.postgresql": ENGINE_POSTGRESQL,
+        "django.db.backends.mysql": ENGINE_MYSQL
+    }
+
+    def __init__(self, expression, separator="\n", **extra):
+
+        self.engine = self._get_engine()
+        self.function = self._get_function()
+        self.template = self._get_template(separator)
+
+        models.Aggregate.__init__(
+            self,
+            expression,
+            output_field=models.CharField(),
+            **extra
+        )
+
+    def _get_engine(self):
+        engine = settings.DATABASES["default"]["ENGINE"]
+        try:
+            return self.ENGINES[engine]
+        except KeyError:
+            raise NotImplementedError(
+                "There's currently no support for {} when it comes to group "
+                "concatenation in Paperless".format(engine)
+            )
+
+    def _get_function(self):
+        if self.engine == self.ENGINE_POSTGRESQL:
+            return "STRING_AGG"
+        return "GROUP_CONCAT"
+
+    def _get_template(self, separator):
+        if self.engine == self.ENGINE_MYSQL:
+            return "%(function)s(%(expressions)s, SEPARATOR '{}')".format(
+                separator)
+        return "%(function)s(%(expressions)s, '{}')".format(separator)
+
+
+class LogQuerySet(models.query.QuerySet):
+
+    def by_group(self):
+        return self.values("group").annotate(
+            time=Max("modified"),
+            messages=Concat("message"),
+        ).order_by("-time")
+
+
+class LogManager(models.Manager):
+
+    def get_queryset(self):
+        return LogQuerySet(self.model, using=self._db)
diff --git a/src/documents/models.py b/src/documents/models.py
index 91dd458ea..e5556534a 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -8,6 +8,8 @@ from django.db import models
 from django.template.defaultfilters import slugify
 from django.utils import timezone
 
+from .managers import LogManager
+
 
 class SluggedModel(models.Model):
 
@@ -214,6 +216,8 @@ class Log(models.Model):
     created = models.DateTimeField(auto_now_add=True)
     modified = models.DateTimeField(auto_now=True)
 
+    objects = LogManager()
+
     class Meta(object):
         ordering = ("-modified",)
 

From 86878923322484cdbdf9f8898924d68df7575fb5 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 28 Feb 2016 00:52:44 +0000
Subject: [PATCH 36/71] Don't print to standard out during a test

---
 src/documents/tests/test_mail.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/documents/tests/test_mail.py b/src/documents/tests/test_mail.py
index 366dc97b9..256c77231 100644
--- a/src/documents/tests/test_mail.py
+++ b/src/documents/tests/test_mail.py
@@ -3,6 +3,7 @@ import os
 import magic
 
 from hashlib import md5
+from unittest import mock
 
 from django.conf import settings
 from django.test import TestCase
@@ -27,7 +28,8 @@ class TestMessage(TestCase):
 
         with open(self.sample, "rb") as f:
 
-            message = Message(f.read())
+            with mock.patch("logging.StreamHandler.emit") as __:
+                message = Message(f.read())
 
             self.assertTrue(message)
             self.assertEqual(message.subject, "Test 0")

From 9379e95446869fb01718ef21a1e2f26a9b730b6b Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 28 Feb 2016 00:53:18 +0000
Subject: [PATCH 37/71] Added a test for the new by_group() feature

---
 src/documents/tests/test_logger.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py
index d5527d7c6..8d31fe4ec 100644
--- a/src/documents/tests/test_logger.py
+++ b/src/documents/tests/test_logger.py
@@ -122,3 +122,21 @@ class TestPaperlessLog(TestCase):
             self.logger.critical("This is a critical message", extra=kw1)
             self.assertEqual(Log.objects.all().count(), 4)
             self.assertEqual(Log.objects.filter(component=c1).count(), 2)
+
+    def test_groupped_query(self):
+
+        kw = {
+            "group": uuid.uuid4(),
+            "component": Log.COMPONENT_MAIL
+        }
+        with mock.patch("logging.StreamHandler.emit") as __:
+            self.logger.info("Message 0", extra=kw)
+            self.logger.info("Message 1", extra=kw)
+            self.logger.info("Message 2", extra=kw)
+            self.logger.info("Message 3", extra=kw)
+
+        self.assertEqual(Log.objects.all().by_group().count(), 1)
+        self.assertEqual(
+            Log.objects.all().by_group()[0]["Messages"],
+            "Message 0\nMessage 1\nMessage 2\nMessage 3"
+        )

From 85f59638519168b7b85a8889a24b99478acad880 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 28 Feb 2016 15:02:18 +0000
Subject: [PATCH 38/71] Fixt capitalisation

---
 src/documents/tests/test_logger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py
index 8d31fe4ec..23cea13e7 100644
--- a/src/documents/tests/test_logger.py
+++ b/src/documents/tests/test_logger.py
@@ -137,6 +137,6 @@ class TestPaperlessLog(TestCase):
 
         self.assertEqual(Log.objects.all().by_group().count(), 1)
         self.assertEqual(
-            Log.objects.all().by_group()[0]["Messages"],
+            Log.objects.all().by_group()[0]["messages"],
             "Message 0\nMessage 1\nMessage 2\nMessage 3"
         )

From 26fc27da9bf21e3e3de1edb605a63ab59aaf4492 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 1 Mar 2016 18:57:12 +0000
Subject: [PATCH 39/71] Setting appropriate permissions

---
 src/documents/serialisers.py | 15 ++++++++++++++-
 src/documents/views.py       | 35 ++++++++++++++++++++++++++++-------
 src/paperless/urls.py        |  3 ++-
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index 345fa166d..f9b29f790 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1,6 +1,6 @@
 from rest_framework import serializers
 
-from .models import Sender, Tag, Document
+from .models import Sender, Tag, Document, Log
 
 
 class SenderSerializer(serializers.HyperlinkedModelSerializer):
@@ -39,3 +39,16 @@ class DocumentSerializer(serializers.ModelSerializer):
             "file_name",
             "download_url"
         )
+
+
+class LogSerializer(serializers.ModelSerializer):
+
+    time = serializers.DateTimeField()
+    messages = serializers.CharField()
+
+    class Meta(object):
+        model = Log
+        fields = (
+            "time",
+            "messages"
+        )
diff --git a/src/documents/views.py b/src/documents/views.py
index 26642c9fc..1bfba3ee7 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,19 +1,25 @@
+from django.contrib.auth.mixins import LoginRequiredMixin
 from django.http import HttpResponse
 from django.template.defaultfilters import slugify
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic import FormView, DetailView
 
+from rest_framework.mixins import (
+    RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin)
 from rest_framework.pagination import PageNumberPagination
-from rest_framework.viewsets import ModelViewSet
+from rest_framework.permissions import IsAuthenticated
+from rest_framework.viewsets import (
+    ModelViewSet, ReadOnlyModelViewSet, GenericViewSet)
 
 from paperless.db import GnuPG
 
 from .forms import UploadForm
-from .models import Sender, Tag, Document
-from .serialisers import SenderSerializer, TagSerializer, DocumentSerializer
+from .models import Sender, Tag, Document, Log
+from .serialisers import (
+    SenderSerializer, TagSerializer, DocumentSerializer, LogSerializer)
 
 
-class FetchView(DetailView):
+class FetchView(LoginRequiredMixin, DetailView):
 
     model = Document
 
@@ -40,9 +46,9 @@ class FetchView(DetailView):
         return response
 
 
-class PushView(FormView):
+class PushView(LoginRequiredMixin, FormView):
     """
-    A crude REST API for creating documents.
+    A crude REST-ish API for creating documents.
     """
 
     form_class = UploadForm
@@ -69,6 +75,7 @@ class SenderViewSet(ModelViewSet):
     queryset = Sender.objects.all()
     serializer_class = SenderSerializer
     pagination_class = StandardPagination
+    permission_classes = (IsAuthenticated,)
 
 
 class TagViewSet(ModelViewSet):
@@ -76,10 +83,24 @@ class TagViewSet(ModelViewSet):
     queryset = Tag.objects.all()
     serializer_class = TagSerializer
     pagination_class = StandardPagination
+    permission_classes = (IsAuthenticated,)
 
 
-class DocumentViewSet(ModelViewSet):
+class DocumentViewSet(RetrieveModelMixin,
+                      UpdateModelMixin,
+                      DestroyModelMixin,
+                      ListModelMixin,
+                      GenericViewSet):
     model = Document
     queryset = Document.objects.all()
     serializer_class = DocumentSerializer
     pagination_class = StandardPagination
+    permission_classes = (IsAuthenticated,)
+
+
+class LogViewSet(ReadOnlyModelViewSet):
+    model = Log
+    queryset = Log.objects.all().by_group()
+    serializer_class = LogSerializer
+    pagination_class = StandardPagination
+    permission_classes = (IsAuthenticated,)
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index fd1af065d..2f4c63f17 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -21,12 +21,13 @@ from django.contrib import admin
 from rest_framework.routers import DefaultRouter
 
 from documents.views import (
-    FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet)
+    FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet)
 
 router = DefaultRouter()
 router.register(r'senders', SenderViewSet)
 router.register(r'tags', TagViewSet)
 router.register(r'documents', DocumentViewSet)
+router.register(r'logs', LogViewSet)
 
 urlpatterns = [
 

From 7d1aa1175f93f46b6e2b376a238f19b6f35a0a29 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 1 Mar 2016 19:03:28 +0000
Subject: [PATCH 40/71] pep8

---
 src/paperless/urls.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 2f4c63f17..55563c6c5 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -21,7 +21,8 @@ from django.contrib import admin
 from rest_framework.routers import DefaultRouter
 
 from documents.views import (
-    FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet)
+    FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet
+)
 
 router = DefaultRouter()
 router.register(r'senders', SenderViewSet)

From 26c378135079c80d716aa9d656c9e6ae6720286a Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 1 Mar 2016 22:37:42 +0000
Subject: [PATCH 41/71] #44: Harmonise environment variables with constant
 names

---
 docs/changelog.rst        |  2 ++
 src/documents/forms.py    |  2 +-
 src/documents/mail.py     |  4 ++--
 src/paperless/settings.py | 47 +++++++++++++++++++++++++++++++++------
 src/paperless/urls.py     |  2 +-
 src/paperless/version.py  |  2 +-
 6 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index cdb720926..5b8029780 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -3,6 +3,7 @@ Changelog
 
 * 0.1.1 (master)
 
+  * `#44`_: Harmonise environment variable names with constant names.
   * `#60`_: Setup logging to actually use the Python native logging framework.
   * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
     to be imported but made unavailable.
@@ -68,6 +69,7 @@ Changelog
 .. _darkmatter: https://github.com/darkmatter
 .. _zedster: https://github.com/zedster
 
+.. _#44: https://github.com/danielquinn/paperless/issues/44
 .. _#45: https://github.com/danielquinn/paperless/issues/45
 .. _#47: https://github.com/danielquinn/paperless/issues/47
 .. _#48: https://github.com/danielquinn/paperless/issues/48
diff --git a/src/documents/forms.py b/src/documents/forms.py
index 404be1763..8eb7b8381 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -14,7 +14,7 @@ from .consumer import Consumer
 
 class UploadForm(forms.Form):
 
-    SECRET = settings.UPLOAD_SHARED_SECRET
+    SECRET = settings.SHARED_SECRET
     TYPE_LOOKUP = {
         "application/pdf": Document.TYPE_PDF,
         "image/png": Document.TYPE_PNG,
diff --git a/src/documents/mail.py b/src/documents/mail.py
index a5f5416cb..0bc3ce94f 100644
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -44,7 +44,7 @@ class Message(Loggable):
     and n attachments, and that we don't care about the message body.
     """
 
-    SECRET = settings.UPLOAD_SHARED_SECRET
+    SECRET = settings.SHARED_SECRET
 
     def __init__(self, data, group=None):
         """
@@ -175,7 +175,7 @@ class MailFetcher(Loggable):
             # Reset the grouping id for each fetch
             self.logging_group = uuid.uuid4()
 
-            self.log("info", "Checking mail")
+            self.log("debug", "Checking mail")
 
             for message in self._get_messages():
 
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 1f7bb6d0a..67f6c4a0c 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -159,7 +159,7 @@ LOGGING = {
 }
 
 
-# Paperless-specific stuffs
+# Paperless-specific stuff
 # Change these paths if yours are different
 # ----------------------------------------------------------------------------
 
@@ -173,19 +173,19 @@ OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
-FORGIVING_OCR = True
+FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true"))
 
 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
 
-# Convert is part of the Imagemagick package
-CONVERT_BINARY = "/usr/bin/convert"
+# Convert is part of the ImageMagick package
+CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
 
 # This will be created if it doesn't exist
-SCRATCH_DIR = "/tmp/paperless"
+SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
 
 # This is where Paperless will look for PDFs to index
-CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME")
+CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
 
 # If you want to use IMAP mail consumption, populate this with useful values.
 # If you leave HOST set to None, we assume you're not going to use this
@@ -211,4 +211,37 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
 # If you intend to use the "API" to push files into the consumer, you'll need
 # to provide a shared secret here.  Leaving this as the default will disable
 # the API.
-UPLOAD_SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "")
+SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
+
+#
+# TODO: Remove after 1.2
+#
+# This logic is here to address issue #44, wherein we were using inconsistent
+# constant names vs. environment variables.  If you're using Paperless for the
+# first time, you can safely ignore everything from here on, so long as you're
+# correctly defining the variables as per the documentation.
+#
+
+
+def deprecated(before, after):
+    print(
+        "\n\n"
+        "WARNING: {before} has been renamed to {after}.\n"
+        "WARNING: Use of {before} will not work as of version 1.2."
+        "\n\n".format(
+            before=before,
+            after=after
+        )
+    )
+
+if not CONVERT_BINARY:
+    deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY")
+    CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", "convert")
+
+if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"):
+    deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR")
+    CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME")
+
+if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"):
+    deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET")
+    SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "")
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 55563c6c5..eb302638f 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -47,5 +47,5 @@ urlpatterns = [
 
 ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
 
-if settings.UPLOAD_SHARED_SECRET:
+if settings.SHARED_SECRET:
     urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push"))
diff --git a/src/paperless/version.py b/src/paperless/version.py
index 8e2c2d9ea..d61abb655 100644
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (0, 1, 0)
+__version__ = (0, 1, 1)

From 857c7ac65426907925c89e3c62d3d73455367230 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Tue, 1 Mar 2016 22:39:40 +0000
Subject: [PATCH 42/71] #44: Harmonise environment variables with constant
 names

---
 src/paperless/settings.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 67f6c4a0c..1599a08e8 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -235,8 +235,10 @@ def deprecated(before, after):
     )
 
 if not CONVERT_BINARY:
-    deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY")
-    CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", "convert")
+    CONVERT_BINARY = "convert"
+    if os.getenv("PAPERLESS_CONVERT"):
+        deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY")
+        CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY)
 
 if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"):
     deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR")

From 21cd4e9f14a7b74bec3b5d35dcc56a31ebed0372 Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Wed, 2 Mar 2016 09:05:51 +0100
Subject: [PATCH 43/71] Update env-var in Dockerfile, fix volume names

---
 Dockerfile                   |  4 ++--
 docker-compose.yml.example   | 14 +++++++-------
 scripts/docker-entrypoint.sh |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index dade863ca..fec76ee37 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,8 +19,8 @@ RUN mkdir -p /usr/src/paperless/src
 COPY src/ /usr/src/paperless/src/
 
 # Set consumption directory
-ENV PAPERLESS_CONSUME /consume
-RUN mkdir -p $PAPERLESS_CONSUME
+ENV PAPERLESS_CONSUMPTION_DIR /consume
+RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
 
 # Migrate database
 WORKDIR /usr/src/paperless/src
diff --git a/docker-compose.yml.example b/docker-compose.yml.example
index 7e3557aa8..488fc83d2 100644
--- a/docker-compose.yml.example
+++ b/docker-compose.yml.example
@@ -8,8 +8,8 @@ services:
             # modifying the part before the `:`.
             - "8000:8000"
         volumes:
-            - paperless-data:/usr/src/paperless/data
-            - paperless-media:/usr/src/paperless/media
+            - data:/usr/src/paperless/data
+            - media:/usr/src/paperless/media
         env_file: docker-compose.env
         environment:
             - PAPERLESS_OCR_LANGUAGES=
@@ -18,20 +18,20 @@ services:
     consumer:
         image: paperless
         volumes:
-            - paperless-data:/usr/src/paperless/data
-            - paperless-media:/usr/src/paperless/media
+            - data:/usr/src/paperless/data
+            - media:/usr/src/paperless/media
             # You have to adapt the local path you want the consumption
             # directory to mount to by modifying the part before the ':'.
             - /path/to/arbitrary/place:/consume
             # Likewise, you can add a local path to mount a directory for
             # exporting. This is not strictly needed for paperless to
             # function, only if you're exporting your files: uncomment
-            # it and fill in a local path if you know you're going to 
+            # it and fill in a local path if you know you're going to
             # want to export your documents.
             # - /path/to/another/arbitrary/place:/export
         env_file: docker-compose.env
         command: ["document_consumer"]
 
 volumes:
-    paperless-data:
-    paperless-media:
+    data:
+    media:
diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh
index 9001574a1..14d385469 100644
--- a/scripts/docker-entrypoint.sh
+++ b/scripts/docker-entrypoint.sh
@@ -16,8 +16,8 @@ map_uidgid() {
 
 set_permissions() {
     # Set permissions for consumption directory
-    chgrp paperless "$PAPERLESS_CONSUME"
-    chmod g+x "$PAPERLESS_CONSUME"
+    chgrp paperless "$PAPERLESS_CONSUMPTION_DIR"
+    chmod g+x "$PAPERLESS_CONSUMPTION_DIR"
 
     # Set permissions for application directory
     chown -Rh paperless:paperless /usr/src/paperless

From dd3bdcb9568838408296bddbac43c97659de41e2 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 11:00:46 +0000
Subject: [PATCH 44/71] Updated the Vagrant tools to use environment variables

---
 docs/changelog.rst        |  2 ++
 docs/setup.rst            | 46 ++++++++++++++++-------------
 scripts/vagrant-provision | 62 ++++++++++++++++++++++++++++++++-------
 3 files changed, 79 insertions(+), 31 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 5b8029780..ce2a4edab 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -3,6 +3,8 @@ Changelog
 
 * 0.1.1 (master)
 
+  * Refactored the Vagrant installation process to use environment variables
+    rather than asking the user to modify ``settings.py``.
   * `#44`_: Harmonise environment variable names with constant names.
   * `#60`_: Setup logging to actually use the Python native logging framework.
   * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
diff --git a/docs/setup.rst b/docs/setup.rst
index be8a349d8..077ce135c 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -98,27 +98,31 @@ Vagrant Method
 2. Run ``vagrant up``.  An instance will start up for you.  When it's ready and
    provisioned...
 3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
-   ``/opt/paperless/src/paperless/settings.py`` and set the values for:
-    * ``CONSUMPTION_DIR``: this is where your documents will be dumped to be
-      consumed by Paperless.
-    * ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt
-      the original document.  The default value attempts to source the
-      passphrase from the environment, so if you don't set it to a static value
-      here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the
-      command line whenever invoking the consumer or webserver.
-4. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
-5. Still inside your vagrant box, create a user for your Paperless instance with
-   ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
+   ``/etc/paperless.conf`` and set the values for:
+    * ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
+      dumped to be consumed by Paperless.
+    * ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to
+      encrypt/decrypt the original document.
+    * ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming
+      documents from mail or via the API.  If you don't use either, leaving it
+      blank is just fine.
+4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again.  This
+   updates the environment to make use of the changes you made to the config
+   file.
+5. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
+6. Still inside your vagrant box, create a user for your Paperless instance
+   with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
    create your user.
-6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``.
-   You should now be able to visit your (empty) `Paperless webserver`_ at
-   ``172.28.128.4:8000``.  You can login with the user/pass you created in #5.
-7. In a separate window, run ``vagrant ssh`` again, but this time once inside
+7. Start the webserver with
+   ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be
+   able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``.
+   You can login with the user/pass you created in #6.
+8. In a separate window, run ``vagrant ssh`` again, but this time once inside
    your vagrant instance, you should start the consumer script with
    ``/opt/paperless/src/manage.py document_consumer``.
-8. Scan something.  Put it in the ``CONSUMPTION_DIR``.
-9. Wait a few minutes
-10. Visit the document list on your webserver, and it should be there, indexed
+9. Scan something.  Put it in the ``CONSUMPTION_DIR``.
+10. Wait a few minutes
+11. Visit the document list on your webserver, and it should be there, indexed
     and downloadable.
 
 .. _Vagrant: https://vagrantup.com/
@@ -158,11 +162,11 @@ Docker Method
 
 3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
    a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
-   editing both these files: taking a copy ensures that you can ``git pull`` to 
-   receive updates without risking merge conflicts with your modified versions 
+   editing both these files: taking a copy ensures that you can ``git pull`` to
+   receive updates without risking merge conflicts with your modified versions
    of the configuration files.
 4. Modify ``docker-compose.yml`` to your preferences, following the instructions
-   in comments in the file. The only change that is a hard requirement is to 
+   in comments in the file. The only change that is a hard requirement is to
    specify where the consumption directory should mount.
 5. Modify ``docker-compose.env`` and adapt the following environment variables:
 
diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision
index aa6ca5e14..c746e7fc1 100644
--- a/scripts/vagrant-provision
+++ b/scripts/vagrant-provision
@@ -1,13 +1,55 @@
 #!/bin/bash
 
-# install packages
-sudo apt-get update
-sudo apt-get build-dep -y python-imaging
-sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
-sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
-sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
+# Install packages
+apt-get update
+apt-get build-dep -y python-imaging
+apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
+apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
+apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
 
-# setup python project
-pushd /opt/paperless
-sudo pip3 install -r requirements.txt
-popd
+# Python dependencies
+pip3 install -r /opt/paperless/requirements.txt
+
+# Create the environment file
+echo "
+# This where your documents should go to be consumed.  Make sure that it exists
+# before you start Paperless.
+export PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption'
+
+# This is the secret passphrase used to encrypt the documents once they have
+# been consumed.  Change it to whatever you like, but you shouldn't change it
+# after it has been used to consume a document or you won't be able to read
+# that document again.
+export PAPERLESS_PASSPHRASE='secret'
+
+# This is the secret string used to verify PDFs sent by mail or consumed via
+# the API.  If you don't plan to use either of these, you can safely leave it
+# blank
+export PAPERLESS_SHARED_SECRET=''
+" > /tmp/paperless.conf
+chmod 0640 /tmp/paperless.conf
+chown root:vagrant /tmp/paperless.conf
+mv /tmp/paperless.conf /etc/
+
+# Create the consumption directory
+mkdir /home/vagrant/consumption
+chown vagrant:vagrant /home/vagrant/consumption
+
+# Create environment wrapper
+echo "
+
+
+# Setup the paperless environment variables
+. /etc/paperless.conf
+" >> /home/vagrant/.bashrc
+
+echo "
+
+
+Now follow the remaining steps in the Vagrant section of the setup
+documentation to complete the process:
+
+http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant
+
+
+"

From 0aead1fbe6578240476bb36a135859b5126f4966 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 17:59:27 +0000
Subject: [PATCH 45/71] #68: Using dotenv for a proper unix config file

---
 docs/changelog.rst                  |  3 +++
 requirements.txt                    |  1 +
 scripts/paperless-consumer.service  |  3 +--
 scripts/paperless-webserver.service |  1 -
 src/paperless/settings.py           | 16 ++++++++++++----
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index ce2a4edab..772e30dc0 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -3,6 +3,8 @@ Changelog
 
 * 0.1.1 (master)
 
+  * `#68`_: Added support for using a proper config file at
+    ``/etc/paperless.conf``.
   * Refactored the Vagrant installation process to use environment variables
     rather than asking the user to modify ``settings.py``.
   * `#44`_: Harmonise environment variable names with constant names.
@@ -79,3 +81,4 @@ Changelog
 .. _#54: https://github.com/danielquinn/paperless/issues/54
 .. _#57: https://github.com/danielquinn/paperless/issues/57
 .. _#60: https://github.com/danielquinn/paperless/issues/60
+.. _#68: https://github.com/danielquinn/paperless/issues/68
diff --git a/requirements.txt b/requirements.txt
index 810af8ec2..6a133327a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 Django==1.9.2
 django-extensions==1.6.1
 djangorestframework==3.3.2
+python-dotenv==0.3.0
 filemagic==1.6
 langdetect==1.0.5
 Pillow==3.1.1
diff --git a/scripts/paperless-consumer.service b/scripts/paperless-consumer.service
index 34d65dedb..79a27d3ce 100644
--- a/scripts/paperless-consumer.service
+++ b/scripts/paperless-consumer.service
@@ -2,10 +2,9 @@
 Description=Paperless consumer
 
 [Service]
-EnvironmentFile=/etc/conf.d/paperless
 User=paperless
 Group=paperless
-ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
+ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer
 
 [Install]
 WantedBy=multi-user.target
diff --git a/scripts/paperless-webserver.service b/scripts/paperless-webserver.service
index 1a2386471..9d20f5a1c 100644
--- a/scripts/paperless-webserver.service
+++ b/scripts/paperless-webserver.service
@@ -2,7 +2,6 @@
 Description=Paperless webserver
 
 [Service]
-EnvironmentFile=/etc/conf.d/paperless
 User=paperless
 Group=paperless
 ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 1599a08e8..f2fb41941 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
 
 import os
 
+from dotenv import load_dotenv
+
 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
@@ -140,6 +142,16 @@ STATIC_URL = '/static/'
 MEDIA_URL = "/media/"
 
 
+# Paperless-specific stuff
+# You shouldn't have to edit any of these values.  Rather, you can set these
+# values in /etc/paperless.conf instead.
+# ----------------------------------------------------------------------------
+
+# Tap paperless.conf if it's available
+if os.path.exists("/etc/paperless.conf"):
+    load_dotenv("/etc/paperless.conf")
+
+
 # Logging
 
 LOGGING = {
@@ -159,10 +171,6 @@ LOGGING = {
 }
 
 
-# Paperless-specific stuff
-# Change these paths if yours are different
-# ----------------------------------------------------------------------------
-
 # The default language that tesseract will attempt to use when parsing
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = "eng"

From 66d4407565ce4ff431b50c9dbd01b7598625e62e Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 18:01:02 +0000
Subject: [PATCH 46/71] #68: Using dotenv for a proper unix config file

---
 scripts/vagrant-provision | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision
index c746e7fc1..2a744d5d3 100644
--- a/scripts/vagrant-provision
+++ b/scripts/vagrant-provision
@@ -14,18 +14,18 @@ pip3 install -r /opt/paperless/requirements.txt
 echo "
 # This where your documents should go to be consumed.  Make sure that it exists
 # before you start Paperless.
-export PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption'
+PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption'
 
 # This is the secret passphrase used to encrypt the documents once they have
 # been consumed.  Change it to whatever you like, but you shouldn't change it
 # after it has been used to consume a document or you won't be able to read
 # that document again.
-export PAPERLESS_PASSPHRASE='secret'
+PAPERLESS_PASSPHRASE='secret'
 
 # This is the secret string used to verify PDFs sent by mail or consumed via
 # the API.  If you don't plan to use either of these, you can safely leave it
 # blank
-export PAPERLESS_SHARED_SECRET=''
+PAPERLESS_SHARED_SECRET=''
 " > /tmp/paperless.conf
 chmod 0640 /tmp/paperless.conf
 chown root:vagrant /tmp/paperless.conf
@@ -35,14 +35,6 @@ mv /tmp/paperless.conf /etc/
 mkdir /home/vagrant/consumption
 chown vagrant:vagrant /home/vagrant/consumption
 
-# Create environment wrapper
-echo "
-
-
-# Setup the paperless environment variables
-. /etc/paperless.conf
-" >> /home/vagrant/.bashrc
-
 echo "
 
 

From b8be20b5658d428aa66e2156de588778f96e9f43 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 18:09:10 +0000
Subject: [PATCH 47/71] Preparing for a proper UI

---
 src/documents/templates/documents/index.html | 10 ++++++++++
 src/documents/views.py                       | 13 ++++++++++++-
 src/paperless/urls.py                        |  9 +++++++--
 3 files changed, 29 insertions(+), 3 deletions(-)
 create mode 100644 src/documents/templates/documents/index.html

diff --git a/src/documents/templates/documents/index.html b/src/documents/templates/documents/index.html
new file mode 100644
index 000000000..ccde2d389
--- /dev/null
+++ b/src/documents/templates/documents/index.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+
+<html lang="en-gb">
+  <head>
+    <title>Paperless</title>
+    <meta charset="utf-8">
+  </head>
+  <body>
+  </body>
+</html>
diff --git a/src/documents/views.py b/src/documents/views.py
index 1bfba3ee7..0b2b50926 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -2,7 +2,7 @@ from django.contrib.auth.mixins import LoginRequiredMixin
 from django.http import HttpResponse
 from django.template.defaultfilters import slugify
 from django.views.decorators.csrf import csrf_exempt
-from django.views.generic import FormView, DetailView
+from django.views.generic import FormView, DetailView, TemplateView
 
 from rest_framework.mixins import (
     RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin)
@@ -19,6 +19,17 @@ from .serialisers import (
     SenderSerializer, TagSerializer, DocumentSerializer, LogSerializer)
 
 
+class IndexView(TemplateView):
+
+    template_name = "documents/index.html"
+
+    def get_context_data(self, **kwargs):
+        print(kwargs)
+        print(self.request.GET)
+        print(self.request.POST)
+        return TemplateView.get_context_data(self, **kwargs)
+
+
 class FetchView(LoginRequiredMixin, DetailView):
 
     model = Document
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index eb302638f..6fa7e65ef 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -21,7 +21,8 @@ from django.contrib import admin
 from rest_framework.routers import DefaultRouter
 
 from documents.views import (
-    FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet
+    IndexView, FetchView, PushView,
+    SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet
 )
 
 router = DefaultRouter()
@@ -39,11 +40,15 @@ urlpatterns = [
     ),
     url(r"^api/", include(router.urls, namespace="drf")),
 
+    # Normal pages (coming soon)
+    # url(r"^$", IndexView.as_view(), name="index"),
+
     # File downloads
     url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"),
 
     # The Django admin
-    url(r"", admin.site.urls),
+    url(r"admin", admin.site.urls),
+    url(r"", admin.site.urls),  # This is going away
 
 ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
 

From 55dcbcc47f944d67f5ab2c0b5c83bdf097683ce2 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 18:18:38 +0000
Subject: [PATCH 48/71] Forgot a slash

---
 src/paperless/urls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 6fa7e65ef..24a495810 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -47,7 +47,7 @@ urlpatterns = [
     url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"),
 
     # The Django admin
-    url(r"admin", admin.site.urls),
+    url(r"admin/", admin.site.urls),
     url(r"", admin.site.urls),  # This is going away
 
 ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

From fad466477b3ccf5e6f433871e2fd89a840b738eb Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 18:18:48 +0000
Subject: [PATCH 49/71] More verbose error logging

---
 src/documents/consumer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index f3d5b71cb..5617ed550 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -123,9 +123,9 @@ class Consumer(object):
             try:
                 text = self._get_ocr(pngs)
                 self._store(text, doc)
-            except OCRError:
+            except OCRError as e:
                 self._ignore.append(doc)
-                self.log("error", "OCR FAILURE: {}".format(doc))
+                self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
                 self._cleanup_tempdir(tempdir)
                 continue
             else:
@@ -165,7 +165,7 @@ class Consumer(object):
         """
 
         if not pngs:
-            raise OCRError
+            raise OCRError("No images found")
 
         self.log("info", "OCRing the document")
 
@@ -186,7 +186,7 @@ class Consumer(object):
                 )
                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                 return raw_text
-            raise OCRError
+            raise OCRError("Language detection failed")
 
         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
             raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
@@ -205,7 +205,10 @@ class Consumer(object):
                 )
                 raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                 return raw_text
-            raise OCRError
+            raise OCRError(
+                "The guessed language is not available in this instance of "
+                "Tesseract."
+            )
 
     def _assemble_ocr_sections(self, pngs, middle, text):
         """

From 070463b85a396c4895e6473e63af70af6406b539 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 20:52:42 +0000
Subject: [PATCH 50/71] s/Sender/Correspondent & reworked the (im|ex)porter

---
 docs/consumption.rst                          |  38 ++--
 docs/migrating.rst                            | 177 +++++++-----------
 docs/utilities.rst                            |  90 ++++++++-
 src/documents/admin.py                        |   4 +-
 src/documents/consumer.py                     |   4 +-
 src/documents/forms.py                        |  10 +-
 src/documents/mail.py                         |   4 +-
 .../management/commands/document_exporter.py  |  29 ++-
 .../management/commands/document_importer.py  | 110 +++++++++++
 .../migrations/0011_auto_20160303_1929.py     |  19 ++
 src/documents/models.py                       |  16 +-
 src/documents/serialisers.py                  |   6 +-
 src/documents/views.py                        |  15 +-
 src/paperless/urls.py                         |   4 +-
 14 files changed, 342 insertions(+), 184 deletions(-)
 create mode 100644 src/documents/management/commands/document_importer.py
 create mode 100644 src/documents/migrations/0011_auto_20160303_1929.py

diff --git a/docs/consumption.rst b/docs/consumption.rst
index 8b9b35433..0f8ff7ca5 100644
--- a/docs/consumption.rst
+++ b/docs/consumption.rst
@@ -44,10 +44,10 @@ Any document you put into the consumption directory will be consumed, but if you
 name the file right, it'll automatically set some values in the database for
 you.  This is is the logic the consumer follows:
 
-1. Try to find the sender, title, and tags in the file name following the
-   pattern: ``Sender - Title - tag,tag,tag.pdf``.
-2. If that doesn't work, try to find the sender and title in the file name
-   following the pattern:  ``Sender - Title.pdf``.
+1. Try to find the correspondent, title, and tags in the file name following
+   the pattern: ``Correspondent - Title - tag,tag,tag.pdf``.
+2. If that doesn't work, try to find the correspondent and title in the file
+   name following the pattern:  ``Correspondent - Title.pdf``.
 3. If that doesn't work, just assume that the name of the file is the title.
 
 So given the above, the following examples would work as you'd expect:
@@ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running:
    the configured email account every 10 minutes for something new and pull down
    whatever it finds.
 4. Send yourself an email!  Note that the subject is treated as the file name,
-   so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get
-   what you expect.  Also, you must include the aforementioned secret string in
-   every email so the fetcher knows that it's safe to import.
+   so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
+   get what you expect.  Also, you must include the aforementioned secret
+   string in every email so the fetcher knows that it's safe to import.
 5. After a few minutes, the consumer will poll your mailbox, pull down the
    message, and place the attachment in the consumption directory with the
    appropriate name.  A few minutes later, the consumer will import it like any
@@ -118,16 +118,16 @@ a real API, it's just a URL that accepts an HTTP POST.
 To push your document to *Paperless*, send an HTTP POST to the server with the
 following name/value pairs:
 
-* ``sender``: The name of the document's sender.  Note that there are
-  restrictions on what characters you can use here.  Specifically, alphanumeric
-  characters, `-`, `,`, `.`, and `'` are ok, everything else it out.  You also
-  can't use the sequence ` - ` (space, dash, space).
+* ``correspondent``: The name of the document's correspondent.  Note that there
+  are restrictions on what characters you can use here.  Specifically,
+  alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it
+  out.  You also can't use the sequence ` - ` (space, dash, space).
 * ``title``: The title of the document.  The rules for characters is the same
-  here as the sender.
-* ``signature``: For security reasons, we have the sender send a signature using
-  a "shared secret" method to make sure that random strangers don't start
-  uploading stuff to your server.  The means of generating this signature is
-  defined below.
+  here as the correspondent.
+* ``signature``: For security reasons, we have the correspondent send a
+  signature using a "shared secret" method to make sure that random strangers
+  don't start uploading stuff to your server.  The means of generating this
+  signature is defined below.
 
 Specify ``enctype="multipart/form-data"``, and then POST your file with:::
 
@@ -146,12 +146,12 @@ verification.
 
 In the case of *Paperless*, you configure the server with the secret by setting
 ``UPLOAD_SHARED_SECRET``.  Then on your client, you generate your signature by
-concatenating the sender, title, and the secret, and then using sha256 to
-generate a hexdigest.
+concatenating the correspondent, title, and the secret, and then using sha256
+to generate a hexdigest.
 
 If you're using Python, this is what that looks like:
 
 .. code:: python
 
     from hashlib import sha256
-    signature = sha256(sender + title + secret).hexdigest()
+    signature = sha256(correspondent + title + secret).hexdigest()
diff --git a/docs/migrating.rst b/docs/migrating.rst
index 491eeace4..d659620ac 100644
--- a/docs/migrating.rst
+++ b/docs/migrating.rst
@@ -4,10 +4,68 @@ Migrating, Updates, and Backups
 ===============================
 
 As *Paperless* is still under active development, there's a lot that can change
-as software updates roll out.  The thing you just need to remember for all of
-this is that for the most part, **the database is expendable** so long as you
-have your files.  This is because the file name of the exported files includes
-the name of the sender, the title, and the tags (if any) on each file.
+as software updates roll out.  You should backup often, so if anything goes
+wrong during an update, you at least have a means of restoring to something
+usable.  Thankfully, there are automated ways of backing up, restoring, and
+updating the software.
+
+
+.. _migrating-backup:
+
+Backing Up
+----------
+
+So you're bored of this whole project, or you want to make a remote backup of
+the unencrypted files for whatever reason.  This is easy to do, simply use the
+:ref:`exporter <utilities-exporter>` to dump your documents and database out
+into an arbitrary directory.
+
+
+.. _migrating-restoring:
+
+Restoring
+---------
+
+Restoring your data is just as easy, since nearly all of your data exists either
+in the file names, or in the contents of the files themselves.  You just need to
+create an empty database (just follow the
+:ref:`installation instructions <setup-installation>` again) and then import the
+``tags.json`` file you created as part of your backup.  Lastly, copy your
+exported documents into the consumption directory and start up the consumer.
+
+.. code-block:: shell-session
+
+    $ cd /path/to/project
+    $ rm data/db.sqlite3  # Delete the database
+    $ cd src
+    $ ./manage.py migrate  # Create the database
+    $ ./manage.py createsuperuser
+    $ ./manage.py loaddata /path/to/arbitrary/place/tags.json
+    $ cp /path/to/exported/docs/* /path/to/consumption/dir/
+    $ ./manage.py document_consumer
+
+Importing your data if you are :ref:`using Docker <setup-installation-docker>`
+is almost as simple:
+
+.. code-block:: shell-session
+
+    # Stop and remove your current containers
+    $ docker-compose stop
+    $ docker-compose rm -f
+
+    # Recreate them, add the superuser
+    $ docker-compose up -d
+    $ docker-compose run --rm webserver createsuperuser
+
+    # Load the tags
+    $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
+
+    # Load your exported documents into the consumption directory
+    # (How you do this highly depends on how you have set this up)
+    $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
+
+After loading the documents into the consumption directory the consumer will
+immediately start consuming the documents.
 
 
 .. _migrating-updates:
@@ -20,7 +78,7 @@ on the directory containing the project files, and then use Django's ``migrate``
 command to execute any database schema updates that might have been rolled in
 as part of the update:
 
-.. code:: bash
+.. code-block:: shell-session
 
     $ cd /path/to/project
     $ git pull
@@ -43,112 +101,3 @@ requires only one additional step:
 
 If ``git pull`` doesn't report any changes, there is no need to continue with
 the remaining steps.
-
-
-.. _migrating-backup:
-
-Backing Up
-----------
-
-So you're bored of this whole project, or you want to make a remote backup of
-the unencrypted files for whatever reason.  This is easy to do, simply use the
-:ref:`exporter <utilities-exporter>` to dump your documents out into an
-arbitrary directory.
-
-Additionally however, you'll need to back up the tags themselves.  The file
-names contain the tag names, but you still need to define the tags and their
-matching algorithms in the database for things to work properly.  We do this
-with Django's ``dumpdata`` command, which produces JSON output.
-
-.. code:: bash
-
-    $ cd /path/to/project
-    $ cd src
-    $ ./manage.py document_export /path/to/arbitrary/place/
-    $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
-
-If you are :ref:`using Docker <setup-installation-docker>`, exporting your tags
-as JSON is almost as easy:
-
-.. code-block:: shell-session
-
-    $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
-
-To export the documents you can either use ``docker run`` directly, specifying all
-the commandline options by hand, or (more simply) mount a second volume for export.
-
-To mount a volume for exports, follow the instructions in the
-``docker-compose.yml.example`` file for the ``/export`` volume (making the changes
-in your own ``docker-compose.yml`` file, of course). Once you have the
-volume mounted, the command to run an export is:
-
-.. code-block:: console
-
-   $ docker-compose run --rm consumer document_exporter /export
-
-If you prefer to use ``docker run`` directly, supplying the necessary commandline
-options:
-
-.. code-block:: shell-session
-
-   $ # Identify your containers
-   $ docker-compose ps
-           Name                       Command                State     Ports
-   -------------------------------------------------------------------------
-   paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
-   paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
-
-   $ # Make sure to replace your passphrase and remove or adapt the id mapping
-   $ docker run --rm \
-       --volumes-from paperless_data_1 \
-       --volume /path/to/arbitrary/place:/export \
-       -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
-       -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
-       paperless document_exporter /export
-
-
-.. _migrating-restoring:
-
-Restoring
----------
-
-Restoring your data is just as easy, since nearly all of your data exists either
-in the file names, or in the contents of the files themselves.  You just need to
-create an empty database (just follow the
-:ref:`installation instructions <setup-installation>` again) and then import the
-``tags.json`` file you created as part of your backup.  Lastly, copy your
-exported documents into the consumption directory and start up the consumer.
-
-.. code:: bash
-
-    $ cd /path/to/project
-    $ rm data/db.sqlite3  # Delete the database
-    $ cd src
-    $ ./manage.py migrate  # Create the database
-    $ ./manage.py createsuperuser
-    $ ./manage.py loaddata /path/to/arbitrary/place/tags.json
-    $ cp /path/to/exported/docs/* /path/to/consumption/dir/
-    $ ./manage.py document_consumer
-
-Importing your data if you are :ref:`using Docker <setup-installation-docker>`
-is almost as simple:
-
-.. code-block:: shell-session
-
-    $ # Stop and remove your current containers
-    $ docker-compose stop
-    $ docker-compose rm -f
-
-    $ # Recreate them, add the superuser
-    $ docker-compose up -d
-    $ docker-compose run --rm webserver createsuperuser
-
-    $ # Load the tags
-    $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
-
-    $ # Load your exported documents into the consumption directory
-    $ # (How you do this highly depends on how you have set this up)
-    $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
-
-After loading the documents into the consumption directory the consumer will
-immediately start consuming the documents.
diff --git a/docs/utilities.rst b/docs/utilities.rst
index f5b452a6f..ce3555b73 100644
--- a/docs/utilities.rst
+++ b/docs/utilities.rst
@@ -26,7 +26,7 @@ How to Use It
 
 The webserver is started via the ``manage.py`` script:
 
-.. code:: bash
+.. code-block:: shell-session
 
     $ /path/to/paperless/src/manage.py runserver
 
@@ -64,7 +64,7 @@ How to Use It
 
 The consumer is started via the ``manage.py`` script:
 
-.. code:: bash
+.. code-block:: shell-session
 
     $ /path/to/paperless/src/manage.py document_consumer
 
@@ -95,16 +95,86 @@ How to Use It
 
 This too is done via the ``manage.py`` script:
 
-.. code:: bash
+.. code-block:: shell-session
 
-    $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere
+    $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
 
-This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with
-as you please.  The naming scheme on export is identical to that used for
-import, so should you can now safely delete the entire project directly,
-database, encrypted PDFs and all, and later create it all again simply by
-running the consumer again and dumping all of these files into
-``CONSUMPTION_DIR``.
+This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
+to do with as you please.  The files are accompanied with a special file,
+``manifest.json`` which can be used to
+:ref:`import the files <utilities-importer>` at a later date if you wish.
+
+
+.. _utilities-exporter-howto-docker:
+
+Docker
+______
+
+If you are :ref:`using Docker <setup-installation-docker>`, running the
+expoorter is almost as easy.  To mount a volume for exports, follow the
+instructions in the ``docker-compose.yml.example`` file for the ``/export``
+volume (making the changes in your own ``docker-compose.yml`` file, of course).
+Once you have the volume mounted, the command to run an export is:
+
+.. code-block:: shell-session
+
+   $ docker-compose run --rm consumer document_exporter /export
+
+If you prefer to use ``docker run`` directly, supplying the necessary commandline
+options:
+
+.. code-block:: shell-session
+
+   $ # Identify your containers
+   $ docker-compose ps
+           Name                       Command                State     Ports
+   -------------------------------------------------------------------------
+   paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
+   paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
+
+   $ # Make sure to replace your passphrase and remove or adapt the id mapping
+   $ docker run --rm \
+       --volumes-from paperless_data_1 \
+       --volume /path/to/arbitrary/place:/export \
+       -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
+       -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
+       paperless document_exporter /export
+
+
+.. _utilities-importer:
+
+The Importer
+------------
+
+Looking to transfer Paperless data from one instance to another, or just want
+to restore from a backup?  This is your go-to toy.
+
+
+.. _utilities-importer-howto:
+
+How to Use It
+.............
+
+The importer works just like the exporter.  You point it at a directory, and
+the script does the rest of the work:
+
+.. code-block:: shell-session
+
+    $ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/
+
+Docker
+______
+
+Assuming that you've already gone through the steps above in the
+:ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing
+to do is just re-use the ``/export`` path you already setup:
+
+.. code-block:: shell-session
+
+   $ docker-compose run --rm consumer document_importer /export
+
+Similarly, if you're not using docker-compose, you can adjust the export
+instructions above to do the import.
 
 
 .. _utilities-retagger:
diff --git a/src/documents/admin.py b/src/documents/admin.py
index 118a295eb..3baad817b 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group
 from django.core.urlresolvers import reverse
 from django.templatetags.static import static
 
-from .models import Sender, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log
 
 
 class MonthListFilter(admin.SimpleListFilter):
@@ -107,7 +107,7 @@ class LogAdmin(admin.ModelAdmin):
     list_filter = ("level", "component",)
 
 
-admin.site.register(Sender)
+admin.site.register(Correspondent)
 admin.site.register(Tag, TagAdmin)
 admin.site.register(Document, DocumentAdmin)
 admin.site.register(Log, LogAdmin)
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 5617ed550..4233cded8 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError
 
 from paperless.db import GnuPG
 
-from .models import Sender, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log
 from .languages import ISO639
 
 
@@ -246,7 +246,7 @@ class Consumer(object):
         """
 
         def get_sender(sender_name):
-            return Sender.objects.get_or_create(
+            return Correspondent.objects.get_or_create(
                 name=sender_name, defaults={"slug": slugify(sender_name)})[0]
 
         def get_tags(tags):
diff --git a/src/documents/forms.py b/src/documents/forms.py
index 8eb7b8381..d8960f88b 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -8,7 +8,7 @@ from time import mktime
 from django import forms
 from django.conf import settings
 
-from .models import Document, Sender
+from .models import Document, Correspondent
 from .consumer import Consumer
 
 
@@ -24,7 +24,9 @@ class UploadForm(forms.Form):
     }
 
     sender = forms.CharField(
-        max_length=Sender._meta.get_field("name").max_length, required=False)
+        max_length=Correspondent._meta.get_field("name").max_length,
+        required=False
+    )
     title = forms.CharField(
         max_length=Document._meta.get_field("title").max_length,
         required=False
@@ -41,7 +43,7 @@ class UploadForm(forms.Form):
         sender = self.cleaned_data.get("sender")
         if not sender:
             return None
-        if not Sender.SAFE_REGEX.match(sender) or " - " in sender:
+        if not Correspondent.SAFE_REGEX.match(sender) or " - " in sender:
             raise forms.ValidationError("That sender name is suspicious.")
         return sender
 
@@ -49,7 +51,7 @@ class UploadForm(forms.Form):
         title = self.cleaned_data.get("title")
         if not title:
             return None
-        if not Sender.SAFE_REGEX.match(title) or " - " in title:
+        if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
             raise forms.ValidationError("That title is suspicious.")
 
     def clean_document(self):
diff --git a/src/documents/mail.py b/src/documents/mail.py
index 0bc3ce94f..5bacb5b5f 100644
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -14,7 +14,7 @@ from dateutil import parser
 from django.conf import settings
 
 from .consumer import Consumer
-from .models import Sender, Log
+from .models import Correspondent, Log
 
 
 class MailFetcherError(Exception):
@@ -103,7 +103,7 @@ class Message(Loggable):
     def check_subject(self):
         if self.subject is None:
             raise InvalidMessageError("Message does not have a subject")
-        if not Sender.SAFE_REGEX.match(self.subject):
+        if not Correspondent.SAFE_REGEX.match(self.subject):
             raise InvalidMessageError("Message subject is unsafe: {}".format(
                 self.subject))
 
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index ac448d8e8..87ed804a2 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -1,10 +1,12 @@
+import json
 import os
 import time
 
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
+from django.core import serializers
 
-from documents.models import Document
+from documents.models import Document, Correspondent, Tag
 from paperless.db import GnuPG
 
 from ...mixins import Renderable
@@ -14,21 +16,19 @@ class Command(Renderable, BaseCommand):
 
     help = """
         Decrypt and rename all files in our collection into a given target
-        directory.  Note that we don't export any of the parsed data since
-        that can always be re-collected via the consumer.
+        directory.  And include a manifest file containing document data for
+        easy import.
     """.replace("    ", "")
 
     def add_arguments(self, parser):
         parser.add_argument("target")
 
     def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        self.target = None
         BaseCommand.__init__(self, *args, **kwargs)
+        self.target = None
 
     def handle(self, *args, **options):
 
-        self.verbosity = options["verbosity"]
         self.target = options["target"]
 
         if not os.path.exists(self.target):
@@ -40,9 +40,15 @@ class Command(Renderable, BaseCommand):
         if not settings.PASSPHRASE:
             settings.PASSPHRASE = input("Please enter the passphrase: ")
 
-        for document in Document.objects.all():
+        documents = Document.objects.all()
+        document_map = {d.pk: d for d in documents}
+        manifest = json.loads(serializers.serialize("json", documents))
+        for document_dict in manifest:
+
+            document = document_map[document_dict["pk"]]
 
             target = os.path.join(self.target, document.file_name)
+            document_dict["__exported_file_name__"] = target
 
             print("Exporting: {}".format(target))
 
@@ -50,3 +56,12 @@ class Command(Renderable, BaseCommand):
                 f.write(GnuPG.decrypted(document.source_file))
                 t = int(time.mktime(document.created.timetuple()))
                 os.utime(target, times=(t, t))
+
+        manifest += json.loads(
+            serializers.serialize("json", Correspondent.objects.all()))
+
+        manifest += json.loads(serializers.serialize(
+            "json", Tag.objects.all()))
+
+        with open(os.path.join(self.target, "manifest.json"), "w") as f:
+            json.dump(manifest, f, indent=2)
diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py
new file mode 100644
index 000000000..213c049e4
--- /dev/null
+++ b/src/documents/management/commands/document_importer.py
@@ -0,0 +1,110 @@
+import json
+import os
+
+from django.conf import settings
+from django.core.management.base import BaseCommand, CommandError
+from django.core.management import call_command
+
+from documents.models import Document
+from paperless.db import GnuPG
+
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        Using a manifest.json file, load the data from there, and import the
+        documents it refers to.
+    """.replace("    ", "")
+
+    def add_arguments(self, parser):
+        parser.add_argument("source")
+        parser.add_argument(
+            '--ignore-absent',
+            action='store_true',
+            default=False,
+            help="If the manifest refers to a document that doesn't exist, "
+                 "ignore it and attempt to import what it can"
+        )
+
+    def __init__(self, *args, **kwargs):
+        BaseCommand.__init__(self, *args, **kwargs)
+        self.source = None
+        self.manifest = None
+
+    def handle(self, *args, **options):
+
+        self.source = options["source"]
+
+        if not os.path.exists(self.source):
+            raise CommandError("That path doesn't exist")
+
+        if not os.access(self.source, os.R_OK):
+            raise CommandError("That path doesn't appear to be readable")
+
+        manifest_path = os.path.join(self.source, "manifest.json")
+        self._check_manifest_exists(manifest_path)
+
+        with open(manifest_path) as f:
+            self.manifest = json.load(f)
+
+        self._check_manifest()
+
+        if not settings.PASSPHRASE:
+            raise CommandError(
+                "You need to define a passphrase before continuing.  Please "
+                "consult the documentation for setting up Paperless."
+            )
+
+        # Fill up the database with whatever is in the manifest
+        call_command("loaddata", manifest_path)
+
+        self._import_files_from_manifest()
+
+    @staticmethod
+    def _check_manifest_exists(path):
+        if not os.path.exists(path):
+            raise CommandError(
+                "That directory doesn't appear to contain a manifest.json "
+                "file."
+            )
+
+    def _check_manifest(self):
+
+        for record in self.manifest:
+
+            if not record["model"] == "documents.document":
+                continue
+
+            if "__exported_file_name__" not in record:
+                raise CommandError(
+                    'The manifest file contains a record which does not '
+                    'refer to an actual document file.  If you want to import '
+                    'the rest anyway (skipping such references) call the '
+                    'importer with --ignore-absent'
+                )
+
+            doc_file = record["__exported_file_name__"]
+            if not os.path.exists(os.path.join(self.source, doc_file)):
+                raise CommandError(
+                    'The manifest file refers to "{}" which does not '
+                    'appear to be in the source directory.  If you want to '
+                    'import the rest anyway (skipping such references) call '
+                    'the importer with --ignore-absent'.format(doc_file)
+                )
+
+    def _import_files_from_manifest(self):
+
+        for record in self.manifest:
+
+            if not record["model"] == "documents.document":
+                continue
+
+            doc_file = record["__exported_file_name__"]
+            document = Document.objects.get(pk=record["pk"])
+            with open(doc_file, "rb") as unencrypted:
+                with open(document.source_path, "wb") as encrypted:
+                    print("Encrypting {} and saving it to {}".format(
+                        doc_file, document.source_path))
+                    encrypted.write(GnuPG.encrypted(unencrypted))
diff --git a/src/documents/migrations/0011_auto_20160303_1929.py b/src/documents/migrations/0011_auto_20160303_1929.py
new file mode 100644
index 000000000..a9aefddaf
--- /dev/null
+++ b/src/documents/migrations/0011_auto_20160303_1929.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9.2 on 2016-03-03 19:29
+from __future__ import unicode_literals
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0010_log'),
+    ]
+
+    operations = [
+        migrations.RenameModel(
+            old_name='Sender',
+            new_name='Correspondent',
+        ),
+    ]
diff --git a/src/documents/models.py b/src/documents/models.py
index e5556534a..0fb6489c4 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -28,7 +28,7 @@ class SluggedModel(models.Model):
         return self.name
 
 
-class Sender(SluggedModel):
+class Correspondent(SluggedModel):
 
     # This regex is probably more restrictive than it needs to be, but it's
     # better safe than sorry.
@@ -141,7 +141,7 @@ class Document(models.Model):
     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
 
     sender = models.ForeignKey(
-        Sender, blank=True, null=True, related_name="documents")
+        Correspondent, blank=True, null=True, related_name="documents")
     title = models.CharField(max_length=128, blank=True, db_index=True)
     content = models.TextField(db_index=True)
     file_type = models.CharField(
@@ -158,9 +158,9 @@ class Document(models.Model):
         ordering = ("sender", "title")
 
     def __str__(self):
-        created = self.created.strftime("%Y-%m-%d")
+        created = self.created.strftime("%Y%m%d%H%M%S")
         if self.sender and self.title:
-            return "{}: {}, {}".format(created, self.sender, self.title)
+            return "{}: {} - {}".format(created, self.sender, self.title)
         if self.sender or self.title:
             return "{}: {}".format(created, self.sender or self.title)
         return str(created)
@@ -179,13 +179,7 @@ class Document(models.Model):
 
     @property
     def file_name(self):
-        if self.sender and self.title:
-            tags = ",".join([t.slug for t in self.tags.all()])
-            if tags:
-                return "{} - {} - {}.{}".format(
-                    self.sender, self.title, tags, self.file_type)
-            return "{} - {}.{}".format(self.sender, self.title, self.file_type)
-        return os.path.basename(self.source_path)
+        return slugify(str(self)) + "." + self.file_type
 
     @property
     def download_url(self):
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index f9b29f790..340fdaa25 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -1,12 +1,12 @@
 from rest_framework import serializers
 
-from .models import Sender, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log
 
 
-class SenderSerializer(serializers.HyperlinkedModelSerializer):
+class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
 
     class Meta(object):
-        model = Sender
+        model = Correspondent
         fields = ("id", "slug", "name")
 
 
diff --git a/src/documents/views.py b/src/documents/views.py
index 0b2b50926..ff7c4ce05 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,6 +1,5 @@
 from django.contrib.auth.mixins import LoginRequiredMixin
 from django.http import HttpResponse
-from django.template.defaultfilters import slugify
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic import FormView, DetailView, TemplateView
 
@@ -14,9 +13,9 @@ from rest_framework.viewsets import (
 from paperless.db import GnuPG
 
 from .forms import UploadForm
-from .models import Sender, Tag, Document, Log
+from .models import Correspondent, Tag, Document, Log
 from .serialisers import (
-    SenderSerializer, TagSerializer, DocumentSerializer, LogSerializer)
+    CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer)
 
 
 class IndexView(TemplateView):
@@ -52,7 +51,7 @@ class FetchView(LoginRequiredMixin, DetailView):
             content_type=content_types[self.object.file_type]
         )
         response["Content-Disposition"] = 'attachment; filename="{}"'.format(
-            slugify(str(self.object)) + "." + self.object.file_type)
+            self.object.file_name)
 
         return response
 
@@ -81,10 +80,10 @@ class StandardPagination(PageNumberPagination):
     max_page_size = 100000
 
 
-class SenderViewSet(ModelViewSet):
-    model = Sender
-    queryset = Sender.objects.all()
-    serializer_class = SenderSerializer
+class CorrespondentViewSet(ModelViewSet):
+    model = Correspondent
+    queryset = Correspondent.objects.all()
+    serializer_class = CorrespondentSerializer
     pagination_class = StandardPagination
     permission_classes = (IsAuthenticated,)
 
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 24a495810..e81d4dcf9 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -22,11 +22,11 @@ from rest_framework.routers import DefaultRouter
 
 from documents.views import (
     IndexView, FetchView, PushView,
-    SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet
+    CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet
 )
 
 router = DefaultRouter()
-router.register(r'senders', SenderViewSet)
+router.register(r'senders', CorrespondentViewSet)
 router.register(r'tags', TagViewSet)
 router.register(r'documents', DocumentViewSet)
 router.register(r'logs', LogViewSet)

From ba7878b9aa5b115ad91daddf387433a3948c7619 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Thu, 3 Mar 2016 21:25:08 +0000
Subject: [PATCH 51/71] Added some tests for the importer

---
 .../management/commands/document_importer.py  | 15 ++------
 src/documents/tests/test_importer.py          | 36 +++++++++++++++++++
 2 files changed, 38 insertions(+), 13 deletions(-)
 create mode 100644 src/documents/tests/test_importer.py

diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py
index 213c049e4..63c961815 100644
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -20,13 +20,6 @@ class Command(Renderable, BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument("source")
-        parser.add_argument(
-            '--ignore-absent',
-            action='store_true',
-            default=False,
-            help="If the manifest refers to a document that doesn't exist, "
-                 "ignore it and attempt to import what it can"
-        )
 
     def __init__(self, *args, **kwargs):
         BaseCommand.__init__(self, *args, **kwargs)
@@ -80,18 +73,14 @@ class Command(Renderable, BaseCommand):
             if "__exported_file_name__" not in record:
                 raise CommandError(
                     'The manifest file contains a record which does not '
-                    'refer to an actual document file.  If you want to import '
-                    'the rest anyway (skipping such references) call the '
-                    'importer with --ignore-absent'
+                    'refer to an actual document file.'
                 )
 
             doc_file = record["__exported_file_name__"]
             if not os.path.exists(os.path.join(self.source, doc_file)):
                 raise CommandError(
                     'The manifest file refers to "{}" which does not '
-                    'appear to be in the source directory.  If you want to '
-                    'import the rest anyway (skipping such references) call '
-                    'the importer with --ignore-absent'.format(doc_file)
+                    'appear to be in the source directory.'.format(doc_file)
                 )
 
     def _import_files_from_manifest(self):
diff --git a/src/documents/tests/test_importer.py b/src/documents/tests/test_importer.py
new file mode 100644
index 000000000..8880aba66
--- /dev/null
+++ b/src/documents/tests/test_importer.py
@@ -0,0 +1,36 @@
+from django.core.management.base import CommandError
+from django.test import TestCase
+
+from ..management.commands.document_importer import Command
+
+
+class TestImporter(TestCase):
+
+    def __init__(self, *args, **kwargs):
+        TestCase.__init__(self, *args, **kwargs)
+
+    def test_check_manifest_exists(self):
+        cmd = Command()
+        self.assertRaises(
+            CommandError, cmd._check_manifest_exists, "/tmp/manifest.json")
+
+    def test_check_manifest(self):
+
+        cmd = Command()
+        cmd.source = "/tmp"
+
+        cmd.manifest = [{"model": "documents.document"}]
+        with self.assertRaises(CommandError) as cm:
+            cmd._check_manifest()
+        self.assertTrue(
+            'The manifest file contains a record' in str(cm.exception))
+
+        cmd.manifest = [{
+            "model": "documents.document",
+            "__exported_file_name__": "noexist.pdf"
+        }]
+        # self.assertRaises(CommandError, cmd._check_manifest)
+        with self.assertRaises(CommandError) as cm:
+            cmd._check_manifest()
+        self.assertTrue(
+            'The manifest file refers to "noexist.pdf"' in str(cm.exception))

From 5d4587ef8b599fbe91c74740ded81e35d1b711f8 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 4 Mar 2016 09:14:50 +0000
Subject: [PATCH 52/71] Accounted for .sender in a few places

---
 src/documents/admin.py                        |  6 +--
 src/documents/consumer.py                     | 34 ++++++++--------
 src/documents/forms.py                        | 29 +++++++-------
 .../management/commands/document_exporter.py  | 39 +++++++++++++++++++
 .../migrations/0011_auto_20160303_1929.py     |  9 +++++
 src/documents/models.py                       | 13 ++++---
 src/documents/serialisers.py                  |  6 +--
 7 files changed, 95 insertions(+), 41 deletions(-)

diff --git a/src/documents/admin.py b/src/documents/admin.py
index 3baad817b..a5b523492 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -45,9 +45,9 @@ class DocumentAdmin(admin.ModelAdmin):
             "all": ("paperless.css",)
         }
 
-    search_fields = ("sender__name", "title", "content")
-    list_display = ("created_", "sender", "title", "tags_", "document")
-    list_filter = ("tags", "sender", MonthListFilter)
+    search_fields = ("correspondent__name", "title", "content")
+    list_display = ("created_", "correspondent", "title", "tags_", "document")
+    list_filter = ("tags", "correspondent", MonthListFilter)
     list_per_page = 25
 
     def created_(self, obj):
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 4233cded8..eeb42cdf1 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -57,11 +57,11 @@ class Consumer(object):
         r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
         flags=re.IGNORECASE
     )
-    REGEX_SENDER_TITLE = re.compile(
+    REGEX_CORRESPONDENT_TITLE = re.compile(
         r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
         flags=re.IGNORECASE
     )
-    REGEX_SENDER_TITLE_TAGS = re.compile(
+    REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
         r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
         flags=re.IGNORECASE
     )
@@ -238,16 +238,18 @@ class Consumer(object):
 
     def _guess_attributes_from_name(self, parseable):
         """
-        We use a crude naming convention to make handling the sender, title,
-        and tags easier:
-          "<sender> - <title> - <tags>.<suffix>"
-          "<sender> - <title>.<suffix>"
+        We use a crude naming convention to make handling the correspondent,
+        title, and tags easier:
+          "<correspondent> - <title> - <tags>.<suffix>"
+          "<correspondent> - <title>.<suffix>"
           "<title>.<suffix>"
         """
 
-        def get_sender(sender_name):
+        def get_correspondent(correspondent_name):
             return Correspondent.objects.get_or_create(
-                name=sender_name, defaults={"slug": slugify(sender_name)})[0]
+                name=correspondent_name,
+                defaults={"slug": slugify(correspondent_name)}
+            )[0]
 
         def get_tags(tags):
             r = []
@@ -262,27 +264,27 @@ class Consumer(object):
                 return "jpg"
             return suffix
 
-        # First attempt: "<sender> - <title> - <tags>.<suffix>"
-        m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable)
+        # First attempt: "<correspondent> - <title> - <tags>.<suffix>"
+        m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
         if m:
             return (
-                get_sender(m.group(1)),
+                get_correspondent(m.group(1)),
                 m.group(2),
                 get_tags(m.group(3)),
                 get_suffix(m.group(4))
             )
 
-        # Second attempt: "<sender> - <title>.<suffix>"
-        m = re.match(self.REGEX_SENDER_TITLE, parseable)
+        # Second attempt: "<correspondent> - <title>.<suffix>"
+        m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
         if m:
             return (
-                get_sender(m.group(1)),
+                get_correspondent(m.group(1)),
                 m.group(2),
                 (),
                 get_suffix(m.group(3))
             )
 
-        # That didn't work, so we assume sender and tags are None
+        # That didn't work, so we assume correspondent and tags are None
         m = re.match(self.REGEX_TITLE, parseable)
         return None, m.group(1), (), get_suffix(m.group(2))
 
@@ -296,7 +298,7 @@ class Consumer(object):
         self.log("debug", "Saving record to database")
 
         document = Document.objects.create(
-            sender=sender,
+            correspondent=sender,
             title=title,
             content=text,
             file_type=file_type,
diff --git a/src/documents/forms.py b/src/documents/forms.py
index d8960f88b..d4c01745a 100644
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -23,7 +23,7 @@ class UploadForm(forms.Form):
         "image/tiff": Document.TYPE_TIF,
     }
 
-    sender = forms.CharField(
+    correspondent = forms.CharField(
         max_length=Correspondent._meta.get_field("name").max_length,
         required=False
     )
@@ -34,18 +34,19 @@ class UploadForm(forms.Form):
     document = forms.FileField()
     signature = forms.CharField(max_length=256)
 
-    def clean_sender(self):
+    def clean_correspondent(self):
         """
         I suppose it might look cleaner to use .get_or_create() here, but that
-        would also allow someone to fill up the db with bogus senders before
-        all validation was met.
+        would also allow someone to fill up the db with bogus correspondents
+        before all validation was met.
         """
-        sender = self.cleaned_data.get("sender")
-        if not sender:
+        corresp = self.cleaned_data.get("correspondent")
+        if not corresp:
             return None
-        if not Correspondent.SAFE_REGEX.match(sender) or " - " in sender:
-            raise forms.ValidationError("That sender name is suspicious.")
-        return sender
+        if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
+            raise forms.ValidationError(
+                "That correspondent name is suspicious.")
+        return corresp
 
     def clean_title(self):
         title = self.cleaned_data.get("title")
@@ -63,10 +64,10 @@ class UploadForm(forms.Form):
         return document, self.TYPE_LOOKUP[file_type]
 
     def clean(self):
-        sender = self.clened_data("sender")
+        corresp = self.clened_data("correspondent")
         title = self.cleaned_data("title")
         signature = self.cleaned_data("signature")
-        if sha256(sender + title + self.SECRET).hexdigest() == signature:
+        if sha256(corresp + title + self.SECRET).hexdigest() == signature:
             return True
         return False
 
@@ -77,13 +78,15 @@ class UploadForm(forms.Form):
         form do that as well.  Think of it as a poor-man's queue server.
         """
 
-        sender = self.clened_data("sender")
+        correspondent = self.clened_data("correspondent")
         title = self.cleaned_data("title")
         document, file_type = self.cleaned_data.get("document")
 
         t = int(mktime(datetime.now()))
         file_name = os.path.join(
-            Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type))
+            Consumer.CONSUME,
+            "{} - {}.{}".format(correspondent, title, file_type)
+        )
 
         with open(file_name, "wb") as f:
             f.write(document)
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index 87ed804a2..913f7ae79 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -22,6 +22,13 @@ class Command(Renderable, BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument("target")
+        parser.add_argument(
+            "--legacy",
+            action="store_true",
+            help="Don't try to export all of the document data, just dump the "
+                 "original document files out in a format that makes "
+                 "re-consuming them easy."
+        )
 
     def __init__(self, *args, **kwargs):
         BaseCommand.__init__(self, *args, **kwargs)
@@ -40,6 +47,13 @@ class Command(Renderable, BaseCommand):
         if not settings.PASSPHRASE:
             settings.PASSPHRASE = input("Please enter the passphrase: ")
 
+        if options["legacy"]:
+            self.dump_legacy()
+        else:
+            self.dump()
+
+    def dump(self):
+
         documents = Document.objects.all()
         document_map = {d.pk: d for d in documents}
         manifest = json.loads(serializers.serialize("json", documents))
@@ -65,3 +79,28 @@ class Command(Renderable, BaseCommand):
 
         with open(os.path.join(self.target, "manifest.json"), "w") as f:
             json.dump(manifest, f, indent=2)
+
+    def dump_legacy(self):
+
+        for document in Document.objects.all():
+
+            target = os.path.join(
+                self.target, self._get_legacy_file_name(document))
+
+            print("Exporting: {}".format(target))
+
+            with open(target, "wb") as f:
+                f.write(GnuPG.decrypted(document.source_file))
+                t = int(time.mktime(document.created.timetuple()))
+                os.utime(target, times=(t, t))
+
+    @staticmethod
+    def _get_legacy_file_name(doc):
+        if doc.correspondent and doc.title:
+            tags = ",".join([t.slug for t in doc.tags.all()])
+            if tags:
+                return "{} - {} - {}.{}".format(
+                    doc.correspondent, doc.title, tags, doc.file_type)
+            return "{} - {}.{}".format(
+                doc.correspondent, doc.title, doc.file_type)
+        return os.path.basename(doc.source_path)
diff --git a/src/documents/migrations/0011_auto_20160303_1929.py b/src/documents/migrations/0011_auto_20160303_1929.py
index a9aefddaf..af4ee4c66 100644
--- a/src/documents/migrations/0011_auto_20160303_1929.py
+++ b/src/documents/migrations/0011_auto_20160303_1929.py
@@ -16,4 +16,13 @@ class Migration(migrations.Migration):
             old_name='Sender',
             new_name='Correspondent',
         ),
+        migrations.AlterModelOptions(
+            name='document',
+            options={'ordering': ('correspondent', 'title')},
+        ),
+        migrations.RenameField(
+            model_name='document',
+            old_name='sender',
+            new_name='correspondent',
+        ),
     ]
diff --git a/src/documents/models.py b/src/documents/models.py
index 0fb6489c4..a82f7643f 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -140,7 +140,7 @@ class Document(models.Model):
     TYPE_TIF = "tiff"
     TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
 
-    sender = models.ForeignKey(
+    correspondent = models.ForeignKey(
         Correspondent, blank=True, null=True, related_name="documents")
     title = models.CharField(max_length=128, blank=True, db_index=True)
     content = models.TextField(db_index=True)
@@ -155,14 +155,15 @@ class Document(models.Model):
     modified = models.DateTimeField(auto_now=True, editable=False)
 
     class Meta(object):
-        ordering = ("sender", "title")
+        ordering = ("correspondent", "title")
 
     def __str__(self):
         created = self.created.strftime("%Y%m%d%H%M%S")
-        if self.sender and self.title:
-            return "{}: {} - {}".format(created, self.sender, self.title)
-        if self.sender or self.title:
-            return "{}: {}".format(created, self.sender or self.title)
+        if self.correspondent and self.title:
+            return "{}: {} - {}".format(
+                created, self.correspondent, self.title)
+        if self.correspondent or self.title:
+            return "{}: {}".format(created, self.correspondent or self.title)
         return str(created)
 
     @property
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index 340fdaa25..c2b2ae7fd 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -20,8 +20,8 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
 
 class DocumentSerializer(serializers.ModelSerializer):
 
-    sender = serializers.HyperlinkedRelatedField(
-        read_only=True, view_name="drf:sender-detail", allow_null=True)
+    correspondent = serializers.HyperlinkedRelatedField(
+        read_only=True, view_name="drf:correspondent-detail", allow_null=True)
     tags = serializers.HyperlinkedRelatedField(
         read_only=True, view_name="drf:tag-detail", many=True)
 
@@ -29,7 +29,7 @@ class DocumentSerializer(serializers.ModelSerializer):
         model = Document
         fields = (
             "id",
-            "sender",
+            "correspondent",
             "title",
             "content",
             "file_type",

From 13c2ed66e13c493c25ca460f29f43aa1f0f5815d Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 4 Mar 2016 17:53:54 +0000
Subject: [PATCH 53/71] Better bare metal explanation

---
 docs/setup.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/setup.rst b/docs/setup.rst
index 077ce135c..9992418c1 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -42,12 +42,13 @@ route`_ is quick & easy, but means you're running a VM which comes with memory
 consumption etc. We also `support Docker`_, which you can use natively under
 Linux and in a VM with `Docker Machine`_ (this guide was written for native
 Docker usage under Linux, you might have to adapt it for Docker Machine.)
-Alternatively the standard, `bare metal`_ approach is a little more complicated.
+Alternatively the standard, `bare metal`_ approach is a little more complicated,
+but worth it because it makes it easier to should you want to contribute some
+code back.
 
 .. _Vagrant route: setup-installation-vagrant_
 .. _support Docker: setup-installation-docker_
 .. _bare metal: setup-installation-standard_
-
 .. _Docker Machine: https://docs.docker.com/machine/
 
 .. _setup-installation-standard:

From 94a7914073f1ba449f3c23b314be87e7418e90d4 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 4 Mar 2016 23:20:22 +0000
Subject: [PATCH 54/71] More descriptive

---
 docs/changelog.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 772e30dc0..d135d3564 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -4,7 +4,7 @@ Changelog
 * 0.1.1 (master)
 
   * `#68`_: Added support for using a proper config file at
-    ``/etc/paperless.conf``.
+    ``/etc/paperless.conf`` and modified the systemd unit files to use it.
   * Refactored the Vagrant installation process to use environment variables
     rather than asking the user to modify ``settings.py``.
   * `#44`_: Harmonise environment variable names with constant names.

From d24cfbb24652972b6c72f70a3eca4b78f22817f7 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 4 Mar 2016 23:22:57 +0000
Subject: [PATCH 55/71] Added the bit about s/sender/correspondent/g

---
 docs/changelog.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index d135d3564..2228c9be1 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -3,6 +3,10 @@ Changelog
 
 * 0.1.1 (master)
 
+  * Potentially **Breaking Change**: All references to "sender" in the code
+    have been renamed to "correspondent" to better reflect the nature of the
+    property (one could quite reasonably scan a document before sending it to
+    someone.)
   * `#68`_: Added support for using a proper config file at
     ``/etc/paperless.conf`` and modified the systemd unit files to use it.
   * Refactored the Vagrant installation process to use environment variables

From 1ffce8f52d90e27dbfcb4863c57448a6cb5c5666 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Fri, 4 Mar 2016 23:59:13 +0000
Subject: [PATCH 56/71] Documented the API and added some help for the config
 file

---
 docs/api.rst              | 23 +++++++++++++++++++++++
 docs/consumption.rst      | 13 ++++++-------
 paperless.conf.example    | 32 ++++++++++++++++++++++++++++++++
 scripts/vagrant-provision | 22 +++-------------------
 4 files changed, 64 insertions(+), 26 deletions(-)
 create mode 100644 docs/api.rst
 create mode 100644 paperless.conf.example

diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 000000000..15ca9bc44
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,23 @@
+.. _api:
+
+The REST API
+############
+
+Paperless makes use of the `Django REST Framework`_ standard API interface
+because of its inherent awesomeness.  Conveniently, the system is also
+self-documenting, so learn more about the access points, schema, what's
+accepted and what isn't, you need only visit ``/api`` on your local Paperless
+installation.
+
+.. _Django REST Framework: http://django-rest-framework.org/
+
+
+.. _api-uploading:
+
+Uploading
+---------
+
+File uploads in an API are hard and so far as I've been able to tell, there's
+no standard way of accepting them, so rather than crowbar file uploads into the
+REST API and endure that headache, I've left that process to a simple HTTP
+POST, documented on the :ref:`consumption page <consumption-http>`.
diff --git a/docs/consumption.rst b/docs/consumption.rst
index 0f8ff7ca5..eadf12823 100644
--- a/docs/consumption.rst
+++ b/docs/consumption.rst
@@ -40,9 +40,9 @@ follow the :ref:`consumer <utilities-consumer>` instructions to get it running.
 A Note on File Naming
 ---------------------
 
-Any document you put into the consumption directory will be consumed, but if you
-name the file right, it'll automatically set some values in the database for
-you.  This is is the logic the consumer follows:
+Any document you put into the consumption directory will be consumed, but if
+you name the file right, it'll automatically set some values in the database
+for you.  This is is the logic the consumer follows:
 
 1. Try to find the correspondent, title, and tags in the file name following
    the pattern: ``Correspondent - Title - tag,tag,tag.pdf``.
@@ -111,11 +111,10 @@ So, with all that in mind, here's what you do to get it running:
 HTTP POST
 =========
 
-Currently, the API is limited to only handling file uploads, it doesn't do tags
-yet, and the URL schema isn't concrete, but it's a start.  It's also not much of
-a real API, it's just a URL that accepts an HTTP POST.
+You can also submit a document via HTTP POST.  It doesn't do tags yet, and the
+URL schema isn't concrete, but it's a start.
 
-To push your document to *Paperless*, send an HTTP POST to the server with the
+To push your document to Paperless, send an HTTP POST to the server with the
 following name/value pairs:
 
 * ``correspondent``: The name of the document's correspondent.  Note that there
diff --git a/paperless.conf.example b/paperless.conf.example
new file mode 100644
index 000000000..fa65c35b9
--- /dev/null
+++ b/paperless.conf.example
@@ -0,0 +1,32 @@
+# Sample paperless.conf
+# Copy this file to /etc/paperless.conf and modify it to suit your needs.
+
+# This where your documents should go to be consumed.  Make sure that it exists
+# and that the user running the paperless service can read/write its contents
+# before you start Paperless.
+PAPERLESS_CONSUMPTION_DIR=""
+
+# These values are required if you want paperless to check a particular email
+# box every 10 minutes and attempt to consume documents from there.  If you
+# dont define a HOST, mail checking will just be disabled.
+PAPERLESS_CONSUME_MAIL_HOST=""
+PAPERLESS_CONSUME_MAIL_PORT=""
+PAPERLESS_CONSUME_MAIL_USER=""
+PAPERLESS_CONSUME_MAIL_PASS=""
+
+# You must have a passphrase in order for Paperless to work at all.  If you set
+# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte
+# file.
+# The passphrase you use here will be used when storing your documents in
+# Paperless, but you can always export them in an unencrypted format by using
+# document exporter.  See the documentaiton for more information.
+#
+# One final note about the passphrase.  Once you've consumed a document with
+# one passphrase, DON'T CHANGE IT.  Paperless assumes this to be a constant and
+# can't properly export documents that were encrypted with an old passphrase if
+# you've since changed it to a new one.
+PAPERLESS_PASSPHRASE="secret"
+
+# If you intend to consume documents either via HTTP POST or by email, you must
+# have a shared secret here.
+PAPERLESS_SHARED_SECRET=""
diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision
index 2a744d5d3..0a09058e4 100644
--- a/scripts/vagrant-provision
+++ b/scripts/vagrant-provision
@@ -11,25 +11,9 @@ apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
 pip3 install -r /opt/paperless/requirements.txt
 
 # Create the environment file
-echo "
-# This where your documents should go to be consumed.  Make sure that it exists
-# before you start Paperless.
-PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption'
-
-# This is the secret passphrase used to encrypt the documents once they have
-# been consumed.  Change it to whatever you like, but you shouldn't change it
-# after it has been used to consume a document or you won't be able to read
-# that document again.
-PAPERLESS_PASSPHRASE='secret'
-
-# This is the secret string used to verify PDFs sent by mail or consumed via
-# the API.  If you don't plan to use either of these, you can safely leave it
-# blank
-PAPERLESS_SHARED_SECRET=''
-" > /tmp/paperless.conf
-chmod 0640 /tmp/paperless.conf
-chown root:vagrant /tmp/paperless.conf
-mv /tmp/paperless.conf /etc/
+cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf
+chmod 0640 /etc/paperless.conf
+chown root:vagrant /etc/paperless.conf
 
 # Create the consumption directory
 mkdir /home/vagrant/consumption

From eb05707f2788db885d33be800fd22392a1b6692c Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 00:01:09 +0000
Subject: [PATCH 57/71] Added link to the api doc page

---
 docs/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/index.rst b/docs/index.rst
index fc78f6f23..47710d376 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -30,6 +30,7 @@ Contents
    requirements
    setup
    consumption
+   api
    utilities
    migrating
    changelog

From 8b5416896d80695a4cfbea125bba3950baf8ff57 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 00:03:45 +0000
Subject: [PATCH 58/71] Grammar & formatting

---
 paperless.conf.example | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paperless.conf.example b/paperless.conf.example
index fa65c35b9..3ee429ea8 100644
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -8,7 +8,7 @@ PAPERLESS_CONSUMPTION_DIR=""
 
 # These values are required if you want paperless to check a particular email
 # box every 10 minutes and attempt to consume documents from there.  If you
-# dont define a HOST, mail checking will just be disabled.
+# don't define a HOST, mail checking will just be disabled.
 PAPERLESS_CONSUME_MAIL_HOST=""
 PAPERLESS_CONSUME_MAIL_PORT=""
 PAPERLESS_CONSUME_MAIL_USER=""
@@ -17,6 +17,7 @@ PAPERLESS_CONSUME_MAIL_PASS=""
 # You must have a passphrase in order for Paperless to work at all.  If you set
 # this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte
 # file.
+#
 # The passphrase you use here will be used when storing your documents in
 # Paperless, but you can always export them in an unencrypted format by using
 # document exporter.  See the documentaiton for more information.

From 5c41e717f0cc718f86c91c6179cafbe4b3e9bd56 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 00:23:14 +0000
Subject: [PATCH 59/71] Missed on case of 'sender'

---
 src/paperless/urls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index e81d4dcf9..4b73dc88e 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -26,7 +26,7 @@ from documents.views import (
 )
 
 router = DefaultRouter()
-router.register(r'senders', CorrespondentViewSet)
+router.register(r'correspondents', CorrespondentViewSet)
 router.register(r'tags', TagViewSet)
 router.register(r'documents', DocumentViewSet)
 router.register(r'logs', LogViewSet)

From 52f15b4de14c3dbaa4969fb6f4e5382d47c230a5 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 01:57:49 +0000
Subject: [PATCH 60/71] The first stages of getting thumbnails back

---
 .gitignore                                    |   5 +-
 media/documents/originals/.keep               |   0
 media/documents/thumbnails/.keep              |   0
 .../migrations/0012_auto_20160305_0040.py     | 101 ++++++++++++++++++
 src/documents/models.py                       |  20 +++-
 src/documents/views.py                        |   8 +-
 src/paperless/urls.py                         |   6 +-
 7 files changed, 135 insertions(+), 5 deletions(-)
 create mode 100644 media/documents/originals/.keep
 create mode 100644 media/documents/thumbnails/.keep
 create mode 100644 src/documents/migrations/0012_auto_20160305_0040.py

diff --git a/.gitignore b/.gitignore
index d4c3fe38e..3c8b8ffea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,7 +57,9 @@ docs/_build/
 target/
 
 # Stored PDFs
-media/*
+media/documents/*.gpg
+media/documents/thumbnails/*.gpg
+media/documents/originals/*.gpg
 
 # Sqlite database
 db.sqlite3
@@ -74,4 +76,3 @@ docker-compose.env
 # Used for development
 scripts/import-for-development
 environment
-
diff --git a/media/documents/originals/.keep b/media/documents/originals/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/media/documents/thumbnails/.keep b/media/documents/thumbnails/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py
new file mode 100644
index 000000000..e42c6cde5
--- /dev/null
+++ b/src/documents/migrations/0012_auto_20160305_0040.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.9.2 on 2016-03-05 00:40
+from __future__ import unicode_literals
+
+import gnupg
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+
+from django.conf import settings
+from django.db import migrations
+
+
+class GnuPG(object):
+    """
+    A handy singleton to use when handling encrypted files.
+    """
+
+    gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
+
+    @classmethod
+    def decrypted(cls, file_handle):
+        return cls.gpg.decrypt_file(
+            file_handle, passphrase=settings.PASSPHRASE).data
+
+    @classmethod
+    def encrypted(cls, file_handle):
+        return cls.gpg.encrypt_file(
+            file_handle,
+            recipients=None,
+            passphrase=settings.PASSPHRASE,
+            symmetric=True
+        ).data
+
+
+def move_documents_and_create_thumbnails(apps, schema_editor):
+
+    documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
+
+    if not documents:
+        return
+
+    print("\n")
+
+    for f in sorted(documents):
+
+        if not f.endswith("gpg"):
+            continue
+
+        print("    * Generating a thumbnail for {}".format(f))
+
+        thumb_temp = tempfile.mkdtemp(
+            prefix="paperless", dir=settings.SCRATCH_DIR)
+        orig_temp = tempfile.mkdtemp(
+            prefix="paperless", dir=settings.SCRATCH_DIR)
+
+        orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f)
+        orig_target = os.path.join(orig_temp, f.replace(".gpg", ""))
+
+        with open(orig_source, "rb") as encrypted:
+            with open(orig_target, "wb") as unencrypted:
+                unencrypted.write(GnuPG.decrypted(encrypted))
+
+        subprocess.Popen((
+            settings.CONVERT_BINARY,
+            "-scale", "500x500",
+            orig_target,
+            os.path.join(thumb_temp, "convert-%04d.jpg")
+        )).wait()
+
+        thumb_source = os.path.join(thumb_temp, "convert-0000.jpg")
+        thumb_target = os.path.join(
+            settings.MEDIA_ROOT,
+            "documents",
+            "thumbnails",
+            re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.jpg\\2", f)
+        )
+        with open(thumb_source, "rb") as unencrypted:
+            with open(thumb_target, "wb") as encrypted:
+                encrypted.write(GnuPG.encrypted(unencrypted))
+
+        shutil.rmtree(thumb_temp)
+        shutil.rmtree(orig_temp)
+
+        shutil.move(
+            os.path.join(settings.MEDIA_ROOT, "documents", f),
+            os.path.join(settings.MEDIA_ROOT, "documents", "originals", f),
+        )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0011_auto_20160303_1929'),
+    ]
+
+    operations = [
+        migrations.RunPython(move_documents_and_create_thumbnails),
+    ]
diff --git a/src/documents/models.py b/src/documents/models.py
index a82f7643f..a3ffb8a74 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -171,6 +171,7 @@ class Document(models.Model):
         return os.path.join(
             settings.MEDIA_ROOT,
             "documents",
+            "originals",
             "{:07}.{}.gpg".format(self.pk, self.file_type)
         )
 
@@ -184,7 +185,24 @@ class Document(models.Model):
 
     @property
     def download_url(self):
-        return reverse("fetch", kwargs={"pk": self.pk})
+        return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk})
+
+    @property
+    def thumbnail_path(self):
+        return os.path.join(
+            settings.MEDIA_ROOT,
+            "documents",
+            "thumbnails",
+            "{:07}.jpg.gpg".format(self.pk)
+        )
+
+    @property
+    def thumbnail_file(self):
+        return open(self.thumbnail_path, "rb")
+
+    @property
+    def thumbnail_url(self):
+        return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
 
 
 class Log(models.Model):
diff --git a/src/documents/views.py b/src/documents/views.py
index ff7c4ce05..4a4a060bf 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -35,7 +35,7 @@ class FetchView(LoginRequiredMixin, DetailView):
 
     def render_to_response(self, context, **response_kwargs):
         """
-        Override the default to return the unencrypted PDF as raw data.
+        Override the default to return the unencrypted image/PDF as raw data.
         """
 
         content_types = {
@@ -46,6 +46,12 @@ class FetchView(LoginRequiredMixin, DetailView):
             Document.TYPE_TIF: "image/tiff",
         }
 
+        if self.kwargs["kind"] == "thumb":
+            return HttpResponse(
+                GnuPG.decrypted(self.object.thumb_file),
+                content_type=content_types[Document.TYPE_JPG]
+            )
+
         response = HttpResponse(
             GnuPG.decrypted(self.object.source_file),
             content_type=content_types[self.object.file_type]
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 4b73dc88e..a7775a588 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -44,7 +44,11 @@ urlpatterns = [
     # url(r"^$", IndexView.as_view(), name="index"),
 
     # File downloads
-    url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"),
+    url(
+        r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$",
+        FetchView.as_view(),
+        name="fetch"
+    ),
 
     # The Django admin
     url(r"admin/", admin.site.urls),

From 8a9ea4664c01f104436cfc89119f7429050841dd Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 02:15:26 +0000
Subject: [PATCH 61/71] Cleaned up the thumbnails by switching to .png

---
 src/documents/migrations/0012_auto_20160305_0040.py | 9 +++++----
 src/documents/models.py                             | 2 +-
 src/documents/views.py                              | 6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py
index e42c6cde5..876c2c68e 100644
--- a/src/documents/migrations/0012_auto_20160305_0040.py
+++ b/src/documents/migrations/0012_auto_20160305_0040.py
@@ -65,17 +65,18 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
 
         subprocess.Popen((
             settings.CONVERT_BINARY,
-            "-scale", "500x500",
+            "-scale", "500x5000",
+            "-alpha", "remove",
             orig_target,
-            os.path.join(thumb_temp, "convert-%04d.jpg")
+            os.path.join(thumb_temp, "convert-%04d.png")
         )).wait()
 
-        thumb_source = os.path.join(thumb_temp, "convert-0000.jpg")
+        thumb_source = os.path.join(thumb_temp, "convert-0000.png")
         thumb_target = os.path.join(
             settings.MEDIA_ROOT,
             "documents",
             "thumbnails",
-            re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.jpg\\2", f)
+            re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
         )
         with open(thumb_source, "rb") as unencrypted:
             with open(thumb_target, "wb") as encrypted:
diff --git a/src/documents/models.py b/src/documents/models.py
index a3ffb8a74..b8baea7f8 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -193,7 +193,7 @@ class Document(models.Model):
             settings.MEDIA_ROOT,
             "documents",
             "thumbnails",
-            "{:07}.jpg.gpg".format(self.pk)
+            "{:07}.png.gpg".format(self.pk)
         )
 
     @property
diff --git a/src/documents/views.py b/src/documents/views.py
index 4a4a060bf..1dc23aa4f 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -29,7 +29,7 @@ class IndexView(TemplateView):
         return TemplateView.get_context_data(self, **kwargs)
 
 
-class FetchView(LoginRequiredMixin, DetailView):
+class FetchView(DetailView):
 
     model = Document
 
@@ -48,8 +48,8 @@ class FetchView(LoginRequiredMixin, DetailView):
 
         if self.kwargs["kind"] == "thumb":
             return HttpResponse(
-                GnuPG.decrypted(self.object.thumb_file),
-                content_type=content_types[Document.TYPE_JPG]
+                GnuPG.decrypted(self.object.thumbnail_file),
+                content_type=content_types[Document.TYPE_PNG]
             )
 
         response = HttpResponse(

From 495ed1c36c9c8ebb120449ad0bdac9be27255f3c Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 12:09:06 +0000
Subject: [PATCH 62/71] Added thumbnail generation to the conumer

---
 src/documents/consumer.py | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index eeb42cdf1..5cfc20852 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -119,10 +119,11 @@ class Consumer(object):
 
             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
             pngs = self._get_greyscale(tempdir, doc)
+            thumbnail = self._get_thumbnail(tempdir, doc)
 
             try:
                 text = self._get_ocr(pngs)
-                self._store(text, doc)
+                self._store(text, doc, thumbnail)
             except OCRError as e:
                 self._ignore.append(doc)
                 self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
@@ -133,6 +134,9 @@ class Consumer(object):
                 self._cleanup_doc(doc)
 
     def _get_greyscale(self, tempdir, doc):
+        """
+        Greyscale images are easier for Tesseract to OCR
+        """
 
         self.log("info", "Generating greyscale image from {}".format(doc))
 
@@ -150,6 +154,23 @@ class Consumer(object):
 
         return sorted(filter(lambda __: os.path.isfile(__), pngs))
 
+    def _get_thumbnail(self, tempdir, doc):
+        """
+        The thumbnail of a PDF is just a 500px wide image of the first page.
+        """
+
+        self.log("info", "Generating the thumbnail")
+
+        subprocess.Popen((
+            self.CONVERT,
+            "-scale", "500x5000",
+            "-alpha", "remove",
+            doc,
+            os.path.join(tempdir, "convert-%04d.png")
+        )).wait()
+
+        return os.path.join(tempdir, "convert-0000.png")
+
     def _guess_language(self, text):
         try:
             guess = langdetect.detect(text)
@@ -288,7 +309,7 @@ class Consumer(object):
         m = re.match(self.REGEX_TITLE, parseable)
         return None, m.group(1), (), get_suffix(m.group(2))
 
-    def _store(self, text, doc):
+    def _store(self, text, doc, thumbnail):
 
         sender, title, tags, file_type = self._guess_attributes_from_name(doc)
         relevant_tags = set(list(Tag.match_all(text)) + list(tags))
@@ -313,9 +334,16 @@ class Consumer(object):
             self.log("debug", "Tagging with {}".format(tag_names))
             document.tags.add(*relevant_tags)
 
+        # Encrypt and store the actual document
         with open(doc, "rb") as unencrypted:
             with open(document.source_path, "wb") as encrypted:
-                self.log("debug", "Encrypting")
+                self.log("debug", "Encrypting the document")
+                encrypted.write(GnuPG.encrypted(unencrypted))
+
+        # Encrypt and store the thumbnail
+        with open(thumbnail, "rb") as unencrypted:
+            with open(document.thumbnail_path, "wb") as encrypted:
+                self.log("debug", "Encrypting the thumbnail")
                 encrypted.write(GnuPG.encrypted(unencrypted))
 
         self.log("info", "Completed")

From ac40aee805a7289a721469b50f146d5c3801cdfe Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 12:31:43 +0000
Subject: [PATCH 63/71] Added some nice output so the migration is less scary

---
 .../migrations/0012_auto_20160305_0040.py        | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py
index 876c2c68e..618ace5d8 100644
--- a/src/documents/migrations/0012_auto_20160305_0040.py
+++ b/src/documents/migrations/0012_auto_20160305_0040.py
@@ -11,6 +11,7 @@ import tempfile
 
 from django.conf import settings
 from django.db import migrations
+from django.utils.termcolors import colorize as colourise  # Spelling hurts me
 
 
 class GnuPG(object):
@@ -42,14 +43,25 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
     if not documents:
         return
 
-    print("\n")
+    print(colourise(
+        "\n\n"
+        "  This is a one-time only migration to generate thumbnails for all of your\n"
+        "  documents so that future UIs will have something to work with.  If you have\n"
+        "  a lot of documents though, this may take a while, so a coffee break may be\n"
+        "  in order."
+        "\n", opts=("bold",)
+    ))
 
     for f in sorted(documents):
 
         if not f.endswith("gpg"):
             continue
 
-        print("    * Generating a thumbnail for {}".format(f))
+        print("    {} {} {}".format(
+            colourise("*", fg="green"),
+            colourise("Generating a thumbnail for", fg="white"),
+            colourise(f, fg="cyan")
+        ))
 
         thumb_temp = tempfile.mkdtemp(
             prefix="paperless", dir=settings.SCRATCH_DIR)

From 034b96277cbb15ca178b3c60ea10a1db0da0c782 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 12:34:26 +0000
Subject: [PATCH 64/71] Added thumbnail_url to the API

---
 src/documents/serialisers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index c2b2ae7fd..db50d34ba 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -37,7 +37,8 @@ class DocumentSerializer(serializers.ModelSerializer):
             "created",
             "modified",
             "file_name",
-            "download_url"
+            "download_url",
+            "thumbnail_url",
         )
 
 

From bfad4560e139257ec81ccd284984001ee53bfce9 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sat, 5 Mar 2016 12:43:05 +0000
Subject: [PATCH 65/71] Fixed the check for empty installations

---
 src/documents/migrations/0012_auto_20160305_0040.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py
index 618ace5d8..62a5c65bc 100644
--- a/src/documents/migrations/0012_auto_20160305_0040.py
+++ b/src/documents/migrations/0012_auto_20160305_0040.py
@@ -40,7 +40,7 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
 
     documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
 
-    if not documents:
+    if set(documents) == {"originals", "thumbnails"}:
         return
 
     print(colourise(

From 9180ad78c4a475f50d8e90d2051545c6f5ff8942 Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Sun, 6 Mar 2016 14:39:28 +0100
Subject: [PATCH 66/71] Update Dockerfile to match latest version

---
 Dockerfile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index fec76ee37..eb9fa90dd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,7 +16,11 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy application
 RUN mkdir -p /usr/src/paperless/src
+RUN mkdir -p /usr/src/paperless/data
+RUN mkdir -p /usr/src/paperless/media
 COPY src/ /usr/src/paperless/src/
+COPY data/ /usr/src/paperless/data/
+COPY media/ /usr/src/paperless/media/
 
 # Set consumption directory
 ENV PAPERLESS_CONSUMPTION_DIR /consume
@@ -24,7 +28,6 @@ RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
 
 # Migrate database
 WORKDIR /usr/src/paperless/src
-RUN mkdir /usr/src/paperless/data
 RUN ./manage.py migrate
 
 # Create user

From fb36a49c2681aa5362e30f266e85c89565a310c3 Mon Sep 17 00:00:00 2001
From: Pit Kleyersburg <pitkley@googlemail.com>
Date: Tue, 16 Feb 2016 10:49:55 +0100
Subject: [PATCH 67/71] Add unpaper as another pre-processing step

---
 Dockerfile                |  2 +-
 docs/requirements.rst     |  2 +
 scripts/vagrant-provision |  2 +-
 src/documents/consumer.py | 80 ++++++++++++++++++++++-----------------
 src/paperless/settings.py |  3 ++
 5 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index eb9fa90dd..a13fa7b3f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,7 +5,7 @@ MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
         sudo \
-        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \
+        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
     && rm -rf /var/lib/apt/lists/*
 
 # Install python dependencies
diff --git a/docs/requirements.rst b/docs/requirements.rst
index ee287d835..36bc234c0 100644
--- a/docs/requirements.rst
+++ b/docs/requirements.rst
@@ -10,11 +10,13 @@ should work) that has the following software installed on it:
 * `GNU Privacy Guard`_
 * `Tesseract`_
 * `Imagemagick`_
+* `unpaper`_
 
 .. _Python3: https://python.org/
 .. _GNU Privacy Guard: https://gnupg.org
 .. _Tesseract: https://github.com/tesseract-ocr
 .. _Imagemagick: http://imagemagick.org/
+.. _unpaper: https://www.flameeyes.eu/projects/unpaper
 
 Notably, you should confirm how you access your Python3 installation.  Many
 Linux distributions will install Python3 in parallel to Python2, using the names
diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision
index 0a09058e4..940bf476c 100644
--- a/scripts/vagrant-provision
+++ b/scripts/vagrant-provision
@@ -5,7 +5,7 @@ apt-get update
 apt-get build-dep -y python-imaging
 apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
 apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
-apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
+apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
 
 # Python dependencies
 pip3 install -r /opt/paperless/requirements.txt
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 5cfc20852..fbdbbc276 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -39,8 +39,8 @@ class ConsumerError(Exception):
 class Consumer(object):
     """
     Loop over every file found in CONSUMPTION_DIR and:
-      1. Convert it to a greyscale png
-      2. Use tesseract on the png
+      1. Convert it to a greyscale pnm
+      2. Use tesseract on the pnm
       3. Encrypt and store the document in the MEDIA_ROOT
       4. Store the OCR'd text in the database
       5. Delete the document and image(s)
@@ -48,6 +48,7 @@ class Consumer(object):
 
     SCRATCH = settings.SCRATCH_DIR
     CONVERT = settings.CONVERT_BINARY
+    UNPAPER = settings.UNPAPER_BINARY
     CONSUME = settings.CONSUMPTION_DIR
     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
 
@@ -118,11 +119,11 @@ class Consumer(object):
             self.log("info", "Consuming {}".format(doc))
 
             tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
-            pngs = self._get_greyscale(tempdir, doc)
+            imgs = self._get_greyscale(tempdir, doc)
             thumbnail = self._get_thumbnail(tempdir, doc)
 
             try:
-                text = self._get_ocr(pngs)
+                text = self._get_ocr(imgs)
                 self._store(text, doc, thumbnail)
             except OCRError as e:
                 self._ignore.append(doc)
@@ -140,19 +141,30 @@ class Consumer(object):
 
         self.log("info", "Generating greyscale image from {}".format(doc))
 
-        png = os.path.join(tempdir, "convert-%04d.jpg")
-
+        # Convert PDF to multiple PNMs
+        pnm = os.path.join(tempdir, "convert-%04d.pnm")
         subprocess.Popen((
             self.CONVERT, "-density", "300", "-depth", "8",
-            "-type", "grayscale", doc, png
+            "-type", "grayscale", doc, pnm
         )).wait()
 
-        pngs = []
+        # Get a list of converted images
+        pnms = []
         for f in os.listdir(tempdir):
-            if f.startswith("convert"):
-                pngs.append(os.path.join(tempdir, f))
+            if f.endswith(".pnm"):
+                pnms.append(os.path.join(tempdir, f))
 
-        return sorted(filter(lambda __: os.path.isfile(__), pngs))
+        # Run unpaper in parallel on converted images
+        with Pool(processes=self.THREADS) as pool:
+            pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
+
+        # Return list of converted images, processed with unpaper
+        pnms = []
+        for f in os.listdir(tempdir):
+            if f.endswith(".unpaper.pnm"):
+                pnms.append(os.path.join(tempdir, f))
+
+        return sorted(filter(lambda __: os.path.isfile(__), pnms))
 
     def _get_thumbnail(self, tempdir, doc):
         """
@@ -179,21 +191,21 @@ class Consumer(object):
         except Exception as e:
             self.log("warning", "Language detection error: {}".format(e))
 
-    def _get_ocr(self, pngs):
+    def _get_ocr(self, imgs):
         """
         Attempts to do the best job possible OCR'ing the document based on
         simple language detection trial & error.
         """
 
-        if not pngs:
+        if not imgs:
             raise OCRError("No images found")
 
         self.log("info", "OCRing the document")
 
         # Since the division gets rounded down by int, this calculation works
         # for every edge-case, i.e. 1
-        middle = int(len(pngs) / 2)
-        raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
+        middle = int(len(imgs) / 2)
+        raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
 
         guessed_language = self._guess_language(raw_text)
 
@@ -205,16 +217,16 @@ class Consumer(object):
                     "As FORGIVING_OCR is enabled, we're going to make the "
                     "best with what we have."
                 )
-                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
+                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
                 return raw_text
             raise OCRError("Language detection failed")
 
         if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
-            raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
+            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
             return raw_text
 
         try:
-            return self._ocr(pngs, ISO639[guessed_language])
+            return self._ocr(imgs, ISO639[guessed_language])
         except pyocr.pyocr.tesseract.TesseractError:
             if settings.FORGIVING_OCR:
                 self.log(
@@ -224,34 +236,34 @@ class Consumer(object):
                         guessed_language
                     )
                 )
-                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
+                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
                 return raw_text
             raise OCRError(
                 "The guessed language is not available in this instance of "
                 "Tesseract."
             )
 
-    def _assemble_ocr_sections(self, pngs, middle, text):
+    def _assemble_ocr_sections(self, imgs, middle, text):
         """
         Given a `middle` value and the text that middle page represents, we OCR
         the remainder of the document and return the whole thing.
         """
-        text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
-        text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
+        text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
+        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
         return text
 
-    def _ocr(self, pngs, lang):
+    def _ocr(self, imgs, lang):
         """
         Performs a single OCR attempt.
         """
 
-        if not pngs:
+        if not imgs:
             return ""
 
         self.log("info", "Parsing for {}".format(lang))
 
         with Pool(processes=self.THREADS) as pool:
-            r = pool.map(image_to_string, itertools.product(pngs, [lang]))
+            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
             r = " ".join(r)
 
         # Strip out excess white space to allow matching to go smoother
@@ -374,16 +386,9 @@ class Consumer(object):
 
 
 def image_to_string(args):
-    """
-    I have no idea why, but if this function were a method of Consumer, it
-    would explode with:
-
-      `TypeError: cannot serialize '_io.TextIOWrapper' object`.
-    """
-
-    png, lang = args
+    img, lang = args
     ocr = pyocr.get_available_tools()[0]
-    with Image.open(os.path.join(Consumer.SCRATCH, png)) as f:
+    with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
         if ocr.can_detect_orientation():
             try:
                 orientation = ocr.detect_orientation(f, lang=lang)
@@ -391,3 +396,10 @@ def image_to_string(args):
             except TesseractError:
                 pass
         return ocr.image_to_string(f, lang=lang)
+
+
+def run_unpaper(args):
+    unpaper, pnm = args
+    subprocess.Popen((
+        unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
+    )).wait()
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index f2fb41941..b7daecaf8 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -189,6 +189,9 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 # Convert is part of the ImageMagick package
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
 
+# Unpaper
+UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
+
 # This will be created if it doesn't exist
 SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
 

From 2fba41ad7530699e30fa2a0f1306c5811070665f Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 6 Mar 2016 16:03:02 +0000
Subject: [PATCH 68/71] Added the use of unpaper to the README

---
 README.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.rst b/README.rst
index 0aba0545e..80043ff7a 100644
--- a/README.rst
+++ b/README.rst
@@ -56,6 +56,7 @@ powerful tools.
 
 * `ImageMagick`_ converts the images between colour and greyscale.
 * `Tesseract`_ does the character recognition.
+* `Unpaper`_ despeckles and and deskews the scanned image.
 * `GNU Privacy Guard`_ is used as the encryption backend.
 * `Python 3`_ is the language of the project.
 
@@ -93,6 +94,7 @@ home.
 .. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail
 .. _ImageMagick: http://imagemagick.org/
 .. _Tesseract: https://github.com/tesseract-ocr
+.. _Unpaper: https://www.flameeyes.eu/projects/unpaper
 .. _GNU Privacy Guard: https://gnupg.org/
 .. _Python 3: https://python.org/
 .. _Pillow: https://pypi.python.org/pypi/pillowfight/

From 3b278c3a242752dcff3e878bdf8cf6c3b28332dc Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 6 Mar 2016 17:26:07 +0000
Subject: [PATCH 69/71] Added an informational log message for consumer start

---
 .../management/commands/document_consumer.py       |  9 +++++++++
 src/documents/managers.py                          |  4 ++--
 src/documents/models.py                            | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
index 0eae5c80c..8116303b5 100644
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -1,10 +1,12 @@
 import datetime
+import logging
 import os
 import time
 
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
 
+from ...models import Log
 from ...consumer import Consumer, ConsumerError
 from ...mail import MailFetcher, MailFetcherError
 
@@ -44,6 +46,13 @@ class Command(BaseCommand):
         except FileExistsError:
             pass
 
+        logging.getLogger(__name__).info(
+            "Starting document consumer at {}".format(
+                settings.CONSUMPTION_DIR
+            ),
+            extra={"component": Log.COMPONENT_CONSUMER}
+        )
+
         try:
             while True:
                 self.loop()
diff --git a/src/documents/managers.py b/src/documents/managers.py
index d7e7225eb..e7b0751ca 100644
--- a/src/documents/managers.py
+++ b/src/documents/managers.py
@@ -4,7 +4,7 @@ from django.db import models
 from django.db.models.aggregates import Max
 
 
-class Concat(models.Aggregate):
+class GroupConcat(models.Aggregate):
     """
     Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've
     only ever tested it in Sqlite.
@@ -60,7 +60,7 @@ class LogQuerySet(models.query.QuerySet):
     def by_group(self):
         return self.values("group").annotate(
             time=Max("modified"),
-            messages=Concat("message"),
+            messages=GroupConcat("message"),
         ).order_by("-time")
 
 
diff --git a/src/documents/models.py b/src/documents/models.py
index b8baea7f8..0d79dba0a 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,6 +1,7 @@
 import logging
 import os
 import re
+import uuid
 
 from django.conf import settings
 from django.core.urlresolvers import reverse
@@ -236,3 +237,16 @@ class Log(models.Model):
 
     def __str__(self):
         return self.message
+
+    def save(self, *args, **kwargs):
+        """
+        To allow for the case where we don't want to group the message, we
+        shouldn't force the caller to specify a one-time group value.  However,
+        allowing group=None means that the manager can't differentiate the
+        different un-grouped messages, so instead we set a random one here.
+        """
+
+        if not self.group:
+            self.group = uuid.uuid4()
+
+        models.Model.save(self, *args, **kwargs)

From f7e96eab724d8baa147d52c165047f234ae55856 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Sun, 6 Mar 2016 17:36:39 +0000
Subject: [PATCH 70/71] Put the lid on the changelog for 0.1.1

---
 docs/changelog.rst | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 2228c9be1..f2ab6cabc 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,12 +1,20 @@
 Changelog
 #########
 
-* 0.1.1 (master)
+* 0.1.1
 
   * Potentially **Breaking Change**: All references to "sender" in the code
     have been renamed to "correspondent" to better reflect the nature of the
     property (one could quite reasonably scan a document before sending it to
     someone.)
+  * `#67`_: Rewrote the document exporter and added a new importer that allows
+    for full metadata retention without depending on the file name and
+    modification time.  A big thanks to `Tikitu de Jager`_, `Pit`_,
+    `Florian Jung`_, and `Christopher Luu`_ for their code snippets and
+    contributing conversation that lead to this change.
+  * `#20`_: Added *unpaper* support to help in cleaning up the scanned image
+    before it's OCR'd.  Thanks to `Pit`_ for this one.
+  * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
   * `#68`_: Added support for using a proper config file at
     ``/etc/paperless.conf`` and modified the systemd unit files to use it.
   * Refactored the Vagrant installation process to use environment variables
@@ -69,14 +77,17 @@ Changelog
 
   * Initial release
 
-.. _Wayne Werner: https://github.com/waynew
 .. _Brian Conn: https://github.com/TheConnMan
+.. _Christopher Luu: https://github.com/nuudles
+.. _Florian Jung: https://github.com/the01
 .. _Tikitu de Jager: https://github.com/tikitu
-.. _Pit: https://github.com/pitkley
 .. _Paul: https://github.com/polo2ro
+.. _Pit: https://github.com/pitkley
+.. _Wayne Werner: https://github.com/waynew
 .. _darkmatter: https://github.com/darkmatter
 .. _zedster: https://github.com/zedster
 
+.. _#20: https://github.com/danielquinn/paperless/issues/20
 .. _#44: https://github.com/danielquinn/paperless/issues/44
 .. _#45: https://github.com/danielquinn/paperless/issues/45
 .. _#47: https://github.com/danielquinn/paperless/issues/47
@@ -85,4 +96,6 @@ Changelog
 .. _#54: https://github.com/danielquinn/paperless/issues/54
 .. _#57: https://github.com/danielquinn/paperless/issues/57
 .. _#60: https://github.com/danielquinn/paperless/issues/60
+.. _#67: https://github.com/danielquinn/paperless/issues/67
 .. _#68: https://github.com/danielquinn/paperless/issues/68
+.. _#71: https://github.com/danielquinn/paperless/issues/71

From 6ca389c28a2cd51294014e3ca6c7bdb65ab144d6 Mon Sep 17 00:00:00 2001
From: Daniel Quinn <code@danielquinn.org>
Date: Mon, 7 Mar 2016 10:12:55 +0000
Subject: [PATCH 71/71] #76

---
 src/documents/migrations/0012_auto_20160305_0040.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py
index 62a5c65bc..91d384c22 100644
--- a/src/documents/migrations/0012_auto_20160305_0040.py
+++ b/src/documents/migrations/0012_auto_20160305_0040.py
@@ -52,6 +52,11 @@ def move_documents_and_create_thumbnails(apps, schema_editor):
         "\n", opts=("bold",)
     ))
 
+    try:
+        os.makedirs(settings.SCRATCH_DIR)
+    except FileExistsError:
+        pass
+
     for f in sorted(documents):
 
         if not f.endswith("gpg"):