Merge branch 'master' into feature/api

2026-02-01 23:19:00 -06:00 · 2016-02-20 22:55:42 +00:00
parent cebc44f2c9 224f4acdc3
commit a5124cade6
17 changed files with 678 additions and 37 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,8 @@ db.sqlite3
 # Other stuff that doesn't belong
 virtualenv
 .vagrant
+docker-compose.yml
+docker-compose.env

 # Used for development
 scripts/import-for-development
--- a/43
+++ b/43
@@ -0,0 +1,43 @@
+FROM python:3.5.1
+MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
+
+# Install dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        sudo \
+        tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install python dependencies
+RUN mkdir -p /usr/src/paperless
+WORKDIR /usr/src/paperless
+COPY requirements.txt /usr/src/paperless/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application
+RUN mkdir -p /usr/src/paperless/src
+COPY src/ /usr/src/paperless/src/
+
+# Set consumption directory
+ENV PAPERLESS_CONSUME /consume
+RUN mkdir -p $PAPERLESS_CONSUME
+
+# Migrate database
+WORKDIR /usr/src/paperless/src
+RUN mkdir /usr/src/paperless/data
+RUN ./manage.py migrate
+
+# Create user
+RUN groupadd -g 1000 paperless \
+    && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
+    && chown -Rh paperless:paperless /usr/src/paperless
+
+# Setup entrypoint
+COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
+RUN chmod 755 /sbin/docker-entrypoint.sh
+
+# Mount volumes
+VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
+
+ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
+CMD ["--help"]
--- a/docker-compose.env.example
+++ b/docker-compose.env.example
@@ -0,0 +1,15 @@
+# Environment variables to set for Paperless
+# Commented out variables will be replaced by a default within Paperless.
+
+# Passphrase Paperless uses to encrypt and decrypt your documents
+PAPERLESS_PASSPHRASE=CHANGE_ME
+
+# The amount of threads to use for text recognition
+# PAPERLESS_OCR_THREADS=4
+
+# Additional languages to install for text recognition
+# PAPERLESS_OCR_LANGUAGES=deu ita
+
+# You can change the default user and group id to a custom one
+# USERMAP_UID=1000
+# USERMAP_GID=1000
--- a/docker-compose.yml.example
+++ b/docker-compose.yml.example
@@ -0,0 +1,37 @@
+version: '2'
+
+services:
+    webserver:
+        image: paperless
+        ports:
+            # You can adapt the port you want Paperless to listen on by
+            # modifying the part before the `:`.
+            - "8000:8000"
+        volumes:
+            - paperless-data:/usr/src/paperless/data
+            - paperless-media:/usr/src/paperless/media
+        env_file: docker-compose.env
+        environment:
+            - PAPERLESS_OCR_LANGUAGES=
+        command: ["runserver", "0.0.0.0:8000"]
+
+    consumer:
+        image: paperless
+        volumes:
+            - paperless-data:/usr/src/paperless/data
+            - paperless-media:/usr/src/paperless/media
+            # You have to adapt the local path you want the consumption
+            # directory to mount to by modifying the part before the ':'.
+            - /path/to/arbitrary/place:/consume
+            # Likewise, you can add a local path to mount a directory for
+            # exporting. This is not strictly needed for paperless to
+            # function, only if you're exporting your files: uncomment
+            # it and fill in a local path if you know you're going to 
+            # want to export your documents.
+            # - /path/to/another/arbitrary/place:/export
+        env_file: docker-compose.env
+        command: ["document_consumer"]
+
+volumes:
+    paperless-data:
+    paperless-media:
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.5.1
+MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
+
+# Install Sphinx and Pygments
+RUN pip install Sphinx Pygments
+
+# Setup directories, copy data
+RUN mkdir /build
+COPY . /build
+WORKDIR /build/docs
+
+# Build documentation
+RUN make html
+
+# Start webserver
+WORKDIR /build/docs/_build/html
+EXPOSE 8000/tcp
+CMD ["python3", "-m", "http.server"]
--- a/docs/migrating.rst
+++ b/docs/migrating.rst
@@ -30,6 +30,20 @@ as part of the update:
 Note that it's possible (even likely) that while ``git pull`` may update some
 files, the ``migrate`` step may not update anything.  This is totally normal.

+If you are :ref:`using Docker <setup-installation-docker>` the update process
+requires only one additional step:
+
+.. code-block:: shell-session
+
+    $ cd /path/to/project
+    $ git pull
+    $ docker build -t paperless .
+    $ docker-compose up -d
+    $ docker-compose run --rm webserver migrate
+
+If ``git pull`` doesn't report any changes, there is no need to continue with
+the remaining steps.
+

 .. _migrating-backup:

@@ -53,6 +67,45 @@ with Django's ``dumpdata`` command, which produces JSON output.
    $ ./manage.py document_export /path/to/arbitrary/place/
    $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json

+If you are :ref:`using Docker <setup-installation-docker>`, exporting your tags
+as JSON is almost as easy:
+
+.. code-block:: shell-session
+
+    $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
+
+To export the documents you can either use ``docker run`` directly, specifying all
+the commandline options by hand, or (more simply) mount a second volume for export.
+
+To mount a volume for exports, follow the instructions in the
+``docker-compose.yml.example`` file for the ``/export`` volume (making the changes
+in your own ``docker-compose.yml`` file, of course). Once you have the
+volume mounted, the command to run an export is:
+
+.. code-block:: console
+
+   $ docker-compose run --rm consumer document_exporter /export
+
+If you prefer to use ``docker run`` directly, supplying the necessary commandline
+options:
+
+.. code-block:: shell-session
+
+   $ # Identify your containers
+   $ docker-compose ps
+           Name                       Command                State     Ports
+   -------------------------------------------------------------------------
+   paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
+   paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
+
+   $ # Make sure to replace your passphrase and remove or adapt the id mapping
+   $ docker run --rm \
+       --volumes-from paperless_data_1 \
+       --volume /path/to/arbitrary/place:/export \
+       -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
+       -e USERMAP_UID=1000 -e USERMAP_GID=1000 \
+       paperless document_exporter /export
+

 .. _migrating-restoring:

@@ -77,3 +130,25 @@ exported documents into the consumption directory and start up the consumer.
    $ cp /path/to/exported/docs/* /path/to/consumption/dir/
    $ ./manage.py document_consumer

+Importing your data if you are :ref:`using Docker <setup-installation-docker>`
+is almost as simple:
+
+.. code-block:: shell-session
+
+    $ # Stop and remove your current containers
+    $ docker-compose stop
+    $ docker-compose rm -f
+
+    $ # Recreate them, add the superuser
+    $ docker-compose up -d
+    $ docker-compose run --rm webserver createsuperuser
+
+    $ # Load the tags
+    $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
+
+    $ # Load your exported documents into the consumption directory
+    $ # (How you do this highly depends on how you have set this up)
+    $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
+
+After loading the documents into the consumption directory the consumer will
+immediately start consuming the documents.
--- a/docs/requirements.rst
+++ b/docs/requirements.rst
@@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to:
    $ pip install sphinx

 and then cd into the ``docs`` directory and type ``make html``.
+
+If you are using Docker, you can use the following commands to build the
+documentation and run a webserver serving it on `port 8001`_:
+
+.. code:: bash
+
+    $ pwd
+    /path/to/paperless
+
+    $ docker build -t paperless:docs -f docs/Dockerfile .
+    $ docker run --rm -it -p "8001:8000" paperless:docs
+
+.. _port 8001: http://127.0.0.1:8001
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -37,11 +37,18 @@ or just download the tarball and go that route:
 Installation & Configuration
 ----------------------------

-You can go two routes with setting up and running Paperless.  The *Vagrant*
-route is quick & easy, but means you're running a VM which comes with memory
-consumption etc.  Alternatively the standard, "bare metal" approach is a little
-more complicated.
+You can go multiple routes with setting up and running Paperless. The `Vagrant
+route`_ is quick & easy, but means you're running a VM which comes with memory
+consumption etc. We also `support Docker`_, which you can use natively under
+Linux and in a VM with `Docker Machine`_ (this guide was written for native
+Docker usage under Linux, you might have to adapt it for Docker Machine.)
+Alternatively the standard, `bare metal`_ approach is a little more complicated.

+.. _Vagrant route: setup-installation-vagrant_
+.. _support Docker: setup-installation-docker_
+.. _bare metal: setup-installation-standard_
+
+.. _Docker Machine: https://docs.docker.com/machine/

 .. _setup-installation-standard:

@@ -118,6 +125,157 @@ Vagrant Method
 .. _Paperless server: http://172.28.128.4:8000


+.. _setup-installation-docker:
+
+Docker Method
+.............
+
+1. Install `Docker`_.
+
+   .. caution::
+
+      As mentioned earlier, this guide assumes that you use Docker natively
+      under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows,
+      you will have to adapt IP addresses, volume-mounting, command execution
+      and maybe more.
+
+2. Install `docker-compose`_. [#compose]_
+
+   .. caution::
+
+       If you want to use the included ``docker-compose.yml.example`` file, you
+       need to have at least Docker version **1.10.0** and docker-compose
+       version **1.6.0**.
+
+       See the `Docker installation guide`_ on how to install the current
+       version of Docker for your operating system or Linux distribution of
+       choice. To get an up-to-date version of docker-compose, follow the
+       `docker-compose installation guide`_ if your package repository doesn't
+       include it.
+
+       .. _Docker installation guide: https://docs.docker.com/engine/installation/
+       .. _docker-compose installation guide: https://docs.docker.com/compose/install/
+
+3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
+   a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
+   editing both these files: taking a copy ensures that you can ``git pull`` to 
+   receive updates without risking merge conflicts with your modified versions 
+   of the configuration files.
+4. Modify ``docker-compose.yml`` to your preferences, following the instructions
+   in comments in the file. The only change that is a hard requirement is to 
+   specify where the consumption directory should mount.
+5. Modify ``docker-compose.env`` and adapt the following environment variables:
+
+   ``PAPERLESS_PASSPHRASE``
+     This is the passphrase Paperless uses to encrypt/decrypt the original
+     document.
+
+   ``PAPERLESS_OCR_THREADS``
+     This is the number of threads the OCR process will spawn to process
+     document pages in parallel. If the variable is not set, Python determines
+     the core-count of your CPU and uses that value.
+
+   ``PAPERLESS_OCR_LANGUAGES``
+     If you want the OCR to recognize other languages in addition to the default
+     English, set this parameter to a space separated list of three-letter
+     language-codes after `ISO 639-2/T`_. For a list of available languages --
+     including their three letter codes -- see the `Debian packagelist`_.
+
+   ``USERMAP_UID`` and ``USERMAP_GID``
+     If you want to mount the consumption volume (directory ``/consume`` within
+     the containers) to a host-directory -- which you probably want to do --
+     access rights might be an issue. The default user and group ``paperless``
+     in the containers have an id of 1000. The containers will enforce that the
+     owning group of the consumption directory will be ``paperless`` to be able
+     to delete consumed documents. If your host-system has a group with an id of
+     1000 and you don't want this group to have access rights to the consumption
+     directory, you can use ``USERMAP_GID`` to change the id in the container
+     and thus the one of the consumption directory. Furthermore, you can change
+     the id of the default user as well using ``USERMAP_UID``.
+
+6. Run ``docker-compose up -d``. This will create and start the necessary
+   containers.
+7. To be able to login, you will need a super user. To create it, execute the
+   following command:
+
+   .. code-block:: shell-session
+
+       $ docker-compose run --rm webserver createsuperuser
+
+   This will prompt you to set a username (default ``paperless``), an optional
+   e-mail address and finally a password.
+8. The default ``docker-compose.yml`` exports the webserver on your local port
+   8000. If you haven't adapted this, you should now be able to visit your
+   `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
+   user and password you just created.
+9. Add files to consumption directory the way you prefer to. Following are two
+   possible options:
+
+   1. Mount the consumption directory to a local host path by modifying your
+      ``docker-compose.yml``:
+
+      .. code-block:: diff
+
+         diff --git a/docker-compose.yml b/docker-compose.yml
+         --- a/docker-compose.yml
+         +++ b/docker-compose.yml
+         @@ -17,9 +18,8 @@ services:
+                  volumes:
+                      - paperless-data:/usr/src/paperless/data
+                      - paperless-media:/usr/src/paperless/media
+         -            - /consume
+         +            - /local/path/you/choose:/consume
+
+      .. danger::
+
+          While the consumption container will ensure at startup that it can
+          **delete** a consumed file from a host-mounted directory, it might not
+          be able to **read** the document in the first place if the access
+          rights to the file are incorrect.
+
+          Make sure that the documents you put into the consumption directory
+          will either be readable by everyone (``chmod o+r file.pdf``) or
+          readable by the default user or group id 1000 (or the one you have set
+          with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
+
+   2. Use ``docker cp`` to copy your files directly into the container:
+
+      .. code-block:: shell-session
+
+         $ # Identify your containers
+         $ docker-compose ps
+                 Name                       Command                State     Ports
+         -------------------------------------------------------------------------
+         paperless_consumer_1    /sbin/docker-entrypoint.sh ...   Exit 0
+         paperless_webserver_1   /sbin/docker-entrypoint.sh ...   Exit 0
+
+         $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
+
+      ``docker cp`` is a one-shot-command, just like ``cp``. This means that
+      every time you want to consume a new document, you will have to execute
+      ``docker cp`` again. You can of course automate this process, but option 1
+      is generally the preferred one.
+
+      .. danger::
+
+          ``docker cp`` will change the owning user and group of a copied file
+          to the acting user at the destination, which will be ``root``.
+
+          You therefore need to ensure that the documents you want to copy into
+          the container are readable by everyone (``chmod o+r file.pdf``) before
+          copying them.
+
+
+.. _Docker: https://www.docker.com/
+.. _docker-compose: https://docs.docker.com/compose/install/
+.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
+
+.. [#compose] You of course don't have to use docker-compose, but it
+   simplifies deployment immensely. If you know your way around Docker, feel
+   free to tinker around without using compose!
+
+
 .. _making-things-a-little-more-permanent:

 Making Things a Little more Permanent
@@ -126,5 +284,9 @@ Making Things a Little more Permanent
 Once you've tested things and are happy with the work flow, you can automate the
 process of starting the webserver and consumer automatically.  If you're running
 on a bare metal system that's using Systemd, you can use the service unit files
-in the ``scripts`` directory to set this up.  If you're on a SysV or other
-startup system (like the Vagrant box), then you're currently on your own.
+in the ``scripts`` directory to set this up.  If you're on another startup
+system or are using a Vagrant box, then you're currently on your own. If you are
+using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to
+have the containers automatically start with the Docker daemon.
+
+.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
--- a/docs/utilities.rst
+++ b/docs/utilities.rst
@@ -105,3 +105,30 @@ import, so should you can now safely delete the entire project directly,
 database, encrypted PDFs and all, and later create it all again simply by
 running the consumer again and dumping all of these files into
 ``CONSUMPTION_DIR``.
+
+
+.. _utilities-retagger:
+
+The Re-tagger
+-------------
+
+Say you've imported a few hundred documents and now want to introduce a tag
+and apply its matching to all of the currently-imported docs.  This problem is
+common enough that there's a tool for it.
+
+
+.. _utilities-retagger-howto:
+
+How to Use It
+.............
+
+This too is done via the ``manage.py`` script:
+
+.. code:: bash
+
+    $ /path/to/paperless/src/manage.py document_retagger
+
+That's it.  It'll loop over all of the documents in your database and attempt
+to match all of your tags to them.  If one matches, it'll be applied.  And
+don't worry, you can run this as often as you like, it' won't double-tag
+a document.
--- a/scripts/docker-entrypoint.sh
+++ b/scripts/docker-entrypoint.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -e
+
+# Source: https://github.com/sameersbn/docker-gitlab/
+map_uidgid() {
+    USERMAP_ORIG_UID=$(id -u paperless)
+    USERMAP_ORIG_UID=$(id -g paperless)
+    USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
+    USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
+    if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
+        echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
+        groupmod -g ${USERMAP_GID} paperless
+        sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
+    fi
+}
+
+set_permissions() {
+    # Set permissions for consumption directory
+    chgrp paperless "$PAPERLESS_CONSUME"
+    chmod g+x "$PAPERLESS_CONSUME"
+
+    # Set permissions for application directory
+    chown -Rh paperless:paperless /usr/src/paperless
+}
+
+initialize() {
+    map_uidgid
+    set_permissions
+}
+
+install_languages() {
+    local langs="$1"
+    read -ra langs <<<"$langs"
+
+    # Check that it is not empty
+    if [ ${#langs[@]} -eq 0 ]; then
+        return
+    fi
+
+    # Update apt-lists
+    apt-get update
+
+    # Loop over languages to be installed
+    for lang in "${langs[@]}"; do
+        pkg="tesseract-ocr-$lang"
+        if dpkg -s "$pkg" 2>&1 > /dev/null; then
+            continue
+        fi
+
+        if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
+            continue
+        fi
+
+        apt-get install "$pkg"
+    done
+
+    # Remove apt lists
+    rm -rf /var/lib/apt/lists/*
+}
+
+
+if [[ "$1" != "/"* ]]; then
+    initialize
+
+    # Install additional languages if specified
+    if [ ! -z "$PAPERLESS_OCR_LANGUAGES"  ]; then
+        install_languages "$PAPERLESS_OCR_LANGUAGES"
+    fi
+
+    exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@"
+fi
+
+exec "$@"
+
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,21 +1,23 @@
 import datetime
-import glob
+import tempfile
 from multiprocessing.pool import Pool

 import itertools
+
 import langdetect
 import os
-import random
 import re
 import subprocess

 import pyocr
+import shutil

 from PIL import Image

 from django.conf import settings
 from django.utils import timezone
 from django.template.defaultfilters import slugify
+from pyocr.tesseract import TesseractError

 from logger.models import Log
 from paperless.db import GnuPG
@@ -27,6 +29,12 @@ from .languages import ISO639
 def image_to_string(args):
    self, png, lang = args
    with Image.open(os.path.join(self.SCRATCH, png)) as f:
+        if self.OCR.can_detect_orientation():
+            try:
+                orientation = self.OCR.detect_orientation(f, lang=lang)
+                f = f.rotate(orientation["angle"], expand=1)
+            except TesseractError:
+                pass
        return self.OCR.image_to_string(f, lang=lang)


@@ -111,34 +119,41 @@ class Consumer(object):

            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)

-            pngs = self._get_greyscale(doc)
+            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
+            pngs = self._get_greyscale(tempdir, doc)

            try:
                text = self._get_ocr(pngs)
+                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
                Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
+                self._cleanup_tempdir(tempdir)
                continue
+            else:
+                self._cleanup_tempdir(tempdir)
+                self._cleanup_doc(doc)

-            self._store(text, doc)
-            self._cleanup(pngs, doc)
-
-    def _get_greyscale(self, doc):
+    def _get_greyscale(self, tempdir, doc):

        Log.debug(
            "Generating greyscale image from {}".format(doc),
            Log.COMPONENT_CONSUMER
        )

-        i = random.randint(1000000, 9999999)
-        png = os.path.join(self.SCRATCH, "{}.png".format(i))
+        png = os.path.join(tempdir, "convert-%04d.jpg")

        subprocess.Popen((
            self.CONVERT, "-density", "300", "-depth", "8",
            "-type", "grayscale", doc, png
        )).wait()

-        return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
+        pngs = []
+        for f in os.listdir(tempdir):
+            if f.startswith("convert"):
+                pngs.append(os.path.join(tempdir, f))
+
+        return sorted(filter(lambda __: os.path.isfile(__), pngs))

    @staticmethod
    def _guess_language(text):
@@ -271,11 +286,7 @@ class Consumer(object):
    def _store(self, text, doc):

        sender, title, tags, file_type = self._guess_attributes_from_name(doc)
-        tags = list(tags)
-
-        lower_text = text.lower()
-        relevant_tags = set(
-            [t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
+        relevant_tags = set(list(Tag.match_all(text)) + list(tags))

        stats = os.stat(doc)

@@ -303,14 +314,15 @@ class Consumer(object):
                Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
                encrypted.write(GnuPG.encrypted(unencrypted))

-    def _cleanup(self, pngs, doc):
+    @staticmethod
+    def _cleanup_tempdir(d):
+        Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
+        shutil.rmtree(d)

-        png_glob = os.path.join(
-            self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
-
-        for f in list(glob.glob(png_glob)) + [doc]:
-            Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
-            os.unlink(f)
+    @staticmethod
+    def _cleanup_doc(doc):
+        Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
+        os.unlink(doc)

    def _is_ready(self, doc):
        """
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
        self.verbosity = options["verbosity"]

        for document in Document.objects.all():
+
            tags = Tag.objects.exclude(
                pk__in=document.tags.values_list("pk", flat=True))
-            for tag in tags:
-                if tag.matches(document.content):
+
+            for tag in Tag.match_all(document.content, tags):
                print('Tagging {} with "{}"'.format(document, tag))
                document.tags.add(tag)
--- a/src/documents/management/commands/loaddata_stdin.py
+++ b/src/documents/management/commands/loaddata_stdin.py
@@ -0,0 +1,23 @@
+"""
+Source:
+    https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
+
+License:
+    MIT
+    Copyright (c) 2016 Baptiste Mispelon
+"""
+import sys
+
+from django.core.management.commands.loaddata import Command as LoadDataCommand
+
+
+class Command(LoadDataCommand):
+    def parse_name(self, fixture_name):
+        self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
+        if fixture_name == '-':
+            return '-', 'json', 'stdin'
+
+    def find_fixtures(self, fixture_label):
+        if fixture_label == '-':
+            return [('-', None, '-')]
+        return super(Command, self).find_fixtures(fixture_label)
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -86,28 +86,40 @@ class Tag(SluggedModel):
        return "{}: \"{}\" ({})".format(
            self.name, self.match, self.get_matching_algorithm_display())

+    @classmethod
+    def match_all(cls, text, tags=None):
+
+        if tags is None:
+            tags = cls.objects.all()
+
+        text = text.lower()
+        for tag in tags:
+            if tag.matches(text):
+                yield tag
+
    def matches(self, text):
+
        # Check that match is not empty
        if self.match.strip() == "":
            return False

        if self.matching_algorithm == self.MATCH_ALL:
            for word in self.match.split(" "):
-                if word not in text:
+                if not re.search(r"\b{}\b".format(word), text):
                    return False
            return True

        if self.matching_algorithm == self.MATCH_ANY:
            for word in self.match.split(" "):
-                if word in text:
+                if re.search(r"\b{}\b".format(word), text):
                    return True
            return False

        if self.matching_algorithm == self.MATCH_LITERAL:
-            return self.match in text
+            return bool(re.search(r"\b{}\b".format(self.match), text))

        if self.matching_algorithm == self.MATCH_REGEX:
-            return re.search(re.compile(self.match), text)
+            return bool(re.search(re.compile(self.match), text))

        raise NotImplementedError("Unsupported matching algorithm")

--- a/src/documents/tests/test_tags.py
+++ b/src/documents/tests/test_tags.py
@@ -0,0 +1,120 @@
+from django.test import TestCase
+
+from ..models import Tag
+
+
+class TestTagMatching(TestCase):
+
+    def test_match_all(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_ALL
+        )
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have charlie in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_ALL
+        )
+        self.assertFalse(t.matches("I have 12 in me"))
+        self.assertFalse(t.matches("I have 34 in me"))
+        self.assertFalse(t.matches("I have 56 in me"))
+        self.assertFalse(t.matches("I have 12 and 34 in me"))
+        self.assertTrue(t.matches("I have 12 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+
+    def test_match_any(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_ANY
+        )
+
+        self.assertTrue(t.matches("I have alpha in me"))
+        self.assertTrue(t.matches("I have charlie in me"))
+        self.assertTrue(t.matches("I have gamma in me"))
+        self.assertTrue(t.matches("I have alpha and charlie in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_ANY
+        )
+        self.assertTrue(t.matches("I have 12 in me"))
+        self.assertTrue(t.matches("I have 34 in me"))
+        self.assertTrue(t.matches("I have 56 in me"))
+        self.assertTrue(t.matches("I have 12 and 34 in me"))
+        self.assertTrue(t.matches("I have 12 34, and 56 in me"))
+        self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
+        self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+
+    def test_match_literal(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha charlie gamma",
+            matching_algorithm=Tag.MATCH_LITERAL
+        )
+
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have charlie in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+        self.assertFalse(t.matches("I have bravo in me"))
+
+        t = Tag.objects.create(
+            name="Test 1",
+            match="12 34 56",
+            matching_algorithm=Tag.MATCH_LITERAL
+        )
+        self.assertFalse(t.matches("I have 12 in me"))
+        self.assertFalse(t.matches("I have 34 in me"))
+        self.assertFalse(t.matches("I have 56 in me"))
+        self.assertFalse(t.matches("I have 12 and 34 in me"))
+        self.assertFalse(t.matches("I have 12 34, and 56 in me"))
+        self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
+        self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
+        self.assertFalse(t.matches("I have 123456 in me"))
+        self.assertFalse(t.matches("I have 01234567 in me"))
+        self.assertTrue(t.matches("I have 12 34 56 in me"))
+
+    def test_match_regex(self):
+
+        t = Tag.objects.create(
+            name="Test 0",
+            match="alpha\w+gamma",
+            matching_algorithm=Tag.MATCH_REGEX
+        )
+
+        self.assertFalse(t.matches("I have alpha in me"))
+        self.assertFalse(t.matches("I have gamma in me"))
+        self.assertFalse(t.matches("I have alpha and charlie in me"))
+        self.assertTrue(t.matches("I have alpha_and_gamma in me"))
+        self.assertTrue(t.matches("I have alphas_and_gamma in me"))
+        self.assertFalse(t.matches("I have alpha,and,gamma in me"))
+        self.assertFalse(t.matches("I have alpha and gamma in me"))
+        self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
+        self.assertFalse(t.matches("I have alphas in me"))
+
--- a/src/logger/migrations/0001_initial.py
+++ b/src/logger/migrations/0001_initial.py
@@ -23,4 +23,8 @@ class Migration(migrations.Migration):
                ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
            ],
        ),
+        migrations.AlterModelOptions(
+            name='log',
+            options={'ordering': ('-time',)},
+        ),
    ]
--- a/src/logger/models.py
+++ b/src/logger/models.py
@@ -27,7 +27,10 @@ class Log(models.Model):
    component = models.PositiveIntegerField(choices=COMPONENTS)

    class Meta(object):
-        ordering = ("time",)
+        ordering = ("-time",)
+
+    def __str__(self):
+        return self.message

    @classmethod
    def error(cls, message, component):