Merge branch 'dev'

2025-06-06 14:07:26 -05:00 · 2020-11-27 17:40:05 +01:00 · 2020-11-27 17:40:05 +01:00 · 5573a84335
commit 5573a84335
parent 440a23a054 a1f5ddede8
61 changed files with 1400 additions and 436 deletions
--- a/4
+++ b/4
@ -8,6 +8,9 @@ url = "https://www.piwheels.org/simple"
 verify_ssl = true
 name = "piwheels"
 [requires]
 python_version = "3.6"
 [packages]
 dateparser = "~=0.7.6"
 django = "~=3.1.3"
@ -35,6 +38,7 @@ scikit-learn="~=0.23.2"
 whitenoise = "~=5.2.0"
 watchdog = "*"
 whoosh="~=2.7.4"
 inotify-simple = "*"
 [dev-packages]
 coveralls = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,10 +1,12 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244"
+            "sha256": "d6432a18280c092c108e998f00bcd377c0c55ef18f26cb0b8eb64f9618b9f383"
        },
        "pipfile-spec": 6,
-        "requires": {},
+        "requires": {
            "python_version": "3.6"
        },
        "sources": [
            {
                "name": "pypi",
@ -129,6 +131,14 @@
            "index": "pypi",
            "version": "==0.32.0"
        },
        "inotify-simple": {
            "hashes": [
                "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
                "sha256:854f9ac752cc1fcff6ca34e9d3d875c9a94c9b7d6eb377f63be2d481a566c6ee"
            ],
            "index": "pypi",
            "version": "==1.3.5"
        },
        "joblib": {
            "hashes": [
                "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
@ -663,11 +673,11 @@
        },
        "faker": {
            "hashes": [
-                "sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81",
+                "sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997",
-                "sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473"
+                "sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e"
            ],
            "markers": "python_version >= '3.5'",
-            "version": "==4.17.0"
+            "version": "==4.17.1"
        },
        "filelock": {
            "hashes": [
@ -693,6 +703,22 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==1.2.0"
        },
        "importlib-metadata": {
            "hashes": [
                "sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
                "sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
            ],
            "markers": "python_version < '3.8'",
            "version": "==2.1.0"
        },
        "importlib-resources": {
            "hashes": [
                "sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
                "sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
            ],
            "markers": "python_version < '3.7'",
            "version": "==3.3.0"
        },
        "iniconfig": {
            "hashes": [
                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@ -999,11 +1025,19 @@
        },
        "virtualenv": {
            "hashes": [
-                "sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2",
+                "sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7",
-                "sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380"
+                "sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==20.1.0"
+            "version": "==20.2.1"
        },
        "zipp": {
            "hashes": [
                "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
                "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
            ],
            "markers": "python_version < '3.8'",
            "version": "==3.4.0"
        }
    }
 }
--- a/docker/hub/docker-compose.postgres.yml
+++ b/docker/hub/docker-compose.postgres.yml
@ -15,7 +15,7 @@ services:
      POSTGRES_PASSWORD: paperless
  webserver:
-    image: jonaswinkler/paperless-ng:0.9.2
+    image: jonaswinkler/paperless-ng:0.9.3
    restart: always
    depends_on:
      - db
--- a/docker/hub/docker-compose.sqlite.yml
+++ b/docker/hub/docker-compose.sqlite.yml
@ -5,7 +5,7 @@ services:
    restart: always
  webserver:
-    image: jonaswinkler/paperless-ng:0.9.2
+    image: jonaswinkler/paperless-ng:0.9.3
    restart: always
    depends_on:
      - broker
--- a/docs/administration.rst
+++ b/docs/administration.rst
@ -30,7 +30,7 @@ Options available to docker installations:
    Paperless uses 3 volumes:
    *   ``paperless_media``: This is where your documents are stored.
-    *   ``paperless_data``: This is where auxilliary data is stored. This
+    *   ``paperless_data``: This is where auxillary data is stored. This
        folder also contains the SQLite database, if you use it.
    *   ``paperless_pgdata``: Exists only if you use PostgreSQL and contains
        the database.
@ -109,7 +109,7 @@ B.  If you built the image yourself, grab the new archive and replace your curre
 .. hint::
    You can usually keep your ``docker-compose.env`` file, since this file will
-    never include mandantory configuration options. However, it is worth checking
+    never include mandatory configuration options. However, it is worth checking
    out the new version of this file, since it might have new recommendations
    on what to configure.
@ -126,8 +126,8 @@ After grabbing the new release and unpacking the contents, do the following:
        $ pip install --upgrade pipenv
        $ cd /path/to/paperless
        $ pipenv install
        $ pipenv clean
        $ pipenv install
    This creates a new virtual environment (or uses your existing environment)
    and installs all dependencies into it.
@ -247,12 +247,12 @@ your already processed documents.
 When multiple document types or correspondents match a single document,
 the retagger won't assign these to the document. Specify ``--use-first``
-to override this behaviour and just use the first correspondent or type
+to override this behavior and just use the first correspondent or type
 it finds. This option does not apply to tags, since any amount of tags
 can be applied to a document.
 Finally, ``-f`` specifies that you wish to overwrite already assigned
-correspondents, types and/or tags. The default behaviour is to not
+correspondents, types and/or tags. The default behavior is to not
 assign correspondents and types to documents that have this data already
 assigned. ``-f`` works differently for tags: By default, only additional tags get
 added to documents, no tags will be removed. With ``-f``, tags that don't
@ -341,7 +341,7 @@ Documents can be stored in Paperless using GnuPG encryption.
 .. danger::
-    Encryption is depreceated since paperless-ng 0.9 and doesn't really provide any
+    Encryption is deprecated since paperless-ng 0.9 and doesn't really provide any
    additional security, since you have to store the passphrase in a configuration
    file on the same system as the encrypted documents for paperless to work.
    Furthermore, the entire text content of the documents is stored plain in the
@ -353,39 +353,23 @@ Documents can be stored in Paperless using GnuPG encryption.
    Consider running paperless on an encrypted filesystem instead, which will then
    at least provide security against physical hardware theft.
 .. code::
    change_storage_type [--passphrase PASSPHRASE] {gpg,unencrypted} {gpg,unencrypted}
    positional arguments:
      {gpg,unencrypted}     The state you want to change your documents from
      {gpg,unencrypted}     The state you want to change your documents to
    optional arguments:
      --passphrase PASSPHRASE
 Enabling encryption
 -------------------
-Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
+Enabling encryption is no longer supported.
 (Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
 .. code::
    change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
 Disabling encryption
 --------------------
-Basic usage to enable encryption of your document store:
+Basic usage to disable encryption of your document store:
-(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
+(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
 .. code::
-    change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted
+    decrypt_documents [--passphrase SECR3TP4SSPHRA$E]
 .. _Pipenv: https://pipenv.pypa.io/en/latest/
--- a/docs/advanced_usage.rst
+++ b/docs/advanced_usage.rst
@ -84,6 +84,8 @@ to the filename.
   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
 .. _advanced-matching:
 Matching tags, correspondents and document types
 ################################################
@ -145,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
 tag is set to *Auto*, this neural network will examine your documents and
 automatically learn when to assign this tag.
-There are a couple caveats you need to keep in mind when using this feature:
+Paperless tries to hide much of the involved complexity with this approach.
 However, there are a couple caveats you need to keep in mind when using this
 feature:
 * Changes to your documents are not immediately reflected by the matching
  algorithm. The neural network needs to be *trained* on your documents after
@ -165,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
  has the correspondent "Very obscure web shop I bought something five years
  ago", it will probably not assign this correspondent automatically if you buy
  something from them again. The more documents, the better.
 * Paperless also needs a reasonable amount of negative examples to decide when
  not to assign a certain tag, correspondent or type. This will usually be the
  case as you start filling up paperless with documents. Example: If all your
  documents are either from "Webshop" and "Bank", paperless will assign one of
  these correspondents to ANY new document, if both are set to automatic matching.
 Hooking into the consumption process
 ####################################
@ -253,7 +262,7 @@ By default, paperless stores your documents in the media directory and renames t
 using the identifier which it has assigned to each document. You will end up getting
 files like ``0000123.pdf`` in your media directory. This isn't necessarily a bad
 thing, because you normally don't have to access these files manually. However, if
-you wish to name your files differently, you can do that by adjustng the
+you wish to name your files differently, you can do that by adjusting the
 ``PAPERLESS_FILENAME_FORMAT`` settings variable.
 This variable allows you to configure the filename (folders are allowed!) using
@ -278,7 +287,7 @@ will create a directory structure as follows:
        my_new_shoes-0000004.pdf
 Paperless appends the unique identifier of each document to the filename. This
-avoides filename clashes.
+avoids filename clashes.
 .. danger::
--- a/docs/api.rst
+++ b/docs/api.rst
@ -94,7 +94,7 @@ Result object:
    }
 *   ``id``: the primary key of the found document
-*   ``highlights``: an object containing parseable highlights for the result.
+*   ``highlights``: an object containing parsable highlights for the result.
    See below.
 *   ``score``: The score assigned to the document. A higher score indicates a
    better match with the query. Search results are sorted descending by score.
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -5,6 +5,24 @@
 Changelog
 *********
 paperless-ng 0.9.3
 ##################
 * Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
  You have to specify your username.
 * Added a simple sanity checker that checks your documents for missing or orphaned files,
  files with wrong checksums, inaccessible files, and documents with empty content.
 * It is no longer possible to encrypt your documents. For the time being, paperless will
  continue to operate with already encrypted documents.
 * Fixes:
  * Paperless now uses inotify again, since the watchdog was causing issues which I was not
    aware of.
  * Issue with the automatic classifier not working with only one tag.
  * A couple issues with the search index being opened to eagerly.
 * Added lots of tests for various parts of the application.
 paperless-ng 0.9.2
 ##################
@ -52,7 +70,7 @@ paperless-ng 0.9.0
 * **Added:** New frontend. Features:
  * Single page application: It's much more responsive than the django admin pages.
-  * Dashboard. Shows recently scanned documents, or todos, or other documents
+  * Dashboard. Shows recently scanned documents, or todo notes, or other documents
    at wish. Allows uploading of documents. Shows basic statistics.
  * Better document list with multiple display options.
  * Full text search with result highlighting, auto completion and scoring based
@ -102,7 +120,7 @@ paperless-ng 0.9.0
 * **Modified [breaking]:** PostgreSQL:
-  * If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses postgresql instead of sqlite.
+  * If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses PostgreSQL instead of SQLite.
    Username, database and password all default to ``paperless`` if not specified.
 * **Modified [breaking]:** document_retagger management command rework. See
@ -130,7 +148,7 @@ paperless-ng 0.9.0
    Certain language specifics such as umlauts may not get picked up properly.
  * ``PAPERLESS_DEBUG`` defaults to ``false``.
  * The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
-    sqlite.
+    SQLite.
  * ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
    ``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
  * ``PAPERLESS_OPTIMIZE_THUMBNAILS`` allows you to disable or enable thumbnail
@ -138,8 +156,11 @@ paperless-ng 0.9.0
 * Many more small changes here and there. The usual stuff.
 Paperless
 #########
 2.7.0
-#####
+=====
 * `syntonym`_ submitted a pull request to catch IMAP connection errors `#475`_.
 * `Stéphane Brunner`_ added ``psycopg2`` to the Pipfile `#489`_.  He also fixed
@ -156,7 +177,7 @@ paperless-ng 0.9.0
 2.6.1
-#####
+=====
 * We now have a logo, complete with a favicon :-)
 * Removed some problematic tests.
@ -168,7 +189,7 @@ paperless-ng 0.9.0
 2.6.0
-#####
+=====
 * Allow an infinite number of logs to be deleted.  Thanks to `Ulli`_ for noting
  the problem in `#433`_.
@ -189,7 +210,7 @@ paperless-ng 0.9.0
 2.5.0
-#####
+=====
 * **New dependency**: Paperless now optimises thumbnail generation with
  `optipng`_, so you'll need to install that somewhere in your PATH or declare
@ -233,7 +254,7 @@ paperless-ng 0.9.0
 2.4.0
-#####
+=====
 * A new set of actions are now available thanks to `jonaswinkler`_'s very first
  pull request!  You can now do nifty things like tag documents in bulk, or set
@ -254,7 +275,7 @@ paperless-ng 0.9.0
 2.3.0
-#####
+=====
 * Support for consuming plain text & markdown documents was added by
  `Joshua Taillon`_!  This was a long-requested feature, and it's addition is
@ -272,14 +293,14 @@ paperless-ng 0.9.0
 2.2.1
-#####
+=====
 * `Kyle Lucy`_ reported a bug quickly after the release of 2.2.0 where we broke
  the ``DISABLE_LOGIN`` feature: `#392`_.
 2.2.0
-#####
+=====
 * Thanks to `dadosch`_, `Wolfgang Mader`_, and `Tim Brooks`_ this is the first
  version of Paperless that supports Django 2.0!  As a result of their hard
@ -296,7 +317,7 @@ paperless-ng 0.9.0
 2.1.0
-#####
+=====
 * `Enno Lohmeier`_ added three simple features that make Paperless a lot more
  user (and developer) friendly:
@ -315,7 +336,7 @@ paperless-ng 0.9.0
 2.0.0
-#####
+=====
 This is a big release as we've changed a core-functionality of Paperless: we no
 longer encrypt files with GPG by default.
@ -347,7 +368,7 @@ Special thanks to `erikarvstedt`_, `matthewmoto`_, and `mcronce`_ who did the
 bulk of the work on this big change.
 1.4.0
-#####
+=====
 * `Quentin Dawans`_ has refactored the document consumer to allow for some
  command-line options.  Notably, you can now direct it to consume from a
@ -382,7 +403,7 @@ bulk of the work on this big change.
  to some excellent work from `erikarvstedt`_ on `#351`_
 1.3.0
-#####
+=====
 * You can now run Paperless without a login, though you'll still have to create
  at least one user.  This is thanks to a pull-request from `matthewmoto`_:
@ -405,7 +426,7 @@ bulk of the work on this big change.
  problem and helping me find where to fix it.
 1.2.0
-#####
+=====
 * New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
  and `Pit`_.  This new image is dramatically smaller than the Debian-based
@ -424,7 +445,7 @@ bulk of the work on this big change.
  in the document text.
 1.1.0
-#####
+=====
 * Fix for `#283`_, a redirect bug which broke interactions with
  paperless-desktop.  Thanks to `chris-aeviator`_ for reporting it.
@ -434,7 +455,7 @@ bulk of the work on this big change.
  `Dan Panzarella`_
 1.0.0
-#####
+=====
 * Upgrade to Django 1.11.  **You'll need to run
  ``pip install -r requirements.txt`` after the usual ``git pull`` to
@ -453,14 +474,14 @@ bulk of the work on this big change.
  `Lukas Winkler`_'s issue `#278`_
 0.8.0
-#####
+=====
 * Paperless can now run in a subdirectory on a host (``/paperless``), rather
  than always running in the root (``/``) thanks to `maphy-psd`_'s work on
  `#255`_.
 0.7.0
-#####
+=====
 * **Potentially breaking change**: As per `#235`_, Paperless will no longer
  automatically delete documents attached to correspondents when those
@ -472,7 +493,7 @@ bulk of the work on this big change.
  `Kusti Skytén`_ for posting the correct solution in the Github issue.
 0.6.0
-#####
+=====
 * Abandon the shared-secret trick we were using for the POST API in favour
  of BasicAuth or Django session.
@ -486,7 +507,7 @@ bulk of the work on this big change.
  the help with this feature.
 0.5.0
-#####
+=====
 * Support for fuzzy matching in the auto-tagger & auto-correspondent systems
  thanks to `Jake Gysland`_'s patch `#220`_.
@ -504,13 +525,13 @@ bulk of the work on this big change.
  * Amended the Django Admin configuration to have nice headers (`#230`_)
 0.4.1
-#####
+=====
 * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
  all-caps suffixes like ``.PDF``
 0.4.0
-#####
+=====
 * Introducing reminders.  See `#199`_ for more information, but the short
  explanation is that you can now attach simple notes & times to documents
@ -520,7 +541,7 @@ bulk of the work on this big change.
  like to make use of this feature in his project.
 0.3.6
-#####
+=====
 * Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
  correspondent or the tags for a document.
@ -534,7 +555,7 @@ bulk of the work on this big change.
  documentation is on its way.
 0.3.5
-#####
+=====
 * A serious facelift for the documents listing page wherein we drop the
  tabular layout in favour of a tiled interface.
@ -545,7 +566,7 @@ bulk of the work on this big change.
  consumption.
 0.3.4
-#####
+=====
 * Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
  Note that you *can* use Django Suit with Paperless, but only in a
@ -558,26 +579,26 @@ bulk of the work on this big change.
  API thanks to @thomasbrueggemann.  See `#179`_.
 0.3.3
-#####
+=====
 * Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
 * Timezone, items per page, and default language are now all configurable,
  also thanks to @ekw.
 0.3.2
-#####
+=====
 * Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
  user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
  arise.
 0.3.1
-#####
+=====
 * Added a default value for ``CONVERT_BINARY``
 0.3.0
-#####
+=====
 * Updated to using django-filter 1.x
 * Added some system checks so new users aren't confused by misconfigurations.
@ -590,7 +611,7 @@ bulk of the work on this big change.
  ``PAPERLESS_SHARED_SECRET`` respectively instead.
 0.2.0
-#####
+=====
 * `#150`_: The media root is now a variable you can set in
  ``paperless.conf``.
@ -618,7 +639,7 @@ bulk of the work on this big change.
  to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
 0.1.1
-#####
+=====
 * Potentially **Breaking Change**: All references to "sender" in the code
  have been renamed to "correspondent" to better reflect the nature of the
@ -642,7 +663,7 @@ bulk of the work on this big change.
  to be imported but made unavailable.
 0.1.0
-#####
+=====
 * Docker support!  Big thanks to `Wayne Werner`_, `Brian Conn`_, and
  `Tikitu de Jager`_ for this one, and especially to `Pit`_
@ -661,14 +682,14 @@ bulk of the work on this big change.
 * Added tox with pep8 checking
 0.0.6
-#####
+=====
 * Added support for parallel OCR (significant work from `Pit`_)
 * Sped up the language detection (significant work from `Pit`_)
 * Added simple logging
 0.0.5
-#####
+=====
 * Added support for image files as documents (png, jpg, gif, tiff)
 * Added a crude means of HTTP POST for document imports
@ -677,7 +698,7 @@ bulk of the work on this big change.
 * Documentation for the above as well as data migration
 0.0.4
-#####
+=====
 * Added automated tagging basted on keyword matching
 * Cleaned up the document listing page
@ -685,19 +706,19 @@ bulk of the work on this big change.
 * Added ``pytz`` to the list of requirements
 0.0.3
-#####
+=====
 * Added basic tagging
 0.0.2
-#####
+=====
 * Added language detection
 * Added datestamps to ``document_exporter``.
 * Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
 0.0.1
-#####
+=====
 * Initial release
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -69,7 +69,7 @@ PAPERLESS_CONSUMPTION_DIR=<path>
    Defaults to "../consume", relative to the "src" directory.
 PAPERLESS_DATA_DIR=<path>
-    This is where paperless stores all its data (search index, sqlite database,
+    This is where paperless stores all its data (search index, SQLite database,
    classification model, etc).
    Defaults to "../data", relative to the "src" directory.
@ -100,7 +100,7 @@ Hosting & Security
 ##################
 PAPERLESS_SECRET_KEY=<key>
-    Paperless uses this to make session tokens. If you exose paperless on the
+    Paperless uses this to make session tokens. If you expose paperless on the
    internet, you need to change this, since the default secret is well known.
    Use any sequence of characters. The more, the better. You don't need to
@ -141,6 +141,16 @@ PAPERLESS_STATIC_URL=<path>
    Defaults to "/static/".
 PAPERLESS_AUTO_LOGIN_USERNAME=<username>
    Specify a username here so that paperless will automatically perform login
    with the selected user.
    .. danger::
        Do not use this when exposing paperless on the internet. There are no
        checks in place that would prevent you from doing this.
    Defaults to none, which disables this feature.
 Software tweaks
 ###############
@ -220,7 +230,7 @@ PAPERLESS_CONSUMER_POLLING=<num>
    specify a polling interval in seconds here, which will then cause paperless
    to periodically check your consumption directory for changes.
-    Defaults to 0, which disables polling and uses filesystem notifiactions.
+    Defaults to 0, which disables polling and uses filesystem notifications.
 PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
    When the consumer detects a duplicate document, it will not touch the
@ -264,7 +274,7 @@ PAPERLESS_CONVERT_DENSITY=<num>
    Default is 300.
 PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
-    Use optipng to optimize thumbnails. This usually reduces the sice of
+    Use optipng to optimize thumbnails. This usually reduces the size of
    thumbnails by about 20%, but uses considerable compute time during
    consumption.
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@ -85,7 +85,7 @@ quoted, or triple-quoted string will do:
    problematic_string = 'This is a "string" with "quotes" in it'
 In HTML templates, please use double-quotes for tag attributes, and single
-quotes for arguments passed to Django tempalte tags:
+quotes for arguments passed to Django template tags:
 .. code:: html
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -17,7 +17,7 @@ is
 .. caution::
-    Dont mess with this folder. Don't change permissions and don't move
+    Do not mess with this folder. Don't change permissions and don't move
    files around manually. This folder is meant to be entirely managed by docker
    and paperless.
@ -36,7 +36,7 @@ file extensions do not matter.
 **A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
 The long answer is that certain parts of
-Paperless will run very slow, such as the tesseract OCR. On Rasperry Pi,
+Paperless will run very slow, such as the tesseract OCR. On Raspberry Pi,
 try to OCR documents before feeding them into paperless so that paperless can
 reuse the text. The web interface should be a lot snappier, since it runs
 in your browser and paperless has to do much less work to serve the data.
--- a/docs/scanners.rst
+++ b/docs/scanners.rst
@ -8,7 +8,7 @@ Scanner recommendations
 As Paperless operates by watching a folder for new files, doesn't care what
 scanner you use, but sometimes finding a scanner that will write to an FTP,
 NFS, or SMB server can be difficult.  This page is here to help you find one
-that works right for you based on recommentations from other Paperless users.
+that works right for you based on recommendations from other Paperless users.
 +---------+----------------+-----+-----+-----+----------------+
 | Brand   | Model          | Supports        | Recommended By |
--- a/docs/screenshots.rst
+++ b/docs/screenshots.rst
@ -21,7 +21,7 @@ Extensive filtering mechanisms:
 .. image:: _static/screenshots/documents-filter.png
-Side-by-side editing of documents. Optmized for 1080p.
+Side-by-side editing of documents. Optimized for 1080p.
 .. image:: _static/screenshots/editing.png
--- a/docs/setup.rst
+++ b/docs/setup.rst
@ -265,15 +265,17 @@ Migration to paperless-ng is then performed in a few simple steps:
    ``docker-compose.env`` to your needs.
    See `docker route`_ for details on which edits are advised.
-6.  Start paperless-ng.
+6.  In order to find your existing documents with the new search feature, you need
    to invoke a one-time operation that will create the search index:
-    .. code:: bash
+    .. code:: shell-session
-        $ docker-compose up
+        $ docker-compose run --rm webserver document_index reindex
-    If you see everything working (you should see some migrations getting
+    This will migrate your database and create the search index. After that,
-    applied, for instance), you can gracefully stop paperless-ng with Ctrl-C
+    paperless will take care of maintaining the index by itself.
-    and then start paperless-ng as usual with
+
 7.  Start paperless-ng.
    .. code:: bash
@ -281,11 +283,11 @@ Migration to paperless-ng is then performed in a few simple steps:
    This will run paperless in the background and automatically start it on system boot.
-7.  Paperless installed a permanent redirect to ``admin/`` in your browser. This
+8.  Paperless installed a permanent redirect to ``admin/`` in your browser. This
    redirect is still in place and prevents access to the new UI. Clear
    browsing cache in order to fix this.
-8.  Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
+9.  Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
 .. _setup-sqlite_to_psql:
@ -322,7 +324,7 @@ management commands as below.
            $ cd /path/to/paperless
            $ docker-compose run --rm webserver /bin/bash
-        This will lauch the container and initialize the PostgreSQL database.
+        This will launch the container and initialize the PostgreSQL database.
    b)  Without docker, open a shell in your virtual environment, switch to
        the ``src`` directory and create the database schema:
@ -357,6 +359,35 @@ management commands as below.
 7.  Start paperless.
 Moving back to paperless
 ========================
 Lets say you migrated to Paperless-ng and used it for a while, but decided that
 you don't like it and want to move back (If you do, send me a mail about what
 part you didn't like!), you can totally do that with a few simple steps.
 Paperless-ng modified the database schema slightly, however, these changes can
 be reverted while keeping your current data, so that your current data will
 be compatible with original Paperless.
 Execute this:
 .. code:: shell-session
    $ cd /path/to/paperless
    $ docker-compose run --rm webserver migrate documents 0023
 Or without docker:
 .. code:: shell-session
    $ cd /path/to/paperless/src
    $ python3 manage.py migrate documents 0023
 After that, you need to clear your cookies (Paperless-ng comes with updated
 dependencies that do cookie-processing differently) and probably your cache
 as well.
 .. _setup-less_powerful_devices:
@ -372,7 +403,7 @@ configuring some options in paperless can help improve performance immensely:
 *   ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
    to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
    paperless will use 2 workers and 2 threads per worker. This may result in
-    slugish response times during consumption, so you might want to lower these
+    sluggish response times during consumption, so you might want to lower these
    settings (example: 2 workers and 1 thread to always have some computing power
    left for other tasks).
 *   Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
--- a/docs/usage_overview.rst
+++ b/docs/usage_overview.rst
@ -5,13 +5,13 @@ Usage Overview
 Paperless is an application that manages your personal documents. With
 the help of a document scanner (see :ref:`scanners`), paperless transforms
 your wieldy physical document binders into a searchable archive and
-provices many utilities for finding and managing your documents.
+provides many utilities for finding and managing your documents.
 Terms and definitions
 #####################
-Paperless esentially consists of two different parts for managing your
+Paperless essentially consists of two different parts for managing your
 documents:
 * The *consumer* watches a specified folder and adds all documents in that
@ -30,12 +30,12 @@ Each document has a couple of fields that you can assign to them:
  tag, however, a single document can also have multiple tags. This is not
  possible with folders. The reason folders are not implemented in paperless
  is simply that tags are much more versatile than folders.
-* A *document type* is used to demarkate the type of a document such as letter,
+* A *document type* is used to demarcate the type of a document such as letter,
  bank statement, invoice, contract, etc. It is used to identify what a document
  is about.
 * The *date added* of a document is the date the document was scanned into
  paperless. You cannot and should not change this date.
-* The *date created* of a document is the date the document was intially issued.
+* The *date created* of a document is the date the document was initially issued.
  This can be the date you bought a product, the date you signed a contract, or
  the date a letter was sent to you.
 * The *archive serial number* (short: ASN) of a document is the identifier of
@ -131,7 +131,7 @@ These are as follows:
    With the correct set of rules, you can completely automate your email documents.
    Create rules for every correspondent you receive digital documents from and
-    paperless will read them automatically. The default acion "mark as read" is
+    paperless will read them automatically. The default action "mark as read" is
    pretty tame and will not cause any damage or data loss whatsoever.
    You can also setup a special folder in your mail account for paperless and use
@ -182,7 +182,7 @@ Processing of the physical documents
 ====================================
 Keep a physical inbox. Whenever you receive a document that you need to
-archive, put it into your inbox. Regulary, do the following for all documents
+archive, put it into your inbox. Regularly, do the following for all documents
 in your inbox:
 1.  For each document, decide if you need to keep the document in physical
@ -217,18 +217,24 @@ Once you have scanned in a document, proceed in paperless as follows.
 1.  If the document has an ASN, assign the ASN to the document.
 2.  Assign a correspondent to the document (i.e., your employer, bank, etc)
-    This isnt strictly necessary but helps in finding a document when you need
+    This isn't strictly necessary but helps in finding a document when you need
    it.
 3.  Assign a document type (i.e., invoice, bank statement, etc) to the document
-    This isnt strictly necessary but helps in finding a document when you need
+    This isn't strictly necessary but helps in finding a document when you need
    it.
 4.  Assign a proper title to the document (the name of an item you bought, the
    subject of the letter, etc)
-5.  Check that the date of the document is corrent. Paperless tries to read
+5.  Check that the date of the document is correct. Paperless tries to read
    the date from the content of the document, but this fails sometimes if the
    OCR is bad or multiple dates appear on the document.
 6.  Remove inbox tags from the documents.
 .. hint::
    You can setup manual matching rules for your correspondents and tags and
    paperless will assign them automatically. After consuming a couple documents,
    you can even ask paperless to *learn* when to assign tags and correspondents
    by itself. For details on this feature, see :ref:`advanced-matching`.
 Task management
 ===============
--- a/paperless.conf.example
+++ b/paperless.conf.example
@ -29,6 +29,7 @@
 #PAPERLESS_CORS_ALLOWED_HOSTS=localhost:8080,example.com,localhost:8000
 #PAPERLESS_FORCE_SCRIPT_NAME=
 #PAPERLESS_STATIC_URL=/static/
 #PAPERLESS_AUTO_LOGIN_USERNAME=
 # Software tweaks
--- a/scripts/make-release.sh
+++ b/scripts/make-release.sh
@ -1,5 +1,19 @@
 #!/bin/bash
 # Release checklist
 # - wait for travis build.
 # adjust src/paperless/version.py
 # changelog in the documentation
 # adjust versions in docker/hub/*
 # If docker-compose was modified: all compose files are the same.
 # Steps:
 # run release script "dev", push
 # if it works: new tag, merge into master
 # on master: make release "lastest", push
 # on master: make release "version-tag", push
 # publish release files
 set -e
--- a/src-ui/src/app/app.module.ts
+++ b/src-ui/src/app/app.module.ts
@ -23,7 +23,7 @@ import { TagEditDialogComponent } from './components/manage/tag-list/tag-edit-di
 import { DocumentTypeEditDialogComponent } from './components/manage/document-type-list/document-type-edit-dialog/document-type-edit-dialog.component';
 import { TagComponent } from './components/common/tag/tag.component';
 import { SearchComponent } from './components/search/search.component';
-import { ResultHightlightComponent } from './components/search/result-hightlight/result-hightlight.component';
+import { ResultHighlightComponent } from './components/search/result-highlight/result-highlight.component';
 import { PageHeaderComponent } from './components/common/page-header/page-header.component';
 import { AppFrameComponent } from './components/app-frame/app-frame.component';
 import { ToastsComponent } from './components/common/toasts/toasts.component';
@ -65,7 +65,7 @@ import { WidgetFrameComponent } from './components/dashboard/widgets/widget-fram
    DocumentTypeEditDialogComponent,
    TagComponent,
    SearchComponent,
-    ResultHightlightComponent,
+    ResultHighlightComponent,
    PageHeaderComponent,
    AppFrameComponent,
    ToastsComponent,
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@ -11,7 +11,7 @@
          <h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5>
        </div>
        <p class="card-text">
-          <app-result-hightlight *ngIf="getDetailsAsHighlight()" class="result-content" [highlights]="getDetailsAsHighlight()"></app-result-hightlight>
+          <app-result-highlight *ngIf="getDetailsAsHighlight()" class="result-content" [highlights]="getDetailsAsHighlight()"></app-result-highlight>
          <span *ngIf="getDetailsAsString()" class="result-content">{{getDetailsAsString()}}</span>
        </p>
--- a/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.html
+++ b/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.html
--- a/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.scss
+++ b/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.scss
--- a/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.spec.ts
+++ b/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.spec.ts
@ -1,20 +1,20 @@
 import { ComponentFixture, TestBed } from '@angular/core/testing';
-import { ResultHightlightComponent } from './result-hightlight.component';
+import { ResultHighlightComponent } from './result-highlight.component';
-describe('ResultHightlightComponent', () => {
+describe('ResultHighlightComponent', () => {
-  let component: ResultHightlightComponent;
+  let component: ResultHighlightComponent;
-  let fixture: ComponentFixture<ResultHightlightComponent>;
+  let fixture: ComponentFixture<ResultHighlightComponent>;
  beforeEach(async () => {
    await TestBed.configureTestingModule({
-      declarations: [ ResultHightlightComponent ]
+      declarations: [ ResultHighlightComponent ]
    })
    .compileComponents();
  });
  beforeEach(() => {
-    fixture = TestBed.createComponent(ResultHightlightComponent);
+    fixture = TestBed.createComponent(ResultHighlightComponent);
    component = fixture.componentInstance;
    fixture.detectChanges();
  });
--- a/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.ts
+++ b/src-ui/src/app/components/search/result-hightlight/result-hightlight.component.ts
@ -2,11 +2,11 @@ import { Component, Input, OnInit } from '@angular/core';
 import { SearchHitHighlight } from 'src/app/data/search-result';
@Component({
-  selector: 'app-result-hightlight',
+  selector: 'app-result-highlight',
-  templateUrl: './result-hightlight.component.html',
+  templateUrl: './result-highlight.component.html',
-  styleUrls: ['./result-hightlight.component.scss']
+  styleUrls: ['./result-highlight.component.scss']
 })
-export class ResultHightlightComponent implements OnInit {
+export class ResultHighlightComponent implements OnInit {
  constructor() { }
--- a/src/documents/init.py
+++ b/src/documents/init.py
@ -1 +1,2 @@
-from .checks import changed_password_check
+# this is here so that django finds the checks.
 from .checks import *
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@ -4,12 +4,13 @@ import os
 import pickle
 import re
 from django.conf import settings
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.neural_network import MLPClassifier
-from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
 from sklearn.utils.multiclass import type_of_target
 from documents.models import Document, MatchingModel
 from paperless import settings
 class IncompatibleClassifierVersionError(Exception):
@ -27,7 +28,7 @@ def preprocess_content(content):
 class DocumentClassifier(object):
-    FORMAT_VERSION = 5
+    FORMAT_VERSION = 6
    def __init__(self):
        # mtime of the model file on disk. used to prevent reloading when
@ -54,6 +55,8 @@ class DocumentClassifier(object):
                        "Cannor load classifier, incompatible versions.")
                else:
                    if self.classifier_version > 0:
                        # Don't be confused by this check. It's simply here
                        # so that we wont log anything on initial reload.
                        logger.info("Classifier updated on disk, "
                                    "reloading classifier models")
                    self.data_hash = pickle.load(f)
@ -122,9 +125,14 @@ class DocumentClassifier(object):
        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
        num_tags = len(labels_tags_unique)
        # substract 1 since -1 (null) is also part of the classes.
-        num_correspondents = len(set(labels_correspondent)) - 1
+
-        num_document_types = len(set(labels_document_type)) - 1
+        # union with {-1} accounts for cases where all documents have
        # correspondents and types assigned, so -1 isnt part of labels_x, which
        # it usually is.
        num_correspondents = len(set(labels_correspondent) | {-1}) - 1
        num_document_types = len(set(labels_document_type) | {-1}) - 1
        logging.getLogger(__name__).debug(
            "{} documents, {} tag(s), {} correspondent(s), "
@ -145,12 +153,23 @@ class DocumentClassifier(object):
        )
        data_vectorized = self.data_vectorizer.fit_transform(data)
        self.tags_binarizer = MultiLabelBinarizer()
        labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
        # Step 3: train the classifiers
        if num_tags > 0:
            logging.getLogger(__name__).debug("Training tags classifier...")
            if num_tags == 1:
                # Special case where only one tag has auto:
                # Fallback to binary classification.
                labels_tags = [label[0] if len(label) == 1 else -1
                               for label in labels_tags]
                self.tags_binarizer = LabelBinarizer()
                labels_tags_vectorized = self.tags_binarizer.fit_transform(
                    labels_tags).ravel()
            else:
                self.tags_binarizer = MultiLabelBinarizer()
                labels_tags_vectorized = self.tags_binarizer.fit_transform(
                    labels_tags)
            self.tags_classifier = MLPClassifier(tol=0.01)
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
@ -222,6 +241,16 @@ class DocumentClassifier(object):
            X = self.data_vectorizer.transform([preprocess_content(content)])
            y = self.tags_classifier.predict(X)
            tags_ids = self.tags_binarizer.inverse_transform(y)[0]
-            return tags_ids
+            if type_of_target(y).startswith('multilabel'):
                # the usual case when there are multiple tags.
                return list(tags_ids)
            elif type_of_target(y) == 'binary' and tags_ids != -1:
                # This is for when we have binary classification with only one
                # tag and the result is to assign this tag.
                return [tags_ids]
            else:
                # Usually binary as well with -1 as the result, but we're
                # going to catch everything else here as well.
                return []
        else:
            return []
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -8,7 +8,6 @@ from django.conf import settings
 from django.db import transaction
 from django.utils import timezone
 from paperless.db import GnuPG
 from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import generate_filename, create_source_path_directory
 from .loggers import LoggingMixin
@ -40,17 +39,6 @@ class Consumer(LoggingMixin):
            raise ConsumerError("Cannot consume {}: It is not a file".format(
                self.path))
    def pre_check_consumption_dir(self):
        if not settings.CONSUMPTION_DIR:
            raise ConsumerError(
                "The CONSUMPTION_DIR settings variable does not appear to be "
                "set.")
        if not os.path.isdir(settings.CONSUMPTION_DIR):
            raise ConsumerError(
                "Consumption directory {} does not exist".format(
                    settings.CONSUMPTION_DIR))
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
@ -92,7 +80,6 @@ class Consumer(LoggingMixin):
        # Make sure that preconditions for consuming the file are met.
        self.pre_check_file_exists()
        self.pre_check_consumption_dir()
        self.pre_check_directories()
        self.pre_check_duplicate()
@ -208,9 +195,6 @@ class Consumer(LoggingMixin):
        created = file_info.created or date or timezone.make_aware(
            datetime.datetime.fromtimestamp(stats.st_mtime))
        if settings.PASSPHRASE:
            storage_type = Document.STORAGE_TYPE_GPG
        else:
        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        with open(self.path, "rb") as f:
@ -260,8 +244,4 @@ class Consumer(LoggingMixin):
    def _write(self, document, source, target):
        with open(source, "rb") as read_file:
            with open(target, "wb") as write_file:
                if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
                write_file.write(read_file.read())
                    return
                self.log("debug", "Encrypting")
                write_file.write(GnuPG.encrypted(read_file))
--- a/src/documents/index.py
+++ b/src/documents/index.py
@ -64,12 +64,12 @@ def get_schema():
 def open_index(recreate=False):
    try:
        if exists_in(settings.INDEX_DIR) and not recreate:
            return open_dir(settings.INDEX_DIR)
-    else:
+    except Exception as e:
-        # TODO: this is not thread safe. If 2 instances try to create the index
+        logger.error(f"Error while opening the index: {e}, recreating.")
-        #  at the same time, this fails. This currently prevents parallel
+
        #  tests.
    if not os.path.isdir(settings.INDEX_DIR):
        os.makedirs(settings.INDEX_DIR, exist_ok=True)
    return create_in(settings.INDEX_DIR, get_schema())
--- a/src/documents/loggers.py
+++ b/src/documents/loggers.py
@ -1,9 +1,14 @@
 import logging
 import uuid
 from django.conf import settings
 class PaperlessHandler(logging.Handler):
    def emit(self, record):
        if settings.DISABLE_DBHANDLER:
            return
        # We have to do the import here or Django will barf when it tries to
        # load this because the apps aren't loaded at that point
        from .models import Log
--- a/src/documents/management/commands/change_storage_type.py
+++ b/src/documents/management/commands/change_storage_type.py
@ -17,16 +17,6 @@ class Command(BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument(
            "from",
            choices=("gpg", "unencrypted"),
            help="The state you want to change your documents from"
        )
        parser.add_argument(
            "to",
            choices=("gpg", "unencrypted"),
            help="The state you want to change your documents to"
        )
        parser.add_argument(
            "--passphrase",
            help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
@ -50,11 +40,6 @@ class Command(BaseCommand):
        except KeyboardInterrupt:
            return
        if options["from"] == options["to"]:
            raise CommandError(
                'The "from" and "to" values can\'t be the same.'
            )
        passphrase = options["passphrase"] or settings.PASSPHRASE
        if not passphrase:
            raise CommandError(
@ -62,10 +47,7 @@ class Command(BaseCommand):
                "by declaring it in your environment or your config."
            )
        if options["from"] == "gpg" and options["to"] == "unencrypted":
        self.__gpg_to_unencrypted(passphrase)
        elif options["from"] == "unencrypted" and options["to"] == "gpg":
            self.__unencrypted_to_gpg(passphrase)
    @staticmethod
    def __gpg_to_unencrypted(passphrase):
@ -79,42 +61,28 @@ class Command(BaseCommand):
                document).encode('utf-8'), "green"))
            old_paths = [document.source_path, document.thumbnail_path]
            raw_document = GnuPG.decrypted(document.source_file, passphrase)
            raw_thumb = GnuPG.decrypted(document.thumbnail_file, passphrase)
            document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
            ext = os.path.splitext(document.filename)[1]
            if not ext == '.gpg':
                raise CommandError(
                    f"Abort: encrypted file {document.source_path} does not "
                    f"end with .gpg")
            document.filename = os.path.splitext(document.filename)[0]
            with open(document.source_path, "wb") as f:
                f.write(raw_document)
            with open(document.thumbnail_path, "wb") as f:
                f.write(raw_thumb)
-            document.save(update_fields=("storage_type",))
+            document.save(update_fields=("storage_type", "filename"))
            for path in old_paths:
                os.unlink(path)
    @staticmethod
    def __unencrypted_to_gpg(passphrase):
        unencrypted_files = Document.objects.filter(
            storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
        for document in unencrypted_files:
            print(coloured("Encrypting {}".format(document), "green"))
            old_paths = [document.source_path, document.thumbnail_path]
            with open(document.source_path, "rb") as raw_document:
                with open(document.thumbnail_path, "rb") as raw_thumb:
                    document.storage_type = Document.STORAGE_TYPE_GPG
                    with open(document.source_path, "wb") as f:
                        f.write(GnuPG.encrypted(raw_document, passphrase))
                    with open(document.thumbnail_path, "wb") as f:
                        f.write(GnuPG.encrypted(raw_thumb, passphrase))
            document.save(update_fields=("storage_type",))
            for path in old_paths:
                os.unlink(path)
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@ -1,11 +1,11 @@
 import logging
 import os
 from time import sleep
 from django.conf import settings
-from django.core.management.base import BaseCommand
+from django.core.management.base import BaseCommand, CommandError
 from django_q.tasks import async_task
 from watchdog.events import FileSystemEventHandler
 from watchdog.observers import Observer
 from watchdog.observers.polling import PollingObserver
 try:
@ -13,25 +13,54 @@ try:
 except ImportError:
    INotify = flags = None
 logger = logging.getLogger(__name__)
 class Handler(FileSystemEventHandler):
-    def _consume(self, file):
+def _consume(file):
        if os.path.isfile(file):
    try:
        if os.path.isfile(file):
            async_task("documents.tasks.consume_file",
                       file,
                       task_name=os.path.basename(file)[:100])
        else:
            logger.debug(
                f"Not consuming file {file}: File has moved.")
    except Exception as e:
        # Catch all so that the consumer won't crash.
-                logging.getLogger(__name__).error(
+        # This is also what the test case is listening for to check for
        # errors.
        logger.error(
            "Error while consuming document: {}".format(e))
 def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
    mtime = -1
    current_try = 0
    while current_try < num_tries:
        try:
            new_mtime = os.stat(file).st_mtime
        except FileNotFoundError:
            logger.debug(f"File {file} moved while waiting for it to remain "
                         f"unmodified.")
            return
        if new_mtime == mtime:
            _consume(file)
            return
        mtime = new_mtime
        sleep(wait_time)
        current_try += 1
    logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
 class Handler(FileSystemEventHandler):
    def on_created(self, event):
-        self._consume(event.src_path)
+        _consume_wait_unmodified(event.src_path)
    def on_moved(self, event):
-        self._consume(event.src_path)
+        _consume_wait_unmodified(event.dest_path)
 class Command(BaseCommand):
@ -40,12 +69,15 @@ class Command(BaseCommand):
    consumption directory.
    """
    # This is here primarily for the tests and is irrelevant in production.
    stop_flag = False
    def __init__(self, *args, **kwargs):
        self.verbosity = 0
        self.logger = logging.getLogger(__name__)
        BaseCommand.__init__(self, *args, **kwargs)
        self.observer = None
    def add_arguments(self, parser):
        parser.add_argument(
@ -54,38 +86,66 @@ class Command(BaseCommand):
            nargs="?",
            help="The consumption directory."
        )
        parser.add_argument(
            "--oneshot",
            action="store_true",
            help="Run only once."
        )
    def handle(self, *args, **options):
        self.verbosity = options["verbosity"]
        directory = options["directory"]
-        logging.getLogger(__name__).info(
+        if not directory:
-            "Starting document consumer at {}".format(
+            raise CommandError(
-                directory
+                "CONSUMPTION_DIR does not appear to be set."
            )
            )
-        # Consume all files as this is not done initially by the watchdog
+        if not os.path.isdir(directory):
            raise CommandError(
                f"Consumption directory {directory} does not exist")
        for entry in os.scandir(directory):
-            if entry.is_file():
+            _consume(entry.path)
                async_task("documents.tasks.consume_file",
                           entry.path,
                           task_name=os.path.basename(entry.path)[:100])
-        # Start the watchdog. Woof!
+        if options["oneshot"]:
-        if settings.CONSUMER_POLLING > 0:
+            return
-            logging.getLogger(__name__).info(
+
-                "Using polling instead of file system notifications.")
+        if settings.CONSUMER_POLLING == 0 and INotify:
-            observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
+            self.handle_inotify(directory)
        else:
-            observer = Observer()
+            self.handle_polling(directory)
-        event_handler = Handler()
+
-        observer.schedule(event_handler, directory, recursive=True)
+        logger.debug("Consumer exiting.")
-        observer.start()
+
    def handle_polling(self, directory):
        logging.getLogger(__name__).info(
            f"Polling directory for changes: {directory}")
        self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
        self.observer.schedule(Handler(), directory, recursive=False)
        self.observer.start()
        try:
-            while observer.is_alive():
+            while self.observer.is_alive():
-                observer.join(1)
+                self.observer.join(1)
                if self.stop_flag:
                    self.observer.stop()
        except KeyboardInterrupt:
-            observer.stop()
+            self.observer.stop()
-        observer.join()
+        self.observer.join()
    def handle_inotify(self, directory):
        logging.getLogger(__name__).info(
            f"Using inotify to watch directory for changes: {directory}")
        inotify = INotify()
        descriptor = inotify.add_watch(
            directory, flags.CLOSE_WRITE | flags.MOVED_TO)
        try:
            while not self.stop_flag:
                for event in inotify.read(timeout=1000, read_delay=1000):
                    file = os.path.join(directory, event.name)
                    _consume(file)
        except KeyboardInterrupt:
            pass
        inotify.rm_watch(descriptor)
        inotify.close()
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@ -22,13 +22,6 @@ class Command(Renderable, BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument("target")
        parser.add_argument(
            "--legacy",
            action="store_true",
            help="Don't try to export all of the document data, just dump the "
                 "original document files out in a format that makes "
                 "re-consuming them easy."
        )
    def __init__(self, *args, **kwargs):
        BaseCommand.__init__(self, *args, **kwargs)
@ -44,9 +37,6 @@ class Command(Renderable, BaseCommand):
        if not os.access(self.target, os.W_OK):
            raise CommandError("That path doesn't appear to be writable")
        if options["legacy"]:
            self.dump_legacy()
        else:
        self.dump()
    def dump(self):
@ -102,33 +92,3 @@ class Command(Renderable, BaseCommand):
        with open(os.path.join(self.target, "manifest.json"), "w") as f:
            json.dump(manifest, f, indent=2)
    def dump_legacy(self):
        for document in Document.objects.all():
            target = os.path.join(
                self.target, self._get_legacy_file_name(document))
            print("Exporting: {}".format(target))
            with open(target, "wb") as f:
                f.write(GnuPG.decrypted(document.source_file))
                t = int(time.mktime(document.created.timetuple()))
                os.utime(target, times=(t, t))
    @staticmethod
    def _get_legacy_file_name(doc):
        if not doc.correspondent and not doc.title:
            return os.path.basename(doc.source_path)
        created = doc.created.strftime("%Y%m%d%H%M%SZ")
        tags = ",".join([t.slug for t in doc.tags.all()])
        if tags:
            return "{} - {} - {} - {}{}".format(
                created, doc.correspondent, doc.title, tags, doc.file_type)
        return "{} - {} - {}{}".format(
            created, doc.correspondent, doc.title, doc.file_type)
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@ -82,8 +82,6 @@ class Command(Renderable, BaseCommand):
    def _import_files_from_manifest(self):
        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        if settings.PASSPHRASE:
            storage_type = Document.STORAGE_TYPE_GPG
        for record in self.manifest:
@ -105,21 +103,6 @@ class Command(Renderable, BaseCommand):
            create_source_path_directory(document.source_path)
            if settings.PASSPHRASE:
                with open(document_path, "rb") as unencrypted:
                    with open(document.source_path, "wb") as encrypted:
                        print("Encrypting {} and saving it to {}".format(
                            doc_file, document.source_path))
                        encrypted.write(GnuPG.encrypted(unencrypted))
                with open(thumbnail_path, "rb") as unencrypted:
                    with open(document.thumbnail_path, "wb") as encrypted:
                        print("Encrypting {} and saving it to {}".format(
                            thumb_file, document.thumbnail_path))
                        encrypted.write(GnuPG.encrypted(unencrypted))
            else:
            print(f"Moving {document_path} to {document.source_path}")
            shutil.copy(document_path, document.source_path)
            shutil.copy(thumbnail_path, document.thumbnail_path)
--- a/src/documents/migrations/1000_update_paperless_all.py
+++ b/src/documents/migrations/1000_update_paperless_all.py
@ -5,23 +5,6 @@ from django.db import migrations, models
 import django.db.models.deletion
 def make_index(apps, schema_editor):
    Document = apps.get_model("documents", "Document")
    documents = Document.objects.all()
    print()
    try:
        print("  --> Creating document index...")
        from whoosh.writing import AsyncWriter
        from documents import index
        ix = index.open_index(recreate=True)
        with AsyncWriter(ix) as writer:
            for document in documents:
                index.update_document(writer, document)
    except ImportError:
        # index may not be relevant anymore
        print("  --> Cannot create document index.")
 def logs_set_default_group(apps, schema_editor):
    Log = apps.get_model('documents', 'Log')
    for log in Log.objects.all():
@ -99,8 +82,4 @@ class Migration(migrations.Migration):
            code=django.db.migrations.operations.special.RunPython.noop,
            reverse_code=logs_set_default_group
        ),
        migrations.RunPython(
            code=make_index,
            reverse_code=django.db.migrations.operations.special.RunPython.noop,
        ),
    ]
--- a/src/documents/migrations/1004_sanity_check_schedule.py
+++ b/src/documents/migrations/1004_sanity_check_schedule.py
@ -0,0 +1,26 @@
 # Generated by Django 3.1.3 on 2020-11-25 14:53
 from django.db import migrations
 from django.db.migrations import RunPython
 from django_q.models import Schedule
 from django_q.tasks import schedule
 def add_schedules(apps, schema_editor):
    schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
 def remove_schedules(apps, schema_editor):
    Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
 class Migration(migrations.Migration):
    dependencies = [
        ('documents', '1003_mime_types'),
        ('django_q', '0013_task_attempt_count'),
    ]
    operations = [
        RunPython(add_schedules, remove_schedules)
    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -230,6 +230,7 @@ class Document(models.Model):
    @property
    def file_type(self):
        # TODO: this is not stable across python versions
        return mimetypes.guess_extension(str(self.mime_type))
    @property
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@ -0,0 +1,94 @@
 import hashlib
 import os
 from django.conf import settings
 from documents.models import Document
 class SanityMessage:
    message = None
 class SanityWarning(SanityMessage):
    def __init__(self, message):
        self.message = message
    def __str__(self):
        return f"Warning: {self.message}"
 class SanityError(SanityMessage):
    def __init__(self, message):
        self.message = message
    def __str__(self):
        return f"ERROR: {self.message}"
 class SanityFailedError(Exception):
    def __init__(self, messages):
        self.messages = messages
    def __str__(self):
        message_string = "\n".join([str(m) for m in self.messages])
        return (
            f"The following issuse were found by the sanity checker:\n"
            f"{message_string}\n\n===============\n\n")
 def check_sanity():
    messages = []
    present_files = []
    for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
        for f in files:
            present_files.append(os.path.normpath(os.path.join(root, f)))
    for doc in Document.objects.all():
        # Check thumbnail
        if not os.path.isfile(doc.thumbnail_path):
            messages.append(SanityError(
                f"Thumbnail of document {doc.pk} does not exist."))
        else:
            present_files.remove(os.path.normpath(doc.thumbnail_path))
            try:
                with doc.thumbnail_file as f:
                    f.read()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read thumbnail file of document {doc.pk}: {e}"
                ))
        # Check document
        if not os.path.isfile(doc.source_path):
            messages.append(SanityError(
                f"Original of document {doc.pk} does not exist."))
        else:
            present_files.remove(os.path.normpath(doc.source_path))
            checksum = None
            try:
                with doc.source_file as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read original file of document {doc.pk}: {e}"))
            if checksum and not checksum == doc.checksum:
                messages.append(SanityError(
                    f"Checksum mismatch of document {doc.pk}. "
                    f"Stored: {doc.checksum}, actual: {checksum}."
                ))
        if not doc.content:
            messages.append(SanityWarning(
                f"Document {doc.pk} has no content."
            ))
    for extra_file in present_files:
        messages.append(SanityWarning(
            f"Orphaned file in media dir: {extra_file}"
        ))
    return messages
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@ -93,14 +93,11 @@ class DocumentSerializer(serializers.ModelSerializer):
            "document_type_id",
            "title",
            "content",
            "mime_type",
            "tags",
            "tags_id",
            "checksum",
            "created",
            "modified",
            "added",
            "file_name",
            "archive_serial_number"
        )
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -3,11 +3,12 @@ import logging
 from django.conf import settings
 from whoosh.writing import AsyncWriter
-from documents import index
+from documents import index, sanity_checker
 from documents.classifier import DocumentClassifier, \
    IncompatibleClassifierVersionError
 from documents.consumer import Consumer, ConsumerError
 from documents.models import Document
 from documents.sanity_checker import SanityFailedError
 def index_optimize():
@ -74,3 +75,12 @@ def consume_file(path,
    else:
        raise ConsumerError("Unknown error: Returned document was null, but "
                            "no error message was given.")
 def sanity_check():
    messages = sanity_checker.check_sanity()
    if len(messages) > 0:
        raise SanityFailedError(messages)
    else:
        return "No issues detected."
--- a/src/documents/tests/samples/originals/0000001.pdf
+++ b/src/documents/tests/samples/originals/0000001.pdf
--- a/src/documents/tests/samples/originals/0000002.pdf.gpg
+++ b/src/documents/tests/samples/originals/0000002.pdf.gpg
--- a/src/documents/tests/samples/simple.pdf
+++ b/src/documents/tests/samples/simple.pdf
--- a/src/documents/tests/samples/simple.zip
+++ b/src/documents/tests/samples/simple.zip
--- a/src/documents/tests/samples/thumb/0000001.png
+++ b/src/documents/tests/samples/thumb/0000001.png
--- a/src/documents/tests/samples/thumb/0000002.png.gpg
+++ b/src/documents/tests/samples/thumb/0000002.png.gpg
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@ -1,40 +1,24 @@
 import os
 import shutil
 import tempfile
 from unittest import mock
 from django.contrib.auth.models import User
-from django.test import override_settings
+from pathvalidate import ValidationError
 from rest_framework.test import APITestCase
 from documents import index
 from documents.models import Document, Correspondent, DocumentType, Tag
 from documents.tests.utils import DirectoriesMixin
-class DocumentApiTest(APITestCase):
+class DocumentApiTest(DirectoriesMixin, APITestCase):
    def setUp(self):
-        self.scratch_dir = tempfile.mkdtemp()
+        super(DocumentApiTest, self).setUp()
        self.media_dir = tempfile.mkdtemp()
        self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
        self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
        os.makedirs(self.originals_dir, exist_ok=True)
        os.makedirs(self.thumbnail_dir, exist_ok=True)
        override_settings(
            SCRATCH_DIR=self.scratch_dir,
            MEDIA_ROOT=self.media_dir,
            ORIGINALS_DIR=self.originals_dir,
            THUMBNAIL_DIR=self.thumbnail_dir
        ).enable()
        user = User.objects.create_superuser(username="temp_admin")
        self.client.force_login(user=user)
    def tearDown(self):
        shutil.rmtree(self.scratch_dir, ignore_errors=True)
        shutil.rmtree(self.media_dir, ignore_errors=True)
    def testDocuments(self):
        response = self.client.get("/api/documents/").data
@ -87,7 +71,7 @@ class DocumentApiTest(APITestCase):
    def test_document_actions(self):
-        _, filename = tempfile.mkstemp(dir=self.originals_dir)
+        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
        content = b"This is a test"
        content_thumbnail = b"thumbnail content"
@ -97,7 +81,7 @@ class DocumentApiTest(APITestCase):
        doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
-        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
+        with open(os.path.join(self.dirs.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
            f.write(content_thumbnail)
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
@ -179,6 +163,109 @@ class DocumentApiTest(APITestCase):
        results = response.data['results']
        self.assertEqual(len(results), 3)
    def test_search_no_query(self):
        response = self.client.get("/api/search/")
        results = response.data['results']
        self.assertEqual(len(results), 0)
    def test_search(self):
        d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
        d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
        d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
        with index.open_index(False).writer() as writer:
            # Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once
            # (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer.
            # That's why we cant open the writer in a model on_save handler or something.
            index.update_document(writer, d1)
            index.update_document(writer, d2)
            index.update_document(writer, d3)
        response = self.client.get("/api/search/?query=bank")
        results = response.data['results']
        self.assertEqual(response.data['count'], 3)
        self.assertEqual(response.data['page'], 1)
        self.assertEqual(response.data['page_count'], 1)
        self.assertEqual(len(results), 3)
        response = self.client.get("/api/search/?query=september")
        results = response.data['results']
        self.assertEqual(response.data['count'], 1)
        self.assertEqual(response.data['page'], 1)
        self.assertEqual(response.data['page_count'], 1)
        self.assertEqual(len(results), 1)
        response = self.client.get("/api/search/?query=statement")
        results = response.data['results']
        self.assertEqual(response.data['count'], 2)
        self.assertEqual(response.data['page'], 1)
        self.assertEqual(response.data['page_count'], 1)
        self.assertEqual(len(results), 2)
        response = self.client.get("/api/search/?query=sfegdfg")
        results = response.data['results']
        self.assertEqual(response.data['count'], 0)
        self.assertEqual(response.data['page'], 0)
        self.assertEqual(response.data['page_count'], 0)
        self.assertEqual(len(results), 0)
    def test_search_multi_page(self):
        with index.open_index(False).writer() as writer:
            for i in range(55):
                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
                index.update_document(writer, doc)
        # This is here so that we test that no document gets returned twice (might happen if the paging is not working)
        seen_ids = []
        for i in range(1, 6):
            response = self.client.get(f"/api/search/?query=content&page={i}")
            results = response.data['results']
            self.assertEqual(response.data['count'], 55)
            self.assertEqual(response.data['page'], i)
            self.assertEqual(response.data['page_count'], 6)
            self.assertEqual(len(results), 10)
            for result in results:
                self.assertNotIn(result['id'], seen_ids)
                seen_ids.append(result['id'])
        response = self.client.get(f"/api/search/?query=content&page=6")
        results = response.data['results']
        self.assertEqual(response.data['count'], 55)
        self.assertEqual(response.data['page'], 6)
        self.assertEqual(response.data['page_count'], 6)
        self.assertEqual(len(results), 5)
        for result in results:
            self.assertNotIn(result['id'], seen_ids)
            seen_ids.append(result['id'])
        response = self.client.get(f"/api/search/?query=content&page=7")
        results = response.data['results']
        self.assertEqual(response.data['count'], 55)
        self.assertEqual(response.data['page'], 6)
        self.assertEqual(response.data['page_count'], 6)
        self.assertEqual(len(results), 5)
    def test_search_invalid_page(self):
        with index.open_index(False).writer() as writer:
            for i in range(15):
                doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
                index.update_document(writer, doc)
        first_page = self.client.get(f"/api/search/?query=content&page=1").data
        second_page = self.client.get(f"/api/search/?query=content&page=2").data
        should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
        should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
        should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
        should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
        self.assertDictEqual(first_page, should_be_first_page_1)
        self.assertDictEqual(first_page, should_be_first_page_2)
        self.assertDictEqual(first_page, should_be_first_page_3)
        self.assertDictEqual(first_page, should_be_first_page_4)
        self.assertNotEqual(len(first_page['results']), len(second_page['results']))
    @mock.patch("documents.index.autocomplete")
    def test_search_autocomplete(self, m):
        m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
@ -215,3 +302,42 @@ class DocumentApiTest(APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.data['documents_total'], 3)
        self.assertEqual(response.data['documents_inbox'], 1)
    @mock.patch("documents.forms.async_task")
    def test_upload(self, m):
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
            response = self.client.post("/api/documents/post_document/", {"document": f})
        self.assertEqual(response.status_code, 200)
        m.assert_called_once()
        args, kwargs = m.call_args
        self.assertEqual(kwargs['override_filename'], "simple.pdf")
    @mock.patch("documents.forms.async_task")
    def test_upload_invalid_form(self, m):
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
            response = self.client.post("/api/documents/post_document/", {"documenst": f})
        self.assertEqual(response.status_code, 400)
        m.assert_not_called()
    @mock.patch("documents.forms.async_task")
    def test_upload_invalid_file(self, m):
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
            response = self.client.post("/api/documents/post_document/", {"document": f})
        self.assertEqual(response.status_code, 400)
        m.assert_not_called()
    @mock.patch("documents.forms.async_task")
    @mock.patch("documents.forms.validate_filename")
    def test_upload_invalid_filename(self, validate_filename, async_task):
        validate_filename.side_effect = ValidationError()
        with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
            response = self.client.post("/api/documents/post_document/", {"document": f})
        self.assertEqual(response.status_code, 400)
        async_task.assert_not_called()
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@ -1,24 +1,29 @@
 import tempfile
 from time import sleep
 from unittest import mock
 from django.test import TestCase, override_settings
-from documents.classifier import DocumentClassifier
+from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from documents.models import Correspondent, Document, Tag, DocumentType
 from documents.tests.utils import DirectoriesMixin
-class TestClassifier(TestCase):
+class TestClassifier(DirectoriesMixin, TestCase):
    def setUp(self):
-
+        super(TestClassifier, self).setUp()
        self.classifier = DocumentClassifier()
    def generate_test_data(self):
        self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
        self.c2 = Correspondent.objects.create(name="c2")
        self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO)
        self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
        self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
        self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
        self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO)
        self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
        self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
@ -59,8 +64,8 @@ class TestClassifier(TestCase):
        self.classifier.train()
        self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
        self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
-        self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
+        self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk])
-        self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
+        self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk])
        self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
        self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
@ -71,6 +76,44 @@ class TestClassifier(TestCase):
        self.assertTrue(self.classifier.train())
        self.assertFalse(self.classifier.train())
    def testVersionIncreased(self):
        self.generate_test_data()
        self.assertTrue(self.classifier.train())
        self.assertFalse(self.classifier.train())
        self.classifier.save_classifier()
        classifier2 = DocumentClassifier()
        current_ver = DocumentClassifier.FORMAT_VERSION
        with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
            # assure that we won't load old classifiers.
            self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
            self.classifier.save_classifier()
            # assure that we can load the classifier after saving it.
            classifier2.reload()
    def testReload(self):
        self.generate_test_data()
        self.assertTrue(self.classifier.train())
        self.classifier.save_classifier()
        classifier2 = DocumentClassifier()
        classifier2.reload()
        v1 = classifier2.classifier_version
        # change the classifier after some time.
        sleep(1)
        self.classifier.save_classifier()
        classifier2.reload()
        v2 = classifier2.classifier_version
        self.assertNotEqual(v1, v2)
    @override_settings(DATA_DIR=tempfile.mkdtemp())
    def testSaveClassifier(self):
@ -83,3 +126,112 @@ class TestClassifier(TestCase):
        new_classifier = DocumentClassifier()
        new_classifier.reload()
        self.assertFalse(new_classifier.train())
    def test_one_correspondent_predict(self):
        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
        self.classifier.train()
        self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
    def test_one_correspondent_predict_manydocs(self):
        c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
        doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B")
        self.classifier.train()
        self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
        self.assertIsNone(self.classifier.predict_correspondent(doc2.content))
    def test_one_type_predict(self):
        dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
                                            checksum="A", document_type=dt)
        self.classifier.train()
        self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
    def test_one_type_predict_manydocs(self):
        dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
                                            checksum="A", document_type=dt)
        doc2 = Document.objects.create(title="doc1", content="this is a document from c2",
                                            checksum="B")
        self.classifier.train()
        self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
        self.assertIsNone(self.classifier.predict_document_type(doc2.content))
    def test_one_tag_predict(self):
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
        doc1.tags.add(t1)
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
    def test_one_tag_predict_unassigned(self):
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [])
    def test_two_tags_predict_singledoc(self):
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
        doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
        doc4.tags.add(t1)
        doc4.tags.add(t2)
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
    def test_two_tags_predict(self):
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
        doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B")
        doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C")
        doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
        doc1.tags.add(t1)
        doc2.tags.add(t2)
        doc4.tags.add(t1)
        doc4.tags.add(t2)
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [t2.pk])
        self.assertListEqual(self.classifier.predict_tags(doc3.content), [])
        self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
    def test_one_tag_predict_multi(self):
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
        doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
        doc1.tags.add(t1)
        doc2.tags.add(t1)
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [t1.pk])
    def test_one_tag_predict_multi_2(self):
        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
        doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
        doc1.tags.add(t1)
        self.classifier.train()
        self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
        self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@ -1,12 +1,12 @@
 import os
 import re
 import shutil
 import tempfile
 from unittest import mock
 from unittest.mock import MagicMock
 from django.test import TestCase, override_settings
 from .utils import DirectoriesMixin
 from ..consumer import Consumer, ConsumerError
 from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
 from ..parsers import DocumentParser, ParseError
@ -408,26 +408,16 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
-class TestConsumer(TestCase):
+class TestConsumer(DirectoriesMixin, TestCase):
    def make_dummy_parser(self, path, logging_group):
-        return DummyParser(path, logging_group, self.scratch_dir)
+        return DummyParser(path, logging_group, self.dirs.scratch_dir)
    def make_faulty_parser(self, path, logging_group):
-        return FaultyParser(path, logging_group, self.scratch_dir)
+        return FaultyParser(path, logging_group, self.dirs.scratch_dir)
    def setUp(self):
-        self.scratch_dir = tempfile.mkdtemp()
+        super(TestConsumer, self).setUp()
        self.media_dir = tempfile.mkdtemp()
        self.consumption_dir = tempfile.mkdtemp()
        override_settings(
            SCRATCH_DIR=self.scratch_dir,
            MEDIA_ROOT=self.media_dir,
            ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
            THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
            CONSUMPTION_DIR=self.consumption_dir
        ).enable()
        patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
        m = patcher.start()
@ -441,13 +431,8 @@ class TestConsumer(TestCase):
        self.consumer = Consumer()
    def tearDown(self):
        shutil.rmtree(self.scratch_dir, ignore_errors=True)
        shutil.rmtree(self.media_dir, ignore_errors=True)
        shutil.rmtree(self.consumption_dir, ignore_errors=True)
    def get_test_file(self):
-        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
+        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
        return f
    def testNormalOperation(self):
@ -516,26 +501,6 @@ class TestConsumer(TestCase):
        self.fail("Should throw exception")
    @override_settings(CONSUMPTION_DIR=None)
    def testConsumptionDirUnset(self):
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
            return
        self.fail("Should throw exception")
    @override_settings(CONSUMPTION_DIR="asd")
    def testNoConsumptionDir(self):
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertEqual(str(e), "Consumption directory asd does not exist")
            return
        self.fail("Should throw exception")
    def testDuplicates(self):
        self.consumer.try_consume_file(self.get_test_file())
--- a/src/documents/tests/test_logger.py
+++ b/src/documents/tests/test_logger.py
@ -2,7 +2,7 @@ import logging
 import uuid
 from unittest import mock
-from django.test import TestCase
+from django.test import TestCase, override_settings
 from ..models import Log
@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase):
        self.logger = logging.getLogger(
            "documents.management.commands.document_consumer")
    @override_settings(DISABLE_DBHANDLER=False)
    def test_that_it_saves_at_all(self):
        kw = {"group": uuid.uuid4()}
@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase):
            self.logger.critical("This is a critical message", extra=kw)
            self.assertEqual(Log.objects.all().count(), 5)
    @override_settings(DISABLE_DBHANDLER=False)
    def test_groups(self):
        kw1 = {"group": uuid.uuid4()}
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@ -0,0 +1,210 @@
 import filecmp
 import os
 import shutil
 from threading import Thread
 from time import sleep
 from unittest import mock
 from django.conf import settings
 from django.core.management import call_command, CommandError
 from django.test import override_settings, TestCase
 from documents.consumer import ConsumerError
 from documents.management.commands import document_consumer
 from documents.tests.utils import DirectoriesMixin
 class ConsumerThread(Thread):
    def __init__(self):
        super().__init__()
        self.cmd = document_consumer.Command()
    def run(self) -> None:
        self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False)
    def stop(self):
        # Consumer checks this every second.
        self.cmd.stop_flag = True
 def chunked(size, source):
    for i in range(0, len(source), size):
        yield source[i:i+size]
 class TestConsumer(DirectoriesMixin, TestCase):
    sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
    def setUp(self) -> None:
        super(TestConsumer, self).setUp()
        self.t = None
        patcher = mock.patch("documents.management.commands.document_consumer.async_task")
        self.task_mock = patcher.start()
        self.addCleanup(patcher.stop)
    def t_start(self):
        self.t = ConsumerThread()
        self.t.start()
        # give the consumer some time to do initial work
        sleep(1)
    def tearDown(self) -> None:
        if self.t:
            # set the stop flag
            self.t.stop()
            # wait for the consumer to exit.
            self.t.join()
        super(TestConsumer, self).tearDown()
    def wait_for_task_mock_call(self):
        n = 0
        while n < 100:
            if self.task_mock.call_count > 0:
                # give task_mock some time to finish and raise errors
                sleep(1)
                return
            n += 1
            sleep(0.1)
        self.fail("async_task was never called")
    # A bogus async_task that will simply check the file for
    # completeness and raise an exception otherwise.
    def bogus_task(self, func, filename, **kwargs):
        eq = filecmp.cmp(filename, self.sample_file, shallow=False)
        if not eq:
            print("Consumed an INVALID file.")
            raise ConsumerError("Incomplete File READ FAILED")
        else:
            print("Consumed a perfectly valid file.")
    def slow_write_file(self, target, incomplete=False):
        with open(self.sample_file, 'rb') as f:
            pdf_bytes = f.read()
        if incomplete:
            pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100]
        with open(target, 'wb') as f:
            # this will take 2 seconds, since the file is about 20k.
            print("Start writing file.")
            for b in chunked(1000, pdf_bytes):
                f.write(b)
                sleep(0.1)
            print("file completed.")
    def test_consume_file(self):
        self.t_start()
        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
        shutil.copy(self.sample_file, f)
        self.wait_for_task_mock_call()
        self.task_mock.assert_called_once()
        args, kwargs = self.task_mock.call_args
        self.assertEqual(args[1], f)
    @override_settings(CONSUMER_POLLING=1)
    def test_consume_file_polling(self):
        self.test_consume_file()
    def test_consume_existing_file(self):
        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
        shutil.copy(self.sample_file, f)
        self.t_start()
        self.task_mock.assert_called_once()
        args, kwargs = self.task_mock.call_args
        self.assertEqual(args[1], f)
    @override_settings(CONSUMER_POLLING=1)
    def test_consume_existing_file_polling(self):
        self.test_consume_existing_file()
    @mock.patch("documents.management.commands.document_consumer.logger.error")
    def test_slow_write_pdf(self, error_logger):
        self.task_mock.side_effect = self.bogus_task
        self.t_start()
        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
        self.slow_write_file(fname)
        self.wait_for_task_mock_call()
        error_logger.assert_not_called()
        self.task_mock.assert_called_once()
        args, kwargs = self.task_mock.call_args
        self.assertEqual(args[1], fname)
    @override_settings(CONSUMER_POLLING=1)
    def test_slow_write_pdf_polling(self):
        self.test_slow_write_pdf()
    @mock.patch("documents.management.commands.document_consumer.logger.error")
    def test_slow_write_and_move(self, error_logger):
        self.task_mock.side_effect = self.bogus_task
        self.t_start()
        fname = os.path.join(self.dirs.consumption_dir, "my_file.~df")
        fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
        self.slow_write_file(fname)
        shutil.move(fname, fname2)
        self.wait_for_task_mock_call()
        self.task_mock.assert_called_once()
        args, kwargs = self.task_mock.call_args
        self.assertEqual(args[1], fname2)
        error_logger.assert_not_called()
    @override_settings(CONSUMER_POLLING=1)
    def test_slow_write_and_move_polling(self):
        self.test_slow_write_and_move()
    @mock.patch("documents.management.commands.document_consumer.logger.error")
    def test_slow_write_incomplete(self, error_logger):
        self.task_mock.side_effect = self.bogus_task
        self.t_start()
        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
        self.slow_write_file(fname, incomplete=True)
        self.wait_for_task_mock_call()
        self.task_mock.assert_called_once()
        args, kwargs = self.task_mock.call_args
        self.assertEqual(args[1], fname)
        # assert that we have an error logged with this invalid file.
        error_logger.assert_called_once()
    @override_settings(CONSUMER_POLLING=1)
    def test_slow_write_incomplete_polling(self):
        self.test_slow_write_incomplete()
    @override_settings(CONSUMPTION_DIR="does_not_exist")
    def test_consumption_directory_invalid(self):
        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
    @override_settings(CONSUMPTION_DIR="")
    def test_consumption_directory_unset(self):
        self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
--- a/src/documents/tests/test_management_decrypt.py
+++ b/src/documents/tests/test_management_decrypt.py
@ -0,0 +1,56 @@
 import hashlib
 import json
 import os
 import shutil
 import tempfile
 from unittest import mock
 from django.core.management import call_command
 from django.test import TestCase, override_settings
 from documents.management.commands import document_exporter
 from documents.models import Document, Tag, DocumentType, Correspondent
 class TestDecryptDocuments(TestCase):
    @override_settings(
        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
        PASSPHRASE="test"
    )
    @mock.patch("documents.management.commands.decrypt_documents.input")
    def test_decrypt(self, m):
        media_dir = tempfile.mkdtemp()
        originals_dir = os.path.join(media_dir, "documents", "originals")
        thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
        os.makedirs(originals_dir, exist_ok=True)
        os.makedirs(thumb_dir, exist_ok=True)
        override_settings(
            ORIGINALS_DIR=originals_dir,
            THUMBNAIL_DIR=thumb_dir,
            PASSPHRASE="test"
        ).enable()
        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
        shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "thumb", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
        call_command('decrypt_documents')
        doc = Document.objects.get(id=2)
        self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
        self.assertEqual(doc.filename, "0000002.pdf")
        self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
        self.assertTrue(os.path.isfile(doc.thumbnail_path))
        with doc.source_file as f:
            checksum = hashlib.md5(f.read()).hexdigest()
            self.assertEqual(checksum, doc.checksum)
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@ -0,0 +1,53 @@
 import hashlib
 import json
 import os
 import tempfile
 from django.core.management import call_command
 from django.test import TestCase, override_settings
 from documents.management.commands import document_exporter
 from documents.models import Document, Tag, DocumentType, Correspondent
 class TestExporter(TestCase):
    @override_settings(
        ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
        THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
        PASSPHRASE="test"
    )
    def test_exporter(self):
        file = os.path.join(os.path.dirname(__file__), "samples", "originals", "0000001.pdf")
        with open(file, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
        Tag.objects.create(name="t")
        DocumentType.objects.create(name="dt")
        Correspondent.objects.create(name="c")
        target = tempfile.mkdtemp()
        call_command('document_exporter', target)
        with open(os.path.join(target, "manifest.json")) as f:
            manifest = json.load(f)
        self.assertEqual(len(manifest), 5)
        for element in manifest:
            if element['model'] == 'documents.document':
                fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
                self.assertTrue(os.path.exists(fname))
                self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
                with open(fname, "rb") as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
                self.assertEqual(checksum, element['fields']['checksum'])
        Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
        self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
--- a/src/documents/tests/test_management_retagger.py
+++ b/src/documents/tests/test_management_retagger.py
@ -0,0 +1,58 @@
 from django.core.management import call_command
 from django.test import TestCase
 from documents.models import Document, Tag, Correspondent, DocumentType
 from documents.tests.utils import DirectoriesMixin
 class TestRetagger(DirectoriesMixin, TestCase):
    def make_models(self):
        self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
        self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
        self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
        self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
        self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
        self.correspondent_first = Correspondent.objects.create(
            name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
        self.correspondent_second = Correspondent.objects.create(
            name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY)
        self.doctype_first = DocumentType.objects.create(
            name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY)
        self.doctype_second = DocumentType.objects.create(
            name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
    def get_updated_docs(self):
        return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
    def setUp(self) -> None:
        super(TestRetagger, self).setUp()
        self.make_models()
    def test_add_tags(self):
        call_command('document_retagger', '--tags')
        d_first, d_second, d_unrelated = self.get_updated_docs()
        self.assertEqual(d_first.tags.count(), 1)
        self.assertEqual(d_second.tags.count(), 1)
        self.assertEqual(d_unrelated.tags.count(), 0)
        self.assertEqual(d_first.tags.first(), self.tag_first)
        self.assertEqual(d_second.tags.first(), self.tag_second)
    def test_add_type(self):
        call_command('document_retagger', '--document_type')
        d_first, d_second, d_unrelated = self.get_updated_docs()
        self.assertEqual(d_first.document_type, self.doctype_first)
        self.assertEqual(d_second.document_type, self.doctype_second)
    def test_add_correspondent(self):
        call_command('document_retagger', '--correspondent')
        d_first, d_second, d_unrelated = self.get_updated_docs()
        self.assertEqual(d_first.correspondent, self.correspondent_first)
        self.assertEqual(d_second.correspondent, self.correspondent_second)
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@ -1,3 +1,5 @@
 import shutil
 import tempfile
 from random import randint
 from django.contrib.admin.models import LogEntry
@ -215,6 +217,13 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
        self.doc_contains = Document.objects.create(
            content="I contain the keyword.", mime_type="application/pdf")
        self.index_dir = tempfile.mkdtemp()
        # TODO: we should not need the index here.
        override_settings(INDEX_DIR=self.index_dir).enable()
    def tearDown(self) -> None:
        shutil.rmtree(self.index_dir, ignore_errors=True)
    def test_tag_applied_any(self):
        t1 = Tag.objects.create(
            name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@ -0,0 +1,59 @@
 import os
 import shutil
 import tempfile
 from collections import namedtuple
 from django.test import override_settings
 def setup_directories():
    dirs = namedtuple("Dirs", ())
    dirs.data_dir = tempfile.mkdtemp()
    dirs.scratch_dir = tempfile.mkdtemp()
    dirs.media_dir = tempfile.mkdtemp()
    dirs.consumption_dir = tempfile.mkdtemp()
    dirs.index_dir = os.path.join(dirs.data_dir, "index")
    dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
    dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
    os.makedirs(dirs.index_dir, exist_ok=True)
    os.makedirs(dirs.originals_dir, exist_ok=True)
    os.makedirs(dirs.thumbnail_dir, exist_ok=True)
    override_settings(
        DATA_DIR=dirs.data_dir,
        SCRATCH_DIR=dirs.scratch_dir,
        MEDIA_ROOT=dirs.media_dir,
        ORIGINALS_DIR=dirs.originals_dir,
        THUMBNAIL_DIR=dirs.thumbnail_dir,
        CONSUMPTION_DIR=dirs.consumption_dir,
        INDEX_DIR=dirs.index_dir,
        MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
    ).enable()
    return dirs
 def remove_dirs(dirs):
    shutil.rmtree(dirs.media_dir, ignore_errors=True)
    shutil.rmtree(dirs.data_dir, ignore_errors=True)
    shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
    shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
 class DirectoriesMixin:
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dirs = None
    def setUp(self) -> None:
        self.dirs = setup_directories()
        super(DirectoriesMixin, self).setUp()
    def tearDown(self) -> None:
        super(DirectoriesMixin, self).tearDown()
        remove_dirs(self.dirs)
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -149,13 +149,25 @@ class DocumentViewSet(RetrieveModelMixin,
        else:
            return HttpResponseBadRequest(str(form.errors))
    @action(methods=['get'], detail=True)
    def metadata(self, request, pk=None):
        try:
            doc = Document.objects.get(pk=pk)
            return Response({
                "paperless__checksum": doc.checksum,
                "paperless__mime_type": doc.mime_type,
                "paperless__filename": doc.filename,
            })
        except Document.DoesNotExist:
            raise Http404()
    @action(methods=['get'], detail=True)
    def preview(self, request, pk=None):
        try:
            response = self.file_response(pk, "inline")
            return response
-        except FileNotFoundError:
+        except (FileNotFoundError, Document.DoesNotExist):
-            raise Http404("Document source file does not exist")
+            raise Http404()
    @action(methods=['get'], detail=True)
    @cache_control(public=False, max_age=315360000)
@ -163,15 +175,15 @@ class DocumentViewSet(RetrieveModelMixin,
        try:
            return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
                                content_type='image/png')
-        except FileNotFoundError:
+        except (FileNotFoundError, Document.DoesNotExist):
-            raise Http404("Document thumbnail does not exist")
+            raise Http404()
    @action(methods=['get'], detail=True)
    def download(self, request, pk=None):
        try:
            return self.file_response(pk, "attachment")
-        except FileNotFoundError:
+        except (FileNotFoundError, Document.DoesNotExist):
-            raise Http404("Document source file does not exist")
+            raise Http404()
 class LogViewSet(ReadOnlyModelViewSet):
@ -190,7 +202,9 @@ class SearchView(APIView):
    permission_classes = (IsAuthenticated,)
-    ix = index.open_index()
+    def __init__(self, *args, **kwargs):
        super(SearchView, self).__init__(*args, **kwargs)
        self.ix = index.open_index()
    def add_infos_to_hit(self, r):
        doc = Document.objects.get(id=r['id'])
@ -210,6 +224,9 @@ class SearchView(APIView):
            except (ValueError, TypeError):
                page = 1
            if page < 1:
                page = 1
            with index.query_page(self.ix, query, page) as result_page:
                return Response(
                    {'count': len(result_page),
@ -229,7 +246,9 @@ class SearchAutoCompleteView(APIView):
    permission_classes = (IsAuthenticated,)
-    ix = index.open_index()
+    def __init__(self, *args, **kwargs):
        super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
        self.ix = index.open_index()
    def get(self, request, format=None):
        if 'term' in request.query_params:
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@ -1,8 +1,19 @@
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.utils.deprecation import MiddlewareMixin
 from rest_framework import authentication
 class AutoLoginMiddleware(MiddlewareMixin):
    def process_request(self, request):
        try:
            request.user = User.objects.get(
                username=settings.AUTO_LOGIN_USERNAME)
        except User.DoesNotExist:
            pass
 class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
    """ This class is here to provide authentication to the angular dev server
        during development. This is disabled in production.
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -144,6 +144,15 @@ TEMPLATES = [
 # Security                                                                    #
 ###############################################################################
 AUTO_LOGIN_USERNAME = os.getenv("PAPERLESS_AUTO_LOGIN_USERNAME")
 if AUTO_LOGIN_USERNAME:
    _index = MIDDLEWARE.index('django.contrib.auth.middleware.AuthenticationMiddleware')
    # This overrides everything the auth middleware is doing but still allows
    # regular login in case the provided user does not exist.
    MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')
 if DEBUG:
    X_FRAME_OPTIONS = ''
    # this should really be 'allow-from uri' but its not supported in any mayor
@ -241,6 +250,8 @@ USE_TZ = True
 # Logging                                                                     #
 ###############################################################################
 DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
 LOGGING = {
    "version": 1,
    "disable_existing_loggers": False,
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@ -1 +1 @@
-__version__ = (0, 9, 2)
+__version__ = (0, 9, 3)
--- a/src/paperless_tesseract/init.py
+++ b/src/paperless_tesseract/init.py
@ -0,0 +1,2 @@
 # this is here so that django finds the checks.
 from .checks import *
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@ -0,0 +1,25 @@
 import subprocess
 from django.conf import settings
 from django.core.checks import Error, register
 def get_tesseract_langs():
    with subprocess.Popen(['tesseract', '--list-langs'],
                          stdout=subprocess.PIPE) as p:
        stdout, stderr = p.communicate()
    return stdout.decode().strip().split("\n")[1:]
@register()
 def check_default_language_available(app_configs, **kwargs):
    langs = get_tesseract_langs()
    if settings.OCR_LANGUAGE not in langs:
        return [Error(
            f"The default ocr language {settings.OCR_LANGUAGE} is "
            f"not installed. Paperless cannot OCR your documents "
            f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
    else:
        return []
--- a/src/setup.cfg
+++ b/src/setup.cfg
@ -3,10 +3,9 @@ exclude = migrations, paperless/settings.py, .tox, */tests/*
 [tool:pytest]
 DJANGO_SETTINGS_MODULE=paperless.settings
-addopts = --pythonwarnings=all
+addopts = --pythonwarnings=all --cov --cov-report=html -n auto
 env =
-  PAPERLESS_SECRET=paperless
+  PAPERLESS_DISABLE_DBHANDLER=true
  PAPERLESS_EMAIL_SECRET=paperless
 [coverage:run]
`@ -1 +1,2 @@`
	`from .checks import changed_password_check`	`# this is here so that django finds the checks.`
		`from .checks import *`
`@ -1 +1 @@`
	`__version__ = (0, 9, 2)`	`__version__ = (0, 9, 3)`
		`@ -0,0 +1,2 @@`
							`# this is here so that django finds the checks.`
							`from .checks import *`