Merge branch 'dev'

2025-12-06 00:41:11 -06:00 · 2020-11-18 02:02:26 +01:00
parent 85721f1d44 d7a0848a91
commit 8395bdfdf6
56 changed files with 2164 additions and 8913 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,4 @@
 /src-ui/.vscode
 /src-ui/node_modules
 /src-ui/dist
 .git
@@ -5,3 +6,7 @@
 /consume
 /media
 /data
 /docs
 .pytest_cache
 /dist
 /scripts
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,23 +5,18 @@ python:
  - "3.7"
  - "3.8"
 services:
  - docker
 before_install:
  - sudo apt-get update -qq
  - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
 install:
  - pip install --upgrade pipenv
-  - pipenv install --dev
+  - pipenv install --system --dev
 script:
  - cd src/
  - pipenv run pytest --cov
  - pipenv run pycodestyle
  - cd ..
  - docker build --tag=jonaswinkler/paperless-ng .
 after_success:
  - pipenv run coveralls
--- a/1
+++ b/1
@@ -29,6 +29,7 @@ watchdog = "*"
 pathvalidate = "*"
 django-q = "*"
 redis = "*"
 imap-tools = "*"
 [dev-packages]
 coveralls = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "c0dfeedbac2e9b705267336349e6f72ba650ff9184affae06046db32299e2c87"
+            "sha256": "d6416e6844126b09200b9839a3abdcf3c24ef5cf70052b8f134d8bc804552c17"
        },
        "pipfile-spec": 6,
        "requires": {},
@@ -123,6 +123,14 @@
            "index": "pypi",
            "version": "==20.0.4"
        },
        "imap-tools": {
            "hashes": [
                "sha256:070929b8ec429c0aad94588a37a2962eed656a119ab61dcf91489f20fe983f5d",
                "sha256:6232cd43748741496446871e889eb137351fc7a7e7f4c7888cd8c0fa28e20cda"
            ],
            "index": "pypi",
            "version": "==0.31.0"
        },
        "joblib": {
            "hashes": [
                "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
--- a/docs/_static/paperless-11-mail-filters.png
+++ b/docs/_static/paperless-11-mail-filters.png
--- a/docs/_static/recommended_workflow.png
+++ b/docs/_static/recommended_workflow.png
--- a/docs/administration.rst
+++ b/docs/administration.rst
@@ -294,10 +294,14 @@ Documents can be stored in Paperless using GnuPG encryption.
 .. danger::
-    Decryption is depreceated since paperless-ng 1.0 and doesn't really provide any
+    Decryption is depreceated since paperless-ng 0.9 and doesn't really provide any
    additional security, since you have to store the passphrase in a configuration
-    file on the same system as the encrypted documents for paperless to work. Also,
+    file on the same system as the encrypted documents for paperless to work.
-    paperless provides transparent access to your encrypted documents.
+    Furthermore, the entire text content of the documents is stored plain in the
    database, even if your documents are encrypted. Filenames are not encrypted as
    well.
    Also, the web server provides transparent access to your encrypted documents.
    Consider running paperless on an encrypted filesystem instead, which will then
    at least provide security against physical hardware theft.
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -3,25 +3,168 @@
 The REST API
 ************
 .. warning::
-    This section is not updated to paperless-ng yet.
+Paperless makes use of the `Django REST Framework`_ standard API interface.
-
+It provides a browsable API for most of its endpoints, which you can inspect
-Paperless makes use of the `Django REST Framework`_ standard API interface
+at ``http://<paperless-host>:<port>/api/``. This also documents most of the
-because of its inherent awesomeness.  Conveniently, the system is also
+available filters and ordering fields.
 self-documenting, so to learn more about the access points, schema, what's
 accepted and what isn't, you need only visit ``/api`` on your local Paperless
 installation.
 .. _Django REST Framework: http://django-rest-framework.org/
 The API provides 5 main endpoints:
 *   ``/api/correspondents/``: Full CRUD support.
 *   ``/api/document_types/``: Full CRUD support.
 *   ``/api/documents/``: Full CRUD support, except POSTing new documents. See below.
 *   ``/api/logs/``: Read-Only.
 *   ``/api/tags/``: Full CRUD support.
 All of these endpoints except for the logging endpoint 
 allow you to fetch, edit and delete individual objects
 by appending their primary key to the path, for example ``/api/documents/454/``.
 In addition to that, the document endpoint offers these additional actions on
 individual documents:
 *   ``/api/documents/<pk>/download/``: Download the original document.
 *   ``/api/documents/<pk>/thumb/``: Download the PNG thumbnail of a document.
 *   ``/api/documents/<pk>/preview/``: Display the original document inline,
    without downloading it.
 .. hint::
    Paperless used to provide these functionality at ``/fetch/<pk>/preview``,
    ``/fetch/<pk>/thumb`` and ``/fetch/<pk>/doc``. Redirects to the new URLs
    are in place. However, if you use these old URLs to access documents, you
    should update your app or script to use the new URLs.
 Searching for documents
 #######################
 Paperless-ng offers API endpoints for full text search. These are as follows:
 ``/api/search/``
 ================
 Get search results based on a query.
 Query parameters:
 *   ``query``: The query string. See
    `here <https://whoosh.readthedocs.io/en/latest/querylang.html>`_
    for details on the syntax.
 *   ``page``: Specify the page you want to retrieve. Each page
    contains 10 search results and the first page is ``page=1``, which
    is the default if this is omitted.
 Result list object returned by the endpoint:
 .. code:: json
    {
        "count": 1,
        "page": 1,
        "page_count": 1,
        "results": [
        ]
    }
 *   ``count``: The approximate total number of results.
 *   ``page``: The page returned to you. This might be different from
    the page you requested, if you requested a page that is behind
    the last page. In that case, the last page is returned.
 *   ``page_count``: The total number of pages.
 *   ``results``: A list of result objects on the current page.
 Result object:
 .. code:: json
    {
        "id": 1,
        "highlights": [
        ],
        "score": 6.34234,
        "rank": 23,
        "document": {
        }
 *   ``id``: the primary key of the found document
 *   ``highlights``: an object containing parseable highlights for the result.
    See below.
 *   ``score``: The score assigned to the document. A higher score indicates a
    better match with the query. Search results are sorted descending by score.
 *   ``rank``: the position of the document within the entire search results list.
 *   ``document``: The full json of the document, as returned by
    ``/api/documents/<id>/``.
 Highlights object:
 Highlights are provided as a list of fragments. A fragment is a longer section of
 text from the original document.
 Each fragment contains a list of strings, and some of them are marked as a highlight.
 .. code:: json
    "highlights": [
        [
            {"text": "This is a sample text with a "},
            {"text": "highlighted", "term": 0},
            {"text": " word."}
        ],
        [
            {"text": "Another", "term": 1},
            {"text": " fragment with a highlight."}
        ]
    ]
 When ``term`` is present within a string, the word within ``text`` should be highlighted.
 The term index groups multiple matches together and words with the same index
 should get identical highlighting.
 A client may use this example to produce the following output:
 ... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
 ``/api/search/autocomplete/``
 =============================
 Get auto completions for a partial search term.
 Query parameters:
 *   ``term``: The incomplete term.
 *   ``limit``: Amount of results. Defaults to 10.
 Results returned by the endpoint are ordered by importance of the term in the
 document index. The first result is the term that has the highest Tf/Idf score
 in the index.
 .. code:: json
    [
        "term1",
        "term3",
        "term6",
        "term4"
    ]
 .. _api-file_uploads:
-POSTing Documents
+POSTing documents
-=================
+#################
-File uploads in an API are hard and so far as I've been able to tell, there's
+The API provides a special endpoint for file uploads:
-no standard way of accepting them, so rather than crowbar file uploads into the
+
-REST API and endure that headache, I've left that process to a simple HTTP
+``/api/documents/post_document/``
-POST.
+
 POST a multipart form to this endpoint, where the form field ``document`` contains
 the document that you want to upload to paperless. The filename is sanitized and
 then used to store the document in the consumption folder, where the consumer will
 detect the document and process it as any other document.
 The endpoint will immediately return "OK." if the document was stored in the
 consumption directory.
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -8,10 +8,8 @@ Changelog
 paperless-ng 0.9.0
 ##################
-* **Deprecated:** GnuPG. Don't use it. If you're still using it, be aware that it
+* **Deprecated:** GnuPG. :ref:`See this note on the state of GnuPG in paperless-ng. <utilities-encyption>`
-  offers no protection at all, since the passphrase is stored alongside with the
+  This features will most likely be removed in future versions.
  encrypted documents itself. This features will most likely be removed in future
  versions.
 * **Added:** New frontend. Features:
@@ -38,6 +36,25 @@ paperless-ng 0.9.0
  multi user solution, however, it allows more than one user to access the website
  and set some basic permissions / renew passwords.
 * **Modified [breaking]:** All new mail consumer with customizable filters, actions and
  multiple account support. Replaces the old mail consumer. The new mail consumer
  needs different configuration but can be configured to act exactly like the old
  consumer.
 * **Modified:** Changes to the consumer:
  * Now uses the excellent watchdog library that should make sure files are
    discovered no matter what the platform is.
  * The consumer now uses a task scheduler to run consumption processes in parallel.
    This means that consuming many documents should be much faster on systems with
    many cores.
  * Concurrency is controlled with the new settings ``PAPERLESS_TASK_WORKERS``
    and ``PAPERLESS_THREADS_PER_WORKER``. See TODO for details on concurrency.
  * The consumer no longer blocks the database for extended periods of time.
  * An issue with tesseract running multiple threads per page and slowing down
    the consumer was fixed.
 * **Modified [breaking]:** REST Api changes:
  * New filters added, other filters removed (case sensitive filters, slug filters)
@@ -64,8 +81,8 @@ paperless-ng 0.9.0
  * Rework of the code of the tesseract parser. This is now a lot cleaner.
  * Rework of the filename handling code. It was a mess.
  * Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
-  * Consumer rework: now uses the excellent watchdog library, lots of code removed.
+  * Added a task scheduler that takes care of checking mail, training the classifier, maintaining the document search index
-  * Added a task scheduler that takes care of checking mail, training the classifier and maintaining the document search index.
+    and consuming documents.
  * Updated dependencies. Now uses Pipenv all around.
  * Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
@@ -77,6 +94,8 @@ paperless-ng 0.9.0
  * ``PAPERLESS_DEBUG`` defaults to ``false``.
  * The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
    sqlite.
  * ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
    ``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
 * Many more small changes here and there. The usual stuff.
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -20,7 +20,3 @@ places.
    Copy ``paperless.conf.example`` to any of these locations and adjust it to your
    needs.
 .. warning::
    TBD: explain config options.
--- a/docs/screenshots.rst
+++ b/docs/screenshots.rst
@@ -36,6 +36,10 @@ The old admin is still there and accessible!
 .. image:: _static/paperless-9-admin.png
 Fancy mail filters!
 .. image:: _static/paperless-11-mail-filters.png
 Mobile support in the future? This doesn't really work yet.
 .. image:: _static/paperless-10-mobile.png
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -23,6 +23,77 @@ There are multiple options available.
    that need to be compiled, and that's already done for you in the release.
 Overview of Paperless-ng
 ########################
 Compared to paperless, paperless-ng works a little different under the hood and has
 more moving parts that work together. While this increases the complexity of
 the system, it also brings many benefits. 
 Paperless consists of the following components:
 *   **The webserver:** This is pretty much the same as in paperless. It serves 
    the administration pages, the API, and the new frontend. This is the main
    tool you'll be using to interact with paperless. You may start the webserver
    with
    .. code:: shell-session
        $ cd /path/to/paperless/src/
        $ pipenv run gunicorn -c /usr/src/paperless/gunicorn.conf.py -b 0.0.0.0:8000 paperless.wsgi
    or by any other means such as Apache ``mod_wsgi``.
 *   **The consumer:** This is what watches your consumption folder for documents.
    However, the consumer itself does not consume really consume your documents anymore.
    It rather notifies a task processor that a new file is ready for consumption.
    I suppose it should be named differently.
    This also used to check your emails, but that's now gone elsewhere as well.
    Start the consumer with the management command ``document_consumer``:
    .. code:: shell-session
        $ cd /path/to/paperless/src/
        $ pipenv run python3 manage.py document_consumer
 *   **The task processor:** Paperless relies on `Django Q <https://django-q.readthedocs.io/en/latest/>`_
    for doing much of the heavy lifting. This is a task queue that accepts tasks from
    multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
    certain commands periodically.
    This task processor is responsible for:
    *   Consuming documents. When the consumer finds new documents, it notifies the task processor to
        start a consumption task.
    *   Consuming emails. It periodically checks your configured accounts for new mails and
        produces consumption tasks for any documents it finds.
    *   The task processor also performs the consumption of any documents you upload through
        the web interface.
    *   Maintain the search index and the automatic matching algorithm. These are things that paperless
        needs to do from time to time in order to operate properly.
    This allows paperless to process multiple documents from your consumption folder in parallel! On
    a modern multicore system, consumption with full ocr is blazing fast.
    The task processor comes with a built-in admin interface that you can use to see whenever any of the
    tasks fail and inspect the errors.
    You may start the task processor by executing:
    .. code:: shell-session
        $ cd /path/to/paperless/src/
        $ pipenv run python3 manage.py qcluster
 *   A `redis <https://redis.io/>`_ message broker: This is a really lightweight service that is responsible
    for getting the tasks from the webserver and consumer to the task scheduler. These run in different
    processes (maybe even on different machines!), and therefore, this is necessary.
 *   A database server. Paperless supports PostgreSQL and sqlite for storing its data. However, with the
    added concurrency, it is strongly advised to use PostgreSQL, as sqlite has its limits in that regard.
 Installation
 ############
@@ -31,10 +102,12 @@ You can go multiple routes with setting up and running Paperless:
 * The `docker route`_
 * The `bare metal route`_
-The `docker route`_ is quick & easy. This is the recommended route.
+The `docker route`_ is quick & easy. This is the recommended route. This configures all the stuff
 from above automatically so that it just works and uses sensible defaults for all configuration options.
 The `bare metal route`_ is more complicated to setup but makes it easier
-should you want to contribute some code back.
+should you want to contribute some code back. You need to configure and
 run the above mentioned components yourself.
 Docker Route
 ============
--- a/docs/troubleshooting.rst
+++ b/docs/troubleshooting.rst
@@ -2,9 +2,38 @@
 Troubleshooting
 ***************
-.. warning::
+No files are added by the consumer
 ##################################
 Check for the following issues:
 *   Ensure that the directory you're putting your documents in is the folder
    paperless is watching. With docker, this setting is performed in the
    ``docker-compose.yml`` file. Without docker, look at the ``CONSUMPTION_DIR``
    setting. Don't adjust this setting if you're using docker.
 *   Ensure that redis is up and running. Paperless does its task processing
    asynchronously, and for documents to arrive at the task processor, it needs
    redis to run.
 *   Ensure that the task processor is running. Docker does this automatically.
    Manually invoke the task processor by executing
    .. code:: shell-session
        $ python3 manage.py qcluster
 *   Look at the output of paperless and inspect it for any errors.
 *   Go to the admin interface, and check if there are failed tasks. If so, the
    tasks will contain an error message.
 Consumer fails to pickup any new files
 ######################################
 If you notice, that the consumer will only pickup files in the consumption
 directory at startup, but won't find any other files added later, check out
 the configuration file and enable filesystem polling with the setting
 ``PAPERLESS_CONSUMER_POLLING``.
    This section is not updated to paperless-ng yet.
 Consumer warns ``OCR for XX failed``
 ####################################
--- a/docs/usage_overview.rst
+++ b/docs/usage_overview.rst
@@ -27,7 +27,7 @@ Each document has a couple of fields that you can assign to them:
  a document either originates form, or is sent to.
 * A *tag* is a label that you can assign to documents. Think of labels as more
  powerful folders: Multiple documents can be grouped together with a single
-  tag, however, a single document can also have multiple tags. This is not 
+  tag, however, a single document can also have multiple tags. This is not
  possible with folders. The reason folders are not implemented in paperless
  is simply that tags are much more versatile than folders.
 * A *document type* is used to demarkate the type of a document such as letter,
@@ -86,49 +86,63 @@ files from the scanner.  Typically, you're looking at an FTP server like
 IMAP (Email)
 ============
-Another handy way to get documents into your database is to email them to
+You can tell paperless-ng to consume documents from your email accounts.
-yourself.  The typical use-case would be to be out for lunch and want to send a
+This is a very flexible and powerful feature, if you regularly received documents
-copy of the receipt back to your system at home.  Paperless can be taught to
+via mail that you need to archive. The mail consumer can be configured by using the
-pull emails down from an arbitrary account and dump them into the consumption
+admin interface in the following manner:
 directory where the consumer will follow the
 usual pattern on consuming the document.
-.. hint::
+1.  Define e-mail accounts.
 2.  Define mail rules for your account.
-    It's disabled by default. By setting the values below it will be enabled.
+These rules perform the following:
    It's been tested in a limited environment, so it may not work for you (please
    submit a pull request if you can!)
-.. danger::
+1.  Connect to the mail server.
 2.  Fetch all matching mails (as defined by folder, maximum age and the filters)
 3.  Check if there are any consumable attachments.
 4.  If so, instruct paperless to consume the attachments and optionally
    use the metadata provided in the rule for the new document.
 5.  If documents were consumed from a mail, the rule action is performed
    on that mail.
-    It's designed to **delete mail from the server once consumed**.  So don't go
+Paperless will completely ignore mails that do not match your filters. It will also
-    pointing this to your personal email account and wonder where all your stuff
+only perform the action on mails that it has consumed documents from.
    went.
-.. hint::
+The actions all ensure that the same mail is not consumed twice by different means.
 These are as follows:
-    Currently, only one photo (attachment) per email will work.
+*   **Delete:** Immediately deletes mail that paperless has consumed documents from.
    Use with caution.
 *   **Mark as read:** Mark consumed mail as read. Paperless will not consume documents
    from already read mails. If you read a mail before paperless sees it, it will be
    ignored.
 *   **Flag:** Sets the 'important' flag on mails with consumed documents. Paperless
    will not consume flagged mails.
 *   **Move to folder:** Moves consumed mails out of the way so that paperless wont
    consume them again.
-So, with all that in mind, here's what you do to get it running:
+.. caution::
-1. Setup a new email account somewhere, or if you're feeling daring, create a
+    The mail consumer will perform these actions on all mails it has consumed
-   folder in an existing email box and note the path to that folder.
+    documents from. Keep in mind that the actual consumption process may fail
-2. In ``/etc/paperless.conf`` set all of the appropriate values in
+    for some reason, leaving you with missing documents in paperless.
-   ``PATHS AND FOLDERS`` and ``SECURITY``.
+
-   If you decided to use a subfolder of an existing account, then make sure you
+.. note::
-   set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here.  You also have to set
+
-   the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
+    With the correct set of rules, you can completely automate your email documents.
-   have to include that in every email you send.
+    Create rules for every correspondent you receive digital documents from and
-3. Restart paperless.  Paperless will check
+    paperless will read them automatically. The default acion "mark as read" is
-   the configured email account at startup and from then on every 10 minutes
+    pretty tame and will not cause any damage or data loss whatsoever.
-   for something new and pulls down whatever it finds.
+
-4. Send yourself an email!  Note that the subject is treated as the file name,
+.. note::
-   so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
+
-   get what you expect.  Also, you must include the aforementioned secret
+    Paperless will process the rules in the order defined in the admin page.
-   string in every email so the fetcher knows that it's safe to import.
+
-   Note that Paperless only allows the email title to consist of safe characters
+    You can define catch-all rules and have them executed last to consume
-   to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
+    any documents not matched by previous rules. Such a rule may assign an "Unknown
    mail document" tag to consumed documents so you can inspect them further.
 Paperless is set up to check your mails every 10 minutes. This can be configured on the
 'Scheduled tasks' page in the admin.
 REST API
@@ -136,6 +150,7 @@ REST API
 You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
 .. _usage-recommended_workflow:
 The recommended workflow
@@ -147,6 +162,10 @@ is as follows. This workflow also takes into account that some documents
 have to be kept in physical form, but still ensures that you get all the
 advantages for these documents as well.
 The following diagram shows how easy it is to manage your documents.
 .. image:: _static/recommended_workflow.png
 Preparations in paperless
 =========================
@@ -156,7 +175,7 @@ Preparations in paperless
 Processing of the physical documents
 ====================================
-Keep a physical inbox. Whenever you receive a document that you need to 
+Keep a physical inbox. Whenever you receive a document that you need to
 archive, put it into your inbox. Regulary, do the following for all documents
 in your inbox:
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -59,22 +59,6 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
 #PAPERLESS_STATIC_URL="/static/"
 # These values are required if you want paperless to check a particular email
 # box every 10 minutes and attempt to consume documents from there.  If you
 # don't define a HOST, mail checking will just be disabled.
 #PAPERLESS_CONSUME_MAIL_HOST=""
 #PAPERLESS_CONSUME_MAIL_PORT=""
 #PAPERLESS_CONSUME_MAIL_USER=""
 #PAPERLESS_CONSUME_MAIL_PASS=""
 # Override the default IMAP inbox here. If not set Paperless defaults to
 # "INBOX".
 #PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
 # Any email sent to the target account that does not contain this text will be
 # ignored.
 PAPERLESS_EMAIL_SECRET=""
 # Specify a filename format for the document (directories are supported)
 # Use the following placeholders:
 # * {correspondent}
@@ -143,6 +127,35 @@ PAPERLESS_EMAIL_SECRET=""
 ####                          Software Tweaks                              ####
 ###############################################################################
 # Paperless does multiple things in the background: Maintain the search index,
 # maintain the automatic matching algorithm, check emails, consume documents,
 # etc. This variable specifies how many things it will do in parallel.
 #PAPERLESS_TASK_WORKERS=1
 # Furthermore, paperless uses multiple threads when consuming documents to
 # speed up OCR. This variable specifies how many pages paperless will process
 # in parallel on a single document.
 #PAPERLESS_THREADS_PER_WORKER=1
 # Ensure that the product
 #   PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
 # does not exceed your CPU core count or else paperless will be extremely slow.
 # If you want paperless to process many documents in parallel, choose a high
 # worker count. If you want paperless to process very large documents faster,
 # use a higher thread per worker count.
 # The default is a balance between the two, according to your CPU core count,
 # with a slight favor towards threads per worker, and using as much cores as
 # possible.
 # If you only specify PAPERLESS_TASK_WORKERS, paperless will adjust
 # PAPERLESS_THREADS_PER_WORKER automatically.
 # If paperless won't find documents added to your consume folder, it might
 # not be able to automatically detect filesystem changes. In that case,
 # specify a polling interval in seconds below, which will then cause paperless
 # to periodically check your consumption directory for changes.
 #PAPERLESS_CONSUMER_POLLING=10
 # When the consumer detects a duplicate document, it will not touch the
 # original document. This default behavior can be changed here.
 #PAPERLESS_CONSUMER_DELETE_DUPLICATES="false"
@@ -186,12 +199,6 @@ PAPERLESS_EMAIL_SECRET=""
 #
 # By default, Paperless will attempt to use all available CPU cores to process
 # a document, but if you would like to limit that, you can set this value to
 # an integer:
 #PAPERLESS_OCR_THREADS=1
 # Customize the default language that tesseract will attempt to use when
 # parsing documents. The default language is used whenever
 #  - No language could be detected on a document
--- a/scripts/make-release.sh
+++ b/scripts/make-release.sh
@@ -2,6 +2,15 @@
 set -e
 VERSION=$1
 if [ -z "$VERSION" ]
 then
 	echo "Need a version string."
 	exit 1
 fi
 # source root directory of paperless
 PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
@@ -42,6 +51,7 @@ mkdir "$PAPERLESS_DIST_APP/docker"
 # the application itself
 cp "$PAPERLESS_ROOT/.env" \
  "$PAPERLESS_ROOT/.dockerignore" \
 	"$PAPERLESS_ROOT/CONTRIBUTING.md" \
 	"$PAPERLESS_ROOT/LICENSE" \
 	"$PAPERLESS_ROOT/Pipfile" \
@@ -80,10 +90,12 @@ cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
 cd "$PAPERLESS_DIST_APP"
-docker-compose build
+docker build . -t "jonaswinkler/paperless-ng:$VERSION"
 docker push "jonaswinkler/paperless-ng:$VERSION"
 # works. package the app!
 cd "$PAPERLESS_DIST"
-tar -cJf paperless-ng.tar.xz paperless-ng/
+tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -12,7 +12,7 @@ from django.utils import timezone
 from paperless.db import GnuPG
 from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import generate_filename, create_source_path_directory
-from .models import Document, FileInfo
+from .models import Document, FileInfo, Correspondent, DocumentType, Tag
 from .parsers import ParseError, get_parser_class
 from .signals import (
    document_consumption_finished,
@@ -25,139 +25,204 @@ class ConsumerError(Exception):
 class Consumer:
    """
    Loop over every file found in CONSUMPTION_DIR and:
      1. Convert it to a greyscale pnm
      2. Use tesseract on the pnm
      3. Store the document in the MEDIA_ROOT with optional encryption
      4. Store the OCR'd text in the database
      5. Delete the document and image(s)
    """
-    def __init__(self, consume=settings.CONSUMPTION_DIR,
+    def __init__(self):
                 scratch=settings.SCRATCH_DIR):
        self.logger = logging.getLogger(__name__)
        self.logging_group = None
        self.path = None
        self.filename = None
        self.override_title = None
        self.override_correspondent_id = None
        self.override_tag_ids = None
        self.override_document_type_id = None
-        self.consume = consume
+    def pre_check_file_exists(self):
-        self.scratch = scratch
+        if not os.path.isfile(self.path):
            raise ConsumerError("Cannot consume {}: It is not a file".format(
                self.path))
-        self.classifier = DocumentClassifier()
+    def pre_check_consumption_dir(self):
-
+        if not settings.CONSUMPTION_DIR:
        os.makedirs(self.scratch, exist_ok=True)
        self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        if settings.PASSPHRASE:
            self.storage_type = Document.STORAGE_TYPE_GPG
        if not self.consume:
            raise ConsumerError(
                "The CONSUMPTION_DIR settings variable does not appear to be "
-                "set."
+                "set.")
        if not os.path.isdir(settings.CONSUMPTION_DIR):
            raise ConsumerError(
                "Consumption directory {} does not exist".format(
                    settings.CONSUMPTION_DIR))
    def pre_check_regex(self):
        if not re.match(FileInfo.REGEXES["title"], self.filename):
            raise ConsumerError(
                "Filename {} does not seem to be safe to "
                "consume".format(self.filename))
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        if Document.objects.filter(checksum=checksum).exists():
            if settings.CONSUMER_DELETE_DUPLICATES:
                os.unlink(self.path)
            raise ConsumerError(
                "Not consuming {}: It is a duplicate.".format(self.filename)
            )
-        if not os.path.exists(self.consume):
+    def pre_check_directories(self):
-            raise ConsumerError(
+        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-                "Consumption directory {} does not exist".format(self.consume))
+        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
            "group": self.logging_group
        })
-    @transaction.atomic
+    def try_consume_file(self,
-    def try_consume_file(self, file):
+                         path,
                         override_filename=None,
                         override_title=None,
                         override_correspondent_id=None,
                         override_document_type_id=None,
                         override_tag_ids=None):
        """
-        Return True if file was consumed
+        Return the document object if it was successfully created.
        """
        self.path = path
        self.filename = override_filename or os.path.basename(path)
        self.override_title = override_title
        self.override_correspondent_id = override_correspondent_id
        self.override_document_type_id = override_document_type_id
        self.override_tag_ids = override_tag_ids
        # this is for grouping logging entries for this particular file
        # together.
        self.logging_group = uuid.uuid4()
-        if not re.match(FileInfo.REGEXES["title"], file):
+        # Make sure that preconditions for consuming the file are met.
            return False
-        doc = file
+        self.pre_check_file_exists()
        self.pre_check_consumption_dir()
        self.pre_check_directories()
        self.pre_check_regex()
        self.pre_check_duplicate()
-        if self._is_duplicate(doc):
+        self.log("info", "Consuming {}".format(self.filename))
            self.log(
                "warning",
                "Skipping {} as it appears to be a duplicate".format(doc)
            )
            if settings.CONSUMER_DELETE_DUPLICATES:
                self._cleanup_doc(doc)
            return False
-        self.log("info", "Consuming {}".format(doc))
+        # Determine the parser class.
-        parser_class = get_parser_class(doc)
+        parser_class = get_parser_class(self.filename)
        if not parser_class:
-            self.log(
+            raise ConsumerError("No parsers abvailable for {}".format(self.filename))
                "error", "No parsers could be found for {}".format(doc))
            return False
        else:
-            self.log("info", "Parser: {}".format(parser_class.__name__))
+            self.log("debug", "Parser: {}".format(parser_class.__name__))
        # Notify all listeners that we're going to do some work.
        document_consumption_started.send(
            sender=self.__class__,
-            filename=doc,
+            filename=self.path,
            logging_group=self.logging_group
        )
-        document_parser = parser_class(doc, self.logging_group)
+        # This doesn't parse the document yet, but gives us a parser.
        document_parser = parser_class(self.path, self.logging_group)
        # However, this already created working directories which we have to
        # clean up.
        # Parse the document. This may take some time.
        try:
-            self.log("info", "Generating thumbnail for {}...".format(doc))
+            self.log("debug", "Generating thumbnail for {}...".format(self.filename))
            thumbnail = document_parser.get_optimised_thumbnail()
            self.log("debug", "Parsing {}...".format(self.filename))
            text = document_parser.get_text()
            date = document_parser.get_date()
            document = self._store(
                text,
                doc,
                thumbnail,
                date
            )
        except ParseError as e:
            self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
            document_parser.cleanup()
-            return False
+            raise ConsumerError(e)
        else:
            document_parser.cleanup()
            self._cleanup_doc(doc)
-            self.log(
+        # Prepare the document classifier.
                "info",
                "Document {} consumption finished".format(document)
            )
        # TODO: I don't really like to do this here, but this way we avoid
        #   reloading the classifier multiple times, since there are multiple
        #   post-consume hooks that all require the classifier.
        try:
            classifier = DocumentClassifier()
            classifier.reload()
        except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
            logging.getLogger(__name__).warning(
                "Cannot classify documents: {}.".format(e))
            classifier = None
-            try:
+        # now that everything is done, we can start to store the document
-                self.classifier.reload()
+        # in the system. This will be a transaction and reasonably fast.
-                classifier = self.classifier
+        try:
-            except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
+            with transaction.atomic():
                logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
-            document_consumption_finished.send(
+                # store the document.
-                sender=self.__class__,
+                document = self._store(
-                document=document,
+                    text=text,
-                logging_group=self.logging_group,
+                    date=date
-                classifier=classifier
+                )
            )
            return True
-    def _store(self, text, doc, thumbnail, date):
+                # If we get here, it was successful. Proceed with post-consume
                # hooks. If they fail, nothing will get changed.
-        file_info = FileInfo.from_path(doc)
+                document_consumption_finished.send(
                    sender=self.__class__,
                    document=document,
                    logging_group=self.logging_group,
                    classifier=classifier
                )
-        stats = os.stat(doc)
+                # After everything is in the database, copy the files into
                # place. If this fails, we'll also rollback the transaction.
                create_source_path_directory(document.source_path)
                self._write(document, self.path, document.source_path)
                self._write(document, thumbnail, document.thumbnail_path)
                # Delete the file only if it was successfully consumed
                self.log("debug", "Deleting file {}".format(self.path))
                os.unlink(self.path)
        except Exception as e:
            raise ConsumerError(e)
        finally:
            document_parser.cleanup()
        self.log(
            "info",
            "Document {} consumption finished".format(document)
        )
        return document
    def _store(self, text, date):
        # If someone gave us the original filename, use it instead of doc.
        file_info = FileInfo.from_path(self.filename)
        stats = os.stat(self.path)
        self.log("debug", "Saving record to database")
        created = file_info.created or date or timezone.make_aware(
            datetime.datetime.fromtimestamp(stats.st_mtime))
-        with open(doc, "rb") as f:
+        if settings.PASSPHRASE:
            storage_type = Document.STORAGE_TYPE_GPG
        else:
            storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        with open(self.path, "rb") as f:
            document = Document.objects.create(
                correspondent=file_info.correspondent,
                title=file_info.title,
@@ -166,7 +231,7 @@ class Consumer:
                checksum=hashlib.md5(f.read()).hexdigest(),
                created=created,
                modified=created,
-                storage_type=self.storage_type
+                storage_type=storage_type
            )
        relevant_tags = set(file_info.tags)
@@ -175,19 +240,30 @@ class Consumer:
            self.log("debug", "Tagging with {}".format(tag_names))
            document.tags.add(*relevant_tags)
        self.apply_overrides(document)
        document.filename = generate_filename(document)
        create_source_path_directory(document.source_path)
        self._write(document, doc, document.source_path)
        self._write(document, thumbnail, document.thumbnail_path)
        # We need to save the document twice, since we need the PK of the
        # document in order to create its filename above.
        document.save()
        return document
    def apply_overrides(self, document):
        if self.override_title:
            document.title = self.override_title
        if self.override_correspondent_id:
            document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
        if self.override_document_type_id:
            document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
        if self.override_tag_ids:
            for tag_id in self.override_tag_ids:
                document.tags.add(Tag.objects.get(pk=tag_id))
    def _write(self, document, source, target):
        with open(source, "rb") as read_file:
            with open(target, "wb") as write_file:
@@ -196,13 +272,3 @@ class Consumer:
                    return
                self.log("debug", "Encrypting")
                write_file.write(GnuPG.encrypted(read_file))
    def _cleanup_doc(self, doc):
        self.log("debug", "Deleting document {}".format(doc))
        os.unlink(doc)
    @staticmethod
    def _is_duplicate(doc):
        with open(doc, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        return Document.objects.filter(checksum=checksum).exists()
--- a/src/documents/forms.py
+++ b/src/documents/forms.py
@@ -1,9 +1,11 @@
 import os
 import tempfile
 from datetime import datetime
 from time import mktime
 from django import forms
 from django.conf import settings
 from django_q.tasks import async_task
 from pathvalidate import validate_filename, ValidationError
@@ -18,15 +20,6 @@ class UploadForm(forms.Form):
            raise forms.ValidationError("That filename is suspicious.")
        return self.cleaned_data.get("document")
    def get_filename(self, i=None):
        return os.path.join(
            settings.CONSUMPTION_DIR,
            "{}_{}".format(
                str(i),
                self.cleaned_data.get("document").name
            ) if i else self.cleaned_data.get("document").name
        )
    def save(self):
        """
        Since the consumer already does a lot of work, it's easier just to save
@@ -35,15 +28,13 @@ class UploadForm(forms.Form):
        """
        document = self.cleaned_data.get("document").read()
        original_filename = self.cleaned_data.get("document").name
        t = int(mktime(datetime.now().timetuple()))
-        file_name = self.get_filename()
+        with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
        i = 0
        while os.path.exists(file_name):
            i += 1
            file_name = self.get_filename(i)
        with open(file_name, "wb") as f:
            f.write(document)
-            os.utime(file_name, times=(t, t))
+            os.utime(f.name, times=(t, t))
            async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -1,249 +0,0 @@
 import datetime
 import imaplib
 import logging
 import os
 import re
 import time
 import uuid
 from base64 import b64decode
 from email import policy
 from email.parser import BytesParser
 from dateutil import parser
 from django.conf import settings
 from .models import Correspondent
 class MailFetcherError(Exception):
    pass
 class InvalidMessageError(MailFetcherError):
    pass
 class Loggable(object):
    def __init__(self, group=None):
        self.logger = logging.getLogger(__name__)
        self.logging_group = group or uuid.uuid4()
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
            "group": self.logging_group
        })
 class Message(Loggable):
    """
    A crude, but simple email message class.  We assume that there's a subject
    and n attachments, and that we don't care about the message body.
    """
    SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
    def __init__(self, data, group=None):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """
        Loggable.__init__(self, group=group)
        self.subject = None
        self.time = None
        self.attachment = None
        message = BytesParser(policy=policy.default).parsebytes(data)
        self.subject = str(message["Subject"]).replace("\r\n", "")
        self.body = str(message.get_body())
        self.check_subject()
        self.check_body()
        self._set_time(message)
        self.log("info", 'Importing email: "{}"'.format(self.subject))
        attachments = []
        for part in message.walk():
            content_disposition = part.get("Content-Disposition")
            if not content_disposition:
                continue
            dispositions = content_disposition.strip().split(";")
            if len(dispositions) < 2:
                continue
            if not dispositions[0].lower() == "attachment" and \
               "filename" not in dispositions[1].lower():
                continue
            file_data = part.get_payload()
            attachments.append(Attachment(
                b64decode(file_data), content_type=part.get_content_type()))
        if len(attachments) == 0:
            raise InvalidMessageError(
                "There don't appear to be any attachments to this message")
        if len(attachments) > 1:
            raise InvalidMessageError(
                "There's more than one attachment to this message. It cannot "
                "be indexed automatically."
            )
        self.attachment = attachments[0]
    def __bool__(self):
        return bool(self.attachment)
    def check_subject(self):
        if self.subject is None:
            raise InvalidMessageError("Message does not have a subject")
        if not Correspondent.SAFE_REGEX.match(self.subject):
            raise InvalidMessageError("Message subject is unsafe: {}".format(
                self.subject))
    def check_body(self):
        if self.SECRET not in self.body:
            raise InvalidMessageError("The secret wasn't in the body")
    def _set_time(self, message):
        self.time = datetime.datetime.now()
        message_time = message.get("Date")
        if message_time:
            try:
                self.time = parser.parse(message_time)
            except (ValueError, AttributeError):
                pass  # We assume that "now" is ok
    @property
    def file_name(self):
        return "{}.{}".format(self.subject, self.attachment.suffix)
 class Attachment(object):
    SAFE_SUFFIX_REGEX = re.compile(
        r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
    def __init__(self, data, content_type):
        self.content_type = content_type
        self.data = data
        self.suffix = None
        m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
        if not m:
            raise MailFetcherError(
                "Not-awesome file type: {}".format(self.content_type))
        self.suffix = m.group(2) or m.group(4)
    def read(self):
        return self.data
 class MailFetcher(Loggable):
    def __init__(self, consume=settings.CONSUMPTION_DIR):
        Loggable.__init__(self)
        self._connection = None
        self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
        self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
        self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
        self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
        self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
        self._enabled = bool(self._host)
        if self._enabled and Message.SECRET is None:
            raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
        self.last_checked = time.time()
        self.consume = consume
    def pull(self):
        """
        Fetch all available mail at the target address and store it locally in
        the consumption directory so that the file consumer can pick it up and
        do its thing.
        """
        if self._enabled:
            # Reset the grouping id for each fetch
            self.logging_group = uuid.uuid4()
            self.log("debug", "Checking mail")
            for message in self._get_messages():
                self.log("info", 'Storing email: "{}"'.format(message.subject))
                t = int(time.mktime(message.time.timetuple()))
                file_name = os.path.join(self.consume, message.file_name)
                with open(file_name, "wb") as f:
                    f.write(message.attachment.data)
                    os.utime(file_name, times=(t, t))
        self.last_checked = time.time()
    def _get_messages(self):
        r = []
        try:
            self._connect()
            self._login()
            for message in self._fetch():
                if message:
                    r.append(message)
            self._connection.expunge()
            self._connection.close()
            self._connection.logout()
        except MailFetcherError as e:
            self.log("error", str(e))
        return r
    def _connect(self):
        try:
            self._connection = imaplib.IMAP4_SSL(self._host, self._port)
        except OSError as e:
            msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
            raise MailFetcherError(msg)
    def _login(self):
        login = self._connection.login(self._username, self._password)
        if not login[0] == "OK":
            raise MailFetcherError("Can't log into mail: {}".format(login[1]))
        inbox = self._connection.select(self._inbox)
        if not inbox[0] == "OK":
            raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
    def _fetch(self):
        for num in self._connection.search(None, "ALL")[1][0].split():
            __, data = self._connection.fetch(num, "(RFC822)")
            message = None
            try:
                message = Message(data[0][1], self.logging_group)
            except InvalidMessageError as e:
                self.log("error", str(e))
            else:
                self._connection.store(num, "+FLAGS", "\\Deleted")
            if message:
                yield message
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -3,10 +3,10 @@ import os
 from django.conf import settings
 from django.core.management.base import BaseCommand
 from django_q.tasks import async_task
 from watchdog.events import FileSystemEventHandler
 from watchdog.observers import Observer
-
+from watchdog.observers.polling import PollingObserver
 from documents.consumer import Consumer
 try:
    from inotify_simple import INotify, flags
@@ -16,13 +16,10 @@ except ImportError:
 class Handler(FileSystemEventHandler):
    def __init__(self, consumer):
        self.consumer = consumer
    def _consume(self, file):
        if os.path.isfile(file):
            try:
-                self.consumer.try_consume_file(file)
+                async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
            except Exception as e:
                # Catch all so that the consumer won't crash.
                logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
@@ -37,7 +34,7 @@ class Handler(FileSystemEventHandler):
 class Command(BaseCommand):
    """
    On every iteration of an infinite loop, consume what we can from the
-    consumption directory, and fetch any mail available.
+    consumption directory.
    """
    def __init__(self, *args, **kwargs):
@@ -45,12 +42,6 @@ class Command(BaseCommand):
        self.verbosity = 0
        self.logger = logging.getLogger(__name__)
        self.file_consumer = None
        self.mail_fetcher = None
        self.first_iteration = True
        self.consumer = Consumer()
        BaseCommand.__init__(self, *args, **kwargs)
    def add_arguments(self, parser):
@@ -66,9 +57,6 @@ class Command(BaseCommand):
        self.verbosity = options["verbosity"]
        directory = options["directory"]
        for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
            os.makedirs(d, exist_ok=True)
        logging.getLogger(__name__).info(
            "Starting document consumer at {}".format(
                directory
@@ -78,11 +66,16 @@ class Command(BaseCommand):
        # Consume all files as this is not done initially by the watchdog
        for entry in os.scandir(directory):
            if entry.is_file():
-                self.consumer.try_consume_file(entry.path)
+                async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
        # Start the watchdog. Woof!
-        observer = Observer()
+        if settings.CONSUMER_POLLING > 0:
-        event_handler = Handler(self.consumer)
+            logging.getLogger(__name__).info('Using polling instead of file'
                                             'system notifications.')
            observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
        else:
            observer = Observer()
        event_handler = Handler()
        observer.schedule(event_handler, directory, recursive=True)
        observer.start()
        try:
--- a/src/documents/migrations/1001_auto_20201109_1636.py
+++ b/src/documents/migrations/1001_auto_20201109_1636.py
@@ -9,13 +9,11 @@ from django_q.tasks import schedule
 def add_schedules(apps, schema_editor):
    schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
    schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
    schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
 def remove_schedules(apps, schema_editor):
    Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
    Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
    Schedule.objects.filter(func='documents.tasks.consume_mail').delete()
 class Migration(migrations.Migration):
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -113,6 +113,7 @@ class DocumentType(MatchingModel):
 class Document(models.Model):
    # TODO: why do we need an explicit list
    TYPE_PDF = "pdf"
    TYPE_PNG = "png"
    TYPE_JPG = "jpg"
@@ -291,7 +292,7 @@ class FileInfo:
            non_separated_word=r"([\w,. ]|([^\s]-))"
        )
    )
-
+    # TODO: what is this used for
    formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
    REGEXES = OrderedDict([
        ("created-correspondent-title-tags", re.compile(
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -41,15 +41,16 @@ def get_parser_class(doc):
    Determine the appropriate parser class based on the file
    """
    parsers = []
    for response in document_consumer_declaration.send(None):
        parsers.append(response[1])
    options = []
-    for parser in parsers:
+
-        result = parser(doc)
+    # Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
-        if result:
+
-            options.append(result)
+    for response in document_consumer_declaration.send(None):
        parser_declaration = response[1]
        parser_test = parser_declaration["test"]
        if parser_test(doc):
            options.append(parser_declaration)
    if not options:
        return None
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -6,14 +6,10 @@ from whoosh.writing import AsyncWriter
 from documents import index
 from documents.classifier import DocumentClassifier, \
    IncompatibleClassifierVersionError
-from documents.mail import MailFetcher
+from documents.consumer import Consumer, ConsumerError
 from documents.models import Document
 def consume_mail():
    MailFetcher().pull()
 def index_optimize():
    index.open_index().optimize()
@@ -54,3 +50,27 @@ def train_classifier():
        logging.getLogger(__name__).error(
            "Classifier error: " + str(e)
        )
 def consume_file(path,
                 override_filename=None,
                 override_title=None,
                 override_correspondent_id=None,
                 override_document_type_id=None,
                 override_tag_ids=None):
    document = Consumer().try_consume_file(
        path,
        override_filename=override_filename,
        override_title=override_title,
        override_correspondent_id=override_correspondent_id,
        override_document_type_id=override_document_type_id,
        override_tag_ids=override_tag_ids)
    if document:
        return "Success. New document id {} created".format(
            document.pk
        )
    else:
        raise ConsumerError("Unknown error: Returned document was null, but "
                            "no error message was given.")
--- a/src/documents/tests/samples/inline_mail.txt
+++ b/src/documents/tests/samples/inline_mail.txt
--- a/src/documents/tests/samples/mail.txt
+++ b/src/documents/tests/samples/mail.txt
@@ -1,208 +0,0 @@
 Return-Path: <sender@example.com>
 X-Original-To: sender@mailbox4.mailhost.com
 Delivered-To: sender@mailbox4.mailhost.com
 Received: from mx8.mailhost.com (mail8.mailhost.com [75.126.24.68])
 	by mailbox4.mailhost.com (Postfix) with ESMTP id B62BD5498001
 	for <sender@mailbox4.mailhost.com>; Thu,  4 Feb 2016 22:01:17 +0000 (UTC)
 Received: from localhost (localhost.localdomain [127.0.0.1])
 	by mx8.mailhost.com (Postfix) with ESMTP id B41796F190D
 	for <sender@mailbox4.mailhost.com>; Thu,  4 Feb 2016 22:01:17 +0000 (UTC)
 X-Spam-Flag: NO
 X-Spam-Score: 0
 X-Spam-Level: 
 X-Spam-Status: No, score=0 tagged_above=-999 required=3
 	tests=[RCVD_IN_DNSWL_NONE=-0.0001]
 Received: from mx8.mailhost.com ([127.0.0.1])
 	by localhost (mail8.mailhost.com [127.0.0.1]) (amavisd-new, port 10024)
 	with ESMTP id 3cj6d28FXsS3 for <sender@mailbox4.mailhost.com>;
 	Thu,  4 Feb 2016 22:01:17 +0000 (UTC)
 Received: from smtp.mailhost.com (smtp.mailhost.com [74.55.86.74])
 	by mx8.mailhost.com (Postfix) with ESMTP id 527D76F1529
 	for <paperless@example.com>; Thu,  4 Feb 2016 22:01:17 +0000 (UTC)
 Received: from [10.114.0.19] (nl3x.mullvad.net [46.166.136.162])
 	by smtp.mailhost.com (Postfix) with ESMTP id 9C52420C6FDA
 	for <paperless@example.com>; Thu,  4 Feb 2016 22:01:16 +0000 (UTC)
 To: paperless@example.com
 From: Daniel Quinn <sender@example.com>
 Subject: Test 0
 Message-ID: <56B3CA2A.6030806@example.com>
 Date: Thu, 4 Feb 2016 22:01:14 +0000
 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
 Thunderbird/38.5.0
 MIME-Version: 1.0
 Content-Type: multipart/mixed;
 boundary="------------090701020702030809070008"
 This is a multi-part message in MIME format.
 --------------090701020702030809070008
 Content-Type: text/plain; charset=utf-8
 Content-Transfer-Encoding: 7bit
 The secret word is "paperless" :-)
 --------------090701020702030809070008
 Content-Type: application/pdf;
 name="test0.pdf"
 Content-Transfer-Encoding: base64
 Content-Disposition: attachment;
 filename="test0.pdf"
 JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0
 ZURlY29kZT4+CnN0cmVhbQp4nFWLQQvCMAyF7/kVOQutSdeuHZSA0+3gbVDwIN6c3gR38e/b
 bF4kkPfyvReyjB94IyFVF7pgG0ze4TLDZYevLamzPKEvEFqbMEZfq+WO+5GRHZbHNROLy+So
 UfFi6g7/RyusEpUl9VsQxQTlHR2oV3wUEzOdhOnXG1aw/o1yK2cYCkww4RdbUCevCmVuZHN0
 cmVhbQplbmRvYmoKCjMgMCBvYmoKMTM5CmVuZG9iagoKNSAwIG9iago8PC9MZW5ndGggNiAw
 IFIvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aDEgMTA4MjQ+PgpzdHJlYW0KeJzlOWt0G9WZ
 95uRbNmWLckPWY4SaRTFedmybI8T4rw8sS3ZiZ1YfqWSCbFkS7YEtiQkJSE8GlNeOQ5pUmh5
 Zkt2l+XQNl3GhLaBpcWw0D19UGALLRRS0gM9nD0lxVBK9wCx97tXI0UJAc727L8d+c587/u9
 7p0rOZXYEyJaMkV4Io1OBuLOqmqBEPJLQqB0dG9K2NRTsQHhM4Rw/zkWH5+870e7PiRE9Rgh
 +Y+NT+wf+/b3e4YI0YYJKX41HAoEfxj6vUjIIgltrA0jYef8/nzEr0F8WXgydY2bP7QO8WOI
 SxOx0cDxxbUmxN9AfOlk4Jr4apWLI8SMKBGigcmQpYXrRBx9KtobjyVTQbJsgZDl91B+PBGK
 d9838hzipwjhjyIN8EMvLYJ5FOd4lTovX1NQWKQtLtGR/3eX+jCpIJ3qTURH4ux+wcWfIFXk
 XkIW3qXY+ft898LH/5deaNKPe8hD5DFymLxGrlAYbuIhEbIHKbnX0+QlpNLLQ4bId8n055g9
 QU4hPy3nJ0doJJe8PORucpL8xwWzeMgkuQ59+QF5DRrIz7BVYuQD0JAbyXNo9QOkbb+UKa4E
 b2MMHMuhvk7u5w6RbdzbiNxLOZyT05NnyTHYjZZTGOfhbMQbP2P0NnID3vtJmOxFmF3qTZ/+
 jhQs/AWjuoFsI18jW8hEjsaT8ABfiPUbIA9gTp9mNGeGmd/JX8n9kOPO3YnIN8g4jgBg7Nxh
 fsvnZOh/ffGDpBhW8dWk4FJcrono5j/mGhc+5JeRQjK4MJehLXQt/IUPzEdVw6rF6k2qX3zR
 HHnfUE2iNln44/x180H1DvVDWK2HcePouHzI5x0c6O/r9fTs2N7dtW1rZ4fb1d7WukVq2bxp
 44b1zesuW7umod5Z56hduWJ59TL7UpvVVG7Q60qKiwoLNPl5ahXPAakVZPC7ZL5aMLgDdpc9
 0OmoFVymcLuj1mV3+2UhIMj4UC23d3Yykj0gC35BXo6PQA7ZL0soOXaRpJSWlLKSoBc2ko10
 CrsgP99uF07BUK8X4cPtdp8gn2XwdgarljOkGBGbDTWYV9RbwSW794anXX70EWaKCtvsbaFC
 Ry2ZKSxCsAgheaU9PgMrNwMDuJWu9TMc0RTTaTFSVyAoe3q9rnazzeZz1G6VS+ztjEXamEk5
 r03OZyaFCHWdHBJmamenbz+lJyP+Gm3QHgzs8sp8AHWnedf09G2yoUZeZW+XV137tgkjD8m1
 9naXXEOtdvVl5+k6PyXI6mq9XZj+K8Fw7GffvZASUCh51fq/EgrKXJsMfV4bvcxuzPX0tNsu
 uKf904FTC1MjdkFvn57RaqfjLkw38XjRxKmFJw6ZZfftPlnvD8N6nxK6u69LLuu93Ctz1W4h
 HEAK/rXYbevMNkNWxvN5bIJpweRghm02moZDpyQygog81etN4wIZMT9KJGeNT+b8lDOb4VQM
 Us5UhpNV99uxtl393mlZVb01aHdhxg8F5KkR7K4raWHsernkI7PNPl1qEJqdPiYroFdbgxFB
 Vi/HJKFWrgL2DVWZ1jOk5KP046wZJ1huKBWa7WiG2nHZXX7lb2/YhAYETHRnTboRBryy1I6A
 FFAq5pqpd6JGwI8Fi7SzYspOe1wut7dmq0vdckX6vUxFUZPL22TiH1W0ZKeLrSvBNe1vT7tA
 bdl7vY8TceHMTJNgPimSJuJrp8LGNuyy5a5pb3BMtvrNQVx3Y4LXbJMlH1bYZ/eGfLTtMEOr
 zphZc/hYrwx4u/rtXb1D3nWKI2kGNaeqdl1kxu41p81gA8qaao3g5cy8DwX1SBDcCNhbN+Jd
 zq/W4NBjwhmVNm7rRsELZpKRRjfkVYIr1K7IUfwCo2raTm2dGWt5FEU7bZ1mm8+Wvhy1HLIF
 ZWLU0NCkdmZYuE0hQ4P92dbJSDSXJtr0gtcesvvsYUGWPF4aG00Py7KSDJZzpVYDF2A5ycI0
 ERuyMwhNpuyuMecmV+5geBbtvIi9NcMWpjX2rv5patyuGCTo+VaZ0BaW1hnMbC+gC9qOe6+g
 xyXNFvT0jCTRxRxeT43Ytwan7f3ejUwa95MbzNfSuUpJF3QNtDpqcWtrnbHDwd4ZCQ72D3kf
 1+O58OCA91EOuDZ/q29mGfK8jwv40mBUjlIpkSICRailPkQ0TN78uETIFOOqGIHho6eAMJom
 QwMyeopL0/TpiZaziSTCIUeV5kgZaRXSNGnaFKOxa4bQlEmFakkjFUharpgzzwAlPYqUJ/Ac
 WwDkpBaKwTyDWn2MfAqmZgokc1piCiWktIcHB89PPTjkPanFt7OZ3XGiVnphu5jCWGx8rbiE
 IG2U633hab+PLjZixNLgH8hg34xlsm9GR/K0cqE91CoX2VspvYXSW9L0PErPxxYFI6D6FNbe
 IwPtgMu9NlySwqKfmaf1Z2mlfLipTOv/6MCMVeP3hqfxDFoOG6XTpVwRp+ErjFqigQJeoykw
 8AW831fAl3KEG/aR0hYj6IxwxghPGeGIEQ4YYdgISBQY/ao5I7xghOOMFzdCjxGsjJGmy0Z4
 gLFiTE0yQj0TIEZ4k3GnGL2eUTYssHnSakcYo4fx5hhdzsyRVhCYzhwzNMummWJcdM2ZmeOK
 7HV15koo1+6L6J/hUB5pqTEQ0cTuBtHkHN59hWgohcpmg9hQb1tzmcG+VAd2g81gX1EHNWCo
 rIANr4jnrjC3qY61my0/v6bhlTVm1d3lL8GG+edeyi/65CrzGnqgAlKOJ7c/4neCJeQJaT8p
 L68qLikpqCqwWJcs8viWkHJEKqs8Pm1lRRnHqdWGPp9af9wKZ6wwawW9FYgVmhE5aoW4FfxW
 8FhBskK9FQQrWBkbWVMZLrJeZJqyFY7n0HOTk0hckAAldoy6RaSAyNJQCs0Ye/rTUA/l+ZtB
 bDRWYOA0G032pfkKuGKNDdz5nT9qufb6xPxVNzy0+6YD88F9t0Mj/1G4btXGr9927q4qh6OK
 231iybkyCqk5kwMXTg2eT0vV3aQIvy39gzRGtNo8g6HSyBf0+wgPep6vkCpKPb4KndagM3h8
 uorySlBVQvOHlXC0Erh4JfgrwVMJUiXMVoJcCccZKlSCvhJIJcwxCormSl7YIzQFwywL2fKT
 RSb9r7D4LAEGUQk+z750+ZqmtZgA/nzQ10mOWkmqdUiF/zhfdfwWqFG9mcalT9bTOHmhiq7B
 gYV3uV/zz5GVxCc12fLLFxVjS6xaXWzjKystHp+5Us8XeXz5vHFqNcRXg381eFaDsBoeWQ3D
 q6FnNWT8JVgewmpUSrA26QKhg1kPV6wRK41i45omJ9RxzN3KCvuK5faleRXlxkoLz/165vvu
 79Q7GrqueeZeX2hX43eOjt/vXL0m0Tu4fcedQy120Nx+dEnpOze1P3Rt0xJb+6j7+iPW5yed
 nvbmHYsa69p20q8ZpHPhXf5q/mlixt1lUmoxaKqrVYJWW6Xi8di/tHBpr89UYTAsxooZrAZO
 yxsMRFNozFdhjBWkwuMj+qkVMLwCpBWAwBVYBEw+MbEhljY708knzawn0yvQoESp9N8KDNbQ
 tBlaYE3TcrYu16yF/BKoKBcb114GL933jT3z82WJmfe3Hr/ncMe2YP/Sdf8E5KZbh4+0jzby
 T3/1a+duqXLsToBp93VbeNWdgV3OPc/b5y0q9e6obDWxNYs1c6huJEbSIa0oLCnJL+P5SpNK
 W6T1+Aryi3S4pg29PmJ8wASyCVpM4DTRMiUybSSKivfNpc2NjbSH1NhABvuaFhArxAq7oRzr
 dFlFCcAO//B1N4RafvvbDfXr++03lyfGuTsdK155ZeDcgS2t+i0mK8u5B3Puxh6qIIvJYWmo
 CkC3SFOhq1hiqSKY6CprFSa6qkpbWmr0+Er1WnWvT2uctYBsgeMWOGqBKQvELeC3gMcCxAKb
 8SFZoN4CggX0FphjciiU2R2yO+MVSnFoRUzOzMJINx5bGxXlFqBpx2CwBQ3YdYKhArDlbE3L
 QbXpwPjab9bX/8vO13/xq6cgMn93OAZ37ILXSqfv9ZQWrbPWvQvqjz6YH+uDYw8/ePJeGus2
 jPUd3C/LcMecknrKVUWkqkqv0lusZXqPrwz3A4yY5GOD5eurUIGr7PVxRtwGO3J3RsI2wSlG
 SQN+RldWvxLk+Z0v04HnNz4WXnWeXTA0leJKWr4JcNHT9gNWPMNyu8D9+uq75w/87uWJWN63
 oT01/9/z1qmbrx7yJeY/dQ/BH/4GUGm75UOT4+PHqxzw/E/+bQX3joHVcwfG+CjWsxA77Anp
 RoO6iKhJpUlT4vFp9Fy5BwMSTEBMcMYEHhPUm0BvgjmGvmiCWdZ1x01w1ARTJoibwG8CyQRp
 lQ0PMJKHkeoZVc8YufrHmWZaDe9XfO6bMbtdZpdpNkFYfL0tsy/mNyn7DPYC/+h858uvvvrG
 b3732FdvvWnPvhtvnoLX5w3z7//507/95dVnnjjz1o+fTb8baR52YB6MxC9txCwY1UbMgg7f
 hhq9sZwv7/XxRvR8c24kcyyGdABIf8QEw3TxZd3fnd3MxVxfq7E/BQPbFA10UxTSa5Df0XBi
 aP6y/3rttuOX1fSn5j/85+/dMdG8bBW8/6dz1vmPH3LOh1/+gY36akZfT/Mn0NdvScOktFil
 KigtqDSpy4xl2IpGnQqPpX2+Yr1RW4D+Vxxn2Z7NJL/5TE49CCtgtm5yJpw0RTBBbtpzX9NE
 eUUrj5yXNH0H0K5UenQFXY1VtGOh+fj1E18Hcd/8nzUdT7TMXQMW0J6wcu9UOT69r8rRvaIZ
 yrkxfFPRGPGdnFeF9WiAR6UFgzZv8WIbWbnS4bBpebGxoc7ja9CttC02aB01Do/PqqupqMrL
 Kygo7/MV6FfgMYev7vPx+r0i7BRhrQjLRDCKkCfCRyK8LcLLIvxUhAdFuEuEERHAI0K7CPVM
 rlwElQjhuYzgYyKkRJBEaGJs5H0owusizIogMxs3ixAUFRNpGX1G7EURnhXheyIcZWJXibBB
 BCEzx7r0BMdF8IswkJmjnGm+zTS/KcIUTi/V5PDNTPdt5gAnM4E4mx5n1YmgUdbL8BcfMy88
 heYcxM6r5wjlbE6Z45lyPsuc0CqzJzTWAOyEVknvVZA9ppVw+edPbcsvOrZ1PSy59izZ/kL7
 3P75wduPL3K5WioMh+dbDw0Oem86PL9z3z4o4/0165uaa1rn/6Qc5LwnNIXFqrVbMmi/b8m5
 quyBh/WRE5vhD9hHi8msdAMpKzMVabX5pvwllsV40l2sK0PEaPL4Co0VpbRt9LRtHrTA2xZ4
 1gL4QlFZoBmRb1ogZYGgBQYs0G6BJgsss4CZsfHNxuW+1/Bt9qIFsq+8LD03o8N/18n3wnPv
 RRls3/6v69Pn3t7BITz4Xnn11aDl/bXN2WOvt39YOfcq58HbFt6C/eQVPPeapCKSl6ct5gvu
 v5wvIy3KmRP3qpwDJ+x3NTW53KLo3tXQ2dkgut3s/y30Pzblq28Z1m38K2dN/9b/yzuXdJ7/
 JXfhrbwqNf0FXJMloV6+bd5FvpJLueDS5zXjN8a3SLWKkHKumdTwS8gAR397Pkw6ES/Hpwd5
 23DsQHgHPs2oU4NPJ0eUX9KfgR3wDLcaP8e4t/kh/pcqj+ohtSlvY97P895VZtWTRhoDi0SP
 /bILgX/nf0p4xrVANOvbzqyfgJI7FZgj+WRMgXk8i04qsAplDiqwmpSQexQ4j+jIQwqcT64l
 P1BgDX43dipwASmBNgUuhCj0KnARWcw9lf0vVx33ugIXkzV8gQKXkEX8Zuq9iv46f4L3KjAQ
 QaVSYI6UqJYpME/WqhoVWIUyYQVWk8WqgwqcRyyqBxU4n3yoekaBNWSl+ocKXEAWq3+vwIXc
 G+qPFbiIrNP8RoG1ZFdBiQIXkysLrlTgEtJU8HJ7ZDySilwbCgrBQCogjMbi+xOR8XBKWDm6
 Smisb6gXOmKx8YmQ0BZLxGOJQCoSi9YVtl0s1ij0oYnOQKpW2BodreuOjITSskJ/KBEZ6wuN
 75kIJLYkR0PRYCghOISLJS7Gd4YSSYo01tXX1zWc514sHEkKASGVCARDk4HEVUJs7EJHhERo
 PJJMhRJIjESFwbr+OsETSIWiKSEQDQoDWcWesbHIaIgRR0OJVACFY6kwunrlnkQkGYyM0tmS
 ddkIctLRnwrtDQnbA6lUKBmLtgaSOBd6NhCJxpK1wr5wZDQs7AskhWAoGRmPInNkv3ChjoDc
 AMYSjcb2osm9oVr0eywRSoYj0XEhSUNWtIVUOJCiQU+GUonIaGBiYj/WbDKOWiNYpH2RVBgn
 ngwlhR2hfUJfbDIQ/W5d2hXMzRgmVYhMxhOxvcxHR3I0EQpFcbJAMDASmYik0Fo4kAiMYsYw
 bZHRJMsIJkKIB6IO155ELB5CT7/S0X1eEB1MZzMZm9iLM1PpaCgUpDOi23tDE6iEE0/EYlfR
 eMZiCXQ0mAo7cjwfi0VTqBoTAsEgBo7Zio3umaR1wjSnMs4FRhMx5MUnAim0MpmsC6dS8fVO
 5759++oCSmlGsTJ1aNn5RbzU/nhIqUeCWpmc6MbyR2np9rD60iD6t3YLPXHMjxudExSBWiHT
 mg11DcoUmMZIPJWsS0Ym6mKJcWePu5u0kwgZx5HCcS0JkSARcAQQDyA0SmIkTvaTBJMKI1Ug
 K5G6Cp+NpJ404BBIB0rFkD+B+gJpQziBWvQeYHZjJErq8FtE25daa0SoT/Gik2nXIrQV9UfR
 QjfqjSA3165A+hklgvss1Rwne9CPAFK2kCRqhVAmyCQE4sDxZTa+jL+TQckspxH9qsdPHXp/
 Kd0vsxxBWwLLdYpxqK+TzP+rkBZDvS/KiIByIVa/JHJCDAsyq9T2IEr0MykP06S5SLHZokxq
 4BIz9uCMY6g/ymqZkRxltmlPpC3HEA4rWb0SM55gHgSZXia2JM782Rpcujv6mXd72ZzbGZ3i
 ScZrRTypxJXO2QDzIoZUmot96AmdN8zgAMtnkGnTLosqmiPYd8IXziMougGlLlE2x17FS6pT
 q+R7jN2TbN4oziEw/9JVvnBugeUpwLKervQkclNMdhTpE/jZr6yzScxKeq4RZSXtY+syrEQ8
 yewKZAc+97GuiLG6RW1LWY3PZyXdN2NKpwpMN45wjEWRyaOD1YZGEmKeUijA1v4IakywudO+
 hVl3BFhtQ0qtUyyCTL6CSqTU6zijOIiL9QVd8SElp1/BnaL7khbTGcztTVqTCeZvMsd2lHkb
 zMaYzjaVmlBmSkc8wXakq7L1GWP9ls5okFlzfE7Ox1huUsqsMeZRED/piqd7K4a6e1g90usp
 3c2pz2QuwPIbU/TibF9KKb5MsvURZh0YJ+vxbOlE7+injvVh7qoZVdZMneKz8+/Wo37FWQZz
 10ci68sk+titrP5odtXtyVm/mUr04x7UzfaLuNI/biVzwkUW6Kq5eNdsYPvlhVGkuzGCeIr5
 k2S5rGMxjCO/B2foZufo9DcHG/p0iWumwLNlBEIEIAzjpIxYwU92wDAZhC1kE0j4lJDXis82
 xOmzDjaRKZTbhPTNiG9E+gbcPK14b8HRg+MIDhWOtEQ9Sjjx6VRwB+K1qPEC3oENSm1BKn1u
 Q7wTnx3K0410Fz5dCr4VcXwSP+TjQbyF3Z8ClXQSzpyDF86BcA4OfAKeT2Dqg6MfcO/PrbI+
 MvfUHNfz3vB7j7zH178HuvdAQ87qz3rO+s/Gzx4/m1eoexe05E9geOvMOuubm04P/n7TG4Pk
 NEZ2uv605/TUafm0+jTwg2/wRqt+Vpitn43PTs2+OHtmdm5WM/WToz/hfvyk06p70vokZz3Z
 c/LASd7/MOgetj7Mee73388dPQa6Y9ZjzmP8fffWWe/tsFjvvmuF9cxdc3dxpxZmT95VbHA/
 CT3QTTZhDnec5Besj2ypgO0Ylg7vVhxOHD04YjiO4MDvPShuxeGEbmkdP/wtKLrDfEfNHdfd
 cegOdfzWqVuP3spP3XL0Fu6RvU/t5ZKeVdZYtMYa7VhtrRJNg/kiP5iH0+Ds0taR6pVu/7Bk
 HUahy4fqrUMdq6xlYumgGgNWoaCOt/ItfA8f44/wT/H5mj6PxdqL44xnzsNJngKtW9dj7XH2
 8KcWzkihLhta2xbfNrWN3+peZe3sWGfVdVg7nB0vdLzZ8V5H3nAHPIB/7kfcT7l5yb3K6Zbc
 Fpt7cad50ChWDBpAN6gXdYMcYKFFMujULeg4nW5Yd0DH60gL4aaMoIZTcHRmoL+mputU/kJf
 l6zxXC7DQbm6n96l3iE576BMBocu984AfN13y+HDpHVJl9zY75X9S3xdchABiQJTCOiXzBhJ
 qy+ZTNWwC2pqEN6Dd1KzpwaJu5NpKsnySU0SkrhHJZkS1FCBNA54r6E8JFA9QO3dSUJvlFmT
 VqLaScUcU07fGGDa/T/LhW2oCmVuZHN0cmVhbQplbmRvYmoKCjYgMCBvYmoKNjI5MQplbmRv
 YmoKCjcgMCBvYmoKPDwvVHlwZS9Gb250RGVzY3JpcHRvci9Gb250TmFtZS9CQUFBQUErTGli
 ZXJhdGlvblNlcmlmCi9GbGFncyA0Ci9Gb250QkJveFstNTQzIC0zMDMgMTI3NyA5ODFdL0l0
 YWxpY0FuZ2xlIDAKL0FzY2VudCA4OTEKL0Rlc2NlbnQgLTIxNgovQ2FwSGVpZ2h0IDk4MQov
 U3RlbVYgODAKL0ZvbnRGaWxlMiA1IDAgUgo+PgplbmRvYmoKCjggMCBvYmoKPDwvTGVuZ3Ro
 IDI5Mi9GaWx0ZXIvRmxhdGVEZWNvZGU+PgpzdHJlYW0KeJxdkctuwyAQRfd8Bct0EfmROA/J
 spQmseRFH6rbD3BgnCLVGGGy8N+XmUlbqQvQmZl7BxiSY3NqrAnJqx9VC0H2xmoP03jzCuQF
 rsaKLJfaqHCPaFdD50QSve08BRga249lKZK3WJuCn+XioMcLPIjkxWvwxl7l4uPYxri9OfcF
 A9ggU1FVUkMf+zx17rkbICHXstGxbMK8jJY/wfvsQOYUZ3wVNWqYXKfAd/YKokzTSpZ1XQmw
 +l8tK9hy6dVn56M0i9I0LdZV5Jx4s0NeMe+R18TbFXJBnKfIG9ZkyFvWUJ8d5wvkPTPlD8w1
 8iMz9Tyyl/Qnzp+Qz8xn5JrPPdOj7rfH5+H8f8Ym1c37ODL6JJoVTslY+P1HNzp00foG7l+O
 gwplbmRzdHJlYW0KZW5kb2JqCgo5IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1RydWVU
 eXBlL0Jhc2VGb250L0JBQUFBQStMaWJlcmF0aW9uU2VyaWYKL0ZpcnN0Q2hhciAwCi9MYXN0
 Q2hhciAxNQovV2lkdGhzWzc3NyA2MTAgNTAwIDI3NyAzODkgMjUwIDQ0MyAyNzcgNDQzIDUw
 MCA1MDAgNDQzIDUwMCA3NzcgNTAwIDI1MApdCi9Gb250RGVzY3JpcHRvciA3IDAgUgovVG9V
 bmljb2RlIDggMCBSCj4+CmVuZG9iagoKMTAgMCBvYmoKPDwvRjEgOSAwIFIKPj4KZW5kb2Jq
 CgoxMSAwIG9iago8PC9Gb250IDEwIDAgUgovUHJvY1NldFsvUERGL1RleHRdCj4+CmVuZG9i
 agoKMSAwIG9iago8PC9UeXBlL1BhZ2UvUGFyZW50IDQgMCBSL1Jlc291cmNlcyAxMSAwIFIv
 TWVkaWFCb3hbMCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNl
 UkdCL0kgdHJ1ZT4+L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUv
 UGFnZXMKL1Jlc291cmNlcyAxMSAwIFIKL01lZGlhQm94WyAwIDAgNTk1IDg0MiBdCi9LaWRz
 WyAxIDAgUiBdCi9Db3VudCAxPj4KZW5kb2JqCgoxMiAwIG9iago8PC9UeXBlL0NhdGFsb2cv
 UGFnZXMgNCAwIFIKL09wZW5BY3Rpb25bMSAwIFIgL1hZWiBudWxsIG51bGwgMF0KL0xhbmco
 ZW4tR0IpCj4+CmVuZG9iagoKMTMgMCBvYmoKPDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5
 MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAwNEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAw
 NjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzUwMDJFMDAzMD4KL0NyZWF0aW9uRGF0ZShEOjIw
 MTYwMjA0MjIwMDAyWicpPj4KZW5kb2JqCgp4cmVmCjAgMTQKMDAwMDAwMDAwMCA2NTUzNSBm
 IAowMDAwMDA3NTA5IDAwMDAwIG4gCjAwMDAwMDAwMTkgMDAwMDAgbiAKMDAwMDAwMDIyOSAw
 MDAwMCBuIAowMDAwMDA3NjUyIDAwMDAwIG4gCjAwMDAwMDAyNDkgMDAwMDAgbiAKMDAwMDAw
 NjYyNSAwMDAwMCBuIAowMDAwMDA2NjQ2IDAwMDAwIG4gCjAwMDAwMDY4NDEgMDAwMDAgbiAK
 MDAwMDAwNzIwMiAwMDAwMCBuIAowMDAwMDA3NDIyIDAwMDAwIG4gCjAwMDAwMDc0NTQgMDAw
 MDAgbiAKMDAwMDAwNzc1MSAwMDAwMCBuIAowMDAwMDA3ODQ4IDAwMDAwIG4gCnRyYWlsZXIK
 PDwvU2l6ZSAxNC9Sb290IDEyIDAgUgovSW5mbyAxMyAwIFIKL0lEIFsgPDRFN0ZCMEZCMjA4
 ODBCNURBQkIzQTNEOTQxNDlBRTQ3Pgo8NEU3RkIwRkIyMDg4MEI1REFCQjNBM0Q5NDE0OUFF
 NDc+IF0KL0RvY0NoZWNrc3VtIC8yQTY0RDMzNzRFQTVEODMwNTRDNEI2RDFEMUY4QzU1RQo+
 PgpzdGFydHhyZWYKODAxOAolJUVPRgo=
 --------------090701020702030809070008--
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -0,0 +1,218 @@
 import os
 import shutil
 import tempfile
 from unittest import mock
 from unittest.mock import MagicMock
 from django.contrib.auth.models import User
 from django.test import override_settings
 from rest_framework.test import APITestCase, APIClient
 from documents.models import Document, Correspondent, DocumentType, Tag
 class DocumentApiTest(APITestCase):
    def setUp(self):
        self.scratch_dir = tempfile.mkdtemp()
        self.media_dir = tempfile.mkdtemp()
        self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
        self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
        os.makedirs(self.originals_dir, exist_ok=True)
        os.makedirs(self.thumbnail_dir, exist_ok=True)
        override_settings(
            SCRATCH_DIR=self.scratch_dir,
            MEDIA_ROOT=self.media_dir,
            ORIGINALS_DIR=self.originals_dir,
            THUMBNAIL_DIR=self.thumbnail_dir
        ).enable()
        user = User.objects.create_superuser(username="temp_admin")
        self.client.force_login(user=user)
    def tearDown(self):
        shutil.rmtree(self.scratch_dir, ignore_errors=True)
        shutil.rmtree(self.media_dir, ignore_errors=True)
    def testDocuments(self):
        response = self.client.get("/api/documents/").data
        self.assertEqual(response['count'], 0)
        c = Correspondent.objects.create(name="c", pk=41)
        dt = DocumentType.objects.create(name="dt", pk=63)
        tag = Tag.objects.create(name="t", pk=85)
        doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
        doc.tags.add(tag)
        response = self.client.get("/api/documents/", format='json')
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.data['count'], 1)
        returned_doc = response.data['results'][0]
        self.assertEqual(returned_doc['id'], doc.id)
        self.assertEqual(returned_doc['title'], doc.title)
        self.assertEqual(returned_doc['correspondent']['name'], c.name)
        self.assertEqual(returned_doc['document_type']['name'], dt.name)
        self.assertEqual(returned_doc['correspondent']['id'], c.id)
        self.assertEqual(returned_doc['document_type']['id'], dt.id)
        self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
        self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
        self.assertEqual(len(returned_doc['tags']), 1)
        self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
        self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
        self.assertListEqual(returned_doc['tags_id'], [tag.id])
        c2 = Correspondent.objects.create(name="c2")
        returned_doc['correspondent_id'] = c2.pk
        returned_doc['title'] = "the new title"
        response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
        self.assertEqual(response.status_code, 200)
        doc_after_save = Document.objects.get(id=doc.id)
        self.assertEqual(doc_after_save.correspondent, c2)
        self.assertEqual(doc_after_save.title, "the new title")
        self.client.delete("/api/documents/{}/".format(doc_after_save.pk))
        self.assertEqual(len(Document.objects.all()), 0)
    def test_document_actions(self):
        _, filename = tempfile.mkstemp(dir=self.originals_dir)
        content = b"This is a test"
        content_thumbnail = b"thumbnail content"
        with open(filename, "wb") as f:
            f.write(content)
        doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
        with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
            f.write(content_thumbnail)
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content)
        response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content)
        response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_thumbnail)
    def test_document_actions_not_existing_file(self):
        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
        self.assertEqual(response.status_code, 404)
        response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
        self.assertEqual(response.status_code, 404)
        response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
        self.assertEqual(response.status_code, 404)
    def test_document_filters(self):
        doc1 = Document.objects.create(title="none1", checksum="A")
        doc2 = Document.objects.create(title="none2", checksum="B")
        doc3 = Document.objects.create(title="none3", checksum="C")
        tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
        tag_2 = Tag.objects.create(name="t2")
        tag_3 = Tag.objects.create(name="t3")
        doc1.tags.add(tag_inbox)
        doc2.tags.add(tag_2)
        doc3.tags.add(tag_2)
        doc3.tags.add(tag_3)
        response = self.client.get("/api/documents/?is_in_inbox=true")
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0]['id'], doc1.id)
        response = self.client.get("/api/documents/?is_in_inbox=false")
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['id'], doc2.id)
        self.assertEqual(results[1]['id'], doc3.id)
        response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_inbox.id, tag_3.id))
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
        self.assertEqual(len(results), 2)
        self.assertEqual(results[0]['id'], doc1.id)
        self.assertEqual(results[1]['id'], doc3.id)
        response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0]['id'], doc3.id)
        response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_inbox.id, tag_3.id))
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
        self.assertEqual(len(results), 0)
        response = self.client.get("/api/documents/?tags__id__all={}a{}".format(tag_inbox.id, tag_3.id))
        self.assertEqual(response.status_code, 200)
        results = response.data['results']
        self.assertEqual(len(results), 3)
    @mock.patch("documents.index.autocomplete")
    def test_search_autocomplete(self, m):
        m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
        response = self.client.get("/api/search/autocomplete/?term=test")
        self.assertEqual(response.status_code, 200)
        self.assertEqual(len(response.data), 10)
        response = self.client.get("/api/search/autocomplete/?term=test&limit=20")
        self.assertEqual(response.status_code, 200)
        self.assertEqual(len(response.data), 20)
        response = self.client.get("/api/search/autocomplete/?term=test&limit=-1")
        self.assertEqual(response.status_code, 400)
        response = self.client.get("/api/search/autocomplete/")
        self.assertEqual(response.status_code, 400)
        response = self.client.get("/api/search/autocomplete/?term=")
        self.assertEqual(response.status_code, 200)
        self.assertEqual(len(response.data), 10)
    def test_statistics(self):
        doc1 = Document.objects.create(title="none1", checksum="A")
        doc2 = Document.objects.create(title="none2", checksum="B")
        doc3 = Document.objects.create(title="none3", checksum="C")
        tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
        doc1.tags.add(tag_inbox)
        response = self.client.get("/api/statistics/")
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.data['documents_total'], 3)
        self.assertEqual(response.data['documents_inbox'], 1)
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -0,0 +1,85 @@
 import tempfile
 from django.test import TestCase, override_settings
 from documents.classifier import DocumentClassifier
 from documents.models import Correspondent, Document, Tag, DocumentType
 class TestClassifier(TestCase):
    def setUp(self):
        self.classifier = DocumentClassifier()
    def generate_test_data(self):
        self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
        self.c2 = Correspondent.objects.create(name="c2")
        self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
        self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
        self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
        self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
        self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
        self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
        self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
        self.doc1.tags.add(self.t1)
        self.doc2.tags.add(self.t1)
        self.doc2.tags.add(self.t3)
        self.doc_inbox.tags.add(self.t2)
    def testNoTrainingData(self):
        try:
            self.classifier.train()
        except ValueError as e:
            self.assertEqual(str(e), "No training data available.")
        else:
            self.fail("Should raise exception")
    def testEmpty(self):
        Document.objects.create(title="WOW", checksum="3457", content="ASD")
        self.classifier.train()
        self.assertIsNone(self.classifier.document_type_classifier)
        self.assertIsNone(self.classifier.tags_classifier)
        self.assertIsNone(self.classifier.correspondent_classifier)
        self.assertListEqual(self.classifier.predict_tags(""), [])
        self.assertIsNone(self.classifier.predict_document_type(""))
        self.assertIsNone(self.classifier.predict_correspondent(""))
    def testTrain(self):
        self.generate_test_data()
        self.classifier.train()
        self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
        self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
    def testPredict(self):
        self.generate_test_data()
        self.classifier.train()
        self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
        self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
        self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
        self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
        self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
        self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
    def testDatasetHashing(self):
        self.generate_test_data()
        self.assertTrue(self.classifier.train())
        self.assertFalse(self.classifier.train())
    @override_settings(DATA_DIR=tempfile.mkdtemp())
    def testSaveClassifier(self):
        self.generate_test_data()
        self.classifier.train()
        self.classifier.save_classifier()
        newClassifier = DocumentClassifier()
        newClassifier.reload()
        self.assertFalse(newClassifier.train())
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,8 +1,17 @@
 import os
 import re
 import shutil
 import tempfile
 from unittest import mock
 from unittest.mock import MagicMock
-from django.test import TestCase
+from django.conf import settings
 from django.db import DatabaseError
 from django.test import TestCase, override_settings
-from ..models import FileInfo, Tag
+from ..consumer import Consumer, ConsumerError
 from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
 from ..parsers import DocumentParser, ParseError
 class TestAttributes(TestCase):
@@ -394,3 +403,254 @@ class TestFieldPermutations(TestCase):
            self.assertEqual(info.created.year, 2019)
            self.assertEqual(info.created.month, 9)
            self.assertEqual(info.created.day, 8)
 class DummyParser(DocumentParser):
    def get_thumbnail(self):
        # not important during tests
        raise NotImplementedError()
    def __init__(self, path, logging_group, scratch_dir):
        super(DummyParser, self).__init__(path, logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
    def get_optimised_thumbnail(self):
        return self.fake_thumb
    def get_text(self):
        return "The Text"
 class FaultyParser(DocumentParser):
    def get_thumbnail(self):
        # not important during tests
        raise NotImplementedError()
    def __init__(self, path, logging_group, scratch_dir):
        super(FaultyParser, self).__init__(path, logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
    def get_optimised_thumbnail(self):
        return self.fake_thumb
    def get_text(self):
        raise ParseError("Does not compute.")
 class TestConsumer(TestCase):
    def make_dummy_parser(self, path, logging_group):
        return DummyParser(path, logging_group, self.scratch_dir)
    def make_faulty_parser(self, path, logging_group):
        return FaultyParser(path, logging_group, self.scratch_dir)
    def setUp(self):
        self.scratch_dir = tempfile.mkdtemp()
        self.media_dir = tempfile.mkdtemp()
        self.consumption_dir = tempfile.mkdtemp()
        override_settings(
            SCRATCH_DIR=self.scratch_dir,
            MEDIA_ROOT=self.media_dir,
            ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
            THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
            CONSUMPTION_DIR=self.consumption_dir
        ).enable()
        patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
        m = patcher.start()
        m.return_value = [(None, {
            "parser": self.make_dummy_parser,
            "test": lambda _: True,
            "weight": 0
        })]
        self.addCleanup(patcher.stop)
        self.consumer = Consumer()
    def tearDown(self):
        shutil.rmtree(self.scratch_dir, ignore_errors=True)
        shutil.rmtree(self.media_dir, ignore_errors=True)
        shutil.rmtree(self.consumption_dir, ignore_errors=True)
    def get_test_file(self):
        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
        return f
    def testNormalOperation(self):
        filename = self.get_test_file()
        document = self.consumer.try_consume_file(filename)
        self.assertEqual(document.content, "The Text")
        self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
        self.assertIsNone(document.correspondent)
        self.assertIsNone(document.document_type)
        self.assertEqual(document.filename, "0000001.pdf")
        self.assertTrue(os.path.isfile(
            document.source_path
        ))
        self.assertTrue(os.path.isfile(
            document.thumbnail_path
        ))
        self.assertFalse(os.path.isfile(filename))
    def testOverrideFilename(self):
        filename = self.get_test_file()
        overrideFilename = "My Bank - Statement for November.pdf"
        document = self.consumer.try_consume_file(filename, override_filename=overrideFilename)
        self.assertEqual(document.correspondent.name, "My Bank")
        self.assertEqual(document.title, "Statement for November")
    def testOverrideTitle(self):
        document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
        self.assertEqual(document.title, "Override Title")
    def testOverrideCorrespondent(self):
        c = Correspondent.objects.create(name="test")
        document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
        self.assertEqual(document.correspondent.id, c.id)
    def testOverrideDocumentType(self):
        dt = DocumentType.objects.create(name="test")
        document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
        self.assertEqual(document.document_type.id, dt.id)
    def testOverrideTags(self):
        t1 = Tag.objects.create(name="t1")
        t2 = Tag.objects.create(name="t2")
        t3 = Tag.objects.create(name="t3")
        document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
        self.assertIn(t1, document.tags.all())
        self.assertNotIn(t2, document.tags.all())
        self.assertIn(t3, document.tags.all())
    def testNotAFile(self):
        try:
            self.consumer.try_consume_file("non-existing-file")
        except ConsumerError as e:
            self.assertTrue(str(e).endswith('It is not a file'))
            return
        self.fail("Should throw exception")
    @override_settings(CONSUMPTION_DIR=None)
    def testConsumptionDirUnset(self):
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
            return
        self.fail("Should throw exception")
    @override_settings(CONSUMPTION_DIR="asd")
    def testNoConsumptionDir(self):
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertEqual(str(e), "Consumption directory asd does not exist")
            return
        self.fail("Should throw exception")
    def testDuplicates(self):
        self.consumer.try_consume_file(self.get_test_file())
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertTrue(str(e).endswith("It is a duplicate."))
            return
        self.fail("Should throw exception")
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def testNoParsers(self, m):
        m.return_value = []
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertTrue(str(e).startswith("No parsers abvailable"))
            return
        self.fail("Should throw exception")
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def testFaultyParser(self, m):
        m.return_value = [(None, {
            "parser": self.make_faulty_parser,
            "test": lambda _: True,
            "weight": 0
        })]
        try:
            self.consumer.try_consume_file(self.get_test_file())
        except ConsumerError as e:
            self.assertEqual(str(e), "Does not compute.")
            return
        self.fail("Should throw exception.")
    @mock.patch("documents.consumer.Consumer._write")
    def testPostSaveError(self, m):
        filename = self.get_test_file()
        m.side_effect = OSError("NO.")
        try:
            self.consumer.try_consume_file(filename)
        except ConsumerError as e:
            self.assertEqual(str(e), "NO.")
        else:
            self.fail("Should raise exception")
        # file not deleted
        self.assertTrue(os.path.isfile(filename))
        # Database empty
        self.assertEqual(len(Document.objects.all()), 0)
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def testFilenameHandling(self):
        filename = self.get_test_file()
        document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
        print(document.source_path)
        print("===")
        self.assertEqual(document.title, "new docs")
        self.assertEqual(document.correspondent.name, "Bank")
        self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
    @mock.patch("documents.consumer.DocumentClassifier")
    def testClassifyDocument(self, m):
        correspondent = Correspondent.objects.create(name="test")
        dtype = DocumentType.objects.create(name="test")
        t1 = Tag.objects.create(name="t1")
        t2 = Tag.objects.create(name="t2")
        m.return_value = MagicMock()
        m.return_value.predict_correspondent.return_value = correspondent.pk
        m.return_value.predict_document_type.return_value = dtype.pk
        m.return_value.predict_tags.return_value = [t1.pk]
        document = self.consumer.try_consume_file(self.get_test_file())
        self.assertEqual(document.correspondent, correspondent)
        self.assertEqual(document.document_type, dtype)
        self.assertIn(t1, document.tags.all())
        self.assertNotIn(t2, document.tags.all())
--- a/src/documents/tests/test_mail.py
+++ b/src/documents/tests/test_mail.py
@@ -1,90 +0,0 @@
 import base64
 import os
 from hashlib import md5
 from unittest import mock
 import magic
 from django.conf import settings
 from django.test import TestCase
 from ..mail import Message, Attachment
 class TestMessage(TestCase):
    def __init__(self, *args, **kwargs):
        TestCase.__init__(self, *args, **kwargs)
        self.sample = os.path.join(
            settings.BASE_DIR,
            "documents",
            "tests",
            "samples",
            "mail.txt"
        )
    def test_init(self):
        with open(self.sample, "rb") as f:
            with mock.patch("logging.StreamHandler.emit") as __:
                message = Message(f.read())
            self.assertTrue(message)
            self.assertEqual(message.subject, "Test 0")
            data = message.attachment.read()
            self.assertEqual(
                md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
            self.assertEqual(
                message.attachment.content_type, "application/pdf")
            with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
                self.assertEqual(m.id_buffer(data), "application/pdf")
 class TestInlineMessage(TestCase):
    def __init__(self, *args, **kwargs):
        TestCase.__init__(self, *args, **kwargs)
        self.sample = os.path.join(
            settings.BASE_DIR,
            "documents",
            "tests",
            "samples",
            "inline_mail.txt"
        )
    def test_init(self):
        with open(self.sample, "rb") as f:
            with mock.patch("logging.StreamHandler.emit") as __:
                message = Message(f.read())
            self.assertTrue(message)
            self.assertEqual(message.subject, "Paperless Inline Image")
            data = message.attachment.read()
            self.assertEqual(
                md5(data).hexdigest(), "30c00a7b42913e65f7fdb0be40b9eef3")
            self.assertEqual(
                message.attachment.content_type, "image/png")
            with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
                self.assertEqual(m.id_buffer(data), "image/png")
 class TestAttachment(TestCase):
    def test_init(self):
        data = base64.encodebytes(b"0")
        self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
        self.assertEqual(Attachment(data, "image/png").suffix, "png")
        self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
        self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
        self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
        self.assertEqual(Attachment(data, "image/png").read(), data)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
            pass
        m.return_value = (
-            (None, lambda _: {"weight": 0, "parser": DummyParser}),
+            (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
        )
        self.assertEqual(
@@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
            pass
        m.return_value = (
-            (None, lambda _: {"weight": 0, "parser": DummyParser1}),
+            (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
-            (None, lambda _: {"weight": 1, "parser": DummyParser2}),
+            (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
        )
        self.assertEqual(
@@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def test__get_parser_class_0_parsers(self, m, *args):
-        m.return_value = ((None, lambda _: None),)
+        m.return_value = []
        with TemporaryDirectory() as tmpdir:
            self.assertIsNone(
                get_parser_class("doc.pdf")
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -52,7 +52,7 @@ class CorrespondentViewSet(ModelViewSet):
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
-    filter_class = CorrespondentFilterSet
+    filterset_class = CorrespondentFilterSet
    ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
@@ -63,7 +63,7 @@ class TagViewSet(ModelViewSet):
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
-    filter_class = TagFilterSet
+    filterset_class = TagFilterSet
    ordering_fields = ("name", "matching_algorithm", "match", "document_count")
@@ -74,7 +74,7 @@ class DocumentTypeViewSet(ModelViewSet):
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
-    filter_class = DocumentTypeFilterSet
+    filterset_class = DocumentTypeFilterSet
    ordering_fields = ("name", "matching_algorithm", "match", "document_count")
@@ -89,7 +89,7 @@ class DocumentViewSet(RetrieveModelMixin,
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
-    filter_class = DocumentFilterSet
+    filterset_class = DocumentFilterSet
    search_fields = ("title", "correspondent__name", "content")
    ordering_fields = (
        "id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
@@ -170,7 +170,7 @@ class LogViewSet(ReadOnlyModelViewSet):
    pagination_class = StandardPagination
    permission_classes = (IsAuthenticated,)
    filter_backends = (DjangoFilterBackend, OrderingFilter)
-    filter_class = LogFilterSet
+    filterset_class = LogFilterSet
    ordering_fields = ("created",)
@@ -223,17 +223,16 @@ class SearchAutoCompleteView(APIView):
        if 'term' in request.query_params:
            term = request.query_params['term']
        else:
-            term = None
+            return HttpResponseBadRequest("Term required")
        if 'limit' in request.query_params:
            limit = int(request.query_params['limit'])
            if limit <= 0:
                return HttpResponseBadRequest("Invalid limit")
        else:
            limit = 10
-        if term is not None:
+        return Response(index.autocomplete(self.ix, term, limit))
            return Response(index.autocomplete(self.ix, term, limit))
        else:
            return Response([])
 class StatisticsView(APIView):
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1,4 +1,5 @@
 import json
 import math
 import multiprocessing
 import os
 import re
@@ -79,6 +80,7 @@ INSTALLED_APPS = [
    "documents.apps.DocumentsConfig",
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
    "django.contrib.admin",
@@ -262,24 +264,58 @@ LOGGING = {
 # Task queue                                                                  #
 ###############################################################################
 # Sensible defaults for multitasking:
 # use a fair balance between worker processes and threads epr worker so that
 # both consuming many documents in parallel and consuming large documents is
 # reasonably fast.
 # Favors threads per worker on smaller systems and never exceeds cpu_count()
 # in total.
 def default_task_workers():
    try:
        return max(
            math.floor(math.sqrt(multiprocessing.cpu_count())),
            1
        )
    except NotImplementedError:
        return 1
 TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
 Q_CLUSTER = {
    'name': 'paperless',
    'catch_up': False,
    'workers': TASK_WORKERS,
    'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
 }
 def default_threads_per_worker():
    try:
        return max(
            math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
            1
        )
    except NotImplementedError:
        return 1
 THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
 ###############################################################################
 # Paperless Specific Settings                                                 #
 ###############################################################################
 CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
 CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
 # The default language that tesseract will attempt to use when parsing
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # The amount of threads to use for OCR
 OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
 # OCR all documents?
 OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
@@ -324,5 +360,6 @@ FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
 # TODO: this should not have a prefix.
 # Specify the filename format for out files
 PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -1,7 +1,7 @@
-from django.conf.urls import include, url
+from django.conf.urls import include
 from django.contrib import admin
 from django.contrib.auth.decorators import login_required
-from django.urls import path
+from django.urls import path, re_path
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic import RedirectView
 from rest_framework.routers import DefaultRouter
@@ -30,32 +30,32 @@ api_router.register(r"tags", TagViewSet)
 urlpatterns = [
    # API
-    url(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
+    re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
-    url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
+    re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
-    url(r"^api/search/", SearchView.as_view(), name="search"),
+    re_path(r"^api/search/", SearchView.as_view(), name="search"),
-    url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
+    re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
-    url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
+    re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
    # Favicon
-    url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
+    re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
    # The Django admin
-    url(r"admin/", admin.site.urls),
+    re_path(r"admin/", admin.site.urls),
    # These redirects are here to support clients that use the old FetchView.
-    url(
+    re_path(
        r"^fetch/doc/(?P<pk>\d+)$",
        RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
    ),
-    url(
+    re_path(
        r"^fetch/thumb/(?P<pk>\d+)$",
        RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
    ),
-    url(
+    re_path(
        r"^fetch/preview/(?P<pk>\d+)$",
        RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
    ),
-    url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
+    re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
    # Frontend assets TODO: this is pretty bad.
    path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
@@ -63,7 +63,7 @@ urlpatterns = [
    path('accounts/', include('django.contrib.auth.urls')),
    # Root of the Frontent
-    url(r".*", login_required(IndexView.as_view())),
+    re_path(r".*", login_required(IndexView.as_view())),
 ]
--- a/src/paperless_mail/init.py
+++ b/src/paperless_mail/init.py
--- a/src/paperless_mail/admin.py
+++ b/src/paperless_mail/admin.py
@@ -0,0 +1,27 @@
 from django.contrib import admin
 from django import forms
 from paperless_mail.models import MailAccount, MailRule
 class MailAccountForm(forms.ModelForm):
    password = forms.CharField(widget=forms.PasswordInput)
    class Meta:
        fields = '__all__'
        model = MailAccount
 class MailAccountAdmin(admin.ModelAdmin):
    list_display = ("name", "imap_server", "username")
 class MailRuleAdmin(admin.ModelAdmin):
    list_display = ("name", "account", "folder", "action")
 admin.site.register(MailAccount, MailAccountAdmin)
 admin.site.register(MailRule, MailRuleAdmin)
--- a/src/paperless_mail/apps.py
+++ b/src/paperless_mail/apps.py
@@ -0,0 +1,7 @@
 from django.apps import AppConfig
 class PaperlessMailConfig(AppConfig):
    name = 'paperless_mail'
    verbose_name = 'Paperless Mail'
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -0,0 +1,227 @@
 import os
 import tempfile
 from datetime import timedelta, date
 from django.conf import settings
 from django.utils.text import slugify
 from django_q.tasks import async_task
 from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
    MailboxFolderSelectError
 from documents.models import Correspondent
 from paperless_mail.models import MailAccount, MailRule
 class MailError(Exception):
    pass
 class BaseMailAction:
    def get_criteria(self):
        return {}
    def post_consume(self, M, message_uids, parameter):
        pass
 class DeleteMailAction(BaseMailAction):
    def post_consume(self, M, message_uids, parameter):
        M.delete(message_uids)
 class MarkReadMailAction(BaseMailAction):
    def get_criteria(self):
        return {'seen': False}
    def post_consume(self, M, message_uids, parameter):
        M.seen(message_uids, True)
 class MoveMailAction(BaseMailAction):
    def post_consume(self, M, message_uids, parameter):
        M.move(message_uids, parameter)
 class FlagMailAction(BaseMailAction):
    def get_criteria(self):
        return {'flagged': False}
    def post_consume(self, M, message_uids, parameter):
        M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
 def get_rule_action(rule):
    if rule.action == MailRule.ACTION_FLAG:
        return FlagMailAction()
    elif rule.action == MailRule.ACTION_DELETE:
        return DeleteMailAction()
    elif rule.action == MailRule.ACTION_MOVE:
        return MoveMailAction()
    elif rule.action == MailRule.ACTION_MARK_READ:
        return MarkReadMailAction()
    else:
        raise ValueError("Unknown action.")
 def make_criterias(rule):
    maximum_age = date.today() - timedelta(days=rule.maximum_age)
    criterias = {
        "date_gte": maximum_age
    }
    if rule.filter_from:
        criterias["from_"] = rule.filter_from
    if rule.filter_subject:
        criterias["subject"] = rule.filter_subject
    if rule.filter_body:
        criterias["body"] = rule.filter_body
    return {**criterias, **get_rule_action(rule).get_criteria()}
 def handle_mail_account(account):
    if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
        mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
    elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
        mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
    elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
        mailbox = MailBox(account.imap_server, account.imap_port)
    else:
        raise ValueError("Unknown IMAP security")
    total_processed_files = 0
    with mailbox as M:
        try:
            M.login(account.username, account.password)
        except Exception:
            raise MailError(
                f"Error while authenticating account {account.name}")
        for rule in account.rules.all():
            try:
                M.folder.set(rule.folder)
            except MailboxFolderSelectError:
                raise MailError(
                    f"Rule {rule.name}: Folder {rule.folder} does not exist "
                    f"in account {account.name}")
            criterias = make_criterias(rule)
            try:
                messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
            except Exception:
                raise MailError(
                    f"Rule {rule.name}: Error while fetching folder "
                    f"{rule.folder} of account {account.name}")
            post_consume_messages = []
            for message in messages:
                try:
                    processed_files = handle_message(message, rule)
                except Exception:
                    raise MailError(
                        f"Rule {rule.name}: Error while processing mail "
                        f"{message.uid} of account {account.name}")
                if processed_files > 0:
                    post_consume_messages.append(message.uid)
                total_processed_files += processed_files
            try:
                get_rule_action(rule).post_consume(
                    M,
                    post_consume_messages,
                    rule.action_parameter)
            except Exception:
                raise MailError(
                    f"Rule {rule.name}: Error while processing post-consume "
                    f"actions for account {account.name}")
    return total_processed_files
 def get_title(message, att, rule):
    if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
        title = message.subject
    elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
        title = os.path.splitext(os.path.basename(att.filename))[0]
    else:
        raise ValueError("Unknown title selector.")
    return title
 def get_correspondent(message, rule):
    if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
        correspondent = None
    elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
        correspondent_name = message.from_
        correspondent = Correspondent.objects.get_or_create(
            name=correspondent_name, defaults={
                "slug": slugify(correspondent_name)
            })[0]
    elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
        if message.from_values and \
           'name' in message.from_values \
           and message.from_values['name']:
            correspondent_name = message.from_values['name']
        else:
            correspondent_name = message.from_
        correspondent = Correspondent.objects.get_or_create(
            name=correspondent_name, defaults={
                "slug": slugify(correspondent_name)
            })[0]
    elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
        correspondent = rule.assign_correspondent
    else:
        raise ValueError("Unknwown correspondent selector")
    return correspondent
 def handle_message(message, rule):
    if not message.attachments:
        return 0
    correspondent = get_correspondent(message, rule)
    tag = rule.assign_tag
    doc_type = rule.assign_document_type
    processed_attachments = 0
    for att in message.attachments:
        title = get_title(message, att, rule)
        # TODO: check with parsers what files types are supported
        if att.content_type == 'application/pdf':
            os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
            _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
            with open(temp_filename, 'wb') as f:
                f.write(att.payload)
            async_task(
                "documents.tasks.consume_file",
                path=temp_filename,
                override_filename=att.filename,
                override_title=title,
                override_correspondent_id=correspondent.id if correspondent else None,
                override_document_type_id=doc_type.id if doc_type else None,
                override_tag_ids=[tag.id] if tag else None,
                task_name=f"Mail: {att.filename}"
            )
            processed_attachments += 1
    return processed_attachments
--- a/src/paperless_mail/management/init.py
+++ b/src/paperless_mail/management/init.py
--- a/src/paperless_mail/management/commands/init.py
+++ b/src/paperless_mail/management/commands/init.py
--- a/src/paperless_mail/management/commands/mail_fetcher.py
+++ b/src/paperless_mail/management/commands/mail_fetcher.py
@@ -0,0 +1,13 @@
 from django.core.management.base import BaseCommand
 from paperless_mail import mail, tasks
 class Command(BaseCommand):
    help = """
    """.replace("    ", "")
    def handle(self, *args, **options):
        tasks.process_mail_accounts()
--- a/src/paperless_mail/migrations/0001_initial.py
+++ b/src/paperless_mail/migrations/0001_initial.py
@@ -0,0 +1,48 @@
 # Generated by Django 3.1.3 on 2020-11-15 22:54
 from django.db import migrations, models
 import django.db.models.deletion
 class Migration(migrations.Migration):
    initial = True
    dependencies = [
        ('documents', '1002_auto_20201111_1105'),
    ]
    operations = [
        migrations.CreateModel(
            name='MailAccount',
            fields=[
                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                ('name', models.CharField(max_length=256, unique=True)),
                ('imap_server', models.CharField(max_length=256)),
                ('imap_port', models.IntegerField(blank=True, null=True)),
                ('imap_security', models.PositiveIntegerField(choices=[(1, 'No encryption'), (2, 'Use SSL'), (3, 'Use STARTTLS')], default=2)),
                ('username', models.CharField(max_length=256)),
                ('password', models.CharField(max_length=256)),
            ],
        ),
        migrations.CreateModel(
            name='MailRule',
            fields=[
                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                ('name', models.CharField(max_length=256)),
                ('folder', models.CharField(default='INBOX', max_length=256)),
                ('filter_from', models.CharField(blank=True, max_length=256, null=True)),
                ('filter_subject', models.CharField(blank=True, max_length=256, null=True)),
                ('filter_body', models.CharField(blank=True, max_length=256, null=True)),
                ('maximum_age', models.PositiveIntegerField(default=30)),
                ('action', models.PositiveIntegerField(choices=[(1, 'Delete'), (2, 'Move to specified folder'), (3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails")], default=3, help_text='The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched.')),
                ('action_parameter', models.CharField(blank=True, help_text='Additional parameter for the action selected above, i.e., the target folder of the move to folder action.', max_length=256, null=True)),
                ('assign_title_from', models.PositiveIntegerField(choices=[(1, 'Use subject as title'), (2, 'Use attachment filename as title')], default=1)),
                ('assign_correspondent_from', models.PositiveIntegerField(choices=[(1, 'Do not assign a correspondent'), (2, 'Use mail address'), (3, 'Use name (or mail address if not available)'), (4, 'Use correspondent selected below')], default=1)),
                ('account', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rules', to='paperless_mail.mailaccount')),
                ('assign_correspondent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.correspondent')),
                ('assign_document_type', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.documenttype')),
                ('assign_tag', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.tag')),
            ],
        ),
    ]
--- a/src/paperless_mail/migrations/0002_auto_20201117_1334.py
+++ b/src/paperless_mail/migrations/0002_auto_20201117_1334.py
@@ -0,0 +1,32 @@
 # Generated by Django 3.1.3 on 2020-11-17 13:34
 from django.db import migrations
 from django.db.migrations import RunPython
 from django_q.models import Schedule
 from django_q.tasks import schedule
 def add_schedules(apps, schema_editor):
    schedule('paperless_mail.tasks.process_mail_accounts',
             name="Check all e-mail accounts",
             schedule_type=Schedule.MINUTES,
             minutes=10)
 def remove_schedules(apps, schema_editor):
    Schedule.objects.filter(
        func='paperless_mail.tasks.process_mail_accounts').delete()
 class Migration(migrations.Migration):
    dependencies = [
        ('paperless_mail', '0001_initial'),
        ('django_q', '0013_task_attempt_count'),
    ]
    operations = [
        RunPython(add_schedules, remove_schedules)
    ]
--- a/src/paperless_mail/migrations/init.py
+++ b/src/paperless_mail/migrations/init.py
--- a/src/paperless_mail/models.py
+++ b/src/paperless_mail/models.py
@@ -0,0 +1,137 @@
 from django.db import models
 # Create your models here.
 from django.db import models
 import documents.models as document_models
 class MailAccount(models.Model):
    IMAP_SECURITY_NONE = 1
    IMAP_SECURITY_SSL = 2
    IMAP_SECURITY_STARTTLS = 3
    IMAP_SECURITY_OPTIONS = (
        (IMAP_SECURITY_NONE, "No encryption"),
        (IMAP_SECURITY_SSL, "Use SSL"),
        (IMAP_SECURITY_STARTTLS, "Use STARTTLS"),
    )
    name = models.CharField(max_length=256, unique=True)
    imap_server = models.CharField(max_length=256)
    imap_port = models.IntegerField(blank=True, null=True)
    imap_security = models.PositiveIntegerField(
        choices=IMAP_SECURITY_OPTIONS,
        default=IMAP_SECURITY_SSL
    )
    username = models.CharField(max_length=256)
    password = models.CharField(max_length=256)
    def __str__(self):
        return self.name
 class MailRule(models.Model):
    ACTION_DELETE = 1
    ACTION_MOVE = 2
    ACTION_MARK_READ = 3
    ACTION_FLAG = 4
    ACTIONS = (
        (ACTION_DELETE, "Delete"),
        (ACTION_MOVE, "Move to specified folder"),
        (ACTION_MARK_READ, "Mark as read, don't process read mails"),
        (ACTION_FLAG, "Flag the mail, don't process flagged mails")
    )
    TITLE_FROM_SUBJECT = 1
    TITLE_FROM_FILENAME = 2
    TITLE_SELECTOR = (
        (TITLE_FROM_SUBJECT, "Use subject as title"),
        (TITLE_FROM_FILENAME, "Use attachment filename as title")
    )
    CORRESPONDENT_FROM_NOTHING = 1
    CORRESPONDENT_FROM_EMAIL = 2
    CORRESPONDENT_FROM_NAME = 3
    CORRESPONDENT_FROM_CUSTOM = 4
    CORRESPONDENT_SELECTOR = (
        (CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
        (CORRESPONDENT_FROM_EMAIL, "Use mail address"),
        (CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
        (CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
    )
    name = models.CharField(max_length=256)
    account = models.ForeignKey(
        MailAccount,
        related_name="rules",
        on_delete=models.CASCADE
    )
    folder = models.CharField(default='INBOX', max_length=256)
    filter_from = models.CharField(max_length=256, null=True, blank=True)
    filter_subject = models.CharField(max_length=256, null=True, blank=True)
    filter_body = models.CharField(max_length=256, null=True, blank=True)
    maximum_age = models.PositiveIntegerField(default=30)
    action = models.PositiveIntegerField(
        choices=ACTIONS,
        default=ACTION_MARK_READ,
        help_text="The action applied to the mail. This action is only "
                  "performed when documents were consumed from the mail. "
                  "Mails without attachments will remain entirely "
                  "untouched."
    )
    action_parameter = models.CharField(
        max_length=256, blank=True, null=True,
        help_text="Additional parameter for the action selected above, i.e., "
                  "the target folder of the move to folder action."
    )
    assign_title_from = models.PositiveIntegerField(
        choices=TITLE_SELECTOR,
        default=TITLE_FROM_SUBJECT
    )
    assign_tag = models.ForeignKey(
        document_models.Tag,
        null=True,
        blank=True,
        on_delete=models.SET_NULL
    )
    assign_document_type = models.ForeignKey(
        document_models.DocumentType,
        null=True,
        blank=True,
        on_delete=models.SET_NULL
    )
    assign_correspondent_from = models.PositiveIntegerField(
        choices=CORRESPONDENT_SELECTOR,
        default=CORRESPONDENT_FROM_NOTHING
    )
    assign_correspondent = models.ForeignKey(
        document_models.Correspondent,
        null=True,
        blank=True,
        on_delete=models.SET_NULL
    )
    def __str__(self):
        return self.name
--- a/src/paperless_mail/tasks.py
+++ b/src/paperless_mail/tasks.py
@@ -0,0 +1,23 @@
 import logging
 from paperless_mail import mail
 from paperless_mail.models import MailAccount
 def process_mail_accounts():
    total_new_documents = 0
    for account in MailAccount.objects.all():
        total_new_documents += mail.handle_mail_account(account)
    if total_new_documents > 0:
        return f"Added {total_new_documents} document(s)."
    else:
        return "No new documents were added."
 def process_mail_account(name):
    account = MailAccount.objects.find(name=name)
    if account:
        mail.handle_mail_account(account)
    else:
        logging.error("Unknown mail acccount: {}".format(name))
--- a/src/paperless_mail/tests/init.py
+++ b/src/paperless_mail/tests/init.py
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -0,0 +1,352 @@
 import uuid
 from collections import namedtuple
 from typing import ContextManager
 from unittest import mock
 from django.test import TestCase
 from imap_tools import MailMessageFlags, MailboxFolderSelectError
 from documents.models import Correspondent
 from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError
 from paperless_mail.models import MailRule, MailAccount
 class BogusFolderManager:
    current_folder = "INBOX"
    def set(self, new_folder):
        if new_folder not in ["INBOX", "spam"]:
            raise MailboxFolderSelectError(None, "uhm")
        self.current_folder = new_folder
 class BogusMailBox(ContextManager):
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        pass
    def __init__(self):
        self.messages = []
        self.messages_spam = []
    def login(self, username, password):
        if not (username == 'admin' and password == 'secret'):
            raise Exception()
    folder = BogusFolderManager()
    def fetch(self, criteria, mark_seen):
        msg = self.messages
        criteria = str(criteria).strip('()').split(" ")
        if 'UNSEEN' in criteria:
            msg = filter(lambda m: not m.seen, msg)
        if 'SUBJECT' in criteria:
            subject = criteria[criteria.index('SUBJECT') + 1].strip('"')
            msg = filter(lambda m: subject in m.subject, msg)
        if 'BODY' in criteria:
            body = criteria[criteria.index('BODY') + 1].strip('"')
            msg = filter(lambda m: body in m.body, msg)
        if 'FROM' in criteria:
            from_ = criteria[criteria.index('FROM') + 1].strip('"')
            msg = filter(lambda m: from_ in m.from_, msg)
        if 'UNFLAGGED' in criteria:
            msg = filter(lambda m: not m.flagged, msg)
        return list(msg)
    def seen(self, uid_list, seen_val):
        for message in self.messages:
            if message.uid in uid_list:
                message.seen = seen_val
    def delete(self, uid_list):
        self.messages = list(filter(lambda m: m.uid not in uid_list, self.messages))
    def flag(self, uid_list, flag_set, value):
        for message in self.messages:
            if message.uid in uid_list:
                for flag in flag_set:
                    if flag == MailMessageFlags.FLAGGED:
                        message.flagged = value
    def move(self, uid_list, folder):
        if folder == "spam":
            self.messages_spam.append(
                filter(lambda m: m.uid in uid_list, self.messages)
            )
            self.messages = list(
                filter(lambda m: m.uid not in uid_list, self.messages)
            )
        else:
            raise Exception()
 def create_message(num_attachments=1, body="", subject="the suject", from_="noone@mail.com", seen=False, flagged=False):
    message = namedtuple('MailMessage', [])
    message.uid = uuid.uuid4()
    message.subject = subject
    message.attachments = []
    message.from_ = from_
    message.body = body
    for i in range(num_attachments):
        attachment = namedtuple('Attachment', [])
        attachment.filename = 'some_file.pdf'
        attachment.content_type = 'application/pdf'
        attachment.payload = b'content of the attachment'
        message.attachments.append(attachment)
    message.seen = seen
    message.flagged = flagged
    return message
 class TestMail(TestCase):
    def setUp(self):
        patcher = mock.patch('paperless_mail.mail.MailBox')
        m = patcher.start()
        self.bogus_mailbox = BogusMailBox()
        m.return_value = self.bogus_mailbox
        self.addCleanup(patcher.stop)
        patcher = mock.patch('paperless_mail.mail.async_task')
        self.async_task = patcher.start()
        self.addCleanup(patcher.stop)
        self.reset_bogus_mailbox()
    def reset_bogus_mailbox(self):
        self.bogus_mailbox.messages = []
        self.bogus_mailbox.messages_spam = []
        self.bogus_mailbox.messages.append(create_message(subject="Invoice 1", from_="amazon@amazon.de", body="cables", seen=True, flagged=False))
        self.bogus_mailbox.messages.append(create_message(subject="Invoice 2", body="from my favorite electronic store", seen=False, flagged=True))
        self.bogus_mailbox.messages.append(create_message(subject="Claim your $10M price now!", from_="amazon@amazon-some-indian-site.org", seen=False))
    def test_get_correspondent(self):
        message = namedtuple('MailMessage', [])
        message.from_ = "someone@somewhere.com"
        message.from_values = {'name': "Someone!", 'email': "someone@somewhere.com"}
        message2 = namedtuple('MailMessage', [])
        message2.from_ = "me@localhost.com"
        message2.from_values = {'name': "", 'email': "fake@localhost.com"}
        me_localhost = Correspondent.objects.create(name=message2.from_)
        someone_else = Correspondent.objects.create(name="someone else")
        rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
        self.assertIsNone(get_correspondent(message, rule))
        rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
        c = get_correspondent(message, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.name, "someone@somewhere.com")
        c = get_correspondent(message2, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.name, "me@localhost.com")
        self.assertEqual(c.id, me_localhost.id)
        rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
        c = get_correspondent(message, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.name, "Someone!")
        c = get_correspondent(message2, rule)
        self.assertIsNotNone(c)
        self.assertEqual(c.id, me_localhost.id)
        rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
        c = get_correspondent(message, rule)
        self.assertEqual(c, someone_else)
    def test_get_title(self):
        message = namedtuple('MailMessage', [])
        message.subject = "the message title"
        att = namedtuple('Attachment', [])
        att.filename = "this_is_the_file.pdf"
        rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
        self.assertEqual(get_title(message, att, rule), "this_is_the_file")
        rule = MailRule(assign_title_from=MailRule.TITLE_FROM_SUBJECT)
        self.assertEqual(get_title(message, att, rule), "the message title")
    def test_handle_message(self):
        message = namedtuple('MailMessage', [])
        message.subject = "the message title"
        att = namedtuple('Attachment', [])
        att.filename = "test1.pdf"
        att.content_type = 'application/pdf'
        att.payload = b"attachment contents"
        att2 = namedtuple('Attachment', [])
        att2.filename = "test2.pdf"
        att2.content_type = 'application/pdf'
        att2.payload = b"attachment contents"
        att3 = namedtuple('Attachment', [])
        att3.filename = "test3.pdf"
        att3.content_type = 'application/invalid'
        att3.payload = b"attachment contents"
        message.attachments = [att, att2, att3]
        rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
        result = handle_message(message, rule)
        self.assertEqual(result, 2)
        self.assertEqual(len(self.async_task.call_args_list), 2)
        args1, kwargs1 = self.async_task.call_args_list[0]
        args2, kwargs2 = self.async_task.call_args_list[1]
        self.assertEqual(kwargs1['override_title'], "test1")
        self.assertEqual(kwargs1['override_filename'], "test1.pdf")
        self.assertEqual(kwargs2['override_title'], "test2")
        self.assertEqual(kwargs2['override_filename'], "test2.pdf")
    @mock.patch("paperless_mail.mail.async_task")
    def test_handle_empty_message(self, m):
        message = namedtuple('MailMessage', [])
        message.attachments = []
        rule = MailRule()
        result = handle_message(message, rule)
        self.assertFalse(m.called)
        self.assertEqual(result, 0)
    def test_handle_mail_account_mark_read(self):
        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
        self.assertEqual(self.async_task.call_count, 0)
        self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
        handle_mail_account(account)
        self.assertEqual(self.async_task.call_count, 2)
        self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
    def test_handle_mail_account_delete(self):
        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Invoice")
        self.assertEqual(self.async_task.call_count, 0)
        self.assertEqual(len(self.bogus_mailbox.messages), 3)
        handle_mail_account(account)
        self.assertEqual(self.async_task.call_count, 2)
        self.assertEqual(len(self.bogus_mailbox.messages), 1)
    def test_handle_mail_account_flag(self):
        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
        self.assertEqual(self.async_task.call_count, 0)
        self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
        handle_mail_account(account)
        self.assertEqual(self.async_task.call_count, 1)
        self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
    def test_handle_mail_account_move(self):
        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", filter_subject="Claim")
        self.assertEqual(self.async_task.call_count, 0)
        self.assertEqual(len(self.bogus_mailbox.messages), 3)
        self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
        handle_mail_account(account)
        self.assertEqual(self.async_task.call_count, 1)
        self.assertEqual(len(self.bogus_mailbox.messages), 2)
        self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
    def test_errors(self):
        account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
        try:
            handle_mail_account(account)
        except MailError as e:
            self.assertTrue(str(e).startswith("Error while authenticating account"))
        else:
            self.fail("Should raise exception")
        account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
        try:
            handle_mail_account(account)
        except MailError as e:
            self.assertTrue("uuuh does not exist" in str(e))
        else:
            self.fail("Should raise exception")
        account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
        try:
            handle_mail_account(account)
        except MailError as e:
            self.assertTrue("Error while processing post-consume actions" in str(e))
        else:
            self.fail("Should raise exception")
    def test_filters(self):
        account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
        rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
        self.assertEqual(self.async_task.call_count, 0)
        self.assertEqual(len(self.bogus_mailbox.messages), 3)
        handle_mail_account(account)
        self.assertEqual(len(self.bogus_mailbox.messages), 2)
        self.assertEqual(self.async_task.call_count, 1)
        self.reset_bogus_mailbox()
        rule.filter_subject = None
        rule.filter_body = "electronic"
        rule.save()
        self.assertEqual(len(self.bogus_mailbox.messages), 3)
        handle_mail_account(account)
        self.assertEqual(len(self.bogus_mailbox.messages), 2)
        self.assertEqual(self.async_task.call_count, 2)
        self.reset_bogus_mailbox()
        rule.filter_from = "amazon"
        rule.filter_body = None
        rule.save()
        self.assertEqual(len(self.bogus_mailbox.messages), 3)
        handle_mail_account(account)
        self.assertEqual(len(self.bogus_mailbox.messages), 1)
        self.assertEqual(self.async_task.call_count, 4)
        self.reset_bogus_mailbox()
        rule.filter_from = "amazon"
        rule.filter_body = "cables"
        rule.filter_subject = "Invoice"
        rule.save()
        self.assertEqual(len(self.bogus_mailbox.messages), 3)
        handle_mail_account(account)
        self.assertEqual(len(self.bogus_mailbox.messages), 2)
        self.assertEqual(self.async_task.call_count, 5)
--- a/src/paperless_mail/views.py
+++ b/src/paperless_mail/views.py
@@ -0,0 +1,3 @@
 from django.shortcuts import render
 # Create your views here.
--- a/src/paperless_tesseract/apps.py
+++ b/src/paperless_tesseract/apps.py
@@ -1,5 +1,7 @@
 from django.apps import AppConfig
 from paperless_tesseract.signals import tesseract_consumer_declaration
 class PaperlessTesseractConfig(AppConfig):
@@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
        from documents.signals import document_consumer_declaration
-        from .signals import ConsumerDeclaration
+        document_consumer_declaration.connect(tesseract_consumer_declaration)
        document_consumer_declaration.connect(ConsumerDeclaration.handle)
        AppConfig.ready(self)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -2,7 +2,7 @@ import itertools
 import os
 import re
 import subprocess
-from multiprocessing.pool import Pool
+from multiprocessing.pool import ThreadPool
 import langdetect
 import pdftotext
@@ -151,7 +151,7 @@ class RasterisedDocumentParser(DocumentParser):
        self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
        # Run unpaper in parallel on converted images
-        with Pool(processes=settings.OCR_THREADS) as pool:
+        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
            pnms = pool.map(run_unpaper, pnms)
        return sorted(filter(lambda __: os.path.isfile(__), pnms))
@@ -166,7 +166,7 @@ class RasterisedDocumentParser(DocumentParser):
    def _ocr(self, imgs, lang):
        self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
-        with Pool(processes=settings.OCR_THREADS) as pool:
+        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
            return r
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -3,21 +3,16 @@ import re
 from .parsers import RasterisedDocumentParser
-class ConsumerDeclaration:
+def tesseract_consumer_declaration(sender, **kwargs):
    return {
        "parser": RasterisedDocumentParser,
        "weight": 0,
        "test": tesseract_consumer_test
    }
    MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
-    @classmethod
+MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
    def handle(cls, sender, **kwargs):
        return cls.test
    @classmethod
    def test(cls, doc):
-        if cls.MATCHING_FILES.match(doc.lower()):
+def tesseract_consumer_test(doc):
-            return {
+    return MATCHING_FILES.match(doc.lower())
                "parser": RasterisedDocumentParser,
                "weight": 0
            }
        return None
--- a/src/paperless_tesseract/tests/test_signals.py
+++ b/src/paperless_tesseract/tests/test_signals.py
@@ -1,6 +1,6 @@
 from django.test import TestCase
-from ..signals import ConsumerDeclaration
+from paperless_tesseract.signals import tesseract_consumer_test
 class SignalsTestCase(TestCase):
@@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
        for prefix in prefixes:
            for suffix in suffixes:
                name = "{}.{}".format(prefix, suffix)
-                self.assertTrue(ConsumerDeclaration.test(name))
+                self.assertTrue(tesseract_consumer_test(name))
    def test_test_handles_various_file_names_false(self):
@@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
        for prefix in prefixes:
            for suffix in suffixes:
                name = "{}.{}".format(prefix, suffix)
-                self.assertFalse(ConsumerDeclaration.test(name))
+                self.assertFalse(tesseract_consumer_test(name))
-        self.assertFalse(ConsumerDeclaration.test(""))
+        self.assertFalse(tesseract_consumer_test(""))
-        self.assertFalse(ConsumerDeclaration.test("doc"))
+        self.assertFalse(tesseract_consumer_test("doc"))
--- a/src/paperless_text/apps.py
+++ b/src/paperless_text/apps.py
@@ -1,5 +1,7 @@
 from django.apps import AppConfig
 from paperless_text.signals import text_consumer_declaration
 class PaperlessTextConfig(AppConfig):
@@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
        from documents.signals import document_consumer_declaration
-        from .signals import ConsumerDeclaration
+        document_consumer_declaration.connect(text_consumer_declaration)
        document_consumer_declaration.connect(ConsumerDeclaration.handle)
        AppConfig.ready(self)
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -3,21 +3,16 @@ import re
 from .parsers import TextDocumentParser
-class ConsumerDeclaration:
+def text_consumer_declaration(sender, **kwargs):
    return {
        "parser": TextDocumentParser,
        "weight": 10,
        "test": text_consumer_test
    }
    MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
-    @classmethod
+MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
    def handle(cls, sender, **kwargs):
        return cls.test
    @classmethod
    def test(cls, doc):
-        if cls.MATCHING_FILES.match(doc.lower()):
+def text_consumer_test(doc):
-            return {
+    return MATCHING_FILES.match(doc.lower())
                "parser": TextDocumentParser,
                "weight": 10
            }
        return None
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -6,7 +6,6 @@ ignore = E501
 DJANGO_SETTINGS_MODULE=paperless.settings
 addopts = --pythonwarnings=all
 env =
  PAPERLESS_PASSPHRASE=THISISNOTASECRET
  PAPERLESS_SECRET=paperless
  PAPERLESS_EMAIL_SECRET=paperless
@@ -15,4 +14,4 @@ env =
 source =
  ./
 omit =
-  */tests
+  */tests/*
		`@@ -0,0 +1,3 @@`
							`from django.shortcuts import render`

							`# Create your views here.`