mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'dev'
This commit is contained in:
commit
8395bdfdf6
@ -1,3 +1,4 @@
|
||||
/src-ui/.vscode
|
||||
/src-ui/node_modules
|
||||
/src-ui/dist
|
||||
.git
|
||||
@ -5,3 +6,7 @@
|
||||
/consume
|
||||
/media
|
||||
/data
|
||||
/docs
|
||||
.pytest_cache
|
||||
/dist
|
||||
/scripts
|
||||
|
@ -5,23 +5,18 @@ python:
|
||||
- "3.7"
|
||||
- "3.8"
|
||||
|
||||
services:
|
||||
- docker
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
|
||||
|
||||
install:
|
||||
- pip install --upgrade pipenv
|
||||
- pipenv install --dev
|
||||
- pipenv install --system --dev
|
||||
|
||||
script:
|
||||
- cd src/
|
||||
- pipenv run pytest --cov
|
||||
- pipenv run pycodestyle
|
||||
- cd ..
|
||||
- docker build --tag=jonaswinkler/paperless-ng .
|
||||
|
||||
after_success:
|
||||
- pipenv run coveralls
|
||||
|
1
Pipfile
1
Pipfile
@ -29,6 +29,7 @@ watchdog = "*"
|
||||
pathvalidate = "*"
|
||||
django-q = "*"
|
||||
redis = "*"
|
||||
imap-tools = "*"
|
||||
|
||||
[dev-packages]
|
||||
coveralls = "*"
|
||||
|
10
Pipfile.lock
generated
10
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "c0dfeedbac2e9b705267336349e6f72ba650ff9184affae06046db32299e2c87"
|
||||
"sha256": "d6416e6844126b09200b9839a3abdcf3c24ef5cf70052b8f134d8bc804552c17"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {},
|
||||
@ -123,6 +123,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==20.0.4"
|
||||
},
|
||||
"imap-tools": {
|
||||
"hashes": [
|
||||
"sha256:070929b8ec429c0aad94588a37a2962eed656a119ab61dcf91489f20fe983f5d",
|
||||
"sha256:6232cd43748741496446871e889eb137351fc7a7e7f4c7888cd8c0fa28e20cda"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"joblib": {
|
||||
"hashes": [
|
||||
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
|
||||
|
BIN
docs/_static/paperless-11-mail-filters.png
vendored
Normal file
BIN
docs/_static/paperless-11-mail-filters.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 70 KiB |
BIN
docs/_static/recommended_workflow.png
vendored
Normal file
BIN
docs/_static/recommended_workflow.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 67 KiB |
@ -294,10 +294,14 @@ Documents can be stored in Paperless using GnuPG encryption.
|
||||
|
||||
.. danger::
|
||||
|
||||
Decryption is depreceated since paperless-ng 1.0 and doesn't really provide any
|
||||
Decryption is depreceated since paperless-ng 0.9 and doesn't really provide any
|
||||
additional security, since you have to store the passphrase in a configuration
|
||||
file on the same system as the encrypted documents for paperless to work. Also,
|
||||
paperless provides transparent access to your encrypted documents.
|
||||
file on the same system as the encrypted documents for paperless to work.
|
||||
Furthermore, the entire text content of the documents is stored plain in the
|
||||
database, even if your documents are encrypted. Filenames are not encrypted as
|
||||
well.
|
||||
|
||||
Also, the web server provides transparent access to your encrypted documents.
|
||||
|
||||
Consider running paperless on an encrypted filesystem instead, which will then
|
||||
at least provide security against physical hardware theft.
|
||||
|
171
docs/api.rst
171
docs/api.rst
@ -3,25 +3,168 @@
|
||||
The REST API
|
||||
************
|
||||
|
||||
.. warning::
|
||||
|
||||
This section is not updated to paperless-ng yet.
|
||||
|
||||
Paperless makes use of the `Django REST Framework`_ standard API interface
|
||||
because of its inherent awesomeness. Conveniently, the system is also
|
||||
self-documenting, so to learn more about the access points, schema, what's
|
||||
accepted and what isn't, you need only visit ``/api`` on your local Paperless
|
||||
installation.
|
||||
Paperless makes use of the `Django REST Framework`_ standard API interface.
|
||||
It provides a browsable API for most of its endpoints, which you can inspect
|
||||
at ``http://<paperless-host>:<port>/api/``. This also documents most of the
|
||||
available filters and ordering fields.
|
||||
|
||||
.. _Django REST Framework: http://django-rest-framework.org/
|
||||
|
||||
The API provides 5 main endpoints:
|
||||
|
||||
* ``/api/correspondents/``: Full CRUD support.
|
||||
* ``/api/document_types/``: Full CRUD support.
|
||||
* ``/api/documents/``: Full CRUD support, except POSTing new documents. See below.
|
||||
* ``/api/logs/``: Read-Only.
|
||||
* ``/api/tags/``: Full CRUD support.
|
||||
|
||||
All of these endpoints except for the logging endpoint
|
||||
allow you to fetch, edit and delete individual objects
|
||||
by appending their primary key to the path, for example ``/api/documents/454/``.
|
||||
|
||||
In addition to that, the document endpoint offers these additional actions on
|
||||
individual documents:
|
||||
|
||||
* ``/api/documents/<pk>/download/``: Download the original document.
|
||||
* ``/api/documents/<pk>/thumb/``: Download the PNG thumbnail of a document.
|
||||
* ``/api/documents/<pk>/preview/``: Display the original document inline,
|
||||
without downloading it.
|
||||
|
||||
.. hint::
|
||||
|
||||
Paperless used to provide these functionality at ``/fetch/<pk>/preview``,
|
||||
``/fetch/<pk>/thumb`` and ``/fetch/<pk>/doc``. Redirects to the new URLs
|
||||
are in place. However, if you use these old URLs to access documents, you
|
||||
should update your app or script to use the new URLs.
|
||||
|
||||
Searching for documents
|
||||
#######################
|
||||
|
||||
Paperless-ng offers API endpoints for full text search. These are as follows:
|
||||
|
||||
``/api/search/``
|
||||
================
|
||||
|
||||
Get search results based on a query.
|
||||
|
||||
Query parameters:
|
||||
|
||||
* ``query``: The query string. See
|
||||
`here <https://whoosh.readthedocs.io/en/latest/querylang.html>`_
|
||||
for details on the syntax.
|
||||
* ``page``: Specify the page you want to retrieve. Each page
|
||||
contains 10 search results and the first page is ``page=1``, which
|
||||
is the default if this is omitted.
|
||||
|
||||
Result list object returned by the endpoint:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{
|
||||
"count": 1,
|
||||
"page": 1,
|
||||
"page_count": 1,
|
||||
"results": [
|
||||
|
||||
]
|
||||
}
|
||||
|
||||
* ``count``: The approximate total number of results.
|
||||
* ``page``: The page returned to you. This might be different from
|
||||
the page you requested, if you requested a page that is behind
|
||||
the last page. In that case, the last page is returned.
|
||||
* ``page_count``: The total number of pages.
|
||||
* ``results``: A list of result objects on the current page.
|
||||
|
||||
Result object:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"highlights": [
|
||||
|
||||
],
|
||||
"score": 6.34234,
|
||||
"rank": 23,
|
||||
"document": {
|
||||
|
||||
}
|
||||
|
||||
* ``id``: the primary key of the found document
|
||||
* ``highlights``: an object containing parseable highlights for the result.
|
||||
See below.
|
||||
* ``score``: The score assigned to the document. A higher score indicates a
|
||||
better match with the query. Search results are sorted descending by score.
|
||||
* ``rank``: the position of the document within the entire search results list.
|
||||
* ``document``: The full json of the document, as returned by
|
||||
``/api/documents/<id>/``.
|
||||
|
||||
Highlights object:
|
||||
|
||||
Highlights are provided as a list of fragments. A fragment is a longer section of
|
||||
text from the original document.
|
||||
Each fragment contains a list of strings, and some of them are marked as a highlight.
|
||||
|
||||
.. code:: json
|
||||
|
||||
"highlights": [
|
||||
[
|
||||
{"text": "This is a sample text with a "},
|
||||
{"text": "highlighted", "term": 0},
|
||||
{"text": " word."}
|
||||
],
|
||||
[
|
||||
{"text": "Another", "term": 1},
|
||||
{"text": " fragment with a highlight."}
|
||||
]
|
||||
]
|
||||
|
||||
When ``term`` is present within a string, the word within ``text`` should be highlighted.
|
||||
The term index groups multiple matches together and words with the same index
|
||||
should get identical highlighting.
|
||||
A client may use this example to produce the following output:
|
||||
|
||||
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
|
||||
|
||||
``/api/search/autocomplete/``
|
||||
=============================
|
||||
|
||||
Get auto completions for a partial search term.
|
||||
|
||||
Query parameters:
|
||||
|
||||
* ``term``: The incomplete term.
|
||||
* ``limit``: Amount of results. Defaults to 10.
|
||||
|
||||
Results returned by the endpoint are ordered by importance of the term in the
|
||||
document index. The first result is the term that has the highest Tf/Idf score
|
||||
in the index.
|
||||
|
||||
.. code:: json
|
||||
|
||||
[
|
||||
"term1",
|
||||
"term3",
|
||||
"term6",
|
||||
"term4"
|
||||
]
|
||||
|
||||
|
||||
.. _api-file_uploads:
|
||||
|
||||
POSTing Documents
|
||||
=================
|
||||
POSTing documents
|
||||
#################
|
||||
|
||||
File uploads in an API are hard and so far as I've been able to tell, there's
|
||||
no standard way of accepting them, so rather than crowbar file uploads into the
|
||||
REST API and endure that headache, I've left that process to a simple HTTP
|
||||
POST.
|
||||
The API provides a special endpoint for file uploads:
|
||||
|
||||
``/api/documents/post_document/``
|
||||
|
||||
POST a multipart form to this endpoint, where the form field ``document`` contains
|
||||
the document that you want to upload to paperless. The filename is sanitized and
|
||||
then used to store the document in the consumption folder, where the consumer will
|
||||
detect the document and process it as any other document.
|
||||
|
||||
The endpoint will immediately return "OK." if the document was stored in the
|
||||
consumption directory.
|
||||
|
@ -8,10 +8,8 @@ Changelog
|
||||
paperless-ng 0.9.0
|
||||
##################
|
||||
|
||||
* **Deprecated:** GnuPG. Don't use it. If you're still using it, be aware that it
|
||||
offers no protection at all, since the passphrase is stored alongside with the
|
||||
encrypted documents itself. This features will most likely be removed in future
|
||||
versions.
|
||||
* **Deprecated:** GnuPG. :ref:`See this note on the state of GnuPG in paperless-ng. <utilities-encyption>`
|
||||
This features will most likely be removed in future versions.
|
||||
|
||||
* **Added:** New frontend. Features:
|
||||
|
||||
@ -38,6 +36,25 @@ paperless-ng 0.9.0
|
||||
multi user solution, however, it allows more than one user to access the website
|
||||
and set some basic permissions / renew passwords.
|
||||
|
||||
* **Modified [breaking]:** All new mail consumer with customizable filters, actions and
|
||||
multiple account support. Replaces the old mail consumer. The new mail consumer
|
||||
needs different configuration but can be configured to act exactly like the old
|
||||
consumer.
|
||||
|
||||
|
||||
* **Modified:** Changes to the consumer:
|
||||
|
||||
* Now uses the excellent watchdog library that should make sure files are
|
||||
discovered no matter what the platform is.
|
||||
* The consumer now uses a task scheduler to run consumption processes in parallel.
|
||||
This means that consuming many documents should be much faster on systems with
|
||||
many cores.
|
||||
* Concurrency is controlled with the new settings ``PAPERLESS_TASK_WORKERS``
|
||||
and ``PAPERLESS_THREADS_PER_WORKER``. See TODO for details on concurrency.
|
||||
* The consumer no longer blocks the database for extended periods of time.
|
||||
* An issue with tesseract running multiple threads per page and slowing down
|
||||
the consumer was fixed.
|
||||
|
||||
* **Modified [breaking]:** REST Api changes:
|
||||
|
||||
* New filters added, other filters removed (case sensitive filters, slug filters)
|
||||
@ -64,8 +81,8 @@ paperless-ng 0.9.0
|
||||
* Rework of the code of the tesseract parser. This is now a lot cleaner.
|
||||
* Rework of the filename handling code. It was a mess.
|
||||
* Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
|
||||
* Consumer rework: now uses the excellent watchdog library, lots of code removed.
|
||||
* Added a task scheduler that takes care of checking mail, training the classifier and maintaining the document search index.
|
||||
* Added a task scheduler that takes care of checking mail, training the classifier, maintaining the document search index
|
||||
and consuming documents.
|
||||
* Updated dependencies. Now uses Pipenv all around.
|
||||
* Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
|
||||
|
||||
@ -77,6 +94,8 @@ paperless-ng 0.9.0
|
||||
* ``PAPERLESS_DEBUG`` defaults to ``false``.
|
||||
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
|
||||
sqlite.
|
||||
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
|
||||
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
|
||||
|
||||
* Many more small changes here and there. The usual stuff.
|
||||
|
||||
|
@ -20,7 +20,3 @@ places.
|
||||
|
||||
Copy ``paperless.conf.example`` to any of these locations and adjust it to your
|
||||
needs.
|
||||
|
||||
.. warning::
|
||||
|
||||
TBD: explain config options.
|
@ -36,6 +36,10 @@ The old admin is still there and accessible!
|
||||
|
||||
.. image:: _static/paperless-9-admin.png
|
||||
|
||||
Fancy mail filters!
|
||||
|
||||
.. image:: _static/paperless-11-mail-filters.png
|
||||
|
||||
Mobile support in the future? This doesn't really work yet.
|
||||
|
||||
.. image:: _static/paperless-10-mobile.png
|
||||
|
@ -23,6 +23,77 @@ There are multiple options available.
|
||||
that need to be compiled, and that's already done for you in the release.
|
||||
|
||||
|
||||
Overview of Paperless-ng
|
||||
########################
|
||||
|
||||
Compared to paperless, paperless-ng works a little different under the hood and has
|
||||
more moving parts that work together. While this increases the complexity of
|
||||
the system, it also brings many benefits.
|
||||
|
||||
Paperless consists of the following components:
|
||||
|
||||
* **The webserver:** This is pretty much the same as in paperless. It serves
|
||||
the administration pages, the API, and the new frontend. This is the main
|
||||
tool you'll be using to interact with paperless. You may start the webserver
|
||||
with
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src/
|
||||
$ pipenv run gunicorn -c /usr/src/paperless/gunicorn.conf.py -b 0.0.0.0:8000 paperless.wsgi
|
||||
|
||||
or by any other means such as Apache ``mod_wsgi``.
|
||||
|
||||
* **The consumer:** This is what watches your consumption folder for documents.
|
||||
However, the consumer itself does not consume really consume your documents anymore.
|
||||
It rather notifies a task processor that a new file is ready for consumption.
|
||||
I suppose it should be named differently.
|
||||
This also used to check your emails, but that's now gone elsewhere as well.
|
||||
|
||||
Start the consumer with the management command ``document_consumer``:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src/
|
||||
$ pipenv run python3 manage.py document_consumer
|
||||
|
||||
* **The task processor:** Paperless relies on `Django Q <https://django-q.readthedocs.io/en/latest/>`_
|
||||
for doing much of the heavy lifting. This is a task queue that accepts tasks from
|
||||
multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
|
||||
certain commands periodically.
|
||||
|
||||
This task processor is responsible for:
|
||||
|
||||
* Consuming documents. When the consumer finds new documents, it notifies the task processor to
|
||||
start a consumption task.
|
||||
* Consuming emails. It periodically checks your configured accounts for new mails and
|
||||
produces consumption tasks for any documents it finds.
|
||||
* The task processor also performs the consumption of any documents you upload through
|
||||
the web interface.
|
||||
* Maintain the search index and the automatic matching algorithm. These are things that paperless
|
||||
needs to do from time to time in order to operate properly.
|
||||
|
||||
This allows paperless to process multiple documents from your consumption folder in parallel! On
|
||||
a modern multicore system, consumption with full ocr is blazing fast.
|
||||
|
||||
The task processor comes with a built-in admin interface that you can use to see whenever any of the
|
||||
tasks fail and inspect the errors.
|
||||
|
||||
You may start the task processor by executing:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src/
|
||||
$ pipenv run python3 manage.py qcluster
|
||||
|
||||
* A `redis <https://redis.io/>`_ message broker: This is a really lightweight service that is responsible
|
||||
for getting the tasks from the webserver and consumer to the task scheduler. These run in different
|
||||
processes (maybe even on different machines!), and therefore, this is necessary.
|
||||
|
||||
* A database server. Paperless supports PostgreSQL and sqlite for storing its data. However, with the
|
||||
added concurrency, it is strongly advised to use PostgreSQL, as sqlite has its limits in that regard.
|
||||
|
||||
|
||||
Installation
|
||||
############
|
||||
|
||||
@ -31,10 +102,12 @@ You can go multiple routes with setting up and running Paperless:
|
||||
* The `docker route`_
|
||||
* The `bare metal route`_
|
||||
|
||||
The `docker route`_ is quick & easy. This is the recommended route.
|
||||
The `docker route`_ is quick & easy. This is the recommended route. This configures all the stuff
|
||||
from above automatically so that it just works and uses sensible defaults for all configuration options.
|
||||
|
||||
The `bare metal route`_ is more complicated to setup but makes it easier
|
||||
should you want to contribute some code back.
|
||||
should you want to contribute some code back. You need to configure and
|
||||
run the above mentioned components yourself.
|
||||
|
||||
Docker Route
|
||||
============
|
||||
|
@ -2,9 +2,38 @@
|
||||
Troubleshooting
|
||||
***************
|
||||
|
||||
.. warning::
|
||||
No files are added by the consumer
|
||||
##################################
|
||||
|
||||
Check for the following issues:
|
||||
|
||||
* Ensure that the directory you're putting your documents in is the folder
|
||||
paperless is watching. With docker, this setting is performed in the
|
||||
``docker-compose.yml`` file. Without docker, look at the ``CONSUMPTION_DIR``
|
||||
setting. Don't adjust this setting if you're using docker.
|
||||
* Ensure that redis is up and running. Paperless does its task processing
|
||||
asynchronously, and for documents to arrive at the task processor, it needs
|
||||
redis to run.
|
||||
* Ensure that the task processor is running. Docker does this automatically.
|
||||
Manually invoke the task processor by executing
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ python3 manage.py qcluster
|
||||
|
||||
* Look at the output of paperless and inspect it for any errors.
|
||||
* Go to the admin interface, and check if there are failed tasks. If so, the
|
||||
tasks will contain an error message.
|
||||
|
||||
|
||||
Consumer fails to pickup any new files
|
||||
######################################
|
||||
|
||||
If you notice, that the consumer will only pickup files in the consumption
|
||||
directory at startup, but won't find any other files added later, check out
|
||||
the configuration file and enable filesystem polling with the setting
|
||||
``PAPERLESS_CONSUMER_POLLING``.
|
||||
|
||||
This section is not updated to paperless-ng yet.
|
||||
|
||||
Consumer warns ``OCR for XX failed``
|
||||
####################################
|
||||
|
@ -27,7 +27,7 @@ Each document has a couple of fields that you can assign to them:
|
||||
a document either originates form, or is sent to.
|
||||
* A *tag* is a label that you can assign to documents. Think of labels as more
|
||||
powerful folders: Multiple documents can be grouped together with a single
|
||||
tag, however, a single document can also have multiple tags. This is not
|
||||
tag, however, a single document can also have multiple tags. This is not
|
||||
possible with folders. The reason folders are not implemented in paperless
|
||||
is simply that tags are much more versatile than folders.
|
||||
* A *document type* is used to demarkate the type of a document such as letter,
|
||||
@ -86,49 +86,63 @@ files from the scanner. Typically, you're looking at an FTP server like
|
||||
IMAP (Email)
|
||||
============
|
||||
|
||||
Another handy way to get documents into your database is to email them to
|
||||
yourself. The typical use-case would be to be out for lunch and want to send a
|
||||
copy of the receipt back to your system at home. Paperless can be taught to
|
||||
pull emails down from an arbitrary account and dump them into the consumption
|
||||
directory where the consumer will follow the
|
||||
usual pattern on consuming the document.
|
||||
You can tell paperless-ng to consume documents from your email accounts.
|
||||
This is a very flexible and powerful feature, if you regularly received documents
|
||||
via mail that you need to archive. The mail consumer can be configured by using the
|
||||
admin interface in the following manner:
|
||||
|
||||
.. hint::
|
||||
1. Define e-mail accounts.
|
||||
2. Define mail rules for your account.
|
||||
|
||||
It's disabled by default. By setting the values below it will be enabled.
|
||||
|
||||
It's been tested in a limited environment, so it may not work for you (please
|
||||
submit a pull request if you can!)
|
||||
These rules perform the following:
|
||||
|
||||
.. danger::
|
||||
1. Connect to the mail server.
|
||||
2. Fetch all matching mails (as defined by folder, maximum age and the filters)
|
||||
3. Check if there are any consumable attachments.
|
||||
4. If so, instruct paperless to consume the attachments and optionally
|
||||
use the metadata provided in the rule for the new document.
|
||||
5. If documents were consumed from a mail, the rule action is performed
|
||||
on that mail.
|
||||
|
||||
It's designed to **delete mail from the server once consumed**. So don't go
|
||||
pointing this to your personal email account and wonder where all your stuff
|
||||
went.
|
||||
Paperless will completely ignore mails that do not match your filters. It will also
|
||||
only perform the action on mails that it has consumed documents from.
|
||||
|
||||
.. hint::
|
||||
The actions all ensure that the same mail is not consumed twice by different means.
|
||||
These are as follows:
|
||||
|
||||
Currently, only one photo (attachment) per email will work.
|
||||
* **Delete:** Immediately deletes mail that paperless has consumed documents from.
|
||||
Use with caution.
|
||||
* **Mark as read:** Mark consumed mail as read. Paperless will not consume documents
|
||||
from already read mails. If you read a mail before paperless sees it, it will be
|
||||
ignored.
|
||||
* **Flag:** Sets the 'important' flag on mails with consumed documents. Paperless
|
||||
will not consume flagged mails.
|
||||
* **Move to folder:** Moves consumed mails out of the way so that paperless wont
|
||||
consume them again.
|
||||
|
||||
So, with all that in mind, here's what you do to get it running:
|
||||
.. caution::
|
||||
|
||||
1. Setup a new email account somewhere, or if you're feeling daring, create a
|
||||
folder in an existing email box and note the path to that folder.
|
||||
2. In ``/etc/paperless.conf`` set all of the appropriate values in
|
||||
``PATHS AND FOLDERS`` and ``SECURITY``.
|
||||
If you decided to use a subfolder of an existing account, then make sure you
|
||||
set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here. You also have to set
|
||||
the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
|
||||
have to include that in every email you send.
|
||||
3. Restart paperless. Paperless will check
|
||||
the configured email account at startup and from then on every 10 minutes
|
||||
for something new and pulls down whatever it finds.
|
||||
4. Send yourself an email! Note that the subject is treated as the file name,
|
||||
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
|
||||
get what you expect. Also, you must include the aforementioned secret
|
||||
string in every email so the fetcher knows that it's safe to import.
|
||||
Note that Paperless only allows the email title to consist of safe characters
|
||||
to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
|
||||
The mail consumer will perform these actions on all mails it has consumed
|
||||
documents from. Keep in mind that the actual consumption process may fail
|
||||
for some reason, leaving you with missing documents in paperless.
|
||||
|
||||
.. note::
|
||||
|
||||
With the correct set of rules, you can completely automate your email documents.
|
||||
Create rules for every correspondent you receive digital documents from and
|
||||
paperless will read them automatically. The default acion "mark as read" is
|
||||
pretty tame and will not cause any damage or data loss whatsoever.
|
||||
|
||||
.. note::
|
||||
|
||||
Paperless will process the rules in the order defined in the admin page.
|
||||
|
||||
You can define catch-all rules and have them executed last to consume
|
||||
any documents not matched by previous rules. Such a rule may assign an "Unknown
|
||||
mail document" tag to consumed documents so you can inspect them further.
|
||||
|
||||
Paperless is set up to check your mails every 10 minutes. This can be configured on the
|
||||
'Scheduled tasks' page in the admin.
|
||||
|
||||
|
||||
REST API
|
||||
@ -136,6 +150,7 @@ REST API
|
||||
|
||||
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
|
||||
|
||||
|
||||
.. _usage-recommended_workflow:
|
||||
|
||||
The recommended workflow
|
||||
@ -147,6 +162,10 @@ is as follows. This workflow also takes into account that some documents
|
||||
have to be kept in physical form, but still ensures that you get all the
|
||||
advantages for these documents as well.
|
||||
|
||||
The following diagram shows how easy it is to manage your documents.
|
||||
|
||||
.. image:: _static/recommended_workflow.png
|
||||
|
||||
Preparations in paperless
|
||||
=========================
|
||||
|
||||
@ -156,7 +175,7 @@ Preparations in paperless
|
||||
Processing of the physical documents
|
||||
====================================
|
||||
|
||||
Keep a physical inbox. Whenever you receive a document that you need to
|
||||
Keep a physical inbox. Whenever you receive a document that you need to
|
||||
archive, put it into your inbox. Regulary, do the following for all documents
|
||||
in your inbox:
|
||||
|
||||
|
@ -59,22 +59,6 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
|
||||
#PAPERLESS_STATIC_URL="/static/"
|
||||
|
||||
|
||||
# These values are required if you want paperless to check a particular email
|
||||
# box every 10 minutes and attempt to consume documents from there. If you
|
||||
# don't define a HOST, mail checking will just be disabled.
|
||||
#PAPERLESS_CONSUME_MAIL_HOST=""
|
||||
#PAPERLESS_CONSUME_MAIL_PORT=""
|
||||
#PAPERLESS_CONSUME_MAIL_USER=""
|
||||
#PAPERLESS_CONSUME_MAIL_PASS=""
|
||||
|
||||
# Override the default IMAP inbox here. If not set Paperless defaults to
|
||||
# "INBOX".
|
||||
#PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
|
||||
|
||||
# Any email sent to the target account that does not contain this text will be
|
||||
# ignored.
|
||||
PAPERLESS_EMAIL_SECRET=""
|
||||
|
||||
# Specify a filename format for the document (directories are supported)
|
||||
# Use the following placeholders:
|
||||
# * {correspondent}
|
||||
@ -143,6 +127,35 @@ PAPERLESS_EMAIL_SECRET=""
|
||||
#### Software Tweaks ####
|
||||
###############################################################################
|
||||
|
||||
# Paperless does multiple things in the background: Maintain the search index,
|
||||
# maintain the automatic matching algorithm, check emails, consume documents,
|
||||
# etc. This variable specifies how many things it will do in parallel.
|
||||
#PAPERLESS_TASK_WORKERS=1
|
||||
|
||||
# Furthermore, paperless uses multiple threads when consuming documents to
|
||||
# speed up OCR. This variable specifies how many pages paperless will process
|
||||
# in parallel on a single document.
|
||||
#PAPERLESS_THREADS_PER_WORKER=1
|
||||
|
||||
# Ensure that the product
|
||||
# PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
|
||||
# does not exceed your CPU core count or else paperless will be extremely slow.
|
||||
# If you want paperless to process many documents in parallel, choose a high
|
||||
# worker count. If you want paperless to process very large documents faster,
|
||||
# use a higher thread per worker count.
|
||||
# The default is a balance between the two, according to your CPU core count,
|
||||
# with a slight favor towards threads per worker, and using as much cores as
|
||||
# possible.
|
||||
# If you only specify PAPERLESS_TASK_WORKERS, paperless will adjust
|
||||
# PAPERLESS_THREADS_PER_WORKER automatically.
|
||||
|
||||
# If paperless won't find documents added to your consume folder, it might
|
||||
# not be able to automatically detect filesystem changes. In that case,
|
||||
# specify a polling interval in seconds below, which will then cause paperless
|
||||
# to periodically check your consumption directory for changes.
|
||||
#PAPERLESS_CONSUMER_POLLING=10
|
||||
|
||||
|
||||
# When the consumer detects a duplicate document, it will not touch the
|
||||
# original document. This default behavior can be changed here.
|
||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false"
|
||||
@ -186,12 +199,6 @@ PAPERLESS_EMAIL_SECRET=""
|
||||
#
|
||||
|
||||
|
||||
# By default, Paperless will attempt to use all available CPU cores to process
|
||||
# a document, but if you would like to limit that, you can set this value to
|
||||
# an integer:
|
||||
#PAPERLESS_OCR_THREADS=1
|
||||
|
||||
|
||||
# Customize the default language that tesseract will attempt to use when
|
||||
# parsing documents. The default language is used whenever
|
||||
# - No language could be detected on a document
|
||||
|
@ -2,6 +2,15 @@
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
VERSION=$1
|
||||
|
||||
if [ -z "$VERSION" ]
|
||||
then
|
||||
echo "Need a version string."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# source root directory of paperless
|
||||
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
|
||||
|
||||
@ -42,6 +51,7 @@ mkdir "$PAPERLESS_DIST_APP/docker"
|
||||
# the application itself
|
||||
|
||||
cp "$PAPERLESS_ROOT/.env" \
|
||||
"$PAPERLESS_ROOT/.dockerignore" \
|
||||
"$PAPERLESS_ROOT/CONTRIBUTING.md" \
|
||||
"$PAPERLESS_ROOT/LICENSE" \
|
||||
"$PAPERLESS_ROOT/Pipfile" \
|
||||
@ -80,10 +90,12 @@ cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
|
||||
|
||||
cd "$PAPERLESS_DIST_APP"
|
||||
|
||||
docker-compose build
|
||||
docker build . -t "jonaswinkler/paperless-ng:$VERSION"
|
||||
|
||||
docker push "jonaswinkler/paperless-ng:$VERSION"
|
||||
|
||||
# works. package the app!
|
||||
|
||||
cd "$PAPERLESS_DIST"
|
||||
|
||||
tar -cJf paperless-ng.tar.xz paperless-ng/
|
||||
tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/
|
||||
|
@ -12,7 +12,7 @@ from django.utils import timezone
|
||||
from paperless.db import GnuPG
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .models import Document, FileInfo
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
@ -25,139 +25,204 @@ class ConsumerError(Exception):
|
||||
|
||||
|
||||
class Consumer:
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale pnm
|
||||
2. Use tesseract on the pnm
|
||||
3. Store the document in the MEDIA_ROOT with optional encryption
|
||||
4. Store the OCR'd text in the database
|
||||
5. Delete the document and image(s)
|
||||
"""
|
||||
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR,
|
||||
scratch=settings.SCRATCH_DIR):
|
||||
def __init__(self):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
self.path = None
|
||||
self.filename = None
|
||||
self.override_title = None
|
||||
self.override_correspondent_id = None
|
||||
self.override_tag_ids = None
|
||||
self.override_document_type_id = None
|
||||
|
||||
self.consume = consume
|
||||
self.scratch = scratch
|
||||
def pre_check_file_exists(self):
|
||||
if not os.path.isfile(self.path):
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
|
||||
self.classifier = DocumentClassifier()
|
||||
|
||||
os.makedirs(self.scratch, exist_ok=True)
|
||||
|
||||
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
self.storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
if not self.consume:
|
||||
def pre_check_consumption_dir(self):
|
||||
if not settings.CONSUMPTION_DIR:
|
||||
raise ConsumerError(
|
||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||
"set."
|
||||
"set.")
|
||||
|
||||
if not os.path.isdir(settings.CONSUMPTION_DIR):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def pre_check_regex(self):
|
||||
if not re.match(FileInfo.REGEXES["title"], self.filename):
|
||||
raise ConsumerError(
|
||||
"Filename {} does not seem to be safe to "
|
||||
"consume".format(self.filename))
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||
)
|
||||
|
||||
if not os.path.exists(self.consume):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(self.consume))
|
||||
def pre_check_directories(self):
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
@transaction.atomic
|
||||
def try_consume_file(self, file):
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
override_filename=None,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
"""
|
||||
Return True if file was consumed
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
self.path = path
|
||||
self.filename = override_filename or os.path.basename(path)
|
||||
self.override_title = override_title
|
||||
self.override_correspondent_id = override_correspondent_id
|
||||
self.override_document_type_id = override_document_type_id
|
||||
self.override_tag_ids = override_tag_ids
|
||||
|
||||
# this is for grouping logging entries for this particular file
|
||||
# together.
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
if not re.match(FileInfo.REGEXES["title"], file):
|
||||
return False
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
doc = file
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_regex()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
if self._is_duplicate(doc):
|
||||
self.log(
|
||||
"warning",
|
||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
||||
)
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
self._cleanup_doc(doc)
|
||||
return False
|
||||
self.log("info", "Consuming {}".format(self.filename))
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
# Determine the parser class.
|
||||
|
||||
parser_class = get_parser_class(doc)
|
||||
parser_class = get_parser_class(self.filename)
|
||||
if not parser_class:
|
||||
self.log(
|
||||
"error", "No parsers could be found for {}".format(doc))
|
||||
return False
|
||||
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||
else:
|
||||
self.log("info", "Parser: {}".format(parser_class.__name__))
|
||||
self.log("debug", "Parser: {}".format(parser_class.__name__))
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
filename=doc,
|
||||
filename=self.path,
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
document_parser = parser_class(doc, self.logging_group)
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.path, self.logging_group)
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("info", "Generating thumbnail for {}...".format(doc))
|
||||
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
document = self._store(
|
||||
text,
|
||||
doc,
|
||||
thumbnail,
|
||||
date
|
||||
)
|
||||
except ParseError as e:
|
||||
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
document_parser.cleanup()
|
||||
return False
|
||||
else:
|
||||
document_parser.cleanup()
|
||||
self._cleanup_doc(doc)
|
||||
raise ConsumerError(e)
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
try:
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Cannot classify documents: {}.".format(e))
|
||||
classifier = None
|
||||
|
||||
try:
|
||||
self.classifier.reload()
|
||||
classifier = self.classifier
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
return True
|
||||
# store the document.
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date
|
||||
)
|
||||
|
||||
def _store(self, text, doc, thumbnail, date):
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
file_info = FileInfo.from_path(doc)
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
|
||||
stats = os.stat(doc)
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
self._write(document, self.path, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
except Exception as e:
|
||||
raise ConsumerError(e)
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
return document
|
||||
|
||||
def _store(self, text, date):
|
||||
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
|
||||
file_info = FileInfo.from_path(self.filename)
|
||||
|
||||
stats = os.stat(self.path)
|
||||
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
with open(doc, "rb") as f:
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
else:
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
@ -166,7 +231,7 @@ class Consumer:
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
created=created,
|
||||
modified=created,
|
||||
storage_type=self.storage_type
|
||||
storage_type=storage_type
|
||||
)
|
||||
|
||||
relevant_tags = set(file_info.tags)
|
||||
@ -175,19 +240,30 @@ class Consumer:
|
||||
self.log("debug", "Tagging with {}".format(tag_names))
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
self.apply_overrides(document)
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
self._write(document, doc, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
# We need to save the document twice, since we need the PK of the
|
||||
# document in order to create its filename above.
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
if self.override_title:
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
|
||||
|
||||
if self.override_document_type_id:
|
||||
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
|
||||
|
||||
if self.override_tag_ids:
|
||||
for tag_id in self.override_tag_ids:
|
||||
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||
|
||||
def _write(self, document, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
@ -196,13 +272,3 @@ class Consumer:
|
||||
return
|
||||
self.log("debug", "Encrypting")
|
||||
write_file.write(GnuPG.encrypted(read_file))
|
||||
|
||||
def _cleanup_doc(self, doc):
|
||||
self.log("debug", "Deleting document {}".format(doc))
|
||||
os.unlink(doc)
|
||||
|
||||
@staticmethod
|
||||
def _is_duplicate(doc):
|
||||
with open(doc, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
@ -1,9 +1,11 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
|
||||
@ -18,15 +20,6 @@ class UploadForm(forms.Form):
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
return self.cleaned_data.get("document")
|
||||
|
||||
def get_filename(self, i=None):
|
||||
return os.path.join(
|
||||
settings.CONSUMPTION_DIR,
|
||||
"{}_{}".format(
|
||||
str(i),
|
||||
self.cleaned_data.get("document").name
|
||||
) if i else self.cleaned_data.get("document").name
|
||||
)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
@ -35,15 +28,13 @@ class UploadForm(forms.Form):
|
||||
"""
|
||||
|
||||
document = self.cleaned_data.get("document").read()
|
||||
original_filename = self.cleaned_data.get("document").name
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
file_name = self.get_filename()
|
||||
i = 0
|
||||
while os.path.exists(file_name):
|
||||
i += 1
|
||||
file_name = self.get_filename(i)
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
|
||||
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(document)
|
||||
os.utime(file_name, times=(t, t))
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
|
||||
|
@ -1,249 +0,0 @@
|
||||
import datetime
|
||||
import imaplib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from base64 import b64decode
|
||||
from email import policy
|
||||
from email.parser import BytesParser
|
||||
|
||||
from dateutil import parser
|
||||
from django.conf import settings
|
||||
|
||||
from .models import Correspondent
|
||||
|
||||
|
||||
class MailFetcherError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidMessageError(MailFetcherError):
|
||||
pass
|
||||
|
||||
|
||||
class Loggable(object):
|
||||
|
||||
def __init__(self, group=None):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = group or uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
|
||||
class Message(Loggable):
|
||||
"""
|
||||
A crude, but simple email message class. We assume that there's a subject
|
||||
and n attachments, and that we don't care about the message body.
|
||||
"""
|
||||
|
||||
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
|
||||
|
||||
def __init__(self, data, group=None):
|
||||
"""
|
||||
Cribbed heavily from
|
||||
https://www.ianlewis.org/en/parsing-email-attachments-python
|
||||
"""
|
||||
|
||||
Loggable.__init__(self, group=group)
|
||||
|
||||
self.subject = None
|
||||
self.time = None
|
||||
self.attachment = None
|
||||
|
||||
message = BytesParser(policy=policy.default).parsebytes(data)
|
||||
self.subject = str(message["Subject"]).replace("\r\n", "")
|
||||
self.body = str(message.get_body())
|
||||
|
||||
self.check_subject()
|
||||
self.check_body()
|
||||
|
||||
self._set_time(message)
|
||||
|
||||
self.log("info", 'Importing email: "{}"'.format(self.subject))
|
||||
|
||||
attachments = []
|
||||
for part in message.walk():
|
||||
|
||||
content_disposition = part.get("Content-Disposition")
|
||||
if not content_disposition:
|
||||
continue
|
||||
|
||||
dispositions = content_disposition.strip().split(";")
|
||||
if len(dispositions) < 2:
|
||||
continue
|
||||
|
||||
if not dispositions[0].lower() == "attachment" and \
|
||||
"filename" not in dispositions[1].lower():
|
||||
continue
|
||||
|
||||
file_data = part.get_payload()
|
||||
|
||||
attachments.append(Attachment(
|
||||
b64decode(file_data), content_type=part.get_content_type()))
|
||||
|
||||
if len(attachments) == 0:
|
||||
raise InvalidMessageError(
|
||||
"There don't appear to be any attachments to this message")
|
||||
|
||||
if len(attachments) > 1:
|
||||
raise InvalidMessageError(
|
||||
"There's more than one attachment to this message. It cannot "
|
||||
"be indexed automatically."
|
||||
)
|
||||
|
||||
self.attachment = attachments[0]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.attachment)
|
||||
|
||||
def check_subject(self):
|
||||
if self.subject is None:
|
||||
raise InvalidMessageError("Message does not have a subject")
|
||||
if not Correspondent.SAFE_REGEX.match(self.subject):
|
||||
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
||||
self.subject))
|
||||
|
||||
def check_body(self):
|
||||
if self.SECRET not in self.body:
|
||||
raise InvalidMessageError("The secret wasn't in the body")
|
||||
|
||||
def _set_time(self, message):
|
||||
self.time = datetime.datetime.now()
|
||||
message_time = message.get("Date")
|
||||
if message_time:
|
||||
try:
|
||||
self.time = parser.parse(message_time)
|
||||
except (ValueError, AttributeError):
|
||||
pass # We assume that "now" is ok
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return "{}.{}".format(self.subject, self.attachment.suffix)
|
||||
|
||||
|
||||
class Attachment(object):
|
||||
|
||||
SAFE_SUFFIX_REGEX = re.compile(
|
||||
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
|
||||
|
||||
def __init__(self, data, content_type):
|
||||
|
||||
self.content_type = content_type
|
||||
self.data = data
|
||||
self.suffix = None
|
||||
|
||||
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
|
||||
if not m:
|
||||
raise MailFetcherError(
|
||||
"Not-awesome file type: {}".format(self.content_type))
|
||||
self.suffix = m.group(2) or m.group(4)
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
|
||||
class MailFetcher(Loggable):
|
||||
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR):
|
||||
|
||||
Loggable.__init__(self)
|
||||
|
||||
self._connection = None
|
||||
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
|
||||
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
|
||||
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
|
||||
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
|
||||
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
|
||||
|
||||
self._enabled = bool(self._host)
|
||||
if self._enabled and Message.SECRET is None:
|
||||
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
|
||||
|
||||
self.last_checked = time.time()
|
||||
self.consume = consume
|
||||
|
||||
def pull(self):
|
||||
"""
|
||||
Fetch all available mail at the target address and store it locally in
|
||||
the consumption directory so that the file consumer can pick it up and
|
||||
do its thing.
|
||||
"""
|
||||
|
||||
if self._enabled:
|
||||
|
||||
# Reset the grouping id for each fetch
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("debug", "Checking mail")
|
||||
|
||||
for message in self._get_messages():
|
||||
|
||||
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
||||
|
||||
t = int(time.mktime(message.time.timetuple()))
|
||||
file_name = os.path.join(self.consume, message.file_name)
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(message.attachment.data)
|
||||
os.utime(file_name, times=(t, t))
|
||||
|
||||
self.last_checked = time.time()
|
||||
|
||||
def _get_messages(self):
|
||||
|
||||
r = []
|
||||
try:
|
||||
|
||||
self._connect()
|
||||
self._login()
|
||||
|
||||
for message in self._fetch():
|
||||
if message:
|
||||
r.append(message)
|
||||
|
||||
self._connection.expunge()
|
||||
self._connection.close()
|
||||
self._connection.logout()
|
||||
|
||||
except MailFetcherError as e:
|
||||
self.log("error", str(e))
|
||||
|
||||
return r
|
||||
|
||||
def _connect(self):
|
||||
try:
|
||||
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
|
||||
except OSError as e:
|
||||
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
|
||||
raise MailFetcherError(msg)
|
||||
|
||||
def _login(self):
|
||||
|
||||
login = self._connection.login(self._username, self._password)
|
||||
if not login[0] == "OK":
|
||||
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
|
||||
|
||||
inbox = self._connection.select(self._inbox)
|
||||
if not inbox[0] == "OK":
|
||||
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
|
||||
|
||||
def _fetch(self):
|
||||
|
||||
for num in self._connection.search(None, "ALL")[1][0].split():
|
||||
|
||||
__, data = self._connection.fetch(num, "(RFC822)")
|
||||
|
||||
message = None
|
||||
try:
|
||||
message = Message(data[0][1], self.logging_group)
|
||||
except InvalidMessageError as e:
|
||||
self.log("error", str(e))
|
||||
else:
|
||||
self._connection.store(num, "+FLAGS", "\\Deleted")
|
||||
|
||||
if message:
|
||||
yield message
|
@ -3,10 +3,10 @@ import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
|
||||
from documents.consumer import Consumer
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
try:
|
||||
from inotify_simple import INotify, flags
|
||||
@ -16,13 +16,10 @@ except ImportError:
|
||||
|
||||
class Handler(FileSystemEventHandler):
|
||||
|
||||
def __init__(self, consumer):
|
||||
self.consumer = consumer
|
||||
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
self.consumer.try_consume_file(file)
|
||||
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
||||
@ -37,7 +34,7 @@ class Handler(FileSystemEventHandler):
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
On every iteration of an infinite loop, consume what we can from the
|
||||
consumption directory, and fetch any mail available.
|
||||
consumption directory.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -45,12 +42,6 @@ class Command(BaseCommand):
|
||||
self.verbosity = 0
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.file_consumer = None
|
||||
self.mail_fetcher = None
|
||||
self.first_iteration = True
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
@ -66,9 +57,6 @@ class Command(BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
directory = options["directory"]
|
||||
|
||||
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(
|
||||
directory
|
||||
@ -78,11 +66,16 @@ class Command(BaseCommand):
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
self.consumer.try_consume_file(entry.path)
|
||||
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
observer = Observer()
|
||||
event_handler = Handler(self.consumer)
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info('Using polling instead of file'
|
||||
'system notifications.')
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
else:
|
||||
observer = Observer()
|
||||
event_handler = Handler()
|
||||
observer.schedule(event_handler, directory, recursive=True)
|
||||
observer.start()
|
||||
try:
|
||||
|
@ -9,13 +9,11 @@ from django_q.tasks import schedule
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
|
||||
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
|
||||
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
|
||||
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
|
||||
Schedule.objects.filter(func='documents.tasks.consume_mail').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
@ -113,6 +113,7 @@ class DocumentType(MatchingModel):
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
# TODO: why do we need an explicit list
|
||||
TYPE_PDF = "pdf"
|
||||
TYPE_PNG = "png"
|
||||
TYPE_JPG = "jpg"
|
||||
@ -291,7 +292,7 @@ class FileInfo:
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
|
||||
# TODO: what is this used for
|
||||
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
|
@ -41,15 +41,16 @@ def get_parser_class(doc):
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
options = []
|
||||
for parser in parsers:
|
||||
result = parser(doc)
|
||||
if result:
|
||||
options.append(result)
|
||||
|
||||
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
parser_test = parser_declaration["test"]
|
||||
|
||||
if parser_test(doc):
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
|
@ -6,14 +6,10 @@ from whoosh.writing import AsyncWriter
|
||||
from documents import index
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.mail import MailFetcher
|
||||
from documents.consumer import Consumer, ConsumerError
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
def consume_mail():
|
||||
MailFetcher().pull()
|
||||
|
||||
|
||||
def index_optimize():
|
||||
index.open_index().optimize()
|
||||
|
||||
@ -54,3 +50,27 @@ def train_classifier():
|
||||
logging.getLogger(__name__).error(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
||||
|
||||
|
||||
def consume_file(path,
|
||||
override_filename=None,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
|
||||
document = Consumer().try_consume_file(
|
||||
path,
|
||||
override_filename=override_filename,
|
||||
override_title=override_title,
|
||||
override_correspondent_id=override_correspondent_id,
|
||||
override_document_type_id=override_document_type_id,
|
||||
override_tag_ids=override_tag_ids)
|
||||
|
||||
if document:
|
||||
return "Success. New document id {} created".format(
|
||||
document.pk
|
||||
)
|
||||
else:
|
||||
raise ConsumerError("Unknown error: Returned document was null, but "
|
||||
"no error message was given.")
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,208 +0,0 @@
|
||||
Return-Path: <sender@example.com>
|
||||
X-Original-To: sender@mailbox4.mailhost.com
|
||||
Delivered-To: sender@mailbox4.mailhost.com
|
||||
Received: from mx8.mailhost.com (mail8.mailhost.com [75.126.24.68])
|
||||
by mailbox4.mailhost.com (Postfix) with ESMTP id B62BD5498001
|
||||
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
Received: from localhost (localhost.localdomain [127.0.0.1])
|
||||
by mx8.mailhost.com (Postfix) with ESMTP id B41796F190D
|
||||
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
X-Spam-Flag: NO
|
||||
X-Spam-Score: 0
|
||||
X-Spam-Level:
|
||||
X-Spam-Status: No, score=0 tagged_above=-999 required=3
|
||||
tests=[RCVD_IN_DNSWL_NONE=-0.0001]
|
||||
Received: from mx8.mailhost.com ([127.0.0.1])
|
||||
by localhost (mail8.mailhost.com [127.0.0.1]) (amavisd-new, port 10024)
|
||||
with ESMTP id 3cj6d28FXsS3 for <sender@mailbox4.mailhost.com>;
|
||||
Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
Received: from smtp.mailhost.com (smtp.mailhost.com [74.55.86.74])
|
||||
by mx8.mailhost.com (Postfix) with ESMTP id 527D76F1529
|
||||
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
Received: from [10.114.0.19] (nl3x.mullvad.net [46.166.136.162])
|
||||
by smtp.mailhost.com (Postfix) with ESMTP id 9C52420C6FDA
|
||||
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:16 +0000 (UTC)
|
||||
To: paperless@example.com
|
||||
From: Daniel Quinn <sender@example.com>
|
||||
Subject: Test 0
|
||||
Message-ID: <56B3CA2A.6030806@example.com>
|
||||
Date: Thu, 4 Feb 2016 22:01:14 +0000
|
||||
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
|
||||
Thunderbird/38.5.0
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/mixed;
|
||||
boundary="------------090701020702030809070008"
|
||||
|
||||
This is a multi-part message in MIME format.
|
||||
--------------090701020702030809070008
|
||||
Content-Type: text/plain; charset=utf-8
|
||||
Content-Transfer-Encoding: 7bit
|
||||
|
||||
The secret word is "paperless" :-)
|
||||
|
||||
--------------090701020702030809070008
|
||||
Content-Type: application/pdf;
|
||||
name="test0.pdf"
|
||||
Content-Transfer-Encoding: base64
|
||||
Content-Disposition: attachment;
|
||||
filename="test0.pdf"
|
||||
|
||||
JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0
|
||||
ZURlY29kZT4+CnN0cmVhbQp4nFWLQQvCMAyF7/kVOQutSdeuHZSA0+3gbVDwIN6c3gR38e/b
|
||||
bF4kkPfyvReyjB94IyFVF7pgG0ze4TLDZYevLamzPKEvEFqbMEZfq+WO+5GRHZbHNROLy+So
|
||||
UfFi6g7/RyusEpUl9VsQxQTlHR2oV3wUEzOdhOnXG1aw/o1yK2cYCkww4RdbUCevCmVuZHN0
|
||||
cmVhbQplbmRvYmoKCjMgMCBvYmoKMTM5CmVuZG9iagoKNSAwIG9iago8PC9MZW5ndGggNiAw
|
||||
IFIvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aDEgMTA4MjQ+PgpzdHJlYW0KeJzlOWt0G9WZ
|
||||
95uRbNmWLckPWY4SaRTFedmybI8T4rw8sS3ZiZ1YfqWSCbFkS7YEtiQkJSE8GlNeOQ5pUmh5
|
||||
Zkt2l+XQNl3GhLaBpcWw0D19UGALLRRS0gM9nD0lxVBK9wCx97tXI0UJAc727L8d+c587/u9
|
||||
7p0rOZXYEyJaMkV4Io1OBuLOqmqBEPJLQqB0dG9K2NRTsQHhM4Rw/zkWH5+870e7PiRE9Rgh
|
||||
+Y+NT+wf+/b3e4YI0YYJKX41HAoEfxj6vUjIIgltrA0jYef8/nzEr0F8WXgydY2bP7QO8WOI
|
||||
SxOx0cDxxbUmxN9AfOlk4Jr4apWLI8SMKBGigcmQpYXrRBx9KtobjyVTQbJsgZDl91B+PBGK
|
||||
d9838hzipwjhjyIN8EMvLYJ5FOd4lTovX1NQWKQtLtGR/3eX+jCpIJ3qTURH4ux+wcWfIFXk
|
||||
XkIW3qXY+ft898LH/5deaNKPe8hD5DFymLxGrlAYbuIhEbIHKbnX0+QlpNLLQ4bId8n055g9
|
||||
QU4hPy3nJ0doJJe8PORucpL8xwWzeMgkuQ59+QF5DRrIz7BVYuQD0JAbyXNo9QOkbb+UKa4E
|
||||
b2MMHMuhvk7u5w6RbdzbiNxLOZyT05NnyTHYjZZTGOfhbMQbP2P0NnID3vtJmOxFmF3qTZ/+
|
||||
jhQs/AWjuoFsI18jW8hEjsaT8ABfiPUbIA9gTp9mNGeGmd/JX8n9kOPO3YnIN8g4jgBg7Nxh
|
||||
fsvnZOh/ffGDpBhW8dWk4FJcrono5j/mGhc+5JeRQjK4MJehLXQt/IUPzEdVw6rF6k2qX3zR
|
||||
HHnfUE2iNln44/x180H1DvVDWK2HcePouHzI5x0c6O/r9fTs2N7dtW1rZ4fb1d7WukVq2bxp
|
||||
44b1zesuW7umod5Z56hduWJ59TL7UpvVVG7Q60qKiwoLNPl5ahXPAakVZPC7ZL5aMLgDdpc9
|
||||
0OmoFVymcLuj1mV3+2UhIMj4UC23d3Yykj0gC35BXo6PQA7ZL0soOXaRpJSWlLKSoBc2ko10
|
||||
CrsgP99uF07BUK8X4cPtdp8gn2XwdgarljOkGBGbDTWYV9RbwSW794anXX70EWaKCtvsbaFC
|
||||
Ry2ZKSxCsAgheaU9PgMrNwMDuJWu9TMc0RTTaTFSVyAoe3q9rnazzeZz1G6VS+ztjEXamEk5
|
||||
r03OZyaFCHWdHBJmamenbz+lJyP+Gm3QHgzs8sp8AHWnedf09G2yoUZeZW+XV137tgkjD8m1
|
||||
9naXXEOtdvVl5+k6PyXI6mq9XZj+K8Fw7GffvZASUCh51fq/EgrKXJsMfV4bvcxuzPX0tNsu
|
||||
uKf904FTC1MjdkFvn57RaqfjLkw38XjRxKmFJw6ZZfftPlnvD8N6nxK6u69LLuu93Ctz1W4h
|
||||
HEAK/rXYbevMNkNWxvN5bIJpweRghm02moZDpyQygog81etN4wIZMT9KJGeNT+b8lDOb4VQM
|
||||
Us5UhpNV99uxtl393mlZVb01aHdhxg8F5KkR7K4raWHsernkI7PNPl1qEJqdPiYroFdbgxFB
|
||||
Vi/HJKFWrgL2DVWZ1jOk5KP046wZJ1huKBWa7WiG2nHZXX7lb2/YhAYETHRnTboRBryy1I6A
|
||||
FFAq5pqpd6JGwI8Fi7SzYspOe1wut7dmq0vdckX6vUxFUZPL22TiH1W0ZKeLrSvBNe1vT7tA
|
||||
bdl7vY8TceHMTJNgPimSJuJrp8LGNuyy5a5pb3BMtvrNQVx3Y4LXbJMlH1bYZ/eGfLTtMEOr
|
||||
zphZc/hYrwx4u/rtXb1D3nWKI2kGNaeqdl1kxu41p81gA8qaao3g5cy8DwX1SBDcCNhbN+Jd
|
||||
zq/W4NBjwhmVNm7rRsELZpKRRjfkVYIr1K7IUfwCo2raTm2dGWt5FEU7bZ1mm8+Wvhy1HLIF
|
||||
ZWLU0NCkdmZYuE0hQ4P92dbJSDSXJtr0gtcesvvsYUGWPF4aG00Py7KSDJZzpVYDF2A5ycI0
|
||||
ERuyMwhNpuyuMecmV+5geBbtvIi9NcMWpjX2rv5patyuGCTo+VaZ0BaW1hnMbC+gC9qOe6+g
|
||||
xyXNFvT0jCTRxRxeT43Ytwan7f3ejUwa95MbzNfSuUpJF3QNtDpqcWtrnbHDwd4ZCQ72D3kf
|
||||
1+O58OCA91EOuDZ/q29mGfK8jwv40mBUjlIpkSICRailPkQ0TN78uETIFOOqGIHho6eAMJom
|
||||
QwMyeopL0/TpiZaziSTCIUeV5kgZaRXSNGnaFKOxa4bQlEmFakkjFUharpgzzwAlPYqUJ/Ac
|
||||
WwDkpBaKwTyDWn2MfAqmZgokc1piCiWktIcHB89PPTjkPanFt7OZ3XGiVnphu5jCWGx8rbiE
|
||||
IG2U633hab+PLjZixNLgH8hg34xlsm9GR/K0cqE91CoX2VspvYXSW9L0PErPxxYFI6D6FNbe
|
||||
IwPtgMu9NlySwqKfmaf1Z2mlfLipTOv/6MCMVeP3hqfxDFoOG6XTpVwRp+ErjFqigQJeoykw
|
||||
8AW831fAl3KEG/aR0hYj6IxwxghPGeGIEQ4YYdgISBQY/ao5I7xghOOMFzdCjxGsjJGmy0Z4
|
||||
gLFiTE0yQj0TIEZ4k3GnGL2eUTYssHnSakcYo4fx5hhdzsyRVhCYzhwzNMummWJcdM2ZmeOK
|
||||
7HV15koo1+6L6J/hUB5pqTEQ0cTuBtHkHN59hWgohcpmg9hQb1tzmcG+VAd2g81gX1EHNWCo
|
||||
rIANr4jnrjC3qY61my0/v6bhlTVm1d3lL8GG+edeyi/65CrzGnqgAlKOJ7c/4neCJeQJaT8p
|
||||
L68qLikpqCqwWJcs8viWkHJEKqs8Pm1lRRnHqdWGPp9af9wKZ6wwawW9FYgVmhE5aoW4FfxW
|
||||
8FhBskK9FQQrWBkbWVMZLrJeZJqyFY7n0HOTk0hckAAldoy6RaSAyNJQCs0Ye/rTUA/l+ZtB
|
||||
bDRWYOA0G032pfkKuGKNDdz5nT9qufb6xPxVNzy0+6YD88F9t0Mj/1G4btXGr9927q4qh6OK
|
||||
231iybkyCqk5kwMXTg2eT0vV3aQIvy39gzRGtNo8g6HSyBf0+wgPep6vkCpKPb4KndagM3h8
|
||||
uorySlBVQvOHlXC0Erh4JfgrwVMJUiXMVoJcCccZKlSCvhJIJcwxCormSl7YIzQFwywL2fKT
|
||||
RSb9r7D4LAEGUQk+z750+ZqmtZgA/nzQ10mOWkmqdUiF/zhfdfwWqFG9mcalT9bTOHmhiq7B
|
||||
gYV3uV/zz5GVxCc12fLLFxVjS6xaXWzjKystHp+5Us8XeXz5vHFqNcRXg381eFaDsBoeWQ3D
|
||||
q6FnNWT8JVgewmpUSrA26QKhg1kPV6wRK41i45omJ9RxzN3KCvuK5faleRXlxkoLz/165vvu
|
||||
79Q7GrqueeZeX2hX43eOjt/vXL0m0Tu4fcedQy120Nx+dEnpOze1P3Rt0xJb+6j7+iPW5yed
|
||||
nvbmHYsa69p20q8ZpHPhXf5q/mlixt1lUmoxaKqrVYJWW6Xi8di/tHBpr89UYTAsxooZrAZO
|
||||
yxsMRFNozFdhjBWkwuMj+qkVMLwCpBWAwBVYBEw+MbEhljY708knzawn0yvQoESp9N8KDNbQ
|
||||
tBlaYE3TcrYu16yF/BKoKBcb114GL933jT3z82WJmfe3Hr/ncMe2YP/Sdf8E5KZbh4+0jzby
|
||||
T3/1a+duqXLsToBp93VbeNWdgV3OPc/b5y0q9e6obDWxNYs1c6huJEbSIa0oLCnJL+P5SpNK
|
||||
W6T1+Aryi3S4pg29PmJ8wASyCVpM4DTRMiUybSSKivfNpc2NjbSH1NhABvuaFhArxAq7oRzr
|
||||
dFlFCcAO//B1N4RafvvbDfXr++03lyfGuTsdK155ZeDcgS2t+i0mK8u5B3Puxh6qIIvJYWmo
|
||||
CkC3SFOhq1hiqSKY6CprFSa6qkpbWmr0+Er1WnWvT2uctYBsgeMWOGqBKQvELeC3gMcCxAKb
|
||||
8SFZoN4CggX0FphjciiU2R2yO+MVSnFoRUzOzMJINx5bGxXlFqBpx2CwBQ3YdYKhArDlbE3L
|
||||
QbXpwPjab9bX/8vO13/xq6cgMn93OAZ37ILXSqfv9ZQWrbPWvQvqjz6YH+uDYw8/ePJeGus2
|
||||
jPUd3C/LcMecknrKVUWkqkqv0lusZXqPrwz3A4yY5GOD5eurUIGr7PVxRtwGO3J3RsI2wSlG
|
||||
SQN+RldWvxLk+Z0v04HnNz4WXnWeXTA0leJKWr4JcNHT9gNWPMNyu8D9+uq75w/87uWJWN63
|
||||
oT01/9/z1qmbrx7yJeY/dQ/BH/4GUGm75UOT4+PHqxzw/E/+bQX3joHVcwfG+CjWsxA77Anp
|
||||
RoO6iKhJpUlT4vFp9Fy5BwMSTEBMcMYEHhPUm0BvgjmGvmiCWdZ1x01w1ARTJoibwG8CyQRp
|
||||
lQ0PMJKHkeoZVc8YufrHmWZaDe9XfO6bMbtdZpdpNkFYfL0tsy/mNyn7DPYC/+h858uvvvrG
|
||||
b3732FdvvWnPvhtvnoLX5w3z7//507/95dVnnjjz1o+fTb8baR52YB6MxC9txCwY1UbMgg7f
|
||||
hhq9sZwv7/XxRvR8c24kcyyGdABIf8QEw3TxZd3fnd3MxVxfq7E/BQPbFA10UxTSa5Df0XBi
|
||||
aP6y/3rttuOX1fSn5j/85+/dMdG8bBW8/6dz1vmPH3LOh1/+gY36akZfT/Mn0NdvScOktFil
|
||||
KigtqDSpy4xl2IpGnQqPpX2+Yr1RW4D+Vxxn2Z7NJL/5TE49CCtgtm5yJpw0RTBBbtpzX9NE
|
||||
eUUrj5yXNH0H0K5UenQFXY1VtGOh+fj1E18Hcd/8nzUdT7TMXQMW0J6wcu9UOT69r8rRvaIZ
|
||||
yrkxfFPRGPGdnFeF9WiAR6UFgzZv8WIbWbnS4bBpebGxoc7ja9CttC02aB01Do/PqqupqMrL
|
||||
Kygo7/MV6FfgMYev7vPx+r0i7BRhrQjLRDCKkCfCRyK8LcLLIvxUhAdFuEuEERHAI0K7CPVM
|
||||
rlwElQjhuYzgYyKkRJBEaGJs5H0owusizIogMxs3ixAUFRNpGX1G7EURnhXheyIcZWJXibBB
|
||||
BCEzx7r0BMdF8IswkJmjnGm+zTS/KcIUTi/V5PDNTPdt5gAnM4E4mx5n1YmgUdbL8BcfMy88
|
||||
heYcxM6r5wjlbE6Z45lyPsuc0CqzJzTWAOyEVknvVZA9ppVw+edPbcsvOrZ1PSy59izZ/kL7
|
||||
3P75wduPL3K5WioMh+dbDw0Oem86PL9z3z4o4/0165uaa1rn/6Qc5LwnNIXFqrVbMmi/b8m5
|
||||
quyBh/WRE5vhD9hHi8msdAMpKzMVabX5pvwllsV40l2sK0PEaPL4Co0VpbRt9LRtHrTA2xZ4
|
||||
1gL4QlFZoBmRb1ogZYGgBQYs0G6BJgsss4CZsfHNxuW+1/Bt9qIFsq+8LD03o8N/18n3wnPv
|
||||
RRls3/6v69Pn3t7BITz4Xnn11aDl/bXN2WOvt39YOfcq58HbFt6C/eQVPPeapCKSl6ct5gvu
|
||||
v5wvIy3KmRP3qpwDJ+x3NTW53KLo3tXQ2dkgut3s/y30Pzblq28Z1m38K2dN/9b/yzuXdJ7/
|
||||
JXfhrbwqNf0FXJMloV6+bd5FvpJLueDS5zXjN8a3SLWKkHKumdTwS8gAR397Pkw6ES/Hpwd5
|
||||
23DsQHgHPs2oU4NPJ0eUX9KfgR3wDLcaP8e4t/kh/pcqj+ohtSlvY97P895VZtWTRhoDi0SP
|
||||
/bILgX/nf0p4xrVANOvbzqyfgJI7FZgj+WRMgXk8i04qsAplDiqwmpSQexQ4j+jIQwqcT64l
|
||||
P1BgDX43dipwASmBNgUuhCj0KnARWcw9lf0vVx33ugIXkzV8gQKXkEX8Zuq9iv46f4L3KjAQ
|
||||
QaVSYI6UqJYpME/WqhoVWIUyYQVWk8WqgwqcRyyqBxU4n3yoekaBNWSl+ocKXEAWq3+vwIXc
|
||||
G+qPFbiIrNP8RoG1ZFdBiQIXkysLrlTgEtJU8HJ7ZDySilwbCgrBQCogjMbi+xOR8XBKWDm6
|
||||
Smisb6gXOmKx8YmQ0BZLxGOJQCoSi9YVtl0s1ij0oYnOQKpW2BodreuOjITSskJ/KBEZ6wuN
|
||||
75kIJLYkR0PRYCghOISLJS7Gd4YSSYo01tXX1zWc514sHEkKASGVCARDk4HEVUJs7EJHhERo
|
||||
PJJMhRJIjESFwbr+OsETSIWiKSEQDQoDWcWesbHIaIgRR0OJVACFY6kwunrlnkQkGYyM0tmS
|
||||
ddkIctLRnwrtDQnbA6lUKBmLtgaSOBd6NhCJxpK1wr5wZDQs7AskhWAoGRmPInNkv3ChjoDc
|
||||
AMYSjcb2osm9oVr0eywRSoYj0XEhSUNWtIVUOJCiQU+GUonIaGBiYj/WbDKOWiNYpH2RVBgn
|
||||
ngwlhR2hfUJfbDIQ/W5d2hXMzRgmVYhMxhOxvcxHR3I0EQpFcbJAMDASmYik0Fo4kAiMYsYw
|
||||
bZHRJMsIJkKIB6IO155ELB5CT7/S0X1eEB1MZzMZm9iLM1PpaCgUpDOi23tDE6iEE0/EYlfR
|
||||
eMZiCXQ0mAo7cjwfi0VTqBoTAsEgBo7Zio3umaR1wjSnMs4FRhMx5MUnAim0MpmsC6dS8fVO
|
||||
5759++oCSmlGsTJ1aNn5RbzU/nhIqUeCWpmc6MbyR2np9rD60iD6t3YLPXHMjxudExSBWiHT
|
||||
mg11DcoUmMZIPJWsS0Ym6mKJcWePu5u0kwgZx5HCcS0JkSARcAQQDyA0SmIkTvaTBJMKI1Ug
|
||||
K5G6Cp+NpJ404BBIB0rFkD+B+gJpQziBWvQeYHZjJErq8FtE25daa0SoT/Gik2nXIrQV9UfR
|
||||
QjfqjSA3165A+hklgvss1Rwne9CPAFK2kCRqhVAmyCQE4sDxZTa+jL+TQckspxH9qsdPHXp/
|
||||
Kd0vsxxBWwLLdYpxqK+TzP+rkBZDvS/KiIByIVa/JHJCDAsyq9T2IEr0MykP06S5SLHZokxq
|
||||
4BIz9uCMY6g/ymqZkRxltmlPpC3HEA4rWb0SM55gHgSZXia2JM782Rpcujv6mXd72ZzbGZ3i
|
||||
ScZrRTypxJXO2QDzIoZUmot96AmdN8zgAMtnkGnTLosqmiPYd8IXziMougGlLlE2x17FS6pT
|
||||
q+R7jN2TbN4oziEw/9JVvnBugeUpwLKervQkclNMdhTpE/jZr6yzScxKeq4RZSXtY+syrEQ8
|
||||
yewKZAc+97GuiLG6RW1LWY3PZyXdN2NKpwpMN45wjEWRyaOD1YZGEmKeUijA1v4IakywudO+
|
||||
hVl3BFhtQ0qtUyyCTL6CSqTU6zijOIiL9QVd8SElp1/BnaL7khbTGcztTVqTCeZvMsd2lHkb
|
||||
zMaYzjaVmlBmSkc8wXakq7L1GWP9ls5okFlzfE7Ox1huUsqsMeZRED/piqd7K4a6e1g90usp
|
||||
3c2pz2QuwPIbU/TibF9KKb5MsvURZh0YJ+vxbOlE7+injvVh7qoZVdZMneKz8+/Wo37FWQZz
|
||||
10ci68sk+titrP5odtXtyVm/mUr04x7UzfaLuNI/biVzwkUW6Kq5eNdsYPvlhVGkuzGCeIr5
|
||||
k2S5rGMxjCO/B2foZufo9DcHG/p0iWumwLNlBEIEIAzjpIxYwU92wDAZhC1kE0j4lJDXis82
|
||||
xOmzDjaRKZTbhPTNiG9E+gbcPK14b8HRg+MIDhWOtEQ9Sjjx6VRwB+K1qPEC3oENSm1BKn1u
|
||||
Q7wTnx3K0410Fz5dCr4VcXwSP+TjQbyF3Z8ClXQSzpyDF86BcA4OfAKeT2Dqg6MfcO/PrbI+
|
||||
MvfUHNfz3vB7j7zH178HuvdAQ87qz3rO+s/Gzx4/m1eoexe05E9geOvMOuubm04P/n7TG4Pk
|
||||
NEZ2uv605/TUafm0+jTwg2/wRqt+Vpitn43PTs2+OHtmdm5WM/WToz/hfvyk06p70vokZz3Z
|
||||
c/LASd7/MOgetj7Mee73388dPQa6Y9ZjzmP8fffWWe/tsFjvvmuF9cxdc3dxpxZmT95VbHA/
|
||||
CT3QTTZhDnec5Besj2ypgO0Ylg7vVhxOHD04YjiO4MDvPShuxeGEbmkdP/wtKLrDfEfNHdfd
|
||||
cegOdfzWqVuP3spP3XL0Fu6RvU/t5ZKeVdZYtMYa7VhtrRJNg/kiP5iH0+Ds0taR6pVu/7Bk
|
||||
HUahy4fqrUMdq6xlYumgGgNWoaCOt/ItfA8f44/wT/H5mj6PxdqL44xnzsNJngKtW9dj7XH2
|
||||
8KcWzkihLhta2xbfNrWN3+peZe3sWGfVdVg7nB0vdLzZ8V5H3nAHPIB/7kfcT7l5yb3K6Zbc
|
||||
Fpt7cad50ChWDBpAN6gXdYMcYKFFMujULeg4nW5Yd0DH60gL4aaMoIZTcHRmoL+mputU/kJf
|
||||
l6zxXC7DQbm6n96l3iE576BMBocu984AfN13y+HDpHVJl9zY75X9S3xdchABiQJTCOiXzBhJ
|
||||
qy+ZTNWwC2pqEN6Dd1KzpwaJu5NpKsnySU0SkrhHJZkS1FCBNA54r6E8JFA9QO3dSUJvlFmT
|
||||
VqLaScUcU07fGGDa/T/LhW2oCmVuZHN0cmVhbQplbmRvYmoKCjYgMCBvYmoKNjI5MQplbmRv
|
||||
YmoKCjcgMCBvYmoKPDwvVHlwZS9Gb250RGVzY3JpcHRvci9Gb250TmFtZS9CQUFBQUErTGli
|
||||
ZXJhdGlvblNlcmlmCi9GbGFncyA0Ci9Gb250QkJveFstNTQzIC0zMDMgMTI3NyA5ODFdL0l0
|
||||
YWxpY0FuZ2xlIDAKL0FzY2VudCA4OTEKL0Rlc2NlbnQgLTIxNgovQ2FwSGVpZ2h0IDk4MQov
|
||||
U3RlbVYgODAKL0ZvbnRGaWxlMiA1IDAgUgo+PgplbmRvYmoKCjggMCBvYmoKPDwvTGVuZ3Ro
|
||||
IDI5Mi9GaWx0ZXIvRmxhdGVEZWNvZGU+PgpzdHJlYW0KeJxdkctuwyAQRfd8Bct0EfmROA/J
|
||||
spQmseRFH6rbD3BgnCLVGGGy8N+XmUlbqQvQmZl7BxiSY3NqrAnJqx9VC0H2xmoP03jzCuQF
|
||||
rsaKLJfaqHCPaFdD50QSve08BRga249lKZK3WJuCn+XioMcLPIjkxWvwxl7l4uPYxri9OfcF
|
||||
A9ggU1FVUkMf+zx17rkbICHXstGxbMK8jJY/wfvsQOYUZ3wVNWqYXKfAd/YKokzTSpZ1XQmw
|
||||
+l8tK9hy6dVn56M0i9I0LdZV5Jx4s0NeMe+R18TbFXJBnKfIG9ZkyFvWUJ8d5wvkPTPlD8w1
|
||||
8iMz9Tyyl/Qnzp+Qz8xn5JrPPdOj7rfH5+H8f8Ym1c37ODL6JJoVTslY+P1HNzp00foG7l+O
|
||||
gwplbmRzdHJlYW0KZW5kb2JqCgo5IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1RydWVU
|
||||
eXBlL0Jhc2VGb250L0JBQUFBQStMaWJlcmF0aW9uU2VyaWYKL0ZpcnN0Q2hhciAwCi9MYXN0
|
||||
Q2hhciAxNQovV2lkdGhzWzc3NyA2MTAgNTAwIDI3NyAzODkgMjUwIDQ0MyAyNzcgNDQzIDUw
|
||||
MCA1MDAgNDQzIDUwMCA3NzcgNTAwIDI1MApdCi9Gb250RGVzY3JpcHRvciA3IDAgUgovVG9V
|
||||
bmljb2RlIDggMCBSCj4+CmVuZG9iagoKMTAgMCBvYmoKPDwvRjEgOSAwIFIKPj4KZW5kb2Jq
|
||||
CgoxMSAwIG9iago8PC9Gb250IDEwIDAgUgovUHJvY1NldFsvUERGL1RleHRdCj4+CmVuZG9i
|
||||
agoKMSAwIG9iago8PC9UeXBlL1BhZ2UvUGFyZW50IDQgMCBSL1Jlc291cmNlcyAxMSAwIFIv
|
||||
TWVkaWFCb3hbMCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNl
|
||||
UkdCL0kgdHJ1ZT4+L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUv
|
||||
UGFnZXMKL1Jlc291cmNlcyAxMSAwIFIKL01lZGlhQm94WyAwIDAgNTk1IDg0MiBdCi9LaWRz
|
||||
WyAxIDAgUiBdCi9Db3VudCAxPj4KZW5kb2JqCgoxMiAwIG9iago8PC9UeXBlL0NhdGFsb2cv
|
||||
UGFnZXMgNCAwIFIKL09wZW5BY3Rpb25bMSAwIFIgL1hZWiBudWxsIG51bGwgMF0KL0xhbmco
|
||||
ZW4tR0IpCj4+CmVuZG9iagoKMTMgMCBvYmoKPDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5
|
||||
MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAwNEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAw
|
||||
NjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzUwMDJFMDAzMD4KL0NyZWF0aW9uRGF0ZShEOjIw
|
||||
MTYwMjA0MjIwMDAyWicpPj4KZW5kb2JqCgp4cmVmCjAgMTQKMDAwMDAwMDAwMCA2NTUzNSBm
|
||||
IAowMDAwMDA3NTA5IDAwMDAwIG4gCjAwMDAwMDAwMTkgMDAwMDAgbiAKMDAwMDAwMDIyOSAw
|
||||
MDAwMCBuIAowMDAwMDA3NjUyIDAwMDAwIG4gCjAwMDAwMDAyNDkgMDAwMDAgbiAKMDAwMDAw
|
||||
NjYyNSAwMDAwMCBuIAowMDAwMDA2NjQ2IDAwMDAwIG4gCjAwMDAwMDY4NDEgMDAwMDAgbiAK
|
||||
MDAwMDAwNzIwMiAwMDAwMCBuIAowMDAwMDA3NDIyIDAwMDAwIG4gCjAwMDAwMDc0NTQgMDAw
|
||||
MDAgbiAKMDAwMDAwNzc1MSAwMDAwMCBuIAowMDAwMDA3ODQ4IDAwMDAwIG4gCnRyYWlsZXIK
|
||||
PDwvU2l6ZSAxNC9Sb290IDEyIDAgUgovSW5mbyAxMyAwIFIKL0lEIFsgPDRFN0ZCMEZCMjA4
|
||||
ODBCNURBQkIzQTNEOTQxNDlBRTQ3Pgo8NEU3RkIwRkIyMDg4MEI1REFCQjNBM0Q5NDE0OUFF
|
||||
NDc+IF0KL0RvY0NoZWNrc3VtIC8yQTY0RDMzNzRFQTVEODMwNTRDNEI2RDFEMUY4QzU1RQo+
|
||||
PgpzdGFydHhyZWYKODAxOAolJUVPRgo=
|
||||
--------------090701020702030809070008--
|
218
src/documents/tests/test_api.py
Normal file
218
src/documents/tests/test_api.py
Normal file
@ -0,0 +1,218 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import override_settings
|
||||
from rest_framework.test import APITestCase, APIClient
|
||||
|
||||
from documents.models import Document, Correspondent, DocumentType, Tag
|
||||
|
||||
|
||||
class DocumentApiTest(APITestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
|
||||
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
|
||||
|
||||
os.makedirs(self.originals_dir, exist_ok=True)
|
||||
os.makedirs(self.thumbnail_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumbnail_dir
|
||||
).enable()
|
||||
|
||||
user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_login(user=user)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
|
||||
def testDocuments(self):
|
||||
|
||||
response = self.client.get("/api/documents/").data
|
||||
|
||||
self.assertEqual(response['count'], 0)
|
||||
|
||||
c = Correspondent.objects.create(name="c", pk=41)
|
||||
dt = DocumentType.objects.create(name="dt", pk=63)
|
||||
tag = Tag.objects.create(name="t", pk=85)
|
||||
|
||||
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
|
||||
|
||||
doc.tags.add(tag)
|
||||
|
||||
response = self.client.get("/api/documents/", format='json')
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['count'], 1)
|
||||
|
||||
returned_doc = response.data['results'][0]
|
||||
self.assertEqual(returned_doc['id'], doc.id)
|
||||
self.assertEqual(returned_doc['title'], doc.title)
|
||||
self.assertEqual(returned_doc['correspondent']['name'], c.name)
|
||||
self.assertEqual(returned_doc['document_type']['name'], dt.name)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], c.id)
|
||||
self.assertEqual(returned_doc['document_type']['id'], dt.id)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
|
||||
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
|
||||
self.assertEqual(len(returned_doc['tags']), 1)
|
||||
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
|
||||
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
|
||||
self.assertListEqual(returned_doc['tags_id'], [tag.id])
|
||||
|
||||
c2 = Correspondent.objects.create(name="c2")
|
||||
|
||||
returned_doc['correspondent_id'] = c2.pk
|
||||
returned_doc['title'] = "the new title"
|
||||
|
||||
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
doc_after_save = Document.objects.get(id=doc.id)
|
||||
|
||||
self.assertEqual(doc_after_save.correspondent, c2)
|
||||
self.assertEqual(doc_after_save.title, "the new title")
|
||||
|
||||
self.client.delete("/api/documents/{}/".format(doc_after_save.pk))
|
||||
|
||||
self.assertEqual(len(Document.objects.all()), 0)
|
||||
|
||||
def test_document_actions(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_thumbnail = b"thumbnail content"
|
||||
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
|
||||
|
||||
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
f.write(content_thumbnail)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_thumbnail)
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
def test_document_filters(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||
|
||||
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||
tag_2 = Tag.objects.create(name="t2")
|
||||
tag_3 = Tag.objects.create(name="t3")
|
||||
|
||||
doc1.tags.add(tag_inbox)
|
||||
doc2.tags.add(tag_2)
|
||||
doc3.tags.add(tag_2)
|
||||
doc3.tags.add(tag_3)
|
||||
|
||||
response = self.client.get("/api/documents/?is_in_inbox=true")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]['id'], doc1.id)
|
||||
|
||||
response = self.client.get("/api/documents/?is_in_inbox=false")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['id'], doc2.id)
|
||||
self.assertEqual(results[1]['id'], doc3.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_inbox.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['id'], doc1.id)
|
||||
self.assertEqual(results[1]['id'], doc3.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]['id'], doc3.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_inbox.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__all={}a{}".format(tag_inbox.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
def test_search_autocomplete(self, m):
|
||||
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 10)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test&limit=20")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 20)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test&limit=-1")
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/")
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 10)
|
||||
|
||||
def test_statistics(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||
|
||||
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||
|
||||
doc1.tags.add(tag_inbox)
|
||||
|
||||
response = self.client.get("/api/statistics/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['documents_total'], 3)
|
||||
self.assertEqual(response.data['documents_inbox'], 1)
|
85
src/documents/tests/test_classifier.py
Normal file
85
src/documents/tests/test_classifier.py
Normal file
@ -0,0 +1,85 @@
|
||||
import tempfile
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||
|
||||
|
||||
class TestClassifier(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
||||
self.classifier = DocumentClassifier()
|
||||
|
||||
def generate_test_data(self):
|
||||
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
self.c2 = Correspondent.objects.create(name="c2")
|
||||
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
|
||||
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
|
||||
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
|
||||
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
|
||||
self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
|
||||
|
||||
self.doc1.tags.add(self.t1)
|
||||
self.doc2.tags.add(self.t1)
|
||||
self.doc2.tags.add(self.t3)
|
||||
self.doc_inbox.tags.add(self.t2)
|
||||
|
||||
def testNoTrainingData(self):
|
||||
try:
|
||||
self.classifier.train()
|
||||
except ValueError as e:
|
||||
self.assertEqual(str(e), "No training data available.")
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def testEmpty(self):
|
||||
Document.objects.create(title="WOW", checksum="3457", content="ASD")
|
||||
self.classifier.train()
|
||||
self.assertIsNone(self.classifier.document_type_classifier)
|
||||
self.assertIsNone(self.classifier.tags_classifier)
|
||||
self.assertIsNone(self.classifier.correspondent_classifier)
|
||||
|
||||
self.assertListEqual(self.classifier.predict_tags(""), [])
|
||||
self.assertIsNone(self.classifier.predict_document_type(""))
|
||||
self.assertIsNone(self.classifier.predict_correspondent(""))
|
||||
|
||||
def testTrain(self):
|
||||
self.generate_test_data()
|
||||
self.classifier.train()
|
||||
self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
|
||||
self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
|
||||
|
||||
def testPredict(self):
|
||||
self.generate_test_data()
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
|
||||
|
||||
def testDatasetHashing(self):
|
||||
|
||||
self.generate_test_data()
|
||||
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
@override_settings(DATA_DIR=tempfile.mkdtemp())
|
||||
def testSaveClassifier(self):
|
||||
|
||||
self.generate_test_data()
|
||||
|
||||
self.classifier.train()
|
||||
|
||||
self.classifier.save_classifier()
|
||||
|
||||
newClassifier = DocumentClassifier()
|
||||
newClassifier.reload()
|
||||
self.assertFalse(newClassifier.train())
|
@ -1,8 +1,17 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import FileInfo, Tag
|
||||
from ..consumer import Consumer, ConsumerError
|
||||
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
||||
from ..parsers import DocumentParser, ParseError
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
@ -394,3 +403,254 @@ class TestFieldPermutations(TestCase):
|
||||
self.assertEqual(info.created.year, 2019)
|
||||
self.assertEqual(info.created.month, 9)
|
||||
self.assertEqual(info.created.day, 8)
|
||||
|
||||
|
||||
class DummyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(DummyParser, self).__init__(path, logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
return "The Text"
|
||||
|
||||
|
||||
class FaultyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(path, logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
class TestConsumer(TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
return DummyParser(path, logging_group, self.scratch_dir)
|
||||
|
||||
def make_faulty_parser(self, path, logging_group):
|
||||
return FaultyParser(path, logging_group, self.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.consumption_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
|
||||
CONSUMPTION_DIR=self.consumption_dir
|
||||
).enable()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_dummy_parser,
|
||||
"test": lambda _: True,
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.consumption_dir, ignore_errors=True)
|
||||
|
||||
def get_test_file(self):
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
|
||||
return f
|
||||
|
||||
def testNormalOperation(self):
|
||||
|
||||
filename = self.get_test_file()
|
||||
document = self.consumer.try_consume_file(filename)
|
||||
|
||||
self.assertEqual(document.content, "The Text")
|
||||
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
|
||||
self.assertIsNone(document.correspondent)
|
||||
self.assertIsNone(document.document_type)
|
||||
self.assertEqual(document.filename, "0000001.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.source_path
|
||||
))
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.thumbnail_path
|
||||
))
|
||||
|
||||
self.assertFalse(os.path.isfile(filename))
|
||||
|
||||
def testOverrideFilename(self):
|
||||
filename = self.get_test_file()
|
||||
overrideFilename = "My Bank - Statement for November.pdf"
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename=overrideFilename)
|
||||
|
||||
self.assertEqual(document.correspondent.name, "My Bank")
|
||||
self.assertEqual(document.title, "Statement for November")
|
||||
|
||||
def testOverrideTitle(self):
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
||||
self.assertEqual(document.title, "Override Title")
|
||||
|
||||
def testOverrideCorrespondent(self):
|
||||
c = Correspondent.objects.create(name="test")
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
|
||||
self.assertEqual(document.correspondent.id, c.id)
|
||||
|
||||
def testOverrideDocumentType(self):
|
||||
dt = DocumentType.objects.create(name="test")
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
|
||||
self.assertEqual(document.document_type.id, dt.id)
|
||||
|
||||
def testOverrideTags(self):
|
||||
t1 = Tag.objects.create(name="t1")
|
||||
t2 = Tag.objects.create(name="t2")
|
||||
t3 = Tag.objects.create(name="t3")
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
|
||||
|
||||
self.assertIn(t1, document.tags.all())
|
||||
self.assertNotIn(t2, document.tags.all())
|
||||
self.assertIn(t3, document.tags.all())
|
||||
|
||||
def testNotAFile(self):
|
||||
try:
|
||||
self.consumer.try_consume_file("non-existing-file")
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith('It is not a file'))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR=None)
|
||||
def testConsumptionDirUnset(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="asd")
|
||||
def testNoConsumptionDir(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Consumption directory asd does not exist")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testNoParsers(self, m):
|
||||
m.return_value = []
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).startswith("No parsers abvailable"))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testFaultyParser(self, m):
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_faulty_parser,
|
||||
"test": lambda _: True,
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Does not compute.")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception.")
|
||||
|
||||
@mock.patch("documents.consumer.Consumer._write")
|
||||
def testPostSaveError(self, m):
|
||||
filename = self.get_test_file()
|
||||
m.side_effect = OSError("NO.")
|
||||
try:
|
||||
self.consumer.try_consume_file(filename)
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "NO.")
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
# file not deleted
|
||||
self.assertTrue(os.path.isfile(filename))
|
||||
|
||||
# Database empty
|
||||
self.assertEqual(len(Document.objects.all()), 0)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def testFilenameHandling(self):
|
||||
filename = self.get_test_file()
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||
|
||||
print(document.source_path)
|
||||
print("===")
|
||||
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
||||
|
||||
@mock.patch("documents.consumer.DocumentClassifier")
|
||||
def testClassifyDocument(self, m):
|
||||
correspondent = Correspondent.objects.create(name="test")
|
||||
dtype = DocumentType.objects.create(name="test")
|
||||
t1 = Tag.objects.create(name="t1")
|
||||
t2 = Tag.objects.create(name="t2")
|
||||
|
||||
m.return_value = MagicMock()
|
||||
m.return_value.predict_correspondent.return_value = correspondent.pk
|
||||
m.return_value.predict_document_type.return_value = dtype.pk
|
||||
m.return_value.predict_tags.return_value = [t1.pk]
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
self.assertEqual(document.correspondent, correspondent)
|
||||
self.assertEqual(document.document_type, dtype)
|
||||
self.assertIn(t1, document.tags.all())
|
||||
self.assertNotIn(t2, document.tags.all())
|
||||
|
@ -1,90 +0,0 @@
|
||||
import base64
|
||||
import os
|
||||
from hashlib import md5
|
||||
from unittest import mock
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
|
||||
from ..mail import Message, Attachment
|
||||
|
||||
|
||||
class TestMessage(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
self.sample = os.path.join(
|
||||
settings.BASE_DIR,
|
||||
"documents",
|
||||
"tests",
|
||||
"samples",
|
||||
"mail.txt"
|
||||
)
|
||||
|
||||
def test_init(self):
|
||||
|
||||
with open(self.sample, "rb") as f:
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
message = Message(f.read())
|
||||
|
||||
self.assertTrue(message)
|
||||
self.assertEqual(message.subject, "Test 0")
|
||||
|
||||
data = message.attachment.read()
|
||||
|
||||
self.assertEqual(
|
||||
md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
|
||||
|
||||
self.assertEqual(
|
||||
message.attachment.content_type, "application/pdf")
|
||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
||||
self.assertEqual(m.id_buffer(data), "application/pdf")
|
||||
|
||||
|
||||
class TestInlineMessage(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
self.sample = os.path.join(
|
||||
settings.BASE_DIR,
|
||||
"documents",
|
||||
"tests",
|
||||
"samples",
|
||||
"inline_mail.txt"
|
||||
)
|
||||
|
||||
def test_init(self):
|
||||
|
||||
with open(self.sample, "rb") as f:
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
message = Message(f.read())
|
||||
|
||||
self.assertTrue(message)
|
||||
self.assertEqual(message.subject, "Paperless Inline Image")
|
||||
|
||||
data = message.attachment.read()
|
||||
|
||||
self.assertEqual(
|
||||
md5(data).hexdigest(), "30c00a7b42913e65f7fdb0be40b9eef3")
|
||||
|
||||
self.assertEqual(
|
||||
message.attachment.content_type, "image/png")
|
||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
||||
self.assertEqual(m.id_buffer(data), "image/png")
|
||||
|
||||
|
||||
class TestAttachment(TestCase):
|
||||
|
||||
def test_init(self):
|
||||
data = base64.encodebytes(b"0")
|
||||
self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
|
||||
self.assertEqual(Attachment(data, "image/png").suffix, "png")
|
||||
self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
|
||||
self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
|
||||
self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
|
||||
self.assertEqual(Attachment(data, "image/png").read(), data)
|
@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser}),
|
||||
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
||||
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_0_parsers(self, m, *args):
|
||||
m.return_value = ((None, lambda _: None),)
|
||||
m.return_value = []
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertIsNone(
|
||||
get_parser_class("doc.pdf")
|
||||
|
@ -52,7 +52,7 @@ class CorrespondentViewSet(ModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = CorrespondentFilterSet
|
||||
filterset_class = CorrespondentFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
|
||||
|
||||
|
||||
@ -63,7 +63,7 @@ class TagViewSet(ModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = TagFilterSet
|
||||
filterset_class = TagFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||
|
||||
|
||||
@ -74,7 +74,7 @@ class DocumentTypeViewSet(ModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = DocumentTypeFilterSet
|
||||
filterset_class = DocumentTypeFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||
|
||||
|
||||
@ -89,7 +89,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
|
||||
filter_class = DocumentFilterSet
|
||||
filterset_class = DocumentFilterSet
|
||||
search_fields = ("title", "correspondent__name", "content")
|
||||
ordering_fields = (
|
||||
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
||||
@ -170,7 +170,7 @@ class LogViewSet(ReadOnlyModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = LogFilterSet
|
||||
filterset_class = LogFilterSet
|
||||
ordering_fields = ("created",)
|
||||
|
||||
|
||||
@ -223,17 +223,16 @@ class SearchAutoCompleteView(APIView):
|
||||
if 'term' in request.query_params:
|
||||
term = request.query_params['term']
|
||||
else:
|
||||
term = None
|
||||
return HttpResponseBadRequest("Term required")
|
||||
|
||||
if 'limit' in request.query_params:
|
||||
limit = int(request.query_params['limit'])
|
||||
if limit <= 0:
|
||||
return HttpResponseBadRequest("Invalid limit")
|
||||
else:
|
||||
limit = 10
|
||||
|
||||
if term is not None:
|
||||
return Response(index.autocomplete(self.ix, term, limit))
|
||||
else:
|
||||
return Response([])
|
||||
return Response(index.autocomplete(self.ix, term, limit))
|
||||
|
||||
|
||||
class StatisticsView(APIView):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import math
|
||||
import multiprocessing
|
||||
import os
|
||||
import re
|
||||
@ -79,6 +80,7 @@ INSTALLED_APPS = [
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
|
||||
"django.contrib.admin",
|
||||
|
||||
@ -262,24 +264,58 @@ LOGGING = {
|
||||
# Task queue #
|
||||
###############################################################################
|
||||
|
||||
|
||||
# Sensible defaults for multitasking:
|
||||
# use a fair balance between worker processes and threads epr worker so that
|
||||
# both consuming many documents in parallel and consuming large documents is
|
||||
# reasonably fast.
|
||||
# Favors threads per worker on smaller systems and never exceeds cpu_count()
|
||||
# in total.
|
||||
|
||||
def default_task_workers():
|
||||
try:
|
||||
return max(
|
||||
math.floor(math.sqrt(multiprocessing.cpu_count())),
|
||||
1
|
||||
)
|
||||
except NotImplementedError:
|
||||
return 1
|
||||
|
||||
|
||||
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
|
||||
|
||||
Q_CLUSTER = {
|
||||
'name': 'paperless',
|
||||
'catch_up': False,
|
||||
'workers': TASK_WORKERS,
|
||||
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
|
||||
}
|
||||
|
||||
|
||||
def default_threads_per_worker():
|
||||
try:
|
||||
return max(
|
||||
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
|
||||
1
|
||||
)
|
||||
except NotImplementedError:
|
||||
return 1
|
||||
|
||||
|
||||
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
|
||||
|
||||
###############################################################################
|
||||
# Paperless Specific Settings #
|
||||
###############################################################################
|
||||
|
||||
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
|
||||
|
||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||
@ -324,5 +360,6 @@ FILENAME_PARSE_TRANSFORMS = []
|
||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||
|
||||
# TODO: this should not have a prefix.
|
||||
# Specify the filename format for out files
|
||||
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
||||
|
@ -1,7 +1,7 @@
|
||||
from django.conf.urls import include, url
|
||||
from django.conf.urls import include
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.decorators import login_required
|
||||
from django.urls import path
|
||||
from django.urls import path, re_path
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.generic import RedirectView
|
||||
from rest_framework.routers import DefaultRouter
|
||||
@ -30,32 +30,32 @@ api_router.register(r"tags", TagViewSet)
|
||||
urlpatterns = [
|
||||
|
||||
# API
|
||||
url(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
|
||||
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
|
||||
url(r"^api/search/", SearchView.as_view(), name="search"),
|
||||
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
|
||||
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
|
||||
re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
|
||||
re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
|
||||
re_path(r"^api/search/", SearchView.as_view(), name="search"),
|
||||
re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
|
||||
re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
|
||||
|
||||
# Favicon
|
||||
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||
|
||||
# The Django admin
|
||||
url(r"admin/", admin.site.urls),
|
||||
re_path(r"admin/", admin.site.urls),
|
||||
|
||||
# These redirects are here to support clients that use the old FetchView.
|
||||
url(
|
||||
re_path(
|
||||
r"^fetch/doc/(?P<pk>\d+)$",
|
||||
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
|
||||
),
|
||||
url(
|
||||
re_path(
|
||||
r"^fetch/thumb/(?P<pk>\d+)$",
|
||||
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
|
||||
),
|
||||
url(
|
||||
re_path(
|
||||
r"^fetch/preview/(?P<pk>\d+)$",
|
||||
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
|
||||
),
|
||||
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
|
||||
re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
|
||||
|
||||
# Frontend assets TODO: this is pretty bad.
|
||||
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
|
||||
@ -63,7 +63,7 @@ urlpatterns = [
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
|
||||
# Root of the Frontent
|
||||
url(r".*", login_required(IndexView.as_view())),
|
||||
re_path(r".*", login_required(IndexView.as_view())),
|
||||
|
||||
]
|
||||
|
||||
|
0
src/paperless_mail/__init__.py
Normal file
0
src/paperless_mail/__init__.py
Normal file
27
src/paperless_mail/admin.py
Normal file
27
src/paperless_mail/admin.py
Normal file
@ -0,0 +1,27 @@
|
||||
from django.contrib import admin
|
||||
from django import forms
|
||||
|
||||
from paperless_mail.models import MailAccount, MailRule
|
||||
|
||||
|
||||
class MailAccountForm(forms.ModelForm):
|
||||
|
||||
password = forms.CharField(widget=forms.PasswordInput)
|
||||
|
||||
class Meta:
|
||||
fields = '__all__'
|
||||
model = MailAccount
|
||||
|
||||
|
||||
class MailAccountAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = ("name", "imap_server", "username")
|
||||
|
||||
|
||||
class MailRuleAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = ("name", "account", "folder", "action")
|
||||
|
||||
|
||||
admin.site.register(MailAccount, MailAccountAdmin)
|
||||
admin.site.register(MailRule, MailRuleAdmin)
|
7
src/paperless_mail/apps.py
Normal file
7
src/paperless_mail/apps.py
Normal file
@ -0,0 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PaperlessMailConfig(AppConfig):
|
||||
name = 'paperless_mail'
|
||||
|
||||
verbose_name = 'Paperless Mail'
|
227
src/paperless_mail/mail.py
Normal file
227
src/paperless_mail/mail.py
Normal file
@ -0,0 +1,227 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import timedelta, date
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||
MailboxFolderSelectError
|
||||
|
||||
from documents.models import Correspondent
|
||||
from paperless_mail.models import MailAccount, MailRule
|
||||
|
||||
|
||||
class MailError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BaseMailAction:
|
||||
|
||||
def get_criteria(self):
|
||||
return {}
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
pass
|
||||
|
||||
|
||||
class DeleteMailAction(BaseMailAction):
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.delete(message_uids)
|
||||
|
||||
|
||||
class MarkReadMailAction(BaseMailAction):
|
||||
|
||||
def get_criteria(self):
|
||||
return {'seen': False}
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.seen(message_uids, True)
|
||||
|
||||
|
||||
class MoveMailAction(BaseMailAction):
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.move(message_uids, parameter)
|
||||
|
||||
|
||||
class FlagMailAction(BaseMailAction):
|
||||
|
||||
def get_criteria(self):
|
||||
return {'flagged': False}
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
|
||||
|
||||
|
||||
def get_rule_action(rule):
|
||||
if rule.action == MailRule.ACTION_FLAG:
|
||||
return FlagMailAction()
|
||||
elif rule.action == MailRule.ACTION_DELETE:
|
||||
return DeleteMailAction()
|
||||
elif rule.action == MailRule.ACTION_MOVE:
|
||||
return MoveMailAction()
|
||||
elif rule.action == MailRule.ACTION_MARK_READ:
|
||||
return MarkReadMailAction()
|
||||
else:
|
||||
raise ValueError("Unknown action.")
|
||||
|
||||
|
||||
def make_criterias(rule):
|
||||
maximum_age = date.today() - timedelta(days=rule.maximum_age)
|
||||
criterias = {
|
||||
"date_gte": maximum_age
|
||||
}
|
||||
if rule.filter_from:
|
||||
criterias["from_"] = rule.filter_from
|
||||
if rule.filter_subject:
|
||||
criterias["subject"] = rule.filter_subject
|
||||
if rule.filter_body:
|
||||
criterias["body"] = rule.filter_body
|
||||
|
||||
return {**criterias, **get_rule_action(rule).get_criteria()}
|
||||
|
||||
|
||||
def handle_mail_account(account):
|
||||
|
||||
if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
|
||||
mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
|
||||
elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
|
||||
mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
|
||||
elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
|
||||
mailbox = MailBox(account.imap_server, account.imap_port)
|
||||
else:
|
||||
raise ValueError("Unknown IMAP security")
|
||||
|
||||
total_processed_files = 0
|
||||
|
||||
with mailbox as M:
|
||||
|
||||
try:
|
||||
M.login(account.username, account.password)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Error while authenticating account {account.name}")
|
||||
|
||||
for rule in account.rules.all():
|
||||
|
||||
try:
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Folder {rule.folder} does not exist "
|
||||
f"in account {account.name}")
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while fetching folder "
|
||||
f"{rule.folder} of account {account.name}")
|
||||
|
||||
post_consume_messages = []
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
processed_files = handle_message(message, rule)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing mail "
|
||||
f"{message.uid} of account {account.name}")
|
||||
if processed_files > 0:
|
||||
post_consume_messages.append(message.uid)
|
||||
|
||||
total_processed_files += processed_files
|
||||
try:
|
||||
get_rule_action(rule).post_consume(
|
||||
M,
|
||||
post_consume_messages,
|
||||
rule.action_parameter)
|
||||
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing post-consume "
|
||||
f"actions for account {account.name}")
|
||||
|
||||
return total_processed_files
|
||||
|
||||
|
||||
def get_title(message, att, rule):
|
||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||
title = message.subject
|
||||
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||
title = os.path.splitext(os.path.basename(att.filename))[0]
|
||||
else:
|
||||
raise ValueError("Unknown title selector.")
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def get_correspondent(message, rule):
|
||||
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||
correspondent = None
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||
correspondent_name = message.from_
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||
if message.from_values and \
|
||||
'name' in message.from_values \
|
||||
and message.from_values['name']:
|
||||
correspondent_name = message.from_values['name']
|
||||
else:
|
||||
correspondent_name = message.from_
|
||||
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||
correspondent = rule.assign_correspondent
|
||||
else:
|
||||
raise ValueError("Unknwown correspondent selector")
|
||||
|
||||
return correspondent
|
||||
|
||||
|
||||
def handle_message(message, rule):
|
||||
if not message.attachments:
|
||||
return 0
|
||||
|
||||
correspondent = get_correspondent(message, rule)
|
||||
tag = rule.assign_tag
|
||||
doc_type = rule.assign_document_type
|
||||
|
||||
processed_attachments = 0
|
||||
|
||||
for att in message.attachments:
|
||||
|
||||
title = get_title(message, att, rule)
|
||||
|
||||
# TODO: check with parsers what files types are supported
|
||||
if att.content_type == 'application/pdf':
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
||||
with open(temp_filename, 'wb') as f:
|
||||
f.write(att.payload)
|
||||
|
||||
async_task(
|
||||
"documents.tasks.consume_file",
|
||||
path=temp_filename,
|
||||
override_filename=att.filename,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent.id if correspondent else None,
|
||||
override_document_type_id=doc_type.id if doc_type else None,
|
||||
override_tag_ids=[tag.id] if tag else None,
|
||||
task_name=f"Mail: {att.filename}"
|
||||
)
|
||||
|
||||
processed_attachments += 1
|
||||
|
||||
return processed_attachments
|
0
src/paperless_mail/management/__init__.py
Normal file
0
src/paperless_mail/management/__init__.py
Normal file
0
src/paperless_mail/management/commands/__init__.py
Normal file
0
src/paperless_mail/management/commands/__init__.py
Normal file
13
src/paperless_mail/management/commands/mail_fetcher.py
Normal file
13
src/paperless_mail/management/commands/mail_fetcher.py
Normal file
@ -0,0 +1,13 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from paperless_mail import mail, tasks
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
""".replace(" ", "")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
tasks.process_mail_accounts()
|
48
src/paperless_mail/migrations/0001_initial.py
Normal file
48
src/paperless_mail/migrations/0001_initial.py
Normal file
@ -0,0 +1,48 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-15 22:54
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('documents', '1002_auto_20201111_1105'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='MailAccount',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=256, unique=True)),
|
||||
('imap_server', models.CharField(max_length=256)),
|
||||
('imap_port', models.IntegerField(blank=True, null=True)),
|
||||
('imap_security', models.PositiveIntegerField(choices=[(1, 'No encryption'), (2, 'Use SSL'), (3, 'Use STARTTLS')], default=2)),
|
||||
('username', models.CharField(max_length=256)),
|
||||
('password', models.CharField(max_length=256)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='MailRule',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=256)),
|
||||
('folder', models.CharField(default='INBOX', max_length=256)),
|
||||
('filter_from', models.CharField(blank=True, max_length=256, null=True)),
|
||||
('filter_subject', models.CharField(blank=True, max_length=256, null=True)),
|
||||
('filter_body', models.CharField(blank=True, max_length=256, null=True)),
|
||||
('maximum_age', models.PositiveIntegerField(default=30)),
|
||||
('action', models.PositiveIntegerField(choices=[(1, 'Delete'), (2, 'Move to specified folder'), (3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails")], default=3, help_text='The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched.')),
|
||||
('action_parameter', models.CharField(blank=True, help_text='Additional parameter for the action selected above, i.e., the target folder of the move to folder action.', max_length=256, null=True)),
|
||||
('assign_title_from', models.PositiveIntegerField(choices=[(1, 'Use subject as title'), (2, 'Use attachment filename as title')], default=1)),
|
||||
('assign_correspondent_from', models.PositiveIntegerField(choices=[(1, 'Do not assign a correspondent'), (2, 'Use mail address'), (3, 'Use name (or mail address if not available)'), (4, 'Use correspondent selected below')], default=1)),
|
||||
('account', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rules', to='paperless_mail.mailaccount')),
|
||||
('assign_correspondent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.correspondent')),
|
||||
('assign_document_type', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.documenttype')),
|
||||
('assign_tag', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.tag')),
|
||||
],
|
||||
),
|
||||
]
|
32
src/paperless_mail/migrations/0002_auto_20201117_1334.py
Normal file
32
src/paperless_mail/migrations/0002_auto_20201117_1334.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-17 13:34
|
||||
|
||||
from django.db import migrations
|
||||
from django.db.migrations import RunPython
|
||||
from django_q.models import Schedule
|
||||
from django_q.tasks import schedule
|
||||
|
||||
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('paperless_mail.tasks.process_mail_accounts',
|
||||
name="Check all e-mail accounts",
|
||||
schedule_type=Schedule.MINUTES,
|
||||
minutes=10)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.filter(
|
||||
func='paperless_mail.tasks.process_mail_accounts').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('paperless_mail', '0001_initial'),
|
||||
('django_q', '0013_task_attempt_count'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
RunPython(add_schedules, remove_schedules)
|
||||
]
|
||||
|
||||
|
0
src/paperless_mail/migrations/__init__.py
Normal file
0
src/paperless_mail/migrations/__init__.py
Normal file
137
src/paperless_mail/models.py
Normal file
137
src/paperless_mail/models.py
Normal file
@ -0,0 +1,137 @@
|
||||
from django.db import models
|
||||
|
||||
# Create your models here.
|
||||
from django.db import models
|
||||
|
||||
import documents.models as document_models
|
||||
|
||||
|
||||
class MailAccount(models.Model):
|
||||
|
||||
IMAP_SECURITY_NONE = 1
|
||||
IMAP_SECURITY_SSL = 2
|
||||
IMAP_SECURITY_STARTTLS = 3
|
||||
|
||||
IMAP_SECURITY_OPTIONS = (
|
||||
(IMAP_SECURITY_NONE, "No encryption"),
|
||||
(IMAP_SECURITY_SSL, "Use SSL"),
|
||||
(IMAP_SECURITY_STARTTLS, "Use STARTTLS"),
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=256, unique=True)
|
||||
|
||||
imap_server = models.CharField(max_length=256)
|
||||
|
||||
imap_port = models.IntegerField(blank=True, null=True)
|
||||
|
||||
imap_security = models.PositiveIntegerField(
|
||||
choices=IMAP_SECURITY_OPTIONS,
|
||||
default=IMAP_SECURITY_SSL
|
||||
)
|
||||
|
||||
username = models.CharField(max_length=256)
|
||||
|
||||
password = models.CharField(max_length=256)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
class MailRule(models.Model):
|
||||
|
||||
ACTION_DELETE = 1
|
||||
ACTION_MOVE = 2
|
||||
ACTION_MARK_READ = 3
|
||||
ACTION_FLAG = 4
|
||||
|
||||
ACTIONS = (
|
||||
(ACTION_DELETE, "Delete"),
|
||||
(ACTION_MOVE, "Move to specified folder"),
|
||||
(ACTION_MARK_READ, "Mark as read, don't process read mails"),
|
||||
(ACTION_FLAG, "Flag the mail, don't process flagged mails")
|
||||
)
|
||||
|
||||
TITLE_FROM_SUBJECT = 1
|
||||
TITLE_FROM_FILENAME = 2
|
||||
|
||||
TITLE_SELECTOR = (
|
||||
(TITLE_FROM_SUBJECT, "Use subject as title"),
|
||||
(TITLE_FROM_FILENAME, "Use attachment filename as title")
|
||||
)
|
||||
|
||||
CORRESPONDENT_FROM_NOTHING = 1
|
||||
CORRESPONDENT_FROM_EMAIL = 2
|
||||
CORRESPONDENT_FROM_NAME = 3
|
||||
CORRESPONDENT_FROM_CUSTOM = 4
|
||||
|
||||
CORRESPONDENT_SELECTOR = (
|
||||
(CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
|
||||
(CORRESPONDENT_FROM_EMAIL, "Use mail address"),
|
||||
(CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
|
||||
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=256)
|
||||
|
||||
account = models.ForeignKey(
|
||||
MailAccount,
|
||||
related_name="rules",
|
||||
on_delete=models.CASCADE
|
||||
)
|
||||
|
||||
folder = models.CharField(default='INBOX', max_length=256)
|
||||
|
||||
filter_from = models.CharField(max_length=256, null=True, blank=True)
|
||||
filter_subject = models.CharField(max_length=256, null=True, blank=True)
|
||||
filter_body = models.CharField(max_length=256, null=True, blank=True)
|
||||
|
||||
maximum_age = models.PositiveIntegerField(default=30)
|
||||
|
||||
action = models.PositiveIntegerField(
|
||||
choices=ACTIONS,
|
||||
default=ACTION_MARK_READ,
|
||||
help_text="The action applied to the mail. This action is only "
|
||||
"performed when documents were consumed from the mail. "
|
||||
"Mails without attachments will remain entirely "
|
||||
"untouched."
|
||||
)
|
||||
|
||||
action_parameter = models.CharField(
|
||||
max_length=256, blank=True, null=True,
|
||||
help_text="Additional parameter for the action selected above, i.e., "
|
||||
"the target folder of the move to folder action."
|
||||
)
|
||||
|
||||
assign_title_from = models.PositiveIntegerField(
|
||||
choices=TITLE_SELECTOR,
|
||||
default=TITLE_FROM_SUBJECT
|
||||
)
|
||||
|
||||
assign_tag = models.ForeignKey(
|
||||
document_models.Tag,
|
||||
null=True,
|
||||
blank=True,
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
assign_document_type = models.ForeignKey(
|
||||
document_models.DocumentType,
|
||||
null=True,
|
||||
blank=True,
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
assign_correspondent_from = models.PositiveIntegerField(
|
||||
choices=CORRESPONDENT_SELECTOR,
|
||||
default=CORRESPONDENT_FROM_NOTHING
|
||||
)
|
||||
|
||||
assign_correspondent = models.ForeignKey(
|
||||
document_models.Correspondent,
|
||||
null=True,
|
||||
blank=True,
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
23
src/paperless_mail/tasks.py
Normal file
23
src/paperless_mail/tasks.py
Normal file
@ -0,0 +1,23 @@
|
||||
import logging
|
||||
|
||||
from paperless_mail import mail
|
||||
from paperless_mail.models import MailAccount
|
||||
|
||||
|
||||
def process_mail_accounts():
|
||||
total_new_documents = 0
|
||||
for account in MailAccount.objects.all():
|
||||
total_new_documents += mail.handle_mail_account(account)
|
||||
|
||||
if total_new_documents > 0:
|
||||
return f"Added {total_new_documents} document(s)."
|
||||
else:
|
||||
return "No new documents were added."
|
||||
|
||||
|
||||
def process_mail_account(name):
|
||||
account = MailAccount.objects.find(name=name)
|
||||
if account:
|
||||
mail.handle_mail_account(account)
|
||||
else:
|
||||
logging.error("Unknown mail acccount: {}".format(name))
|
0
src/paperless_mail/tests/__init__.py
Normal file
0
src/paperless_mail/tests/__init__.py
Normal file
352
src/paperless_mail/tests/test_mail.py
Normal file
352
src/paperless_mail/tests/test_mail.py
Normal file
@ -0,0 +1,352 @@
|
||||
import uuid
|
||||
from collections import namedtuple
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
||||
|
||||
from documents.models import Correspondent
|
||||
from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError
|
||||
from paperless_mail.models import MailRule, MailAccount
|
||||
|
||||
|
||||
class BogusFolderManager:
|
||||
|
||||
current_folder = "INBOX"
|
||||
|
||||
def set(self, new_folder):
|
||||
if new_folder not in ["INBOX", "spam"]:
|
||||
raise MailboxFolderSelectError(None, "uhm")
|
||||
self.current_folder = new_folder
|
||||
|
||||
|
||||
class BogusMailBox(ContextManager):
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
def __init__(self):
|
||||
self.messages = []
|
||||
self.messages_spam = []
|
||||
|
||||
def login(self, username, password):
|
||||
if not (username == 'admin' and password == 'secret'):
|
||||
raise Exception()
|
||||
|
||||
folder = BogusFolderManager()
|
||||
|
||||
def fetch(self, criteria, mark_seen):
|
||||
msg = self.messages
|
||||
|
||||
criteria = str(criteria).strip('()').split(" ")
|
||||
|
||||
if 'UNSEEN' in criteria:
|
||||
msg = filter(lambda m: not m.seen, msg)
|
||||
|
||||
if 'SUBJECT' in criteria:
|
||||
subject = criteria[criteria.index('SUBJECT') + 1].strip('"')
|
||||
msg = filter(lambda m: subject in m.subject, msg)
|
||||
|
||||
if 'BODY' in criteria:
|
||||
body = criteria[criteria.index('BODY') + 1].strip('"')
|
||||
msg = filter(lambda m: body in m.body, msg)
|
||||
|
||||
if 'FROM' in criteria:
|
||||
from_ = criteria[criteria.index('FROM') + 1].strip('"')
|
||||
msg = filter(lambda m: from_ in m.from_, msg)
|
||||
|
||||
if 'UNFLAGGED' in criteria:
|
||||
msg = filter(lambda m: not m.flagged, msg)
|
||||
|
||||
return list(msg)
|
||||
|
||||
def seen(self, uid_list, seen_val):
|
||||
for message in self.messages:
|
||||
if message.uid in uid_list:
|
||||
message.seen = seen_val
|
||||
|
||||
def delete(self, uid_list):
|
||||
self.messages = list(filter(lambda m: m.uid not in uid_list, self.messages))
|
||||
|
||||
def flag(self, uid_list, flag_set, value):
|
||||
for message in self.messages:
|
||||
if message.uid in uid_list:
|
||||
for flag in flag_set:
|
||||
if flag == MailMessageFlags.FLAGGED:
|
||||
message.flagged = value
|
||||
|
||||
def move(self, uid_list, folder):
|
||||
if folder == "spam":
|
||||
self.messages_spam.append(
|
||||
filter(lambda m: m.uid in uid_list, self.messages)
|
||||
)
|
||||
self.messages = list(
|
||||
filter(lambda m: m.uid not in uid_list, self.messages)
|
||||
)
|
||||
else:
|
||||
raise Exception()
|
||||
|
||||
|
||||
def create_message(num_attachments=1, body="", subject="the suject", from_="noone@mail.com", seen=False, flagged=False):
|
||||
message = namedtuple('MailMessage', [])
|
||||
|
||||
message.uid = uuid.uuid4()
|
||||
message.subject = subject
|
||||
message.attachments = []
|
||||
message.from_ = from_
|
||||
message.body = body
|
||||
for i in range(num_attachments):
|
||||
attachment = namedtuple('Attachment', [])
|
||||
attachment.filename = 'some_file.pdf'
|
||||
attachment.content_type = 'application/pdf'
|
||||
attachment.payload = b'content of the attachment'
|
||||
message.attachments.append(attachment)
|
||||
|
||||
message.seen = seen
|
||||
message.flagged = flagged
|
||||
|
||||
return message
|
||||
|
||||
|
||||
class TestMail(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
patcher = mock.patch('paperless_mail.mail.MailBox')
|
||||
m = patcher.start()
|
||||
self.bogus_mailbox = BogusMailBox()
|
||||
m.return_value = self.bogus_mailbox
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
patcher = mock.patch('paperless_mail.mail.async_task')
|
||||
self.async_task = patcher.start()
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
def reset_bogus_mailbox(self):
|
||||
self.bogus_mailbox.messages = []
|
||||
self.bogus_mailbox.messages_spam = []
|
||||
self.bogus_mailbox.messages.append(create_message(subject="Invoice 1", from_="amazon@amazon.de", body="cables", seen=True, flagged=False))
|
||||
self.bogus_mailbox.messages.append(create_message(subject="Invoice 2", body="from my favorite electronic store", seen=False, flagged=True))
|
||||
self.bogus_mailbox.messages.append(create_message(subject="Claim your $10M price now!", from_="amazon@amazon-some-indian-site.org", seen=False))
|
||||
|
||||
def test_get_correspondent(self):
|
||||
message = namedtuple('MailMessage', [])
|
||||
message.from_ = "someone@somewhere.com"
|
||||
message.from_values = {'name': "Someone!", 'email': "someone@somewhere.com"}
|
||||
|
||||
message2 = namedtuple('MailMessage', [])
|
||||
message2.from_ = "me@localhost.com"
|
||||
message2.from_values = {'name': "", 'email': "fake@localhost.com"}
|
||||
|
||||
me_localhost = Correspondent.objects.create(name=message2.from_)
|
||||
someone_else = Correspondent.objects.create(name="someone else")
|
||||
|
||||
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
|
||||
self.assertIsNone(get_correspondent(message, rule))
|
||||
|
||||
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||
c = get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "someone@somewhere.com")
|
||||
c = get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "me@localhost.com")
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
|
||||
c = get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "Someone!")
|
||||
c = get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
|
||||
c = get_correspondent(message, rule)
|
||||
self.assertEqual(c, someone_else)
|
||||
|
||||
def test_get_title(self):
|
||||
message = namedtuple('MailMessage', [])
|
||||
message.subject = "the message title"
|
||||
att = namedtuple('Attachment', [])
|
||||
att.filename = "this_is_the_file.pdf"
|
||||
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
|
||||
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_SUBJECT)
|
||||
self.assertEqual(get_title(message, att, rule), "the message title")
|
||||
|
||||
def test_handle_message(self):
|
||||
message = namedtuple('MailMessage', [])
|
||||
message.subject = "the message title"
|
||||
|
||||
att = namedtuple('Attachment', [])
|
||||
att.filename = "test1.pdf"
|
||||
att.content_type = 'application/pdf'
|
||||
att.payload = b"attachment contents"
|
||||
|
||||
att2 = namedtuple('Attachment', [])
|
||||
att2.filename = "test2.pdf"
|
||||
att2.content_type = 'application/pdf'
|
||||
att2.payload = b"attachment contents"
|
||||
|
||||
att3 = namedtuple('Attachment', [])
|
||||
att3.filename = "test3.pdf"
|
||||
att3.content_type = 'application/invalid'
|
||||
att3.payload = b"attachment contents"
|
||||
|
||||
message.attachments = [att, att2, att3]
|
||||
|
||||
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||
|
||||
result = handle_message(message, rule)
|
||||
|
||||
self.assertEqual(result, 2)
|
||||
|
||||
self.assertEqual(len(self.async_task.call_args_list), 2)
|
||||
|
||||
args1, kwargs1 = self.async_task.call_args_list[0]
|
||||
args2, kwargs2 = self.async_task.call_args_list[1]
|
||||
|
||||
self.assertEqual(kwargs1['override_title'], "test1")
|
||||
self.assertEqual(kwargs1['override_filename'], "test1.pdf")
|
||||
|
||||
self.assertEqual(kwargs2['override_title'], "test2")
|
||||
self.assertEqual(kwargs2['override_filename'], "test2.pdf")
|
||||
|
||||
@mock.patch("paperless_mail.mail.async_task")
|
||||
def test_handle_empty_message(self, m):
|
||||
message = namedtuple('MailMessage', [])
|
||||
|
||||
message.attachments = []
|
||||
rule = MailRule()
|
||||
|
||||
result = handle_message(message, rule)
|
||||
|
||||
self.assertFalse(m.called)
|
||||
self.assertEqual(result, 0)
|
||||
|
||||
def test_handle_mail_account_mark_read(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
|
||||
|
||||
def test_handle_mail_account_delete(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Invoice")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||
|
||||
def test_handle_mail_account_flag(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
|
||||
|
||||
def test_handle_mail_account_move(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", filter_subject="Claim")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
def test_errors(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
||||
|
||||
try:
|
||||
handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue(str(e).startswith("Error while authenticating account"))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
||||
|
||||
try:
|
||||
handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("uuuh does not exist" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
||||
|
||||
try:
|
||||
handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("Error while processing post-consume actions" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def test_filters(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
rule.filter_subject = None
|
||||
rule.filter_body = "electronic"
|
||||
rule.save()
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
rule.filter_from = "amazon"
|
||||
rule.filter_body = None
|
||||
rule.save()
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||
self.assertEqual(self.async_task.call_count, 4)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
rule.filter_from = "amazon"
|
||||
rule.filter_body = "cables"
|
||||
rule.filter_subject = "Invoice"
|
||||
rule.save()
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 5)
|
3
src/paperless_mail/views.py
Normal file
3
src/paperless_mail/views.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
@ -1,5 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_tesseract.signals import tesseract_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
document_consumer_declaration.connect(tesseract_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@ -2,7 +2,7 @@ import itertools
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import langdetect
|
||||
import pdftotext
|
||||
@ -151,7 +151,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=settings.OCR_THREADS) as pool:
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
pnms = pool.map(run_unpaper, pnms)
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
@ -166,7 +166,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
||||
with Pool(processes=settings.OCR_THREADS) as pool:
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
return r
|
||||
|
||||
|
@ -3,21 +3,16 @@ import re
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration:
|
||||
def tesseract_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0,
|
||||
"test": tesseract_consumer_test
|
||||
}
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc.lower()):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0
|
||||
}
|
||||
|
||||
return None
|
||||
def tesseract_consumer_test(doc):
|
||||
return MATCHING_FILES.match(doc.lower())
|
||||
|
@ -1,6 +1,6 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..signals import ConsumerDeclaration
|
||||
from paperless_tesseract.signals import tesseract_consumer_test
|
||||
|
||||
|
||||
class SignalsTestCase(TestCase):
|
||||
@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
|
||||
for prefix in prefixes:
|
||||
for suffix in suffixes:
|
||||
name = "{}.{}".format(prefix, suffix)
|
||||
self.assertTrue(ConsumerDeclaration.test(name))
|
||||
self.assertTrue(tesseract_consumer_test(name))
|
||||
|
||||
def test_test_handles_various_file_names_false(self):
|
||||
|
||||
@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
|
||||
for prefix in prefixes:
|
||||
for suffix in suffixes:
|
||||
name = "{}.{}".format(prefix, suffix)
|
||||
self.assertFalse(ConsumerDeclaration.test(name))
|
||||
self.assertFalse(tesseract_consumer_test(name))
|
||||
|
||||
self.assertFalse(ConsumerDeclaration.test(""))
|
||||
self.assertFalse(ConsumerDeclaration.test("doc"))
|
||||
self.assertFalse(tesseract_consumer_test(""))
|
||||
self.assertFalse(tesseract_consumer_test("doc"))
|
||||
|
@ -1,5 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_text.signals import text_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTextConfig(AppConfig):
|
||||
|
||||
@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
document_consumer_declaration.connect(text_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@ -3,21 +3,16 @@ import re
|
||||
from .parsers import TextDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration:
|
||||
def text_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10,
|
||||
"test": text_consumer_test
|
||||
}
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc.lower()):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10
|
||||
}
|
||||
|
||||
return None
|
||||
def text_consumer_test(doc):
|
||||
return MATCHING_FILES.match(doc.lower())
|
||||
|
@ -6,7 +6,6 @@ ignore = E501
|
||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||
addopts = --pythonwarnings=all
|
||||
env =
|
||||
PAPERLESS_PASSPHRASE=THISISNOTASECRET
|
||||
PAPERLESS_SECRET=paperless
|
||||
PAPERLESS_EMAIL_SECRET=paperless
|
||||
|
||||
@ -15,4 +14,4 @@ env =
|
||||
source =
|
||||
./
|
||||
omit =
|
||||
*/tests
|
||||
*/tests/*
|
||||
|
Loading…
x
Reference in New Issue
Block a user