mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-10-22 03:16:15 -05:00
Merge branch 'dev'
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
/src-ui/.vscode
|
||||||
/src-ui/node_modules
|
/src-ui/node_modules
|
||||||
/src-ui/dist
|
/src-ui/dist
|
||||||
.git
|
.git
|
||||||
@@ -5,3 +6,7 @@
|
|||||||
/consume
|
/consume
|
||||||
/media
|
/media
|
||||||
/data
|
/data
|
||||||
|
/docs
|
||||||
|
.pytest_cache
|
||||||
|
/dist
|
||||||
|
/scripts
|
||||||
|
@@ -5,23 +5,18 @@ python:
|
|||||||
- "3.7"
|
- "3.7"
|
||||||
- "3.8"
|
- "3.8"
|
||||||
|
|
||||||
services:
|
|
||||||
- docker
|
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- sudo apt-get update -qq
|
- sudo apt-get update -qq
|
||||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
|
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- pip install --upgrade pipenv
|
- pip install --upgrade pipenv
|
||||||
- pipenv install --dev
|
- pipenv install --system --dev
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- cd src/
|
- cd src/
|
||||||
- pipenv run pytest --cov
|
- pipenv run pytest --cov
|
||||||
- pipenv run pycodestyle
|
- pipenv run pycodestyle
|
||||||
- cd ..
|
|
||||||
- docker build --tag=jonaswinkler/paperless-ng .
|
|
||||||
|
|
||||||
after_success:
|
after_success:
|
||||||
- pipenv run coveralls
|
- pipenv run coveralls
|
||||||
|
1
Pipfile
1
Pipfile
@@ -29,6 +29,7 @@ watchdog = "*"
|
|||||||
pathvalidate = "*"
|
pathvalidate = "*"
|
||||||
django-q = "*"
|
django-q = "*"
|
||||||
redis = "*"
|
redis = "*"
|
||||||
|
imap-tools = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
coveralls = "*"
|
coveralls = "*"
|
||||||
|
10
Pipfile.lock
generated
10
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "c0dfeedbac2e9b705267336349e6f72ba650ff9184affae06046db32299e2c87"
|
"sha256": "d6416e6844126b09200b9839a3abdcf3c24ef5cf70052b8f134d8bc804552c17"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {},
|
"requires": {},
|
||||||
@@ -123,6 +123,14 @@
|
|||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==20.0.4"
|
"version": "==20.0.4"
|
||||||
},
|
},
|
||||||
|
"imap-tools": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:070929b8ec429c0aad94588a37a2962eed656a119ab61dcf91489f20fe983f5d",
|
||||||
|
"sha256:6232cd43748741496446871e889eb137351fc7a7e7f4c7888cd8c0fa28e20cda"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==0.31.0"
|
||||||
|
},
|
||||||
"joblib": {
|
"joblib": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
|
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
|
||||||
|
BIN
docs/_static/paperless-11-mail-filters.png
vendored
Normal file
BIN
docs/_static/paperless-11-mail-filters.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 70 KiB |
BIN
docs/_static/recommended_workflow.png
vendored
Normal file
BIN
docs/_static/recommended_workflow.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 67 KiB |
@@ -294,10 +294,14 @@ Documents can be stored in Paperless using GnuPG encryption.
|
|||||||
|
|
||||||
.. danger::
|
.. danger::
|
||||||
|
|
||||||
Decryption is depreceated since paperless-ng 1.0 and doesn't really provide any
|
Decryption is depreceated since paperless-ng 0.9 and doesn't really provide any
|
||||||
additional security, since you have to store the passphrase in a configuration
|
additional security, since you have to store the passphrase in a configuration
|
||||||
file on the same system as the encrypted documents for paperless to work. Also,
|
file on the same system as the encrypted documents for paperless to work.
|
||||||
paperless provides transparent access to your encrypted documents.
|
Furthermore, the entire text content of the documents is stored plain in the
|
||||||
|
database, even if your documents are encrypted. Filenames are not encrypted as
|
||||||
|
well.
|
||||||
|
|
||||||
|
Also, the web server provides transparent access to your encrypted documents.
|
||||||
|
|
||||||
Consider running paperless on an encrypted filesystem instead, which will then
|
Consider running paperless on an encrypted filesystem instead, which will then
|
||||||
at least provide security against physical hardware theft.
|
at least provide security against physical hardware theft.
|
||||||
|
171
docs/api.rst
171
docs/api.rst
@@ -3,25 +3,168 @@
|
|||||||
The REST API
|
The REST API
|
||||||
************
|
************
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
This section is not updated to paperless-ng yet.
|
Paperless makes use of the `Django REST Framework`_ standard API interface.
|
||||||
|
It provides a browsable API for most of its endpoints, which you can inspect
|
||||||
Paperless makes use of the `Django REST Framework`_ standard API interface
|
at ``http://<paperless-host>:<port>/api/``. This also documents most of the
|
||||||
because of its inherent awesomeness. Conveniently, the system is also
|
available filters and ordering fields.
|
||||||
self-documenting, so to learn more about the access points, schema, what's
|
|
||||||
accepted and what isn't, you need only visit ``/api`` on your local Paperless
|
|
||||||
installation.
|
|
||||||
|
|
||||||
.. _Django REST Framework: http://django-rest-framework.org/
|
.. _Django REST Framework: http://django-rest-framework.org/
|
||||||
|
|
||||||
|
The API provides 5 main endpoints:
|
||||||
|
|
||||||
|
* ``/api/correspondents/``: Full CRUD support.
|
||||||
|
* ``/api/document_types/``: Full CRUD support.
|
||||||
|
* ``/api/documents/``: Full CRUD support, except POSTing new documents. See below.
|
||||||
|
* ``/api/logs/``: Read-Only.
|
||||||
|
* ``/api/tags/``: Full CRUD support.
|
||||||
|
|
||||||
|
All of these endpoints except for the logging endpoint
|
||||||
|
allow you to fetch, edit and delete individual objects
|
||||||
|
by appending their primary key to the path, for example ``/api/documents/454/``.
|
||||||
|
|
||||||
|
In addition to that, the document endpoint offers these additional actions on
|
||||||
|
individual documents:
|
||||||
|
|
||||||
|
* ``/api/documents/<pk>/download/``: Download the original document.
|
||||||
|
* ``/api/documents/<pk>/thumb/``: Download the PNG thumbnail of a document.
|
||||||
|
* ``/api/documents/<pk>/preview/``: Display the original document inline,
|
||||||
|
without downloading it.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
Paperless used to provide these functionality at ``/fetch/<pk>/preview``,
|
||||||
|
``/fetch/<pk>/thumb`` and ``/fetch/<pk>/doc``. Redirects to the new URLs
|
||||||
|
are in place. However, if you use these old URLs to access documents, you
|
||||||
|
should update your app or script to use the new URLs.
|
||||||
|
|
||||||
|
Searching for documents
|
||||||
|
#######################
|
||||||
|
|
||||||
|
Paperless-ng offers API endpoints for full text search. These are as follows:
|
||||||
|
|
||||||
|
``/api/search/``
|
||||||
|
================
|
||||||
|
|
||||||
|
Get search results based on a query.
|
||||||
|
|
||||||
|
Query parameters:
|
||||||
|
|
||||||
|
* ``query``: The query string. See
|
||||||
|
`here <https://whoosh.readthedocs.io/en/latest/querylang.html>`_
|
||||||
|
for details on the syntax.
|
||||||
|
* ``page``: Specify the page you want to retrieve. Each page
|
||||||
|
contains 10 search results and the first page is ``page=1``, which
|
||||||
|
is the default if this is omitted.
|
||||||
|
|
||||||
|
Result list object returned by the endpoint:
|
||||||
|
|
||||||
|
.. code:: json
|
||||||
|
|
||||||
|
{
|
||||||
|
"count": 1,
|
||||||
|
"page": 1,
|
||||||
|
"page_count": 1,
|
||||||
|
"results": [
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
* ``count``: The approximate total number of results.
|
||||||
|
* ``page``: The page returned to you. This might be different from
|
||||||
|
the page you requested, if you requested a page that is behind
|
||||||
|
the last page. In that case, the last page is returned.
|
||||||
|
* ``page_count``: The total number of pages.
|
||||||
|
* ``results``: A list of result objects on the current page.
|
||||||
|
|
||||||
|
Result object:
|
||||||
|
|
||||||
|
.. code:: json
|
||||||
|
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"highlights": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"score": 6.34234,
|
||||||
|
"rank": 23,
|
||||||
|
"document": {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
* ``id``: the primary key of the found document
|
||||||
|
* ``highlights``: an object containing parseable highlights for the result.
|
||||||
|
See below.
|
||||||
|
* ``score``: The score assigned to the document. A higher score indicates a
|
||||||
|
better match with the query. Search results are sorted descending by score.
|
||||||
|
* ``rank``: the position of the document within the entire search results list.
|
||||||
|
* ``document``: The full json of the document, as returned by
|
||||||
|
``/api/documents/<id>/``.
|
||||||
|
|
||||||
|
Highlights object:
|
||||||
|
|
||||||
|
Highlights are provided as a list of fragments. A fragment is a longer section of
|
||||||
|
text from the original document.
|
||||||
|
Each fragment contains a list of strings, and some of them are marked as a highlight.
|
||||||
|
|
||||||
|
.. code:: json
|
||||||
|
|
||||||
|
"highlights": [
|
||||||
|
[
|
||||||
|
{"text": "This is a sample text with a "},
|
||||||
|
{"text": "highlighted", "term": 0},
|
||||||
|
{"text": " word."}
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{"text": "Another", "term": 1},
|
||||||
|
{"text": " fragment with a highlight."}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
When ``term`` is present within a string, the word within ``text`` should be highlighted.
|
||||||
|
The term index groups multiple matches together and words with the same index
|
||||||
|
should get identical highlighting.
|
||||||
|
A client may use this example to produce the following output:
|
||||||
|
|
||||||
|
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
|
||||||
|
|
||||||
|
``/api/search/autocomplete/``
|
||||||
|
=============================
|
||||||
|
|
||||||
|
Get auto completions for a partial search term.
|
||||||
|
|
||||||
|
Query parameters:
|
||||||
|
|
||||||
|
* ``term``: The incomplete term.
|
||||||
|
* ``limit``: Amount of results. Defaults to 10.
|
||||||
|
|
||||||
|
Results returned by the endpoint are ordered by importance of the term in the
|
||||||
|
document index. The first result is the term that has the highest Tf/Idf score
|
||||||
|
in the index.
|
||||||
|
|
||||||
|
.. code:: json
|
||||||
|
|
||||||
|
[
|
||||||
|
"term1",
|
||||||
|
"term3",
|
||||||
|
"term6",
|
||||||
|
"term4"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
.. _api-file_uploads:
|
.. _api-file_uploads:
|
||||||
|
|
||||||
POSTing Documents
|
POSTing documents
|
||||||
=================
|
#################
|
||||||
|
|
||||||
File uploads in an API are hard and so far as I've been able to tell, there's
|
The API provides a special endpoint for file uploads:
|
||||||
no standard way of accepting them, so rather than crowbar file uploads into the
|
|
||||||
REST API and endure that headache, I've left that process to a simple HTTP
|
``/api/documents/post_document/``
|
||||||
POST.
|
|
||||||
|
POST a multipart form to this endpoint, where the form field ``document`` contains
|
||||||
|
the document that you want to upload to paperless. The filename is sanitized and
|
||||||
|
then used to store the document in the consumption folder, where the consumer will
|
||||||
|
detect the document and process it as any other document.
|
||||||
|
|
||||||
|
The endpoint will immediately return "OK." if the document was stored in the
|
||||||
|
consumption directory.
|
||||||
|
@@ -8,10 +8,8 @@ Changelog
|
|||||||
paperless-ng 0.9.0
|
paperless-ng 0.9.0
|
||||||
##################
|
##################
|
||||||
|
|
||||||
* **Deprecated:** GnuPG. Don't use it. If you're still using it, be aware that it
|
* **Deprecated:** GnuPG. :ref:`See this note on the state of GnuPG in paperless-ng. <utilities-encyption>`
|
||||||
offers no protection at all, since the passphrase is stored alongside with the
|
This features will most likely be removed in future versions.
|
||||||
encrypted documents itself. This features will most likely be removed in future
|
|
||||||
versions.
|
|
||||||
|
|
||||||
* **Added:** New frontend. Features:
|
* **Added:** New frontend. Features:
|
||||||
|
|
||||||
@@ -38,6 +36,25 @@ paperless-ng 0.9.0
|
|||||||
multi user solution, however, it allows more than one user to access the website
|
multi user solution, however, it allows more than one user to access the website
|
||||||
and set some basic permissions / renew passwords.
|
and set some basic permissions / renew passwords.
|
||||||
|
|
||||||
|
* **Modified [breaking]:** All new mail consumer with customizable filters, actions and
|
||||||
|
multiple account support. Replaces the old mail consumer. The new mail consumer
|
||||||
|
needs different configuration but can be configured to act exactly like the old
|
||||||
|
consumer.
|
||||||
|
|
||||||
|
|
||||||
|
* **Modified:** Changes to the consumer:
|
||||||
|
|
||||||
|
* Now uses the excellent watchdog library that should make sure files are
|
||||||
|
discovered no matter what the platform is.
|
||||||
|
* The consumer now uses a task scheduler to run consumption processes in parallel.
|
||||||
|
This means that consuming many documents should be much faster on systems with
|
||||||
|
many cores.
|
||||||
|
* Concurrency is controlled with the new settings ``PAPERLESS_TASK_WORKERS``
|
||||||
|
and ``PAPERLESS_THREADS_PER_WORKER``. See TODO for details on concurrency.
|
||||||
|
* The consumer no longer blocks the database for extended periods of time.
|
||||||
|
* An issue with tesseract running multiple threads per page and slowing down
|
||||||
|
the consumer was fixed.
|
||||||
|
|
||||||
* **Modified [breaking]:** REST Api changes:
|
* **Modified [breaking]:** REST Api changes:
|
||||||
|
|
||||||
* New filters added, other filters removed (case sensitive filters, slug filters)
|
* New filters added, other filters removed (case sensitive filters, slug filters)
|
||||||
@@ -64,8 +81,8 @@ paperless-ng 0.9.0
|
|||||||
* Rework of the code of the tesseract parser. This is now a lot cleaner.
|
* Rework of the code of the tesseract parser. This is now a lot cleaner.
|
||||||
* Rework of the filename handling code. It was a mess.
|
* Rework of the filename handling code. It was a mess.
|
||||||
* Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
|
* Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
|
||||||
* Consumer rework: now uses the excellent watchdog library, lots of code removed.
|
* Added a task scheduler that takes care of checking mail, training the classifier, maintaining the document search index
|
||||||
* Added a task scheduler that takes care of checking mail, training the classifier and maintaining the document search index.
|
and consuming documents.
|
||||||
* Updated dependencies. Now uses Pipenv all around.
|
* Updated dependencies. Now uses Pipenv all around.
|
||||||
* Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
|
* Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
|
||||||
|
|
||||||
@@ -77,6 +94,8 @@ paperless-ng 0.9.0
|
|||||||
* ``PAPERLESS_DEBUG`` defaults to ``false``.
|
* ``PAPERLESS_DEBUG`` defaults to ``false``.
|
||||||
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
|
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
|
||||||
sqlite.
|
sqlite.
|
||||||
|
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
|
||||||
|
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
|
||||||
|
|
||||||
* Many more small changes here and there. The usual stuff.
|
* Many more small changes here and there. The usual stuff.
|
||||||
|
|
||||||
|
@@ -20,7 +20,3 @@ places.
|
|||||||
|
|
||||||
Copy ``paperless.conf.example`` to any of these locations and adjust it to your
|
Copy ``paperless.conf.example`` to any of these locations and adjust it to your
|
||||||
needs.
|
needs.
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
TBD: explain config options.
|
|
@@ -36,6 +36,10 @@ The old admin is still there and accessible!
|
|||||||
|
|
||||||
.. image:: _static/paperless-9-admin.png
|
.. image:: _static/paperless-9-admin.png
|
||||||
|
|
||||||
|
Fancy mail filters!
|
||||||
|
|
||||||
|
.. image:: _static/paperless-11-mail-filters.png
|
||||||
|
|
||||||
Mobile support in the future? This doesn't really work yet.
|
Mobile support in the future? This doesn't really work yet.
|
||||||
|
|
||||||
.. image:: _static/paperless-10-mobile.png
|
.. image:: _static/paperless-10-mobile.png
|
||||||
|
@@ -23,6 +23,77 @@ There are multiple options available.
|
|||||||
that need to be compiled, and that's already done for you in the release.
|
that need to be compiled, and that's already done for you in the release.
|
||||||
|
|
||||||
|
|
||||||
|
Overview of Paperless-ng
|
||||||
|
########################
|
||||||
|
|
||||||
|
Compared to paperless, paperless-ng works a little different under the hood and has
|
||||||
|
more moving parts that work together. While this increases the complexity of
|
||||||
|
the system, it also brings many benefits.
|
||||||
|
|
||||||
|
Paperless consists of the following components:
|
||||||
|
|
||||||
|
* **The webserver:** This is pretty much the same as in paperless. It serves
|
||||||
|
the administration pages, the API, and the new frontend. This is the main
|
||||||
|
tool you'll be using to interact with paperless. You may start the webserver
|
||||||
|
with
|
||||||
|
|
||||||
|
.. code:: shell-session
|
||||||
|
|
||||||
|
$ cd /path/to/paperless/src/
|
||||||
|
$ pipenv run gunicorn -c /usr/src/paperless/gunicorn.conf.py -b 0.0.0.0:8000 paperless.wsgi
|
||||||
|
|
||||||
|
or by any other means such as Apache ``mod_wsgi``.
|
||||||
|
|
||||||
|
* **The consumer:** This is what watches your consumption folder for documents.
|
||||||
|
However, the consumer itself does not consume really consume your documents anymore.
|
||||||
|
It rather notifies a task processor that a new file is ready for consumption.
|
||||||
|
I suppose it should be named differently.
|
||||||
|
This also used to check your emails, but that's now gone elsewhere as well.
|
||||||
|
|
||||||
|
Start the consumer with the management command ``document_consumer``:
|
||||||
|
|
||||||
|
.. code:: shell-session
|
||||||
|
|
||||||
|
$ cd /path/to/paperless/src/
|
||||||
|
$ pipenv run python3 manage.py document_consumer
|
||||||
|
|
||||||
|
* **The task processor:** Paperless relies on `Django Q <https://django-q.readthedocs.io/en/latest/>`_
|
||||||
|
for doing much of the heavy lifting. This is a task queue that accepts tasks from
|
||||||
|
multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
|
||||||
|
certain commands periodically.
|
||||||
|
|
||||||
|
This task processor is responsible for:
|
||||||
|
|
||||||
|
* Consuming documents. When the consumer finds new documents, it notifies the task processor to
|
||||||
|
start a consumption task.
|
||||||
|
* Consuming emails. It periodically checks your configured accounts for new mails and
|
||||||
|
produces consumption tasks for any documents it finds.
|
||||||
|
* The task processor also performs the consumption of any documents you upload through
|
||||||
|
the web interface.
|
||||||
|
* Maintain the search index and the automatic matching algorithm. These are things that paperless
|
||||||
|
needs to do from time to time in order to operate properly.
|
||||||
|
|
||||||
|
This allows paperless to process multiple documents from your consumption folder in parallel! On
|
||||||
|
a modern multicore system, consumption with full ocr is blazing fast.
|
||||||
|
|
||||||
|
The task processor comes with a built-in admin interface that you can use to see whenever any of the
|
||||||
|
tasks fail and inspect the errors.
|
||||||
|
|
||||||
|
You may start the task processor by executing:
|
||||||
|
|
||||||
|
.. code:: shell-session
|
||||||
|
|
||||||
|
$ cd /path/to/paperless/src/
|
||||||
|
$ pipenv run python3 manage.py qcluster
|
||||||
|
|
||||||
|
* A `redis <https://redis.io/>`_ message broker: This is a really lightweight service that is responsible
|
||||||
|
for getting the tasks from the webserver and consumer to the task scheduler. These run in different
|
||||||
|
processes (maybe even on different machines!), and therefore, this is necessary.
|
||||||
|
|
||||||
|
* A database server. Paperless supports PostgreSQL and sqlite for storing its data. However, with the
|
||||||
|
added concurrency, it is strongly advised to use PostgreSQL, as sqlite has its limits in that regard.
|
||||||
|
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
############
|
############
|
||||||
|
|
||||||
@@ -31,10 +102,12 @@ You can go multiple routes with setting up and running Paperless:
|
|||||||
* The `docker route`_
|
* The `docker route`_
|
||||||
* The `bare metal route`_
|
* The `bare metal route`_
|
||||||
|
|
||||||
The `docker route`_ is quick & easy. This is the recommended route.
|
The `docker route`_ is quick & easy. This is the recommended route. This configures all the stuff
|
||||||
|
from above automatically so that it just works and uses sensible defaults for all configuration options.
|
||||||
|
|
||||||
The `bare metal route`_ is more complicated to setup but makes it easier
|
The `bare metal route`_ is more complicated to setup but makes it easier
|
||||||
should you want to contribute some code back.
|
should you want to contribute some code back. You need to configure and
|
||||||
|
run the above mentioned components yourself.
|
||||||
|
|
||||||
Docker Route
|
Docker Route
|
||||||
============
|
============
|
||||||
|
@@ -2,9 +2,38 @@
|
|||||||
Troubleshooting
|
Troubleshooting
|
||||||
***************
|
***************
|
||||||
|
|
||||||
.. warning::
|
No files are added by the consumer
|
||||||
|
##################################
|
||||||
|
|
||||||
|
Check for the following issues:
|
||||||
|
|
||||||
|
* Ensure that the directory you're putting your documents in is the folder
|
||||||
|
paperless is watching. With docker, this setting is performed in the
|
||||||
|
``docker-compose.yml`` file. Without docker, look at the ``CONSUMPTION_DIR``
|
||||||
|
setting. Don't adjust this setting if you're using docker.
|
||||||
|
* Ensure that redis is up and running. Paperless does its task processing
|
||||||
|
asynchronously, and for documents to arrive at the task processor, it needs
|
||||||
|
redis to run.
|
||||||
|
* Ensure that the task processor is running. Docker does this automatically.
|
||||||
|
Manually invoke the task processor by executing
|
||||||
|
|
||||||
|
.. code:: shell-session
|
||||||
|
|
||||||
|
$ python3 manage.py qcluster
|
||||||
|
|
||||||
|
* Look at the output of paperless and inspect it for any errors.
|
||||||
|
* Go to the admin interface, and check if there are failed tasks. If so, the
|
||||||
|
tasks will contain an error message.
|
||||||
|
|
||||||
|
|
||||||
|
Consumer fails to pickup any new files
|
||||||
|
######################################
|
||||||
|
|
||||||
|
If you notice, that the consumer will only pickup files in the consumption
|
||||||
|
directory at startup, but won't find any other files added later, check out
|
||||||
|
the configuration file and enable filesystem polling with the setting
|
||||||
|
``PAPERLESS_CONSUMER_POLLING``.
|
||||||
|
|
||||||
This section is not updated to paperless-ng yet.
|
|
||||||
|
|
||||||
Consumer warns ``OCR for XX failed``
|
Consumer warns ``OCR for XX failed``
|
||||||
####################################
|
####################################
|
||||||
|
@@ -86,49 +86,63 @@ files from the scanner. Typically, you're looking at an FTP server like
|
|||||||
IMAP (Email)
|
IMAP (Email)
|
||||||
============
|
============
|
||||||
|
|
||||||
Another handy way to get documents into your database is to email them to
|
You can tell paperless-ng to consume documents from your email accounts.
|
||||||
yourself. The typical use-case would be to be out for lunch and want to send a
|
This is a very flexible and powerful feature, if you regularly received documents
|
||||||
copy of the receipt back to your system at home. Paperless can be taught to
|
via mail that you need to archive. The mail consumer can be configured by using the
|
||||||
pull emails down from an arbitrary account and dump them into the consumption
|
admin interface in the following manner:
|
||||||
directory where the consumer will follow the
|
|
||||||
usual pattern on consuming the document.
|
|
||||||
|
|
||||||
.. hint::
|
1. Define e-mail accounts.
|
||||||
|
2. Define mail rules for your account.
|
||||||
|
|
||||||
It's disabled by default. By setting the values below it will be enabled.
|
These rules perform the following:
|
||||||
|
|
||||||
It's been tested in a limited environment, so it may not work for you (please
|
1. Connect to the mail server.
|
||||||
submit a pull request if you can!)
|
2. Fetch all matching mails (as defined by folder, maximum age and the filters)
|
||||||
|
3. Check if there are any consumable attachments.
|
||||||
|
4. If so, instruct paperless to consume the attachments and optionally
|
||||||
|
use the metadata provided in the rule for the new document.
|
||||||
|
5. If documents were consumed from a mail, the rule action is performed
|
||||||
|
on that mail.
|
||||||
|
|
||||||
.. danger::
|
Paperless will completely ignore mails that do not match your filters. It will also
|
||||||
|
only perform the action on mails that it has consumed documents from.
|
||||||
|
|
||||||
It's designed to **delete mail from the server once consumed**. So don't go
|
The actions all ensure that the same mail is not consumed twice by different means.
|
||||||
pointing this to your personal email account and wonder where all your stuff
|
These are as follows:
|
||||||
went.
|
|
||||||
|
|
||||||
.. hint::
|
* **Delete:** Immediately deletes mail that paperless has consumed documents from.
|
||||||
|
Use with caution.
|
||||||
|
* **Mark as read:** Mark consumed mail as read. Paperless will not consume documents
|
||||||
|
from already read mails. If you read a mail before paperless sees it, it will be
|
||||||
|
ignored.
|
||||||
|
* **Flag:** Sets the 'important' flag on mails with consumed documents. Paperless
|
||||||
|
will not consume flagged mails.
|
||||||
|
* **Move to folder:** Moves consumed mails out of the way so that paperless wont
|
||||||
|
consume them again.
|
||||||
|
|
||||||
Currently, only one photo (attachment) per email will work.
|
.. caution::
|
||||||
|
|
||||||
So, with all that in mind, here's what you do to get it running:
|
The mail consumer will perform these actions on all mails it has consumed
|
||||||
|
documents from. Keep in mind that the actual consumption process may fail
|
||||||
|
for some reason, leaving you with missing documents in paperless.
|
||||||
|
|
||||||
1. Setup a new email account somewhere, or if you're feeling daring, create a
|
.. note::
|
||||||
folder in an existing email box and note the path to that folder.
|
|
||||||
2. In ``/etc/paperless.conf`` set all of the appropriate values in
|
With the correct set of rules, you can completely automate your email documents.
|
||||||
``PATHS AND FOLDERS`` and ``SECURITY``.
|
Create rules for every correspondent you receive digital documents from and
|
||||||
If you decided to use a subfolder of an existing account, then make sure you
|
paperless will read them automatically. The default acion "mark as read" is
|
||||||
set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here. You also have to set
|
pretty tame and will not cause any damage or data loss whatsoever.
|
||||||
the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
|
|
||||||
have to include that in every email you send.
|
.. note::
|
||||||
3. Restart paperless. Paperless will check
|
|
||||||
the configured email account at startup and from then on every 10 minutes
|
Paperless will process the rules in the order defined in the admin page.
|
||||||
for something new and pulls down whatever it finds.
|
|
||||||
4. Send yourself an email! Note that the subject is treated as the file name,
|
You can define catch-all rules and have them executed last to consume
|
||||||
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
|
any documents not matched by previous rules. Such a rule may assign an "Unknown
|
||||||
get what you expect. Also, you must include the aforementioned secret
|
mail document" tag to consumed documents so you can inspect them further.
|
||||||
string in every email so the fetcher knows that it's safe to import.
|
|
||||||
Note that Paperless only allows the email title to consist of safe characters
|
Paperless is set up to check your mails every 10 minutes. This can be configured on the
|
||||||
to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
|
'Scheduled tasks' page in the admin.
|
||||||
|
|
||||||
|
|
||||||
REST API
|
REST API
|
||||||
@@ -136,6 +150,7 @@ REST API
|
|||||||
|
|
||||||
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
|
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
|
||||||
|
|
||||||
|
|
||||||
.. _usage-recommended_workflow:
|
.. _usage-recommended_workflow:
|
||||||
|
|
||||||
The recommended workflow
|
The recommended workflow
|
||||||
@@ -147,6 +162,10 @@ is as follows. This workflow also takes into account that some documents
|
|||||||
have to be kept in physical form, but still ensures that you get all the
|
have to be kept in physical form, but still ensures that you get all the
|
||||||
advantages for these documents as well.
|
advantages for these documents as well.
|
||||||
|
|
||||||
|
The following diagram shows how easy it is to manage your documents.
|
||||||
|
|
||||||
|
.. image:: _static/recommended_workflow.png
|
||||||
|
|
||||||
Preparations in paperless
|
Preparations in paperless
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
|
@@ -59,22 +59,6 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
|
|||||||
#PAPERLESS_STATIC_URL="/static/"
|
#PAPERLESS_STATIC_URL="/static/"
|
||||||
|
|
||||||
|
|
||||||
# These values are required if you want paperless to check a particular email
|
|
||||||
# box every 10 minutes and attempt to consume documents from there. If you
|
|
||||||
# don't define a HOST, mail checking will just be disabled.
|
|
||||||
#PAPERLESS_CONSUME_MAIL_HOST=""
|
|
||||||
#PAPERLESS_CONSUME_MAIL_PORT=""
|
|
||||||
#PAPERLESS_CONSUME_MAIL_USER=""
|
|
||||||
#PAPERLESS_CONSUME_MAIL_PASS=""
|
|
||||||
|
|
||||||
# Override the default IMAP inbox here. If not set Paperless defaults to
|
|
||||||
# "INBOX".
|
|
||||||
#PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
|
|
||||||
|
|
||||||
# Any email sent to the target account that does not contain this text will be
|
|
||||||
# ignored.
|
|
||||||
PAPERLESS_EMAIL_SECRET=""
|
|
||||||
|
|
||||||
# Specify a filename format for the document (directories are supported)
|
# Specify a filename format for the document (directories are supported)
|
||||||
# Use the following placeholders:
|
# Use the following placeholders:
|
||||||
# * {correspondent}
|
# * {correspondent}
|
||||||
@@ -143,6 +127,35 @@ PAPERLESS_EMAIL_SECRET=""
|
|||||||
#### Software Tweaks ####
|
#### Software Tweaks ####
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
|
# Paperless does multiple things in the background: Maintain the search index,
|
||||||
|
# maintain the automatic matching algorithm, check emails, consume documents,
|
||||||
|
# etc. This variable specifies how many things it will do in parallel.
|
||||||
|
#PAPERLESS_TASK_WORKERS=1
|
||||||
|
|
||||||
|
# Furthermore, paperless uses multiple threads when consuming documents to
|
||||||
|
# speed up OCR. This variable specifies how many pages paperless will process
|
||||||
|
# in parallel on a single document.
|
||||||
|
#PAPERLESS_THREADS_PER_WORKER=1
|
||||||
|
|
||||||
|
# Ensure that the product
|
||||||
|
# PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
|
||||||
|
# does not exceed your CPU core count or else paperless will be extremely slow.
|
||||||
|
# If you want paperless to process many documents in parallel, choose a high
|
||||||
|
# worker count. If you want paperless to process very large documents faster,
|
||||||
|
# use a higher thread per worker count.
|
||||||
|
# The default is a balance between the two, according to your CPU core count,
|
||||||
|
# with a slight favor towards threads per worker, and using as much cores as
|
||||||
|
# possible.
|
||||||
|
# If you only specify PAPERLESS_TASK_WORKERS, paperless will adjust
|
||||||
|
# PAPERLESS_THREADS_PER_WORKER automatically.
|
||||||
|
|
||||||
|
# If paperless won't find documents added to your consume folder, it might
|
||||||
|
# not be able to automatically detect filesystem changes. In that case,
|
||||||
|
# specify a polling interval in seconds below, which will then cause paperless
|
||||||
|
# to periodically check your consumption directory for changes.
|
||||||
|
#PAPERLESS_CONSUMER_POLLING=10
|
||||||
|
|
||||||
|
|
||||||
# When the consumer detects a duplicate document, it will not touch the
|
# When the consumer detects a duplicate document, it will not touch the
|
||||||
# original document. This default behavior can be changed here.
|
# original document. This default behavior can be changed here.
|
||||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false"
|
#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false"
|
||||||
@@ -186,12 +199,6 @@ PAPERLESS_EMAIL_SECRET=""
|
|||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
# By default, Paperless will attempt to use all available CPU cores to process
|
|
||||||
# a document, but if you would like to limit that, you can set this value to
|
|
||||||
# an integer:
|
|
||||||
#PAPERLESS_OCR_THREADS=1
|
|
||||||
|
|
||||||
|
|
||||||
# Customize the default language that tesseract will attempt to use when
|
# Customize the default language that tesseract will attempt to use when
|
||||||
# parsing documents. The default language is used whenever
|
# parsing documents. The default language is used whenever
|
||||||
# - No language could be detected on a document
|
# - No language could be detected on a document
|
||||||
|
@@ -2,6 +2,15 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
|
||||||
|
VERSION=$1
|
||||||
|
|
||||||
|
if [ -z "$VERSION" ]
|
||||||
|
then
|
||||||
|
echo "Need a version string."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# source root directory of paperless
|
# source root directory of paperless
|
||||||
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
|
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
|
||||||
|
|
||||||
@@ -42,6 +51,7 @@ mkdir "$PAPERLESS_DIST_APP/docker"
|
|||||||
# the application itself
|
# the application itself
|
||||||
|
|
||||||
cp "$PAPERLESS_ROOT/.env" \
|
cp "$PAPERLESS_ROOT/.env" \
|
||||||
|
"$PAPERLESS_ROOT/.dockerignore" \
|
||||||
"$PAPERLESS_ROOT/CONTRIBUTING.md" \
|
"$PAPERLESS_ROOT/CONTRIBUTING.md" \
|
||||||
"$PAPERLESS_ROOT/LICENSE" \
|
"$PAPERLESS_ROOT/LICENSE" \
|
||||||
"$PAPERLESS_ROOT/Pipfile" \
|
"$PAPERLESS_ROOT/Pipfile" \
|
||||||
@@ -80,10 +90,12 @@ cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
|
|||||||
|
|
||||||
cd "$PAPERLESS_DIST_APP"
|
cd "$PAPERLESS_DIST_APP"
|
||||||
|
|
||||||
docker-compose build
|
docker build . -t "jonaswinkler/paperless-ng:$VERSION"
|
||||||
|
|
||||||
|
docker push "jonaswinkler/paperless-ng:$VERSION"
|
||||||
|
|
||||||
# works. package the app!
|
# works. package the app!
|
||||||
|
|
||||||
cd "$PAPERLESS_DIST"
|
cd "$PAPERLESS_DIST"
|
||||||
|
|
||||||
tar -cJf paperless-ng.tar.xz paperless-ng/
|
tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/
|
||||||
|
@@ -12,7 +12,7 @@ from django.utils import timezone
|
|||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||||
from .file_handling import generate_filename, create_source_path_directory
|
from .file_handling import generate_filename, create_source_path_directory
|
||||||
from .models import Document, FileInfo
|
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||||
from .parsers import ParseError, get_parser_class
|
from .parsers import ParseError, get_parser_class
|
||||||
from .signals import (
|
from .signals import (
|
||||||
document_consumption_finished,
|
document_consumption_finished,
|
||||||
@@ -25,118 +25,155 @@ class ConsumerError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
class Consumer:
|
class Consumer:
|
||||||
"""
|
|
||||||
Loop over every file found in CONSUMPTION_DIR and:
|
|
||||||
1. Convert it to a greyscale pnm
|
|
||||||
2. Use tesseract on the pnm
|
|
||||||
3. Store the document in the MEDIA_ROOT with optional encryption
|
|
||||||
4. Store the OCR'd text in the database
|
|
||||||
5. Delete the document and image(s)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, consume=settings.CONSUMPTION_DIR,
|
def __init__(self):
|
||||||
scratch=settings.SCRATCH_DIR):
|
|
||||||
|
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self.logging_group = None
|
self.logging_group = None
|
||||||
|
self.path = None
|
||||||
|
self.filename = None
|
||||||
|
self.override_title = None
|
||||||
|
self.override_correspondent_id = None
|
||||||
|
self.override_tag_ids = None
|
||||||
|
self.override_document_type_id = None
|
||||||
|
|
||||||
self.consume = consume
|
def pre_check_file_exists(self):
|
||||||
self.scratch = scratch
|
if not os.path.isfile(self.path):
|
||||||
|
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||||
|
self.path))
|
||||||
|
|
||||||
self.classifier = DocumentClassifier()
|
def pre_check_consumption_dir(self):
|
||||||
|
if not settings.CONSUMPTION_DIR:
|
||||||
os.makedirs(self.scratch, exist_ok=True)
|
|
||||||
|
|
||||||
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
|
||||||
if settings.PASSPHRASE:
|
|
||||||
self.storage_type = Document.STORAGE_TYPE_GPG
|
|
||||||
|
|
||||||
if not self.consume:
|
|
||||||
raise ConsumerError(
|
raise ConsumerError(
|
||||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||||
"set."
|
"set.")
|
||||||
|
|
||||||
|
if not os.path.isdir(settings.CONSUMPTION_DIR):
|
||||||
|
raise ConsumerError(
|
||||||
|
"Consumption directory {} does not exist".format(
|
||||||
|
settings.CONSUMPTION_DIR))
|
||||||
|
|
||||||
|
def pre_check_regex(self):
|
||||||
|
if not re.match(FileInfo.REGEXES["title"], self.filename):
|
||||||
|
raise ConsumerError(
|
||||||
|
"Filename {} does not seem to be safe to "
|
||||||
|
"consume".format(self.filename))
|
||||||
|
|
||||||
|
def pre_check_duplicate(self):
|
||||||
|
with open(self.path, "rb") as f:
|
||||||
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
|
if Document.objects.filter(checksum=checksum).exists():
|
||||||
|
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||||
|
os.unlink(self.path)
|
||||||
|
raise ConsumerError(
|
||||||
|
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||||
)
|
)
|
||||||
|
|
||||||
if not os.path.exists(self.consume):
|
def pre_check_directories(self):
|
||||||
raise ConsumerError(
|
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||||
"Consumption directory {} does not exist".format(self.consume))
|
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||||
|
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||||
|
|
||||||
def log(self, level, message):
|
def log(self, level, message):
|
||||||
getattr(self.logger, level)(message, extra={
|
getattr(self.logger, level)(message, extra={
|
||||||
"group": self.logging_group
|
"group": self.logging_group
|
||||||
})
|
})
|
||||||
|
|
||||||
@transaction.atomic
|
def try_consume_file(self,
|
||||||
def try_consume_file(self, file):
|
path,
|
||||||
|
override_filename=None,
|
||||||
|
override_title=None,
|
||||||
|
override_correspondent_id=None,
|
||||||
|
override_document_type_id=None,
|
||||||
|
override_tag_ids=None):
|
||||||
"""
|
"""
|
||||||
Return True if file was consumed
|
Return the document object if it was successfully created.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
self.path = path
|
||||||
|
self.filename = override_filename or os.path.basename(path)
|
||||||
|
self.override_title = override_title
|
||||||
|
self.override_correspondent_id = override_correspondent_id
|
||||||
|
self.override_document_type_id = override_document_type_id
|
||||||
|
self.override_tag_ids = override_tag_ids
|
||||||
|
|
||||||
|
# this is for grouping logging entries for this particular file
|
||||||
|
# together.
|
||||||
|
|
||||||
self.logging_group = uuid.uuid4()
|
self.logging_group = uuid.uuid4()
|
||||||
|
|
||||||
if not re.match(FileInfo.REGEXES["title"], file):
|
# Make sure that preconditions for consuming the file are met.
|
||||||
return False
|
|
||||||
|
|
||||||
doc = file
|
self.pre_check_file_exists()
|
||||||
|
self.pre_check_consumption_dir()
|
||||||
|
self.pre_check_directories()
|
||||||
|
self.pre_check_regex()
|
||||||
|
self.pre_check_duplicate()
|
||||||
|
|
||||||
if self._is_duplicate(doc):
|
self.log("info", "Consuming {}".format(self.filename))
|
||||||
self.log(
|
|
||||||
"warning",
|
|
||||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
|
||||||
)
|
|
||||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
|
||||||
self._cleanup_doc(doc)
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.log("info", "Consuming {}".format(doc))
|
# Determine the parser class.
|
||||||
|
|
||||||
parser_class = get_parser_class(doc)
|
parser_class = get_parser_class(self.filename)
|
||||||
if not parser_class:
|
if not parser_class:
|
||||||
self.log(
|
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||||
"error", "No parsers could be found for {}".format(doc))
|
|
||||||
return False
|
|
||||||
else:
|
else:
|
||||||
self.log("info", "Parser: {}".format(parser_class.__name__))
|
self.log("debug", "Parser: {}".format(parser_class.__name__))
|
||||||
|
|
||||||
|
# Notify all listeners that we're going to do some work.
|
||||||
|
|
||||||
document_consumption_started.send(
|
document_consumption_started.send(
|
||||||
sender=self.__class__,
|
sender=self.__class__,
|
||||||
filename=doc,
|
filename=self.path,
|
||||||
logging_group=self.logging_group
|
logging_group=self.logging_group
|
||||||
)
|
)
|
||||||
|
|
||||||
document_parser = parser_class(doc, self.logging_group)
|
# This doesn't parse the document yet, but gives us a parser.
|
||||||
|
|
||||||
|
document_parser = parser_class(self.path, self.logging_group)
|
||||||
|
|
||||||
|
# However, this already created working directories which we have to
|
||||||
|
# clean up.
|
||||||
|
|
||||||
|
# Parse the document. This may take some time.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.log("info", "Generating thumbnail for {}...".format(doc))
|
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
|
||||||
thumbnail = document_parser.get_optimised_thumbnail()
|
thumbnail = document_parser.get_optimised_thumbnail()
|
||||||
|
self.log("debug", "Parsing {}...".format(self.filename))
|
||||||
text = document_parser.get_text()
|
text = document_parser.get_text()
|
||||||
date = document_parser.get_date()
|
date = document_parser.get_date()
|
||||||
document = self._store(
|
|
||||||
text,
|
|
||||||
doc,
|
|
||||||
thumbnail,
|
|
||||||
date
|
|
||||||
)
|
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
|
|
||||||
document_parser.cleanup()
|
document_parser.cleanup()
|
||||||
return False
|
raise ConsumerError(e)
|
||||||
else:
|
|
||||||
document_parser.cleanup()
|
|
||||||
self._cleanup_doc(doc)
|
|
||||||
|
|
||||||
self.log(
|
# Prepare the document classifier.
|
||||||
"info",
|
|
||||||
"Document {} consumption finished".format(document)
|
|
||||||
)
|
|
||||||
|
|
||||||
classifier = None
|
# TODO: I don't really like to do this here, but this way we avoid
|
||||||
|
# reloading the classifier multiple times, since there are multiple
|
||||||
|
# post-consume hooks that all require the classifier.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.classifier.reload()
|
classifier = DocumentClassifier()
|
||||||
classifier = self.classifier
|
classifier.reload()
|
||||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||||
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
|
logging.getLogger(__name__).warning(
|
||||||
|
"Cannot classify documents: {}.".format(e))
|
||||||
|
classifier = None
|
||||||
|
|
||||||
|
# now that everything is done, we can start to store the document
|
||||||
|
# in the system. This will be a transaction and reasonably fast.
|
||||||
|
try:
|
||||||
|
with transaction.atomic():
|
||||||
|
|
||||||
|
# store the document.
|
||||||
|
document = self._store(
|
||||||
|
text=text,
|
||||||
|
date=date
|
||||||
|
)
|
||||||
|
|
||||||
|
# If we get here, it was successful. Proceed with post-consume
|
||||||
|
# hooks. If they fail, nothing will get changed.
|
||||||
|
|
||||||
document_consumption_finished.send(
|
document_consumption_finished.send(
|
||||||
sender=self.__class__,
|
sender=self.__class__,
|
||||||
@@ -144,20 +181,48 @@ class Consumer:
|
|||||||
logging_group=self.logging_group,
|
logging_group=self.logging_group,
|
||||||
classifier=classifier
|
classifier=classifier
|
||||||
)
|
)
|
||||||
return True
|
|
||||||
|
|
||||||
def _store(self, text, doc, thumbnail, date):
|
# After everything is in the database, copy the files into
|
||||||
|
# place. If this fails, we'll also rollback the transaction.
|
||||||
|
|
||||||
file_info = FileInfo.from_path(doc)
|
create_source_path_directory(document.source_path)
|
||||||
|
self._write(document, self.path, document.source_path)
|
||||||
|
self._write(document, thumbnail, document.thumbnail_path)
|
||||||
|
|
||||||
stats = os.stat(doc)
|
# Delete the file only if it was successfully consumed
|
||||||
|
self.log("debug", "Deleting file {}".format(self.path))
|
||||||
|
os.unlink(self.path)
|
||||||
|
except Exception as e:
|
||||||
|
raise ConsumerError(e)
|
||||||
|
finally:
|
||||||
|
document_parser.cleanup()
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
"info",
|
||||||
|
"Document {} consumption finished".format(document)
|
||||||
|
)
|
||||||
|
|
||||||
|
return document
|
||||||
|
|
||||||
|
def _store(self, text, date):
|
||||||
|
|
||||||
|
# If someone gave us the original filename, use it instead of doc.
|
||||||
|
|
||||||
|
file_info = FileInfo.from_path(self.filename)
|
||||||
|
|
||||||
|
stats = os.stat(self.path)
|
||||||
|
|
||||||
self.log("debug", "Saving record to database")
|
self.log("debug", "Saving record to database")
|
||||||
|
|
||||||
created = file_info.created or date or timezone.make_aware(
|
created = file_info.created or date or timezone.make_aware(
|
||||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||||
|
|
||||||
with open(doc, "rb") as f:
|
if settings.PASSPHRASE:
|
||||||
|
storage_type = Document.STORAGE_TYPE_GPG
|
||||||
|
else:
|
||||||
|
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||||
|
|
||||||
|
with open(self.path, "rb") as f:
|
||||||
document = Document.objects.create(
|
document = Document.objects.create(
|
||||||
correspondent=file_info.correspondent,
|
correspondent=file_info.correspondent,
|
||||||
title=file_info.title,
|
title=file_info.title,
|
||||||
@@ -166,7 +231,7 @@ class Consumer:
|
|||||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||||
created=created,
|
created=created,
|
||||||
modified=created,
|
modified=created,
|
||||||
storage_type=self.storage_type
|
storage_type=storage_type
|
||||||
)
|
)
|
||||||
|
|
||||||
relevant_tags = set(file_info.tags)
|
relevant_tags = set(file_info.tags)
|
||||||
@@ -175,19 +240,30 @@ class Consumer:
|
|||||||
self.log("debug", "Tagging with {}".format(tag_names))
|
self.log("debug", "Tagging with {}".format(tag_names))
|
||||||
document.tags.add(*relevant_tags)
|
document.tags.add(*relevant_tags)
|
||||||
|
|
||||||
|
self.apply_overrides(document)
|
||||||
|
|
||||||
document.filename = generate_filename(document)
|
document.filename = generate_filename(document)
|
||||||
|
|
||||||
create_source_path_directory(document.source_path)
|
|
||||||
|
|
||||||
self._write(document, doc, document.source_path)
|
|
||||||
self._write(document, thumbnail, document.thumbnail_path)
|
|
||||||
|
|
||||||
# We need to save the document twice, since we need the PK of the
|
# We need to save the document twice, since we need the PK of the
|
||||||
# document in order to create its filename above.
|
# document in order to create its filename above.
|
||||||
document.save()
|
document.save()
|
||||||
|
|
||||||
return document
|
return document
|
||||||
|
|
||||||
|
def apply_overrides(self, document):
|
||||||
|
if self.override_title:
|
||||||
|
document.title = self.override_title
|
||||||
|
|
||||||
|
if self.override_correspondent_id:
|
||||||
|
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
|
||||||
|
|
||||||
|
if self.override_document_type_id:
|
||||||
|
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
|
||||||
|
|
||||||
|
if self.override_tag_ids:
|
||||||
|
for tag_id in self.override_tag_ids:
|
||||||
|
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||||
|
|
||||||
def _write(self, document, source, target):
|
def _write(self, document, source, target):
|
||||||
with open(source, "rb") as read_file:
|
with open(source, "rb") as read_file:
|
||||||
with open(target, "wb") as write_file:
|
with open(target, "wb") as write_file:
|
||||||
@@ -196,13 +272,3 @@ class Consumer:
|
|||||||
return
|
return
|
||||||
self.log("debug", "Encrypting")
|
self.log("debug", "Encrypting")
|
||||||
write_file.write(GnuPG.encrypted(read_file))
|
write_file.write(GnuPG.encrypted(read_file))
|
||||||
|
|
||||||
def _cleanup_doc(self, doc):
|
|
||||||
self.log("debug", "Deleting document {}".format(doc))
|
|
||||||
os.unlink(doc)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_duplicate(doc):
|
|
||||||
with open(doc, "rb") as f:
|
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
|
||||||
return Document.objects.filter(checksum=checksum).exists()
|
|
||||||
|
@@ -1,9 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from time import mktime
|
from time import mktime
|
||||||
|
|
||||||
from django import forms
|
from django import forms
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django_q.tasks import async_task
|
||||||
from pathvalidate import validate_filename, ValidationError
|
from pathvalidate import validate_filename, ValidationError
|
||||||
|
|
||||||
|
|
||||||
@@ -18,15 +20,6 @@ class UploadForm(forms.Form):
|
|||||||
raise forms.ValidationError("That filename is suspicious.")
|
raise forms.ValidationError("That filename is suspicious.")
|
||||||
return self.cleaned_data.get("document")
|
return self.cleaned_data.get("document")
|
||||||
|
|
||||||
def get_filename(self, i=None):
|
|
||||||
return os.path.join(
|
|
||||||
settings.CONSUMPTION_DIR,
|
|
||||||
"{}_{}".format(
|
|
||||||
str(i),
|
|
||||||
self.cleaned_data.get("document").name
|
|
||||||
) if i else self.cleaned_data.get("document").name
|
|
||||||
)
|
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
"""
|
"""
|
||||||
Since the consumer already does a lot of work, it's easier just to save
|
Since the consumer already does a lot of work, it's easier just to save
|
||||||
@@ -35,15 +28,13 @@ class UploadForm(forms.Form):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
document = self.cleaned_data.get("document").read()
|
document = self.cleaned_data.get("document").read()
|
||||||
|
original_filename = self.cleaned_data.get("document").name
|
||||||
|
|
||||||
t = int(mktime(datetime.now().timetuple()))
|
t = int(mktime(datetime.now().timetuple()))
|
||||||
|
|
||||||
file_name = self.get_filename()
|
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
|
||||||
i = 0
|
|
||||||
while os.path.exists(file_name):
|
|
||||||
i += 1
|
|
||||||
file_name = self.get_filename(i)
|
|
||||||
|
|
||||||
with open(file_name, "wb") as f:
|
|
||||||
f.write(document)
|
f.write(document)
|
||||||
os.utime(file_name, times=(t, t))
|
os.utime(f.name, times=(t, t))
|
||||||
|
|
||||||
|
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
|
||||||
|
@@ -1,249 +0,0 @@
|
|||||||
import datetime
|
|
||||||
import imaplib
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from base64 import b64decode
|
|
||||||
from email import policy
|
|
||||||
from email.parser import BytesParser
|
|
||||||
|
|
||||||
from dateutil import parser
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from .models import Correspondent
|
|
||||||
|
|
||||||
|
|
||||||
class MailFetcherError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidMessageError(MailFetcherError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Loggable(object):
|
|
||||||
|
|
||||||
def __init__(self, group=None):
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
self.logging_group = group or uuid.uuid4()
|
|
||||||
|
|
||||||
def log(self, level, message):
|
|
||||||
getattr(self.logger, level)(message, extra={
|
|
||||||
"group": self.logging_group
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
class Message(Loggable):
|
|
||||||
"""
|
|
||||||
A crude, but simple email message class. We assume that there's a subject
|
|
||||||
and n attachments, and that we don't care about the message body.
|
|
||||||
"""
|
|
||||||
|
|
||||||
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
|
|
||||||
|
|
||||||
def __init__(self, data, group=None):
|
|
||||||
"""
|
|
||||||
Cribbed heavily from
|
|
||||||
https://www.ianlewis.org/en/parsing-email-attachments-python
|
|
||||||
"""
|
|
||||||
|
|
||||||
Loggable.__init__(self, group=group)
|
|
||||||
|
|
||||||
self.subject = None
|
|
||||||
self.time = None
|
|
||||||
self.attachment = None
|
|
||||||
|
|
||||||
message = BytesParser(policy=policy.default).parsebytes(data)
|
|
||||||
self.subject = str(message["Subject"]).replace("\r\n", "")
|
|
||||||
self.body = str(message.get_body())
|
|
||||||
|
|
||||||
self.check_subject()
|
|
||||||
self.check_body()
|
|
||||||
|
|
||||||
self._set_time(message)
|
|
||||||
|
|
||||||
self.log("info", 'Importing email: "{}"'.format(self.subject))
|
|
||||||
|
|
||||||
attachments = []
|
|
||||||
for part in message.walk():
|
|
||||||
|
|
||||||
content_disposition = part.get("Content-Disposition")
|
|
||||||
if not content_disposition:
|
|
||||||
continue
|
|
||||||
|
|
||||||
dispositions = content_disposition.strip().split(";")
|
|
||||||
if len(dispositions) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not dispositions[0].lower() == "attachment" and \
|
|
||||||
"filename" not in dispositions[1].lower():
|
|
||||||
continue
|
|
||||||
|
|
||||||
file_data = part.get_payload()
|
|
||||||
|
|
||||||
attachments.append(Attachment(
|
|
||||||
b64decode(file_data), content_type=part.get_content_type()))
|
|
||||||
|
|
||||||
if len(attachments) == 0:
|
|
||||||
raise InvalidMessageError(
|
|
||||||
"There don't appear to be any attachments to this message")
|
|
||||||
|
|
||||||
if len(attachments) > 1:
|
|
||||||
raise InvalidMessageError(
|
|
||||||
"There's more than one attachment to this message. It cannot "
|
|
||||||
"be indexed automatically."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.attachment = attachments[0]
|
|
||||||
|
|
||||||
def __bool__(self):
|
|
||||||
return bool(self.attachment)
|
|
||||||
|
|
||||||
def check_subject(self):
|
|
||||||
if self.subject is None:
|
|
||||||
raise InvalidMessageError("Message does not have a subject")
|
|
||||||
if not Correspondent.SAFE_REGEX.match(self.subject):
|
|
||||||
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
|
||||||
self.subject))
|
|
||||||
|
|
||||||
def check_body(self):
|
|
||||||
if self.SECRET not in self.body:
|
|
||||||
raise InvalidMessageError("The secret wasn't in the body")
|
|
||||||
|
|
||||||
def _set_time(self, message):
|
|
||||||
self.time = datetime.datetime.now()
|
|
||||||
message_time = message.get("Date")
|
|
||||||
if message_time:
|
|
||||||
try:
|
|
||||||
self.time = parser.parse(message_time)
|
|
||||||
except (ValueError, AttributeError):
|
|
||||||
pass # We assume that "now" is ok
|
|
||||||
|
|
||||||
@property
|
|
||||||
def file_name(self):
|
|
||||||
return "{}.{}".format(self.subject, self.attachment.suffix)
|
|
||||||
|
|
||||||
|
|
||||||
class Attachment(object):
|
|
||||||
|
|
||||||
SAFE_SUFFIX_REGEX = re.compile(
|
|
||||||
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
|
|
||||||
|
|
||||||
def __init__(self, data, content_type):
|
|
||||||
|
|
||||||
self.content_type = content_type
|
|
||||||
self.data = data
|
|
||||||
self.suffix = None
|
|
||||||
|
|
||||||
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
|
|
||||||
if not m:
|
|
||||||
raise MailFetcherError(
|
|
||||||
"Not-awesome file type: {}".format(self.content_type))
|
|
||||||
self.suffix = m.group(2) or m.group(4)
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
return self.data
|
|
||||||
|
|
||||||
|
|
||||||
class MailFetcher(Loggable):
|
|
||||||
|
|
||||||
def __init__(self, consume=settings.CONSUMPTION_DIR):
|
|
||||||
|
|
||||||
Loggable.__init__(self)
|
|
||||||
|
|
||||||
self._connection = None
|
|
||||||
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
|
|
||||||
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
|
|
||||||
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
|
|
||||||
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
|
|
||||||
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
|
|
||||||
|
|
||||||
self._enabled = bool(self._host)
|
|
||||||
if self._enabled and Message.SECRET is None:
|
|
||||||
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
|
|
||||||
|
|
||||||
self.last_checked = time.time()
|
|
||||||
self.consume = consume
|
|
||||||
|
|
||||||
def pull(self):
|
|
||||||
"""
|
|
||||||
Fetch all available mail at the target address and store it locally in
|
|
||||||
the consumption directory so that the file consumer can pick it up and
|
|
||||||
do its thing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self._enabled:
|
|
||||||
|
|
||||||
# Reset the grouping id for each fetch
|
|
||||||
self.logging_group = uuid.uuid4()
|
|
||||||
|
|
||||||
self.log("debug", "Checking mail")
|
|
||||||
|
|
||||||
for message in self._get_messages():
|
|
||||||
|
|
||||||
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
|
||||||
|
|
||||||
t = int(time.mktime(message.time.timetuple()))
|
|
||||||
file_name = os.path.join(self.consume, message.file_name)
|
|
||||||
with open(file_name, "wb") as f:
|
|
||||||
f.write(message.attachment.data)
|
|
||||||
os.utime(file_name, times=(t, t))
|
|
||||||
|
|
||||||
self.last_checked = time.time()
|
|
||||||
|
|
||||||
def _get_messages(self):
|
|
||||||
|
|
||||||
r = []
|
|
||||||
try:
|
|
||||||
|
|
||||||
self._connect()
|
|
||||||
self._login()
|
|
||||||
|
|
||||||
for message in self._fetch():
|
|
||||||
if message:
|
|
||||||
r.append(message)
|
|
||||||
|
|
||||||
self._connection.expunge()
|
|
||||||
self._connection.close()
|
|
||||||
self._connection.logout()
|
|
||||||
|
|
||||||
except MailFetcherError as e:
|
|
||||||
self.log("error", str(e))
|
|
||||||
|
|
||||||
return r
|
|
||||||
|
|
||||||
def _connect(self):
|
|
||||||
try:
|
|
||||||
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
|
|
||||||
except OSError as e:
|
|
||||||
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
|
|
||||||
raise MailFetcherError(msg)
|
|
||||||
|
|
||||||
def _login(self):
|
|
||||||
|
|
||||||
login = self._connection.login(self._username, self._password)
|
|
||||||
if not login[0] == "OK":
|
|
||||||
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
|
|
||||||
|
|
||||||
inbox = self._connection.select(self._inbox)
|
|
||||||
if not inbox[0] == "OK":
|
|
||||||
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
|
|
||||||
|
|
||||||
def _fetch(self):
|
|
||||||
|
|
||||||
for num in self._connection.search(None, "ALL")[1][0].split():
|
|
||||||
|
|
||||||
__, data = self._connection.fetch(num, "(RFC822)")
|
|
||||||
|
|
||||||
message = None
|
|
||||||
try:
|
|
||||||
message = Message(data[0][1], self.logging_group)
|
|
||||||
except InvalidMessageError as e:
|
|
||||||
self.log("error", str(e))
|
|
||||||
else:
|
|
||||||
self._connection.store(num, "+FLAGS", "\\Deleted")
|
|
||||||
|
|
||||||
if message:
|
|
||||||
yield message
|
|
@@ -3,10 +3,10 @@ import os
|
|||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from django_q.tasks import async_task
|
||||||
from watchdog.events import FileSystemEventHandler
|
from watchdog.events import FileSystemEventHandler
|
||||||
from watchdog.observers import Observer
|
from watchdog.observers import Observer
|
||||||
|
from watchdog.observers.polling import PollingObserver
|
||||||
from documents.consumer import Consumer
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from inotify_simple import INotify, flags
|
from inotify_simple import INotify, flags
|
||||||
@@ -16,13 +16,10 @@ except ImportError:
|
|||||||
|
|
||||||
class Handler(FileSystemEventHandler):
|
class Handler(FileSystemEventHandler):
|
||||||
|
|
||||||
def __init__(self, consumer):
|
|
||||||
self.consumer = consumer
|
|
||||||
|
|
||||||
def _consume(self, file):
|
def _consume(self, file):
|
||||||
if os.path.isfile(file):
|
if os.path.isfile(file):
|
||||||
try:
|
try:
|
||||||
self.consumer.try_consume_file(file)
|
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Catch all so that the consumer won't crash.
|
# Catch all so that the consumer won't crash.
|
||||||
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
||||||
@@ -37,7 +34,7 @@ class Handler(FileSystemEventHandler):
|
|||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
"""
|
"""
|
||||||
On every iteration of an infinite loop, consume what we can from the
|
On every iteration of an infinite loop, consume what we can from the
|
||||||
consumption directory, and fetch any mail available.
|
consumption directory.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@@ -45,12 +42,6 @@ class Command(BaseCommand):
|
|||||||
self.verbosity = 0
|
self.verbosity = 0
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
self.file_consumer = None
|
|
||||||
self.mail_fetcher = None
|
|
||||||
self.first_iteration = True
|
|
||||||
|
|
||||||
self.consumer = Consumer()
|
|
||||||
|
|
||||||
BaseCommand.__init__(self, *args, **kwargs)
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
def add_arguments(self, parser):
|
||||||
@@ -66,9 +57,6 @@ class Command(BaseCommand):
|
|||||||
self.verbosity = options["verbosity"]
|
self.verbosity = options["verbosity"]
|
||||||
directory = options["directory"]
|
directory = options["directory"]
|
||||||
|
|
||||||
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
|
|
||||||
os.makedirs(d, exist_ok=True)
|
|
||||||
|
|
||||||
logging.getLogger(__name__).info(
|
logging.getLogger(__name__).info(
|
||||||
"Starting document consumer at {}".format(
|
"Starting document consumer at {}".format(
|
||||||
directory
|
directory
|
||||||
@@ -78,11 +66,16 @@ class Command(BaseCommand):
|
|||||||
# Consume all files as this is not done initially by the watchdog
|
# Consume all files as this is not done initially by the watchdog
|
||||||
for entry in os.scandir(directory):
|
for entry in os.scandir(directory):
|
||||||
if entry.is_file():
|
if entry.is_file():
|
||||||
self.consumer.try_consume_file(entry.path)
|
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
|
||||||
|
|
||||||
# Start the watchdog. Woof!
|
# Start the watchdog. Woof!
|
||||||
|
if settings.CONSUMER_POLLING > 0:
|
||||||
|
logging.getLogger(__name__).info('Using polling instead of file'
|
||||||
|
'system notifications.')
|
||||||
|
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||||
|
else:
|
||||||
observer = Observer()
|
observer = Observer()
|
||||||
event_handler = Handler(self.consumer)
|
event_handler = Handler()
|
||||||
observer.schedule(event_handler, directory, recursive=True)
|
observer.schedule(event_handler, directory, recursive=True)
|
||||||
observer.start()
|
observer.start()
|
||||||
try:
|
try:
|
||||||
|
@@ -9,13 +9,11 @@ from django_q.tasks import schedule
|
|||||||
def add_schedules(apps, schema_editor):
|
def add_schedules(apps, schema_editor):
|
||||||
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
|
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
|
||||||
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
|
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
|
||||||
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_schedules(apps, schema_editor):
|
def remove_schedules(apps, schema_editor):
|
||||||
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
|
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
|
||||||
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
|
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
|
||||||
Schedule.objects.filter(func='documents.tasks.consume_mail').delete()
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
class Migration(migrations.Migration):
|
||||||
|
@@ -113,6 +113,7 @@ class DocumentType(MatchingModel):
|
|||||||
|
|
||||||
class Document(models.Model):
|
class Document(models.Model):
|
||||||
|
|
||||||
|
# TODO: why do we need an explicit list
|
||||||
TYPE_PDF = "pdf"
|
TYPE_PDF = "pdf"
|
||||||
TYPE_PNG = "png"
|
TYPE_PNG = "png"
|
||||||
TYPE_JPG = "jpg"
|
TYPE_JPG = "jpg"
|
||||||
@@ -291,7 +292,7 @@ class FileInfo:
|
|||||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
# TODO: what is this used for
|
||||||
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
||||||
REGEXES = OrderedDict([
|
REGEXES = OrderedDict([
|
||||||
("created-correspondent-title-tags", re.compile(
|
("created-correspondent-title-tags", re.compile(
|
||||||
|
@@ -41,15 +41,16 @@ def get_parser_class(doc):
|
|||||||
Determine the appropriate parser class based on the file
|
Determine the appropriate parser class based on the file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
parsers = []
|
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parsers.append(response[1])
|
|
||||||
|
|
||||||
options = []
|
options = []
|
||||||
for parser in parsers:
|
|
||||||
result = parser(doc)
|
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
|
||||||
if result:
|
|
||||||
options.append(result)
|
for response in document_consumer_declaration.send(None):
|
||||||
|
parser_declaration = response[1]
|
||||||
|
parser_test = parser_declaration["test"]
|
||||||
|
|
||||||
|
if parser_test(doc):
|
||||||
|
options.append(parser_declaration)
|
||||||
|
|
||||||
if not options:
|
if not options:
|
||||||
return None
|
return None
|
||||||
|
@@ -6,14 +6,10 @@ from whoosh.writing import AsyncWriter
|
|||||||
from documents import index
|
from documents import index
|
||||||
from documents.classifier import DocumentClassifier, \
|
from documents.classifier import DocumentClassifier, \
|
||||||
IncompatibleClassifierVersionError
|
IncompatibleClassifierVersionError
|
||||||
from documents.mail import MailFetcher
|
from documents.consumer import Consumer, ConsumerError
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
|
||||||
|
|
||||||
def consume_mail():
|
|
||||||
MailFetcher().pull()
|
|
||||||
|
|
||||||
|
|
||||||
def index_optimize():
|
def index_optimize():
|
||||||
index.open_index().optimize()
|
index.open_index().optimize()
|
||||||
|
|
||||||
@@ -54,3 +50,27 @@ def train_classifier():
|
|||||||
logging.getLogger(__name__).error(
|
logging.getLogger(__name__).error(
|
||||||
"Classifier error: " + str(e)
|
"Classifier error: " + str(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def consume_file(path,
|
||||||
|
override_filename=None,
|
||||||
|
override_title=None,
|
||||||
|
override_correspondent_id=None,
|
||||||
|
override_document_type_id=None,
|
||||||
|
override_tag_ids=None):
|
||||||
|
|
||||||
|
document = Consumer().try_consume_file(
|
||||||
|
path,
|
||||||
|
override_filename=override_filename,
|
||||||
|
override_title=override_title,
|
||||||
|
override_correspondent_id=override_correspondent_id,
|
||||||
|
override_document_type_id=override_document_type_id,
|
||||||
|
override_tag_ids=override_tag_ids)
|
||||||
|
|
||||||
|
if document:
|
||||||
|
return "Success. New document id {} created".format(
|
||||||
|
document.pk
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ConsumerError("Unknown error: Returned document was null, but "
|
||||||
|
"no error message was given.")
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,208 +0,0 @@
|
|||||||
Return-Path: <sender@example.com>
|
|
||||||
X-Original-To: sender@mailbox4.mailhost.com
|
|
||||||
Delivered-To: sender@mailbox4.mailhost.com
|
|
||||||
Received: from mx8.mailhost.com (mail8.mailhost.com [75.126.24.68])
|
|
||||||
by mailbox4.mailhost.com (Postfix) with ESMTP id B62BD5498001
|
|
||||||
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
|
||||||
Received: from localhost (localhost.localdomain [127.0.0.1])
|
|
||||||
by mx8.mailhost.com (Postfix) with ESMTP id B41796F190D
|
|
||||||
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
|
||||||
X-Spam-Flag: NO
|
|
||||||
X-Spam-Score: 0
|
|
||||||
X-Spam-Level:
|
|
||||||
X-Spam-Status: No, score=0 tagged_above=-999 required=3
|
|
||||||
tests=[RCVD_IN_DNSWL_NONE=-0.0001]
|
|
||||||
Received: from mx8.mailhost.com ([127.0.0.1])
|
|
||||||
by localhost (mail8.mailhost.com [127.0.0.1]) (amavisd-new, port 10024)
|
|
||||||
with ESMTP id 3cj6d28FXsS3 for <sender@mailbox4.mailhost.com>;
|
|
||||||
Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
|
||||||
Received: from smtp.mailhost.com (smtp.mailhost.com [74.55.86.74])
|
|
||||||
by mx8.mailhost.com (Postfix) with ESMTP id 527D76F1529
|
|
||||||
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
|
||||||
Received: from [10.114.0.19] (nl3x.mullvad.net [46.166.136.162])
|
|
||||||
by smtp.mailhost.com (Postfix) with ESMTP id 9C52420C6FDA
|
|
||||||
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:16 +0000 (UTC)
|
|
||||||
To: paperless@example.com
|
|
||||||
From: Daniel Quinn <sender@example.com>
|
|
||||||
Subject: Test 0
|
|
||||||
Message-ID: <56B3CA2A.6030806@example.com>
|
|
||||||
Date: Thu, 4 Feb 2016 22:01:14 +0000
|
|
||||||
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
|
|
||||||
Thunderbird/38.5.0
|
|
||||||
MIME-Version: 1.0
|
|
||||||
Content-Type: multipart/mixed;
|
|
||||||
boundary="------------090701020702030809070008"
|
|
||||||
|
|
||||||
This is a multi-part message in MIME format.
|
|
||||||
--------------090701020702030809070008
|
|
||||||
Content-Type: text/plain; charset=utf-8
|
|
||||||
Content-Transfer-Encoding: 7bit
|
|
||||||
|
|
||||||
The secret word is "paperless" :-)
|
|
||||||
|
|
||||||
--------------090701020702030809070008
|
|
||||||
Content-Type: application/pdf;
|
|
||||||
name="test0.pdf"
|
|
||||||
Content-Transfer-Encoding: base64
|
|
||||||
Content-Disposition: attachment;
|
|
||||||
filename="test0.pdf"
|
|
||||||
|
|
||||||
JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0
|
|
||||||
ZURlY29kZT4+CnN0cmVhbQp4nFWLQQvCMAyF7/kVOQutSdeuHZSA0+3gbVDwIN6c3gR38e/b
|
|
||||||
bF4kkPfyvReyjB94IyFVF7pgG0ze4TLDZYevLamzPKEvEFqbMEZfq+WO+5GRHZbHNROLy+So
|
|
||||||
UfFi6g7/RyusEpUl9VsQxQTlHR2oV3wUEzOdhOnXG1aw/o1yK2cYCkww4RdbUCevCmVuZHN0
|
|
||||||
cmVhbQplbmRvYmoKCjMgMCBvYmoKMTM5CmVuZG9iagoKNSAwIG9iago8PC9MZW5ndGggNiAw
|
|
||||||
IFIvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aDEgMTA4MjQ+PgpzdHJlYW0KeJzlOWt0G9WZ
|
|
||||||
95uRbNmWLckPWY4SaRTFedmybI8T4rw8sS3ZiZ1YfqWSCbFkS7YEtiQkJSE8GlNeOQ5pUmh5
|
|
||||||
Zkt2l+XQNl3GhLaBpcWw0D19UGALLRRS0gM9nD0lxVBK9wCx97tXI0UJAc727L8d+c587/u9
|
|
||||||
7p0rOZXYEyJaMkV4Io1OBuLOqmqBEPJLQqB0dG9K2NRTsQHhM4Rw/zkWH5+870e7PiRE9Rgh
|
|
||||||
+Y+NT+wf+/b3e4YI0YYJKX41HAoEfxj6vUjIIgltrA0jYef8/nzEr0F8WXgydY2bP7QO8WOI
|
|
||||||
SxOx0cDxxbUmxN9AfOlk4Jr4apWLI8SMKBGigcmQpYXrRBx9KtobjyVTQbJsgZDl91B+PBGK
|
|
||||||
d9838hzipwjhjyIN8EMvLYJ5FOd4lTovX1NQWKQtLtGR/3eX+jCpIJ3qTURH4ux+wcWfIFXk
|
|
||||||
XkIW3qXY+ft898LH/5deaNKPe8hD5DFymLxGrlAYbuIhEbIHKbnX0+QlpNLLQ4bId8n055g9
|
|
||||||
QU4hPy3nJ0doJJe8PORucpL8xwWzeMgkuQ59+QF5DRrIz7BVYuQD0JAbyXNo9QOkbb+UKa4E
|
|
||||||
b2MMHMuhvk7u5w6RbdzbiNxLOZyT05NnyTHYjZZTGOfhbMQbP2P0NnID3vtJmOxFmF3qTZ/+
|
|
||||||
jhQs/AWjuoFsI18jW8hEjsaT8ABfiPUbIA9gTp9mNGeGmd/JX8n9kOPO3YnIN8g4jgBg7Nxh
|
|
||||||
fsvnZOh/ffGDpBhW8dWk4FJcrono5j/mGhc+5JeRQjK4MJehLXQt/IUPzEdVw6rF6k2qX3zR
|
|
||||||
HHnfUE2iNln44/x180H1DvVDWK2HcePouHzI5x0c6O/r9fTs2N7dtW1rZ4fb1d7WukVq2bxp
|
|
||||||
44b1zesuW7umod5Z56hduWJ59TL7UpvVVG7Q60qKiwoLNPl5ahXPAakVZPC7ZL5aMLgDdpc9
|
|
||||||
0OmoFVymcLuj1mV3+2UhIMj4UC23d3Yykj0gC35BXo6PQA7ZL0soOXaRpJSWlLKSoBc2ko10
|
|
||||||
CrsgP99uF07BUK8X4cPtdp8gn2XwdgarljOkGBGbDTWYV9RbwSW794anXX70EWaKCtvsbaFC
|
|
||||||
Ry2ZKSxCsAgheaU9PgMrNwMDuJWu9TMc0RTTaTFSVyAoe3q9rnazzeZz1G6VS+ztjEXamEk5
|
|
||||||
r03OZyaFCHWdHBJmamenbz+lJyP+Gm3QHgzs8sp8AHWnedf09G2yoUZeZW+XV137tgkjD8m1
|
|
||||||
9naXXEOtdvVl5+k6PyXI6mq9XZj+K8Fw7GffvZASUCh51fq/EgrKXJsMfV4bvcxuzPX0tNsu
|
|
||||||
uKf904FTC1MjdkFvn57RaqfjLkw38XjRxKmFJw6ZZfftPlnvD8N6nxK6u69LLuu93Ctz1W4h
|
|
||||||
HEAK/rXYbevMNkNWxvN5bIJpweRghm02moZDpyQygog81etN4wIZMT9KJGeNT+b8lDOb4VQM
|
|
||||||
Us5UhpNV99uxtl393mlZVb01aHdhxg8F5KkR7K4raWHsernkI7PNPl1qEJqdPiYroFdbgxFB
|
|
||||||
Vi/HJKFWrgL2DVWZ1jOk5KP046wZJ1huKBWa7WiG2nHZXX7lb2/YhAYETHRnTboRBryy1I6A
|
|
||||||
FFAq5pqpd6JGwI8Fi7SzYspOe1wut7dmq0vdckX6vUxFUZPL22TiH1W0ZKeLrSvBNe1vT7tA
|
|
||||||
bdl7vY8TceHMTJNgPimSJuJrp8LGNuyy5a5pb3BMtvrNQVx3Y4LXbJMlH1bYZ/eGfLTtMEOr
|
|
||||||
zphZc/hYrwx4u/rtXb1D3nWKI2kGNaeqdl1kxu41p81gA8qaao3g5cy8DwX1SBDcCNhbN+Jd
|
|
||||||
zq/W4NBjwhmVNm7rRsELZpKRRjfkVYIr1K7IUfwCo2raTm2dGWt5FEU7bZ1mm8+Wvhy1HLIF
|
|
||||||
ZWLU0NCkdmZYuE0hQ4P92dbJSDSXJtr0gtcesvvsYUGWPF4aG00Py7KSDJZzpVYDF2A5ycI0
|
|
||||||
ERuyMwhNpuyuMecmV+5geBbtvIi9NcMWpjX2rv5patyuGCTo+VaZ0BaW1hnMbC+gC9qOe6+g
|
|
||||||
xyXNFvT0jCTRxRxeT43Ytwan7f3ejUwa95MbzNfSuUpJF3QNtDpqcWtrnbHDwd4ZCQ72D3kf
|
|
||||||
1+O58OCA91EOuDZ/q29mGfK8jwv40mBUjlIpkSICRailPkQ0TN78uETIFOOqGIHho6eAMJom
|
|
||||||
QwMyeopL0/TpiZaziSTCIUeV5kgZaRXSNGnaFKOxa4bQlEmFakkjFUharpgzzwAlPYqUJ/Ac
|
|
||||||
WwDkpBaKwTyDWn2MfAqmZgokc1piCiWktIcHB89PPTjkPanFt7OZ3XGiVnphu5jCWGx8rbiE
|
|
||||||
IG2U633hab+PLjZixNLgH8hg34xlsm9GR/K0cqE91CoX2VspvYXSW9L0PErPxxYFI6D6FNbe
|
|
||||||
IwPtgMu9NlySwqKfmaf1Z2mlfLipTOv/6MCMVeP3hqfxDFoOG6XTpVwRp+ErjFqigQJeoykw
|
|
||||||
8AW831fAl3KEG/aR0hYj6IxwxghPGeGIEQ4YYdgISBQY/ao5I7xghOOMFzdCjxGsjJGmy0Z4
|
|
||||||
gLFiTE0yQj0TIEZ4k3GnGL2eUTYssHnSakcYo4fx5hhdzsyRVhCYzhwzNMummWJcdM2ZmeOK
|
|
||||||
7HV15koo1+6L6J/hUB5pqTEQ0cTuBtHkHN59hWgohcpmg9hQb1tzmcG+VAd2g81gX1EHNWCo
|
|
||||||
rIANr4jnrjC3qY61my0/v6bhlTVm1d3lL8GG+edeyi/65CrzGnqgAlKOJ7c/4neCJeQJaT8p
|
|
||||||
L68qLikpqCqwWJcs8viWkHJEKqs8Pm1lRRnHqdWGPp9af9wKZ6wwawW9FYgVmhE5aoW4FfxW
|
|
||||||
8FhBskK9FQQrWBkbWVMZLrJeZJqyFY7n0HOTk0hckAAldoy6RaSAyNJQCs0Ye/rTUA/l+ZtB
|
|
||||||
bDRWYOA0G032pfkKuGKNDdz5nT9qufb6xPxVNzy0+6YD88F9t0Mj/1G4btXGr9927q4qh6OK
|
|
||||||
231iybkyCqk5kwMXTg2eT0vV3aQIvy39gzRGtNo8g6HSyBf0+wgPep6vkCpKPb4KndagM3h8
|
|
||||||
uorySlBVQvOHlXC0Erh4JfgrwVMJUiXMVoJcCccZKlSCvhJIJcwxCormSl7YIzQFwywL2fKT
|
|
||||||
RSb9r7D4LAEGUQk+z750+ZqmtZgA/nzQ10mOWkmqdUiF/zhfdfwWqFG9mcalT9bTOHmhiq7B
|
|
||||||
gYV3uV/zz5GVxCc12fLLFxVjS6xaXWzjKystHp+5Us8XeXz5vHFqNcRXg381eFaDsBoeWQ3D
|
|
||||||
q6FnNWT8JVgewmpUSrA26QKhg1kPV6wRK41i45omJ9RxzN3KCvuK5faleRXlxkoLz/165vvu
|
|
||||||
79Q7GrqueeZeX2hX43eOjt/vXL0m0Tu4fcedQy120Nx+dEnpOze1P3Rt0xJb+6j7+iPW5yed
|
|
||||||
nvbmHYsa69p20q8ZpHPhXf5q/mlixt1lUmoxaKqrVYJWW6Xi8di/tHBpr89UYTAsxooZrAZO
|
|
||||||
yxsMRFNozFdhjBWkwuMj+qkVMLwCpBWAwBVYBEw+MbEhljY708knzawn0yvQoESp9N8KDNbQ
|
|
||||||
tBlaYE3TcrYu16yF/BKoKBcb114GL933jT3z82WJmfe3Hr/ncMe2YP/Sdf8E5KZbh4+0jzby
|
|
||||||
T3/1a+duqXLsToBp93VbeNWdgV3OPc/b5y0q9e6obDWxNYs1c6huJEbSIa0oLCnJL+P5SpNK
|
|
||||||
W6T1+Aryi3S4pg29PmJ8wASyCVpM4DTRMiUybSSKivfNpc2NjbSH1NhABvuaFhArxAq7oRzr
|
|
||||||
dFlFCcAO//B1N4RafvvbDfXr++03lyfGuTsdK155ZeDcgS2t+i0mK8u5B3Puxh6qIIvJYWmo
|
|
||||||
CkC3SFOhq1hiqSKY6CprFSa6qkpbWmr0+Er1WnWvT2uctYBsgeMWOGqBKQvELeC3gMcCxAKb
|
|
||||||
8SFZoN4CggX0FphjciiU2R2yO+MVSnFoRUzOzMJINx5bGxXlFqBpx2CwBQ3YdYKhArDlbE3L
|
|
||||||
QbXpwPjab9bX/8vO13/xq6cgMn93OAZ37ILXSqfv9ZQWrbPWvQvqjz6YH+uDYw8/ePJeGus2
|
|
||||||
jPUd3C/LcMecknrKVUWkqkqv0lusZXqPrwz3A4yY5GOD5eurUIGr7PVxRtwGO3J3RsI2wSlG
|
|
||||||
SQN+RldWvxLk+Z0v04HnNz4WXnWeXTA0leJKWr4JcNHT9gNWPMNyu8D9+uq75w/87uWJWN63
|
|
||||||
oT01/9/z1qmbrx7yJeY/dQ/BH/4GUGm75UOT4+PHqxzw/E/+bQX3joHVcwfG+CjWsxA77Anp
|
|
||||||
RoO6iKhJpUlT4vFp9Fy5BwMSTEBMcMYEHhPUm0BvgjmGvmiCWdZ1x01w1ARTJoibwG8CyQRp
|
|
||||||
lQ0PMJKHkeoZVc8YufrHmWZaDe9XfO6bMbtdZpdpNkFYfL0tsy/mNyn7DPYC/+h858uvvvrG
|
|
||||||
b3732FdvvWnPvhtvnoLX5w3z7//507/95dVnnjjz1o+fTb8baR52YB6MxC9txCwY1UbMgg7f
|
|
||||||
hhq9sZwv7/XxRvR8c24kcyyGdABIf8QEw3TxZd3fnd3MxVxfq7E/BQPbFA10UxTSa5Df0XBi
|
|
||||||
aP6y/3rttuOX1fSn5j/85+/dMdG8bBW8/6dz1vmPH3LOh1/+gY36akZfT/Mn0NdvScOktFil
|
|
||||||
KigtqDSpy4xl2IpGnQqPpX2+Yr1RW4D+Vxxn2Z7NJL/5TE49CCtgtm5yJpw0RTBBbtpzX9NE
|
|
||||||
eUUrj5yXNH0H0K5UenQFXY1VtGOh+fj1E18Hcd/8nzUdT7TMXQMW0J6wcu9UOT69r8rRvaIZ
|
|
||||||
yrkxfFPRGPGdnFeF9WiAR6UFgzZv8WIbWbnS4bBpebGxoc7ja9CttC02aB01Do/PqqupqMrL
|
|
||||||
Kygo7/MV6FfgMYev7vPx+r0i7BRhrQjLRDCKkCfCRyK8LcLLIvxUhAdFuEuEERHAI0K7CPVM
|
|
||||||
rlwElQjhuYzgYyKkRJBEaGJs5H0owusizIogMxs3ixAUFRNpGX1G7EURnhXheyIcZWJXibBB
|
|
||||||
BCEzx7r0BMdF8IswkJmjnGm+zTS/KcIUTi/V5PDNTPdt5gAnM4E4mx5n1YmgUdbL8BcfMy88
|
|
||||||
heYcxM6r5wjlbE6Z45lyPsuc0CqzJzTWAOyEVknvVZA9ppVw+edPbcsvOrZ1PSy59izZ/kL7
|
|
||||||
3P75wduPL3K5WioMh+dbDw0Oem86PL9z3z4o4/0165uaa1rn/6Qc5LwnNIXFqrVbMmi/b8m5
|
|
||||||
quyBh/WRE5vhD9hHi8msdAMpKzMVabX5pvwllsV40l2sK0PEaPL4Co0VpbRt9LRtHrTA2xZ4
|
|
||||||
1gL4QlFZoBmRb1ogZYGgBQYs0G6BJgsss4CZsfHNxuW+1/Bt9qIFsq+8LD03o8N/18n3wnPv
|
|
||||||
RRls3/6v69Pn3t7BITz4Xnn11aDl/bXN2WOvt39YOfcq58HbFt6C/eQVPPeapCKSl6ct5gvu
|
|
||||||
v5wvIy3KmRP3qpwDJ+x3NTW53KLo3tXQ2dkgut3s/y30Pzblq28Z1m38K2dN/9b/yzuXdJ7/
|
|
||||||
JXfhrbwqNf0FXJMloV6+bd5FvpJLueDS5zXjN8a3SLWKkHKumdTwS8gAR397Pkw6ES/Hpwd5
|
|
||||||
23DsQHgHPs2oU4NPJ0eUX9KfgR3wDLcaP8e4t/kh/pcqj+ohtSlvY97P895VZtWTRhoDi0SP
|
|
||||||
/bILgX/nf0p4xrVANOvbzqyfgJI7FZgj+WRMgXk8i04qsAplDiqwmpSQexQ4j+jIQwqcT64l
|
|
||||||
P1BgDX43dipwASmBNgUuhCj0KnARWcw9lf0vVx33ugIXkzV8gQKXkEX8Zuq9iv46f4L3KjAQ
|
|
||||||
QaVSYI6UqJYpME/WqhoVWIUyYQVWk8WqgwqcRyyqBxU4n3yoekaBNWSl+ocKXEAWq3+vwIXc
|
|
||||||
G+qPFbiIrNP8RoG1ZFdBiQIXkysLrlTgEtJU8HJ7ZDySilwbCgrBQCogjMbi+xOR8XBKWDm6
|
|
||||||
Smisb6gXOmKx8YmQ0BZLxGOJQCoSi9YVtl0s1ij0oYnOQKpW2BodreuOjITSskJ/KBEZ6wuN
|
|
||||||
75kIJLYkR0PRYCghOISLJS7Gd4YSSYo01tXX1zWc514sHEkKASGVCARDk4HEVUJs7EJHhERo
|
|
||||||
PJJMhRJIjESFwbr+OsETSIWiKSEQDQoDWcWesbHIaIgRR0OJVACFY6kwunrlnkQkGYyM0tmS
|
|
||||||
ddkIctLRnwrtDQnbA6lUKBmLtgaSOBd6NhCJxpK1wr5wZDQs7AskhWAoGRmPInNkv3ChjoDc
|
|
||||||
AMYSjcb2osm9oVr0eywRSoYj0XEhSUNWtIVUOJCiQU+GUonIaGBiYj/WbDKOWiNYpH2RVBgn
|
|
||||||
ngwlhR2hfUJfbDIQ/W5d2hXMzRgmVYhMxhOxvcxHR3I0EQpFcbJAMDASmYik0Fo4kAiMYsYw
|
|
||||||
bZHRJMsIJkKIB6IO155ELB5CT7/S0X1eEB1MZzMZm9iLM1PpaCgUpDOi23tDE6iEE0/EYlfR
|
|
||||||
eMZiCXQ0mAo7cjwfi0VTqBoTAsEgBo7Zio3umaR1wjSnMs4FRhMx5MUnAim0MpmsC6dS8fVO
|
|
||||||
5759++oCSmlGsTJ1aNn5RbzU/nhIqUeCWpmc6MbyR2np9rD60iD6t3YLPXHMjxudExSBWiHT
|
|
||||||
mg11DcoUmMZIPJWsS0Ym6mKJcWePu5u0kwgZx5HCcS0JkSARcAQQDyA0SmIkTvaTBJMKI1Ug
|
|
||||||
K5G6Cp+NpJ404BBIB0rFkD+B+gJpQziBWvQeYHZjJErq8FtE25daa0SoT/Gik2nXIrQV9UfR
|
|
||||||
QjfqjSA3165A+hklgvss1Rwne9CPAFK2kCRqhVAmyCQE4sDxZTa+jL+TQckspxH9qsdPHXp/
|
|
||||||
Kd0vsxxBWwLLdYpxqK+TzP+rkBZDvS/KiIByIVa/JHJCDAsyq9T2IEr0MykP06S5SLHZokxq
|
|
||||||
4BIz9uCMY6g/ymqZkRxltmlPpC3HEA4rWb0SM55gHgSZXia2JM782Rpcujv6mXd72ZzbGZ3i
|
|
||||||
ScZrRTypxJXO2QDzIoZUmot96AmdN8zgAMtnkGnTLosqmiPYd8IXziMougGlLlE2x17FS6pT
|
|
||||||
q+R7jN2TbN4oziEw/9JVvnBugeUpwLKervQkclNMdhTpE/jZr6yzScxKeq4RZSXtY+syrEQ8
|
|
||||||
yewKZAc+97GuiLG6RW1LWY3PZyXdN2NKpwpMN45wjEWRyaOD1YZGEmKeUijA1v4IakywudO+
|
|
||||||
hVl3BFhtQ0qtUyyCTL6CSqTU6zijOIiL9QVd8SElp1/BnaL7khbTGcztTVqTCeZvMsd2lHkb
|
|
||||||
zMaYzjaVmlBmSkc8wXakq7L1GWP9ls5okFlzfE7Ox1huUsqsMeZRED/piqd7K4a6e1g90usp
|
|
||||||
3c2pz2QuwPIbU/TibF9KKb5MsvURZh0YJ+vxbOlE7+injvVh7qoZVdZMneKz8+/Wo37FWQZz
|
|
||||||
10ci68sk+titrP5odtXtyVm/mUr04x7UzfaLuNI/biVzwkUW6Kq5eNdsYPvlhVGkuzGCeIr5
|
|
||||||
k2S5rGMxjCO/B2foZufo9DcHG/p0iWumwLNlBEIEIAzjpIxYwU92wDAZhC1kE0j4lJDXis82
|
|
||||||
xOmzDjaRKZTbhPTNiG9E+gbcPK14b8HRg+MIDhWOtEQ9Sjjx6VRwB+K1qPEC3oENSm1BKn1u
|
|
||||||
Q7wTnx3K0410Fz5dCr4VcXwSP+TjQbyF3Z8ClXQSzpyDF86BcA4OfAKeT2Dqg6MfcO/PrbI+
|
|
||||||
MvfUHNfz3vB7j7zH178HuvdAQ87qz3rO+s/Gzx4/m1eoexe05E9geOvMOuubm04P/n7TG4Pk
|
|
||||||
NEZ2uv605/TUafm0+jTwg2/wRqt+Vpitn43PTs2+OHtmdm5WM/WToz/hfvyk06p70vokZz3Z
|
|
||||||
c/LASd7/MOgetj7Mee73388dPQa6Y9ZjzmP8fffWWe/tsFjvvmuF9cxdc3dxpxZmT95VbHA/
|
|
||||||
CT3QTTZhDnec5Besj2ypgO0Ylg7vVhxOHD04YjiO4MDvPShuxeGEbmkdP/wtKLrDfEfNHdfd
|
|
||||||
cegOdfzWqVuP3spP3XL0Fu6RvU/t5ZKeVdZYtMYa7VhtrRJNg/kiP5iH0+Ds0taR6pVu/7Bk
|
|
||||||
HUahy4fqrUMdq6xlYumgGgNWoaCOt/ItfA8f44/wT/H5mj6PxdqL44xnzsNJngKtW9dj7XH2
|
|
||||||
8KcWzkihLhta2xbfNrWN3+peZe3sWGfVdVg7nB0vdLzZ8V5H3nAHPIB/7kfcT7l5yb3K6Zbc
|
|
||||||
Fpt7cad50ChWDBpAN6gXdYMcYKFFMujULeg4nW5Yd0DH60gL4aaMoIZTcHRmoL+mputU/kJf
|
|
||||||
l6zxXC7DQbm6n96l3iE576BMBocu984AfN13y+HDpHVJl9zY75X9S3xdchABiQJTCOiXzBhJ
|
|
||||||
qy+ZTNWwC2pqEN6Dd1KzpwaJu5NpKsnySU0SkrhHJZkS1FCBNA54r6E8JFA9QO3dSUJvlFmT
|
|
||||||
VqLaScUcU07fGGDa/T/LhW2oCmVuZHN0cmVhbQplbmRvYmoKCjYgMCBvYmoKNjI5MQplbmRv
|
|
||||||
YmoKCjcgMCBvYmoKPDwvVHlwZS9Gb250RGVzY3JpcHRvci9Gb250TmFtZS9CQUFBQUErTGli
|
|
||||||
ZXJhdGlvblNlcmlmCi9GbGFncyA0Ci9Gb250QkJveFstNTQzIC0zMDMgMTI3NyA5ODFdL0l0
|
|
||||||
YWxpY0FuZ2xlIDAKL0FzY2VudCA4OTEKL0Rlc2NlbnQgLTIxNgovQ2FwSGVpZ2h0IDk4MQov
|
|
||||||
U3RlbVYgODAKL0ZvbnRGaWxlMiA1IDAgUgo+PgplbmRvYmoKCjggMCBvYmoKPDwvTGVuZ3Ro
|
|
||||||
IDI5Mi9GaWx0ZXIvRmxhdGVEZWNvZGU+PgpzdHJlYW0KeJxdkctuwyAQRfd8Bct0EfmROA/J
|
|
||||||
spQmseRFH6rbD3BgnCLVGGGy8N+XmUlbqQvQmZl7BxiSY3NqrAnJqx9VC0H2xmoP03jzCuQF
|
|
||||||
rsaKLJfaqHCPaFdD50QSve08BRga249lKZK3WJuCn+XioMcLPIjkxWvwxl7l4uPYxri9OfcF
|
|
||||||
A9ggU1FVUkMf+zx17rkbICHXstGxbMK8jJY/wfvsQOYUZ3wVNWqYXKfAd/YKokzTSpZ1XQmw
|
|
||||||
+l8tK9hy6dVn56M0i9I0LdZV5Jx4s0NeMe+R18TbFXJBnKfIG9ZkyFvWUJ8d5wvkPTPlD8w1
|
|
||||||
8iMz9Tyyl/Qnzp+Qz8xn5JrPPdOj7rfH5+H8f8Ym1c37ODL6JJoVTslY+P1HNzp00foG7l+O
|
|
||||||
gwplbmRzdHJlYW0KZW5kb2JqCgo5IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1RydWVU
|
|
||||||
eXBlL0Jhc2VGb250L0JBQUFBQStMaWJlcmF0aW9uU2VyaWYKL0ZpcnN0Q2hhciAwCi9MYXN0
|
|
||||||
Q2hhciAxNQovV2lkdGhzWzc3NyA2MTAgNTAwIDI3NyAzODkgMjUwIDQ0MyAyNzcgNDQzIDUw
|
|
||||||
MCA1MDAgNDQzIDUwMCA3NzcgNTAwIDI1MApdCi9Gb250RGVzY3JpcHRvciA3IDAgUgovVG9V
|
|
||||||
bmljb2RlIDggMCBSCj4+CmVuZG9iagoKMTAgMCBvYmoKPDwvRjEgOSAwIFIKPj4KZW5kb2Jq
|
|
||||||
CgoxMSAwIG9iago8PC9Gb250IDEwIDAgUgovUHJvY1NldFsvUERGL1RleHRdCj4+CmVuZG9i
|
|
||||||
agoKMSAwIG9iago8PC9UeXBlL1BhZ2UvUGFyZW50IDQgMCBSL1Jlc291cmNlcyAxMSAwIFIv
|
|
||||||
TWVkaWFCb3hbMCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNl
|
|
||||||
UkdCL0kgdHJ1ZT4+L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUv
|
|
||||||
UGFnZXMKL1Jlc291cmNlcyAxMSAwIFIKL01lZGlhQm94WyAwIDAgNTk1IDg0MiBdCi9LaWRz
|
|
||||||
WyAxIDAgUiBdCi9Db3VudCAxPj4KZW5kb2JqCgoxMiAwIG9iago8PC9UeXBlL0NhdGFsb2cv
|
|
||||||
UGFnZXMgNCAwIFIKL09wZW5BY3Rpb25bMSAwIFIgL1hZWiBudWxsIG51bGwgMF0KL0xhbmco
|
|
||||||
ZW4tR0IpCj4+CmVuZG9iagoKMTMgMCBvYmoKPDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5
|
|
||||||
MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAwNEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAw
|
|
||||||
NjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzUwMDJFMDAzMD4KL0NyZWF0aW9uRGF0ZShEOjIw
|
|
||||||
MTYwMjA0MjIwMDAyWicpPj4KZW5kb2JqCgp4cmVmCjAgMTQKMDAwMDAwMDAwMCA2NTUzNSBm
|
|
||||||
IAowMDAwMDA3NTA5IDAwMDAwIG4gCjAwMDAwMDAwMTkgMDAwMDAgbiAKMDAwMDAwMDIyOSAw
|
|
||||||
MDAwMCBuIAowMDAwMDA3NjUyIDAwMDAwIG4gCjAwMDAwMDAyNDkgMDAwMDAgbiAKMDAwMDAw
|
|
||||||
NjYyNSAwMDAwMCBuIAowMDAwMDA2NjQ2IDAwMDAwIG4gCjAwMDAwMDY4NDEgMDAwMDAgbiAK
|
|
||||||
MDAwMDAwNzIwMiAwMDAwMCBuIAowMDAwMDA3NDIyIDAwMDAwIG4gCjAwMDAwMDc0NTQgMDAw
|
|
||||||
MDAgbiAKMDAwMDAwNzc1MSAwMDAwMCBuIAowMDAwMDA3ODQ4IDAwMDAwIG4gCnRyYWlsZXIK
|
|
||||||
PDwvU2l6ZSAxNC9Sb290IDEyIDAgUgovSW5mbyAxMyAwIFIKL0lEIFsgPDRFN0ZCMEZCMjA4
|
|
||||||
ODBCNURBQkIzQTNEOTQxNDlBRTQ3Pgo8NEU3RkIwRkIyMDg4MEI1REFCQjNBM0Q5NDE0OUFF
|
|
||||||
NDc+IF0KL0RvY0NoZWNrc3VtIC8yQTY0RDMzNzRFQTVEODMwNTRDNEI2RDFEMUY4QzU1RQo+
|
|
||||||
PgpzdGFydHhyZWYKODAxOAolJUVPRgo=
|
|
||||||
--------------090701020702030809070008--
|
|
218
src/documents/tests/test_api.py
Normal file
218
src/documents/tests/test_api.py
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from unittest import mock
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from django.contrib.auth.models import User
|
||||||
|
from django.test import override_settings
|
||||||
|
from rest_framework.test import APITestCase, APIClient
|
||||||
|
|
||||||
|
from documents.models import Document, Correspondent, DocumentType, Tag
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentApiTest(APITestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.scratch_dir = tempfile.mkdtemp()
|
||||||
|
self.media_dir = tempfile.mkdtemp()
|
||||||
|
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
|
||||||
|
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
|
||||||
|
|
||||||
|
os.makedirs(self.originals_dir, exist_ok=True)
|
||||||
|
os.makedirs(self.thumbnail_dir, exist_ok=True)
|
||||||
|
|
||||||
|
override_settings(
|
||||||
|
SCRATCH_DIR=self.scratch_dir,
|
||||||
|
MEDIA_ROOT=self.media_dir,
|
||||||
|
ORIGINALS_DIR=self.originals_dir,
|
||||||
|
THUMBNAIL_DIR=self.thumbnail_dir
|
||||||
|
).enable()
|
||||||
|
|
||||||
|
user = User.objects.create_superuser(username="temp_admin")
|
||||||
|
self.client.force_login(user=user)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||||
|
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def testDocuments(self):
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/").data
|
||||||
|
|
||||||
|
self.assertEqual(response['count'], 0)
|
||||||
|
|
||||||
|
c = Correspondent.objects.create(name="c", pk=41)
|
||||||
|
dt = DocumentType.objects.create(name="dt", pk=63)
|
||||||
|
tag = Tag.objects.create(name="t", pk=85)
|
||||||
|
|
||||||
|
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
|
||||||
|
|
||||||
|
doc.tags.add(tag)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/", format='json')
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(response.data['count'], 1)
|
||||||
|
|
||||||
|
returned_doc = response.data['results'][0]
|
||||||
|
self.assertEqual(returned_doc['id'], doc.id)
|
||||||
|
self.assertEqual(returned_doc['title'], doc.title)
|
||||||
|
self.assertEqual(returned_doc['correspondent']['name'], c.name)
|
||||||
|
self.assertEqual(returned_doc['document_type']['name'], dt.name)
|
||||||
|
self.assertEqual(returned_doc['correspondent']['id'], c.id)
|
||||||
|
self.assertEqual(returned_doc['document_type']['id'], dt.id)
|
||||||
|
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
|
||||||
|
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
|
||||||
|
self.assertEqual(len(returned_doc['tags']), 1)
|
||||||
|
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
|
||||||
|
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
|
||||||
|
self.assertListEqual(returned_doc['tags_id'], [tag.id])
|
||||||
|
|
||||||
|
c2 = Correspondent.objects.create(name="c2")
|
||||||
|
|
||||||
|
returned_doc['correspondent_id'] = c2.pk
|
||||||
|
returned_doc['title'] = "the new title"
|
||||||
|
|
||||||
|
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
|
||||||
|
doc_after_save = Document.objects.get(id=doc.id)
|
||||||
|
|
||||||
|
self.assertEqual(doc_after_save.correspondent, c2)
|
||||||
|
self.assertEqual(doc_after_save.title, "the new title")
|
||||||
|
|
||||||
|
self.client.delete("/api/documents/{}/".format(doc_after_save.pk))
|
||||||
|
|
||||||
|
self.assertEqual(len(Document.objects.all()), 0)
|
||||||
|
|
||||||
|
def test_document_actions(self):
|
||||||
|
|
||||||
|
_, filename = tempfile.mkstemp(dir=self.originals_dir)
|
||||||
|
|
||||||
|
content = b"This is a test"
|
||||||
|
content_thumbnail = b"thumbnail content"
|
||||||
|
|
||||||
|
with open(filename, "wb") as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
|
||||||
|
|
||||||
|
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||||
|
f.write(content_thumbnail)
|
||||||
|
|
||||||
|
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(response.content, content)
|
||||||
|
|
||||||
|
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(response.content, content)
|
||||||
|
|
||||||
|
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(response.content, content_thumbnail)
|
||||||
|
|
||||||
|
def test_document_actions_not_existing_file(self):
|
||||||
|
|
||||||
|
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
|
||||||
|
|
||||||
|
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||||
|
self.assertEqual(response.status_code, 404)
|
||||||
|
|
||||||
|
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||||
|
self.assertEqual(response.status_code, 404)
|
||||||
|
|
||||||
|
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
|
||||||
|
self.assertEqual(response.status_code, 404)
|
||||||
|
|
||||||
|
def test_document_filters(self):
|
||||||
|
|
||||||
|
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||||
|
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||||
|
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||||
|
|
||||||
|
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||||
|
tag_2 = Tag.objects.create(name="t2")
|
||||||
|
tag_3 = Tag.objects.create(name="t3")
|
||||||
|
|
||||||
|
doc1.tags.add(tag_inbox)
|
||||||
|
doc2.tags.add(tag_2)
|
||||||
|
doc3.tags.add(tag_2)
|
||||||
|
doc3.tags.add(tag_3)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?is_in_inbox=true")
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
results = response.data['results']
|
||||||
|
self.assertEqual(len(results), 1)
|
||||||
|
self.assertEqual(results[0]['id'], doc1.id)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?is_in_inbox=false")
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
results = response.data['results']
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['id'], doc2.id)
|
||||||
|
self.assertEqual(results[1]['id'], doc3.id)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_inbox.id, tag_3.id))
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
results = response.data['results']
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['id'], doc1.id)
|
||||||
|
self.assertEqual(results[1]['id'], doc3.id)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
results = response.data['results']
|
||||||
|
self.assertEqual(len(results), 1)
|
||||||
|
self.assertEqual(results[0]['id'], doc3.id)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_inbox.id, tag_3.id))
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
results = response.data['results']
|
||||||
|
self.assertEqual(len(results), 0)
|
||||||
|
|
||||||
|
response = self.client.get("/api/documents/?tags__id__all={}a{}".format(tag_inbox.id, tag_3.id))
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
results = response.data['results']
|
||||||
|
self.assertEqual(len(results), 3)
|
||||||
|
|
||||||
|
@mock.patch("documents.index.autocomplete")
|
||||||
|
def test_search_autocomplete(self, m):
|
||||||
|
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||||
|
|
||||||
|
response = self.client.get("/api/search/autocomplete/?term=test")
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(len(response.data), 10)
|
||||||
|
|
||||||
|
response = self.client.get("/api/search/autocomplete/?term=test&limit=20")
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(len(response.data), 20)
|
||||||
|
|
||||||
|
response = self.client.get("/api/search/autocomplete/?term=test&limit=-1")
|
||||||
|
self.assertEqual(response.status_code, 400)
|
||||||
|
|
||||||
|
response = self.client.get("/api/search/autocomplete/")
|
||||||
|
self.assertEqual(response.status_code, 400)
|
||||||
|
|
||||||
|
response = self.client.get("/api/search/autocomplete/?term=")
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(len(response.data), 10)
|
||||||
|
|
||||||
|
def test_statistics(self):
|
||||||
|
|
||||||
|
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||||
|
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||||
|
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||||
|
|
||||||
|
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||||
|
|
||||||
|
doc1.tags.add(tag_inbox)
|
||||||
|
|
||||||
|
response = self.client.get("/api/statistics/")
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(response.data['documents_total'], 3)
|
||||||
|
self.assertEqual(response.data['documents_inbox'], 1)
|
85
src/documents/tests/test_classifier.py
Normal file
85
src/documents/tests/test_classifier.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import tempfile
|
||||||
|
|
||||||
|
from django.test import TestCase, override_settings
|
||||||
|
|
||||||
|
from documents.classifier import DocumentClassifier
|
||||||
|
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||||
|
|
||||||
|
|
||||||
|
class TestClassifier(TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
|
||||||
|
self.classifier = DocumentClassifier()
|
||||||
|
|
||||||
|
def generate_test_data(self):
|
||||||
|
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||||
|
self.c2 = Correspondent.objects.create(name="c2")
|
||||||
|
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||||
|
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
|
||||||
|
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
|
||||||
|
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||||
|
|
||||||
|
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
|
||||||
|
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
|
||||||
|
self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
|
||||||
|
|
||||||
|
self.doc1.tags.add(self.t1)
|
||||||
|
self.doc2.tags.add(self.t1)
|
||||||
|
self.doc2.tags.add(self.t3)
|
||||||
|
self.doc_inbox.tags.add(self.t2)
|
||||||
|
|
||||||
|
def testNoTrainingData(self):
|
||||||
|
try:
|
||||||
|
self.classifier.train()
|
||||||
|
except ValueError as e:
|
||||||
|
self.assertEqual(str(e), "No training data available.")
|
||||||
|
else:
|
||||||
|
self.fail("Should raise exception")
|
||||||
|
|
||||||
|
def testEmpty(self):
|
||||||
|
Document.objects.create(title="WOW", checksum="3457", content="ASD")
|
||||||
|
self.classifier.train()
|
||||||
|
self.assertIsNone(self.classifier.document_type_classifier)
|
||||||
|
self.assertIsNone(self.classifier.tags_classifier)
|
||||||
|
self.assertIsNone(self.classifier.correspondent_classifier)
|
||||||
|
|
||||||
|
self.assertListEqual(self.classifier.predict_tags(""), [])
|
||||||
|
self.assertIsNone(self.classifier.predict_document_type(""))
|
||||||
|
self.assertIsNone(self.classifier.predict_correspondent(""))
|
||||||
|
|
||||||
|
def testTrain(self):
|
||||||
|
self.generate_test_data()
|
||||||
|
self.classifier.train()
|
||||||
|
self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
|
||||||
|
self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
|
||||||
|
|
||||||
|
def testPredict(self):
|
||||||
|
self.generate_test_data()
|
||||||
|
self.classifier.train()
|
||||||
|
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
|
||||||
|
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
|
||||||
|
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
|
||||||
|
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
|
||||||
|
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
|
||||||
|
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
|
||||||
|
|
||||||
|
def testDatasetHashing(self):
|
||||||
|
|
||||||
|
self.generate_test_data()
|
||||||
|
|
||||||
|
self.assertTrue(self.classifier.train())
|
||||||
|
self.assertFalse(self.classifier.train())
|
||||||
|
|
||||||
|
@override_settings(DATA_DIR=tempfile.mkdtemp())
|
||||||
|
def testSaveClassifier(self):
|
||||||
|
|
||||||
|
self.generate_test_data()
|
||||||
|
|
||||||
|
self.classifier.train()
|
||||||
|
|
||||||
|
self.classifier.save_classifier()
|
||||||
|
|
||||||
|
newClassifier = DocumentClassifier()
|
||||||
|
newClassifier.reload()
|
||||||
|
self.assertFalse(newClassifier.train())
|
@@ -1,8 +1,17 @@
|
|||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from unittest import mock
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.conf import settings
|
||||||
|
from django.db import DatabaseError
|
||||||
|
from django.test import TestCase, override_settings
|
||||||
|
|
||||||
from ..models import FileInfo, Tag
|
from ..consumer import Consumer, ConsumerError
|
||||||
|
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
||||||
|
from ..parsers import DocumentParser, ParseError
|
||||||
|
|
||||||
|
|
||||||
class TestAttributes(TestCase):
|
class TestAttributes(TestCase):
|
||||||
@@ -394,3 +403,254 @@ class TestFieldPermutations(TestCase):
|
|||||||
self.assertEqual(info.created.year, 2019)
|
self.assertEqual(info.created.year, 2019)
|
||||||
self.assertEqual(info.created.month, 9)
|
self.assertEqual(info.created.month, 9)
|
||||||
self.assertEqual(info.created.day, 8)
|
self.assertEqual(info.created.day, 8)
|
||||||
|
|
||||||
|
|
||||||
|
class DummyParser(DocumentParser):
|
||||||
|
|
||||||
|
def get_thumbnail(self):
|
||||||
|
# not important during tests
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def __init__(self, path, logging_group, scratch_dir):
|
||||||
|
super(DummyParser, self).__init__(path, logging_group)
|
||||||
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||||
|
|
||||||
|
def get_optimised_thumbnail(self):
|
||||||
|
return self.fake_thumb
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
return "The Text"
|
||||||
|
|
||||||
|
|
||||||
|
class FaultyParser(DocumentParser):
|
||||||
|
|
||||||
|
def get_thumbnail(self):
|
||||||
|
# not important during tests
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def __init__(self, path, logging_group, scratch_dir):
|
||||||
|
super(FaultyParser, self).__init__(path, logging_group)
|
||||||
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||||
|
|
||||||
|
def get_optimised_thumbnail(self):
|
||||||
|
return self.fake_thumb
|
||||||
|
|
||||||
|
def get_text(self):
|
||||||
|
raise ParseError("Does not compute.")
|
||||||
|
|
||||||
|
|
||||||
|
class TestConsumer(TestCase):
|
||||||
|
|
||||||
|
def make_dummy_parser(self, path, logging_group):
|
||||||
|
return DummyParser(path, logging_group, self.scratch_dir)
|
||||||
|
|
||||||
|
def make_faulty_parser(self, path, logging_group):
|
||||||
|
return FaultyParser(path, logging_group, self.scratch_dir)
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.scratch_dir = tempfile.mkdtemp()
|
||||||
|
self.media_dir = tempfile.mkdtemp()
|
||||||
|
self.consumption_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
override_settings(
|
||||||
|
SCRATCH_DIR=self.scratch_dir,
|
||||||
|
MEDIA_ROOT=self.media_dir,
|
||||||
|
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
|
||||||
|
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
|
||||||
|
CONSUMPTION_DIR=self.consumption_dir
|
||||||
|
).enable()
|
||||||
|
|
||||||
|
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
|
m = patcher.start()
|
||||||
|
m.return_value = [(None, {
|
||||||
|
"parser": self.make_dummy_parser,
|
||||||
|
"test": lambda _: True,
|
||||||
|
"weight": 0
|
||||||
|
})]
|
||||||
|
|
||||||
|
self.addCleanup(patcher.stop)
|
||||||
|
|
||||||
|
self.consumer = Consumer()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||||
|
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||||
|
shutil.rmtree(self.consumption_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def get_test_file(self):
|
||||||
|
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
|
||||||
|
return f
|
||||||
|
|
||||||
|
def testNormalOperation(self):
|
||||||
|
|
||||||
|
filename = self.get_test_file()
|
||||||
|
document = self.consumer.try_consume_file(filename)
|
||||||
|
|
||||||
|
self.assertEqual(document.content, "The Text")
|
||||||
|
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
|
||||||
|
self.assertIsNone(document.correspondent)
|
||||||
|
self.assertIsNone(document.document_type)
|
||||||
|
self.assertEqual(document.filename, "0000001.pdf")
|
||||||
|
|
||||||
|
self.assertTrue(os.path.isfile(
|
||||||
|
document.source_path
|
||||||
|
))
|
||||||
|
|
||||||
|
self.assertTrue(os.path.isfile(
|
||||||
|
document.thumbnail_path
|
||||||
|
))
|
||||||
|
|
||||||
|
self.assertFalse(os.path.isfile(filename))
|
||||||
|
|
||||||
|
def testOverrideFilename(self):
|
||||||
|
filename = self.get_test_file()
|
||||||
|
overrideFilename = "My Bank - Statement for November.pdf"
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(filename, override_filename=overrideFilename)
|
||||||
|
|
||||||
|
self.assertEqual(document.correspondent.name, "My Bank")
|
||||||
|
self.assertEqual(document.title, "Statement for November")
|
||||||
|
|
||||||
|
def testOverrideTitle(self):
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
||||||
|
self.assertEqual(document.title, "Override Title")
|
||||||
|
|
||||||
|
def testOverrideCorrespondent(self):
|
||||||
|
c = Correspondent.objects.create(name="test")
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
|
||||||
|
self.assertEqual(document.correspondent.id, c.id)
|
||||||
|
|
||||||
|
def testOverrideDocumentType(self):
|
||||||
|
dt = DocumentType.objects.create(name="test")
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
|
||||||
|
self.assertEqual(document.document_type.id, dt.id)
|
||||||
|
|
||||||
|
def testOverrideTags(self):
|
||||||
|
t1 = Tag.objects.create(name="t1")
|
||||||
|
t2 = Tag.objects.create(name="t2")
|
||||||
|
t3 = Tag.objects.create(name="t3")
|
||||||
|
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
|
||||||
|
|
||||||
|
self.assertIn(t1, document.tags.all())
|
||||||
|
self.assertNotIn(t2, document.tags.all())
|
||||||
|
self.assertIn(t3, document.tags.all())
|
||||||
|
|
||||||
|
def testNotAFile(self):
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file("non-existing-file")
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertTrue(str(e).endswith('It is not a file'))
|
||||||
|
return
|
||||||
|
|
||||||
|
self.fail("Should throw exception")
|
||||||
|
|
||||||
|
@override_settings(CONSUMPTION_DIR=None)
|
||||||
|
def testConsumptionDirUnset(self):
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.fail("Should throw exception")
|
||||||
|
|
||||||
|
@override_settings(CONSUMPTION_DIR="asd")
|
||||||
|
def testNoConsumptionDir(self):
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertEqual(str(e), "Consumption directory asd does not exist")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.fail("Should throw exception")
|
||||||
|
|
||||||
|
def testDuplicates(self):
|
||||||
|
self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||||
|
return
|
||||||
|
|
||||||
|
self.fail("Should throw exception")
|
||||||
|
|
||||||
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
|
def testNoParsers(self, m):
|
||||||
|
m.return_value = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertTrue(str(e).startswith("No parsers abvailable"))
|
||||||
|
return
|
||||||
|
|
||||||
|
self.fail("Should throw exception")
|
||||||
|
|
||||||
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
|
def testFaultyParser(self, m):
|
||||||
|
m.return_value = [(None, {
|
||||||
|
"parser": self.make_faulty_parser,
|
||||||
|
"test": lambda _: True,
|
||||||
|
"weight": 0
|
||||||
|
})]
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertEqual(str(e), "Does not compute.")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.fail("Should throw exception.")
|
||||||
|
|
||||||
|
@mock.patch("documents.consumer.Consumer._write")
|
||||||
|
def testPostSaveError(self, m):
|
||||||
|
filename = self.get_test_file()
|
||||||
|
m.side_effect = OSError("NO.")
|
||||||
|
try:
|
||||||
|
self.consumer.try_consume_file(filename)
|
||||||
|
except ConsumerError as e:
|
||||||
|
self.assertEqual(str(e), "NO.")
|
||||||
|
else:
|
||||||
|
self.fail("Should raise exception")
|
||||||
|
|
||||||
|
# file not deleted
|
||||||
|
self.assertTrue(os.path.isfile(filename))
|
||||||
|
|
||||||
|
# Database empty
|
||||||
|
self.assertEqual(len(Document.objects.all()), 0)
|
||||||
|
|
||||||
|
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||||
|
def testFilenameHandling(self):
|
||||||
|
filename = self.get_test_file()
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||||
|
|
||||||
|
print(document.source_path)
|
||||||
|
print("===")
|
||||||
|
|
||||||
|
self.assertEqual(document.title, "new docs")
|
||||||
|
self.assertEqual(document.correspondent.name, "Bank")
|
||||||
|
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
||||||
|
|
||||||
|
@mock.patch("documents.consumer.DocumentClassifier")
|
||||||
|
def testClassifyDocument(self, m):
|
||||||
|
correspondent = Correspondent.objects.create(name="test")
|
||||||
|
dtype = DocumentType.objects.create(name="test")
|
||||||
|
t1 = Tag.objects.create(name="t1")
|
||||||
|
t2 = Tag.objects.create(name="t2")
|
||||||
|
|
||||||
|
m.return_value = MagicMock()
|
||||||
|
m.return_value.predict_correspondent.return_value = correspondent.pk
|
||||||
|
m.return_value.predict_document_type.return_value = dtype.pk
|
||||||
|
m.return_value.predict_tags.return_value = [t1.pk]
|
||||||
|
|
||||||
|
document = self.consumer.try_consume_file(self.get_test_file())
|
||||||
|
|
||||||
|
self.assertEqual(document.correspondent, correspondent)
|
||||||
|
self.assertEqual(document.document_type, dtype)
|
||||||
|
self.assertIn(t1, document.tags.all())
|
||||||
|
self.assertNotIn(t2, document.tags.all())
|
||||||
|
@@ -1,90 +0,0 @@
|
|||||||
import base64
|
|
||||||
import os
|
|
||||||
from hashlib import md5
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
import magic
|
|
||||||
from django.conf import settings
|
|
||||||
from django.test import TestCase
|
|
||||||
|
|
||||||
from ..mail import Message, Attachment
|
|
||||||
|
|
||||||
|
|
||||||
class TestMessage(TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
|
|
||||||
TestCase.__init__(self, *args, **kwargs)
|
|
||||||
self.sample = os.path.join(
|
|
||||||
settings.BASE_DIR,
|
|
||||||
"documents",
|
|
||||||
"tests",
|
|
||||||
"samples",
|
|
||||||
"mail.txt"
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_init(self):
|
|
||||||
|
|
||||||
with open(self.sample, "rb") as f:
|
|
||||||
|
|
||||||
with mock.patch("logging.StreamHandler.emit") as __:
|
|
||||||
message = Message(f.read())
|
|
||||||
|
|
||||||
self.assertTrue(message)
|
|
||||||
self.assertEqual(message.subject, "Test 0")
|
|
||||||
|
|
||||||
data = message.attachment.read()
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
message.attachment.content_type, "application/pdf")
|
|
||||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
|
||||||
self.assertEqual(m.id_buffer(data), "application/pdf")
|
|
||||||
|
|
||||||
|
|
||||||
class TestInlineMessage(TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
|
|
||||||
TestCase.__init__(self, *args, **kwargs)
|
|
||||||
self.sample = os.path.join(
|
|
||||||
settings.BASE_DIR,
|
|
||||||
"documents",
|
|
||||||
"tests",
|
|
||||||
"samples",
|
|
||||||
"inline_mail.txt"
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_init(self):
|
|
||||||
|
|
||||||
with open(self.sample, "rb") as f:
|
|
||||||
|
|
||||||
with mock.patch("logging.StreamHandler.emit") as __:
|
|
||||||
message = Message(f.read())
|
|
||||||
|
|
||||||
self.assertTrue(message)
|
|
||||||
self.assertEqual(message.subject, "Paperless Inline Image")
|
|
||||||
|
|
||||||
data = message.attachment.read()
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
md5(data).hexdigest(), "30c00a7b42913e65f7fdb0be40b9eef3")
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
message.attachment.content_type, "image/png")
|
|
||||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
|
||||||
self.assertEqual(m.id_buffer(data), "image/png")
|
|
||||||
|
|
||||||
|
|
||||||
class TestAttachment(TestCase):
|
|
||||||
|
|
||||||
def test_init(self):
|
|
||||||
data = base64.encodebytes(b"0")
|
|
||||||
self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
|
|
||||||
self.assertEqual(Attachment(data, "image/png").suffix, "png")
|
|
||||||
self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
|
|
||||||
self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
|
|
||||||
self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
|
|
||||||
self.assertEqual(Attachment(data, "image/png").read(), data)
|
|
@@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
m.return_value = (
|
m.return_value = (
|
||||||
(None, lambda _: {"weight": 0, "parser": DummyParser}),
|
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
m.return_value = (
|
m.return_value = (
|
||||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
|
||||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
|
|||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
def test__get_parser_class_0_parsers(self, m, *args):
|
def test__get_parser_class_0_parsers(self, m, *args):
|
||||||
m.return_value = ((None, lambda _: None),)
|
m.return_value = []
|
||||||
with TemporaryDirectory() as tmpdir:
|
with TemporaryDirectory() as tmpdir:
|
||||||
self.assertIsNone(
|
self.assertIsNone(
|
||||||
get_parser_class("doc.pdf")
|
get_parser_class("doc.pdf")
|
||||||
|
@@ -52,7 +52,7 @@ class CorrespondentViewSet(ModelViewSet):
|
|||||||
pagination_class = StandardPagination
|
pagination_class = StandardPagination
|
||||||
permission_classes = (IsAuthenticated,)
|
permission_classes = (IsAuthenticated,)
|
||||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||||
filter_class = CorrespondentFilterSet
|
filterset_class = CorrespondentFilterSet
|
||||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
|
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
|
||||||
|
|
||||||
|
|
||||||
@@ -63,7 +63,7 @@ class TagViewSet(ModelViewSet):
|
|||||||
pagination_class = StandardPagination
|
pagination_class = StandardPagination
|
||||||
permission_classes = (IsAuthenticated,)
|
permission_classes = (IsAuthenticated,)
|
||||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||||
filter_class = TagFilterSet
|
filterset_class = TagFilterSet
|
||||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||||
|
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ class DocumentTypeViewSet(ModelViewSet):
|
|||||||
pagination_class = StandardPagination
|
pagination_class = StandardPagination
|
||||||
permission_classes = (IsAuthenticated,)
|
permission_classes = (IsAuthenticated,)
|
||||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||||
filter_class = DocumentTypeFilterSet
|
filterset_class = DocumentTypeFilterSet
|
||||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||||
|
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
|||||||
pagination_class = StandardPagination
|
pagination_class = StandardPagination
|
||||||
permission_classes = (IsAuthenticated,)
|
permission_classes = (IsAuthenticated,)
|
||||||
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
|
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
|
||||||
filter_class = DocumentFilterSet
|
filterset_class = DocumentFilterSet
|
||||||
search_fields = ("title", "correspondent__name", "content")
|
search_fields = ("title", "correspondent__name", "content")
|
||||||
ordering_fields = (
|
ordering_fields = (
|
||||||
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
||||||
@@ -170,7 +170,7 @@ class LogViewSet(ReadOnlyModelViewSet):
|
|||||||
pagination_class = StandardPagination
|
pagination_class = StandardPagination
|
||||||
permission_classes = (IsAuthenticated,)
|
permission_classes = (IsAuthenticated,)
|
||||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||||
filter_class = LogFilterSet
|
filterset_class = LogFilterSet
|
||||||
ordering_fields = ("created",)
|
ordering_fields = ("created",)
|
||||||
|
|
||||||
|
|
||||||
@@ -223,17 +223,16 @@ class SearchAutoCompleteView(APIView):
|
|||||||
if 'term' in request.query_params:
|
if 'term' in request.query_params:
|
||||||
term = request.query_params['term']
|
term = request.query_params['term']
|
||||||
else:
|
else:
|
||||||
term = None
|
return HttpResponseBadRequest("Term required")
|
||||||
|
|
||||||
if 'limit' in request.query_params:
|
if 'limit' in request.query_params:
|
||||||
limit = int(request.query_params['limit'])
|
limit = int(request.query_params['limit'])
|
||||||
|
if limit <= 0:
|
||||||
|
return HttpResponseBadRequest("Invalid limit")
|
||||||
else:
|
else:
|
||||||
limit = 10
|
limit = 10
|
||||||
|
|
||||||
if term is not None:
|
|
||||||
return Response(index.autocomplete(self.ix, term, limit))
|
return Response(index.autocomplete(self.ix, term, limit))
|
||||||
else:
|
|
||||||
return Response([])
|
|
||||||
|
|
||||||
|
|
||||||
class StatisticsView(APIView):
|
class StatisticsView(APIView):
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
import math
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -79,6 +80,7 @@ INSTALLED_APPS = [
|
|||||||
"documents.apps.DocumentsConfig",
|
"documents.apps.DocumentsConfig",
|
||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
"paperless_text.apps.PaperlessTextConfig",
|
||||||
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
|
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
|
|
||||||
@@ -262,24 +264,58 @@ LOGGING = {
|
|||||||
# Task queue #
|
# Task queue #
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
# Sensible defaults for multitasking:
|
||||||
|
# use a fair balance between worker processes and threads epr worker so that
|
||||||
|
# both consuming many documents in parallel and consuming large documents is
|
||||||
|
# reasonably fast.
|
||||||
|
# Favors threads per worker on smaller systems and never exceeds cpu_count()
|
||||||
|
# in total.
|
||||||
|
|
||||||
|
def default_task_workers():
|
||||||
|
try:
|
||||||
|
return max(
|
||||||
|
math.floor(math.sqrt(multiprocessing.cpu_count())),
|
||||||
|
1
|
||||||
|
)
|
||||||
|
except NotImplementedError:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
|
||||||
|
|
||||||
Q_CLUSTER = {
|
Q_CLUSTER = {
|
||||||
'name': 'paperless',
|
'name': 'paperless',
|
||||||
'catch_up': False,
|
'catch_up': False,
|
||||||
|
'workers': TASK_WORKERS,
|
||||||
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
|
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def default_threads_per_worker():
|
||||||
|
try:
|
||||||
|
return max(
|
||||||
|
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
|
||||||
|
1
|
||||||
|
)
|
||||||
|
except NotImplementedError:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# Paperless Specific Settings #
|
# Paperless Specific Settings #
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
|
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
|
||||||
|
|
||||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||||
|
|
||||||
# The default language that tesseract will attempt to use when parsing
|
# The default language that tesseract will attempt to use when parsing
|
||||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||||
|
|
||||||
# The amount of threads to use for OCR
|
|
||||||
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
|
|
||||||
|
|
||||||
# OCR all documents?
|
# OCR all documents?
|
||||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||||
@@ -324,5 +360,6 @@ FILENAME_PARSE_TRANSFORMS = []
|
|||||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||||
|
|
||||||
|
# TODO: this should not have a prefix.
|
||||||
# Specify the filename format for out files
|
# Specify the filename format for out files
|
||||||
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
from django.conf.urls import include, url
|
from django.conf.urls import include
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.contrib.auth.decorators import login_required
|
from django.contrib.auth.decorators import login_required
|
||||||
from django.urls import path
|
from django.urls import path, re_path
|
||||||
from django.views.decorators.csrf import csrf_exempt
|
from django.views.decorators.csrf import csrf_exempt
|
||||||
from django.views.generic import RedirectView
|
from django.views.generic import RedirectView
|
||||||
from rest_framework.routers import DefaultRouter
|
from rest_framework.routers import DefaultRouter
|
||||||
@@ -30,32 +30,32 @@ api_router.register(r"tags", TagViewSet)
|
|||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
|
|
||||||
# API
|
# API
|
||||||
url(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
|
re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
|
||||||
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
|
re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
|
||||||
url(r"^api/search/", SearchView.as_view(), name="search"),
|
re_path(r"^api/search/", SearchView.as_view(), name="search"),
|
||||||
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
|
re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
|
||||||
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
|
re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
|
||||||
|
|
||||||
# Favicon
|
# Favicon
|
||||||
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||||
|
|
||||||
# The Django admin
|
# The Django admin
|
||||||
url(r"admin/", admin.site.urls),
|
re_path(r"admin/", admin.site.urls),
|
||||||
|
|
||||||
# These redirects are here to support clients that use the old FetchView.
|
# These redirects are here to support clients that use the old FetchView.
|
||||||
url(
|
re_path(
|
||||||
r"^fetch/doc/(?P<pk>\d+)$",
|
r"^fetch/doc/(?P<pk>\d+)$",
|
||||||
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
|
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
|
||||||
),
|
),
|
||||||
url(
|
re_path(
|
||||||
r"^fetch/thumb/(?P<pk>\d+)$",
|
r"^fetch/thumb/(?P<pk>\d+)$",
|
||||||
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
|
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
|
||||||
),
|
),
|
||||||
url(
|
re_path(
|
||||||
r"^fetch/preview/(?P<pk>\d+)$",
|
r"^fetch/preview/(?P<pk>\d+)$",
|
||||||
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
|
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
|
||||||
),
|
),
|
||||||
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
|
re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
|
||||||
|
|
||||||
# Frontend assets TODO: this is pretty bad.
|
# Frontend assets TODO: this is pretty bad.
|
||||||
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
|
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
|
||||||
@@ -63,7 +63,7 @@ urlpatterns = [
|
|||||||
path('accounts/', include('django.contrib.auth.urls')),
|
path('accounts/', include('django.contrib.auth.urls')),
|
||||||
|
|
||||||
# Root of the Frontent
|
# Root of the Frontent
|
||||||
url(r".*", login_required(IndexView.as_view())),
|
re_path(r".*", login_required(IndexView.as_view())),
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
0
src/paperless_mail/__init__.py
Normal file
0
src/paperless_mail/__init__.py
Normal file
27
src/paperless_mail/admin.py
Normal file
27
src/paperless_mail/admin.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from django.contrib import admin
|
||||||
|
from django import forms
|
||||||
|
|
||||||
|
from paperless_mail.models import MailAccount, MailRule
|
||||||
|
|
||||||
|
|
||||||
|
class MailAccountForm(forms.ModelForm):
|
||||||
|
|
||||||
|
password = forms.CharField(widget=forms.PasswordInput)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
fields = '__all__'
|
||||||
|
model = MailAccount
|
||||||
|
|
||||||
|
|
||||||
|
class MailAccountAdmin(admin.ModelAdmin):
|
||||||
|
|
||||||
|
list_display = ("name", "imap_server", "username")
|
||||||
|
|
||||||
|
|
||||||
|
class MailRuleAdmin(admin.ModelAdmin):
|
||||||
|
|
||||||
|
list_display = ("name", "account", "folder", "action")
|
||||||
|
|
||||||
|
|
||||||
|
admin.site.register(MailAccount, MailAccountAdmin)
|
||||||
|
admin.site.register(MailRule, MailRuleAdmin)
|
7
src/paperless_mail/apps.py
Normal file
7
src/paperless_mail/apps.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class PaperlessMailConfig(AppConfig):
|
||||||
|
name = 'paperless_mail'
|
||||||
|
|
||||||
|
verbose_name = 'Paperless Mail'
|
227
src/paperless_mail/mail.py
Normal file
227
src/paperless_mail/mail.py
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from datetime import timedelta, date
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils.text import slugify
|
||||||
|
from django_q.tasks import async_task
|
||||||
|
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||||
|
MailboxFolderSelectError
|
||||||
|
|
||||||
|
from documents.models import Correspondent
|
||||||
|
from paperless_mail.models import MailAccount, MailRule
|
||||||
|
|
||||||
|
|
||||||
|
class MailError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class BaseMailAction:
|
||||||
|
|
||||||
|
def get_criteria(self):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def post_consume(self, M, message_uids, parameter):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DeleteMailAction(BaseMailAction):
|
||||||
|
|
||||||
|
def post_consume(self, M, message_uids, parameter):
|
||||||
|
M.delete(message_uids)
|
||||||
|
|
||||||
|
|
||||||
|
class MarkReadMailAction(BaseMailAction):
|
||||||
|
|
||||||
|
def get_criteria(self):
|
||||||
|
return {'seen': False}
|
||||||
|
|
||||||
|
def post_consume(self, M, message_uids, parameter):
|
||||||
|
M.seen(message_uids, True)
|
||||||
|
|
||||||
|
|
||||||
|
class MoveMailAction(BaseMailAction):
|
||||||
|
|
||||||
|
def post_consume(self, M, message_uids, parameter):
|
||||||
|
M.move(message_uids, parameter)
|
||||||
|
|
||||||
|
|
||||||
|
class FlagMailAction(BaseMailAction):
|
||||||
|
|
||||||
|
def get_criteria(self):
|
||||||
|
return {'flagged': False}
|
||||||
|
|
||||||
|
def post_consume(self, M, message_uids, parameter):
|
||||||
|
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_rule_action(rule):
|
||||||
|
if rule.action == MailRule.ACTION_FLAG:
|
||||||
|
return FlagMailAction()
|
||||||
|
elif rule.action == MailRule.ACTION_DELETE:
|
||||||
|
return DeleteMailAction()
|
||||||
|
elif rule.action == MailRule.ACTION_MOVE:
|
||||||
|
return MoveMailAction()
|
||||||
|
elif rule.action == MailRule.ACTION_MARK_READ:
|
||||||
|
return MarkReadMailAction()
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown action.")
|
||||||
|
|
||||||
|
|
||||||
|
def make_criterias(rule):
|
||||||
|
maximum_age = date.today() - timedelta(days=rule.maximum_age)
|
||||||
|
criterias = {
|
||||||
|
"date_gte": maximum_age
|
||||||
|
}
|
||||||
|
if rule.filter_from:
|
||||||
|
criterias["from_"] = rule.filter_from
|
||||||
|
if rule.filter_subject:
|
||||||
|
criterias["subject"] = rule.filter_subject
|
||||||
|
if rule.filter_body:
|
||||||
|
criterias["body"] = rule.filter_body
|
||||||
|
|
||||||
|
return {**criterias, **get_rule_action(rule).get_criteria()}
|
||||||
|
|
||||||
|
|
||||||
|
def handle_mail_account(account):
|
||||||
|
|
||||||
|
if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
|
||||||
|
mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
|
||||||
|
elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
|
||||||
|
mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
|
||||||
|
elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
|
||||||
|
mailbox = MailBox(account.imap_server, account.imap_port)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown IMAP security")
|
||||||
|
|
||||||
|
total_processed_files = 0
|
||||||
|
|
||||||
|
with mailbox as M:
|
||||||
|
|
||||||
|
try:
|
||||||
|
M.login(account.username, account.password)
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Error while authenticating account {account.name}")
|
||||||
|
|
||||||
|
for rule in account.rules.all():
|
||||||
|
|
||||||
|
try:
|
||||||
|
M.folder.set(rule.folder)
|
||||||
|
except MailboxFolderSelectError:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Folder {rule.folder} does not exist "
|
||||||
|
f"in account {account.name}")
|
||||||
|
|
||||||
|
criterias = make_criterias(rule)
|
||||||
|
|
||||||
|
try:
|
||||||
|
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Error while fetching folder "
|
||||||
|
f"{rule.folder} of account {account.name}")
|
||||||
|
|
||||||
|
post_consume_messages = []
|
||||||
|
|
||||||
|
for message in messages:
|
||||||
|
try:
|
||||||
|
processed_files = handle_message(message, rule)
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Error while processing mail "
|
||||||
|
f"{message.uid} of account {account.name}")
|
||||||
|
if processed_files > 0:
|
||||||
|
post_consume_messages.append(message.uid)
|
||||||
|
|
||||||
|
total_processed_files += processed_files
|
||||||
|
try:
|
||||||
|
get_rule_action(rule).post_consume(
|
||||||
|
M,
|
||||||
|
post_consume_messages,
|
||||||
|
rule.action_parameter)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Error while processing post-consume "
|
||||||
|
f"actions for account {account.name}")
|
||||||
|
|
||||||
|
return total_processed_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_title(message, att, rule):
|
||||||
|
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||||
|
title = message.subject
|
||||||
|
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||||
|
title = os.path.splitext(os.path.basename(att.filename))[0]
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown title selector.")
|
||||||
|
|
||||||
|
return title
|
||||||
|
|
||||||
|
|
||||||
|
def get_correspondent(message, rule):
|
||||||
|
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||||
|
correspondent = None
|
||||||
|
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||||
|
correspondent_name = message.from_
|
||||||
|
correspondent = Correspondent.objects.get_or_create(
|
||||||
|
name=correspondent_name, defaults={
|
||||||
|
"slug": slugify(correspondent_name)
|
||||||
|
})[0]
|
||||||
|
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||||
|
if message.from_values and \
|
||||||
|
'name' in message.from_values \
|
||||||
|
and message.from_values['name']:
|
||||||
|
correspondent_name = message.from_values['name']
|
||||||
|
else:
|
||||||
|
correspondent_name = message.from_
|
||||||
|
|
||||||
|
correspondent = Correspondent.objects.get_or_create(
|
||||||
|
name=correspondent_name, defaults={
|
||||||
|
"slug": slugify(correspondent_name)
|
||||||
|
})[0]
|
||||||
|
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||||
|
correspondent = rule.assign_correspondent
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknwown correspondent selector")
|
||||||
|
|
||||||
|
return correspondent
|
||||||
|
|
||||||
|
|
||||||
|
def handle_message(message, rule):
|
||||||
|
if not message.attachments:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
correspondent = get_correspondent(message, rule)
|
||||||
|
tag = rule.assign_tag
|
||||||
|
doc_type = rule.assign_document_type
|
||||||
|
|
||||||
|
processed_attachments = 0
|
||||||
|
|
||||||
|
for att in message.attachments:
|
||||||
|
|
||||||
|
title = get_title(message, att, rule)
|
||||||
|
|
||||||
|
# TODO: check with parsers what files types are supported
|
||||||
|
if att.content_type == 'application/pdf':
|
||||||
|
|
||||||
|
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||||
|
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
||||||
|
with open(temp_filename, 'wb') as f:
|
||||||
|
f.write(att.payload)
|
||||||
|
|
||||||
|
async_task(
|
||||||
|
"documents.tasks.consume_file",
|
||||||
|
path=temp_filename,
|
||||||
|
override_filename=att.filename,
|
||||||
|
override_title=title,
|
||||||
|
override_correspondent_id=correspondent.id if correspondent else None,
|
||||||
|
override_document_type_id=doc_type.id if doc_type else None,
|
||||||
|
override_tag_ids=[tag.id] if tag else None,
|
||||||
|
task_name=f"Mail: {att.filename}"
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_attachments += 1
|
||||||
|
|
||||||
|
return processed_attachments
|
0
src/paperless_mail/management/__init__.py
Normal file
0
src/paperless_mail/management/__init__.py
Normal file
0
src/paperless_mail/management/commands/__init__.py
Normal file
0
src/paperless_mail/management/commands/__init__.py
Normal file
13
src/paperless_mail/management/commands/mail_fetcher.py
Normal file
13
src/paperless_mail/management/commands/mail_fetcher.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from paperless_mail import mail, tasks
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
|
||||||
|
help = """
|
||||||
|
""".replace(" ", "")
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
|
tasks.process_mail_accounts()
|
48
src/paperless_mail/migrations/0001_initial.py
Normal file
48
src/paperless_mail/migrations/0001_initial.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# Generated by Django 3.1.3 on 2020-11-15 22:54
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '1002_auto_20201111_1105'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='MailAccount',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('name', models.CharField(max_length=256, unique=True)),
|
||||||
|
('imap_server', models.CharField(max_length=256)),
|
||||||
|
('imap_port', models.IntegerField(blank=True, null=True)),
|
||||||
|
('imap_security', models.PositiveIntegerField(choices=[(1, 'No encryption'), (2, 'Use SSL'), (3, 'Use STARTTLS')], default=2)),
|
||||||
|
('username', models.CharField(max_length=256)),
|
||||||
|
('password', models.CharField(max_length=256)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='MailRule',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('name', models.CharField(max_length=256)),
|
||||||
|
('folder', models.CharField(default='INBOX', max_length=256)),
|
||||||
|
('filter_from', models.CharField(blank=True, max_length=256, null=True)),
|
||||||
|
('filter_subject', models.CharField(blank=True, max_length=256, null=True)),
|
||||||
|
('filter_body', models.CharField(blank=True, max_length=256, null=True)),
|
||||||
|
('maximum_age', models.PositiveIntegerField(default=30)),
|
||||||
|
('action', models.PositiveIntegerField(choices=[(1, 'Delete'), (2, 'Move to specified folder'), (3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails")], default=3, help_text='The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched.')),
|
||||||
|
('action_parameter', models.CharField(blank=True, help_text='Additional parameter for the action selected above, i.e., the target folder of the move to folder action.', max_length=256, null=True)),
|
||||||
|
('assign_title_from', models.PositiveIntegerField(choices=[(1, 'Use subject as title'), (2, 'Use attachment filename as title')], default=1)),
|
||||||
|
('assign_correspondent_from', models.PositiveIntegerField(choices=[(1, 'Do not assign a correspondent'), (2, 'Use mail address'), (3, 'Use name (or mail address if not available)'), (4, 'Use correspondent selected below')], default=1)),
|
||||||
|
('account', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rules', to='paperless_mail.mailaccount')),
|
||||||
|
('assign_correspondent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.correspondent')),
|
||||||
|
('assign_document_type', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.documenttype')),
|
||||||
|
('assign_tag', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.tag')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
32
src/paperless_mail/migrations/0002_auto_20201117_1334.py
Normal file
32
src/paperless_mail/migrations/0002_auto_20201117_1334.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Generated by Django 3.1.3 on 2020-11-17 13:34
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
from django.db.migrations import RunPython
|
||||||
|
from django_q.models import Schedule
|
||||||
|
from django_q.tasks import schedule
|
||||||
|
|
||||||
|
|
||||||
|
def add_schedules(apps, schema_editor):
|
||||||
|
schedule('paperless_mail.tasks.process_mail_accounts',
|
||||||
|
name="Check all e-mail accounts",
|
||||||
|
schedule_type=Schedule.MINUTES,
|
||||||
|
minutes=10)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_schedules(apps, schema_editor):
|
||||||
|
Schedule.objects.filter(
|
||||||
|
func='paperless_mail.tasks.process_mail_accounts').delete()
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('paperless_mail', '0001_initial'),
|
||||||
|
('django_q', '0013_task_attempt_count'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
RunPython(add_schedules, remove_schedules)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
0
src/paperless_mail/migrations/__init__.py
Normal file
0
src/paperless_mail/migrations/__init__.py
Normal file
137
src/paperless_mail/models.py
Normal file
137
src/paperless_mail/models.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
from django.db import models
|
||||||
|
|
||||||
|
# Create your models here.
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
import documents.models as document_models
|
||||||
|
|
||||||
|
|
||||||
|
class MailAccount(models.Model):
|
||||||
|
|
||||||
|
IMAP_SECURITY_NONE = 1
|
||||||
|
IMAP_SECURITY_SSL = 2
|
||||||
|
IMAP_SECURITY_STARTTLS = 3
|
||||||
|
|
||||||
|
IMAP_SECURITY_OPTIONS = (
|
||||||
|
(IMAP_SECURITY_NONE, "No encryption"),
|
||||||
|
(IMAP_SECURITY_SSL, "Use SSL"),
|
||||||
|
(IMAP_SECURITY_STARTTLS, "Use STARTTLS"),
|
||||||
|
)
|
||||||
|
|
||||||
|
name = models.CharField(max_length=256, unique=True)
|
||||||
|
|
||||||
|
imap_server = models.CharField(max_length=256)
|
||||||
|
|
||||||
|
imap_port = models.IntegerField(blank=True, null=True)
|
||||||
|
|
||||||
|
imap_security = models.PositiveIntegerField(
|
||||||
|
choices=IMAP_SECURITY_OPTIONS,
|
||||||
|
default=IMAP_SECURITY_SSL
|
||||||
|
)
|
||||||
|
|
||||||
|
username = models.CharField(max_length=256)
|
||||||
|
|
||||||
|
password = models.CharField(max_length=256)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
|
||||||
|
class MailRule(models.Model):
|
||||||
|
|
||||||
|
ACTION_DELETE = 1
|
||||||
|
ACTION_MOVE = 2
|
||||||
|
ACTION_MARK_READ = 3
|
||||||
|
ACTION_FLAG = 4
|
||||||
|
|
||||||
|
ACTIONS = (
|
||||||
|
(ACTION_DELETE, "Delete"),
|
||||||
|
(ACTION_MOVE, "Move to specified folder"),
|
||||||
|
(ACTION_MARK_READ, "Mark as read, don't process read mails"),
|
||||||
|
(ACTION_FLAG, "Flag the mail, don't process flagged mails")
|
||||||
|
)
|
||||||
|
|
||||||
|
TITLE_FROM_SUBJECT = 1
|
||||||
|
TITLE_FROM_FILENAME = 2
|
||||||
|
|
||||||
|
TITLE_SELECTOR = (
|
||||||
|
(TITLE_FROM_SUBJECT, "Use subject as title"),
|
||||||
|
(TITLE_FROM_FILENAME, "Use attachment filename as title")
|
||||||
|
)
|
||||||
|
|
||||||
|
CORRESPONDENT_FROM_NOTHING = 1
|
||||||
|
CORRESPONDENT_FROM_EMAIL = 2
|
||||||
|
CORRESPONDENT_FROM_NAME = 3
|
||||||
|
CORRESPONDENT_FROM_CUSTOM = 4
|
||||||
|
|
||||||
|
CORRESPONDENT_SELECTOR = (
|
||||||
|
(CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
|
||||||
|
(CORRESPONDENT_FROM_EMAIL, "Use mail address"),
|
||||||
|
(CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
|
||||||
|
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
|
||||||
|
)
|
||||||
|
|
||||||
|
name = models.CharField(max_length=256)
|
||||||
|
|
||||||
|
account = models.ForeignKey(
|
||||||
|
MailAccount,
|
||||||
|
related_name="rules",
|
||||||
|
on_delete=models.CASCADE
|
||||||
|
)
|
||||||
|
|
||||||
|
folder = models.CharField(default='INBOX', max_length=256)
|
||||||
|
|
||||||
|
filter_from = models.CharField(max_length=256, null=True, blank=True)
|
||||||
|
filter_subject = models.CharField(max_length=256, null=True, blank=True)
|
||||||
|
filter_body = models.CharField(max_length=256, null=True, blank=True)
|
||||||
|
|
||||||
|
maximum_age = models.PositiveIntegerField(default=30)
|
||||||
|
|
||||||
|
action = models.PositiveIntegerField(
|
||||||
|
choices=ACTIONS,
|
||||||
|
default=ACTION_MARK_READ,
|
||||||
|
help_text="The action applied to the mail. This action is only "
|
||||||
|
"performed when documents were consumed from the mail. "
|
||||||
|
"Mails without attachments will remain entirely "
|
||||||
|
"untouched."
|
||||||
|
)
|
||||||
|
|
||||||
|
action_parameter = models.CharField(
|
||||||
|
max_length=256, blank=True, null=True,
|
||||||
|
help_text="Additional parameter for the action selected above, i.e., "
|
||||||
|
"the target folder of the move to folder action."
|
||||||
|
)
|
||||||
|
|
||||||
|
assign_title_from = models.PositiveIntegerField(
|
||||||
|
choices=TITLE_SELECTOR,
|
||||||
|
default=TITLE_FROM_SUBJECT
|
||||||
|
)
|
||||||
|
|
||||||
|
assign_tag = models.ForeignKey(
|
||||||
|
document_models.Tag,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
on_delete=models.SET_NULL
|
||||||
|
)
|
||||||
|
|
||||||
|
assign_document_type = models.ForeignKey(
|
||||||
|
document_models.DocumentType,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
on_delete=models.SET_NULL
|
||||||
|
)
|
||||||
|
|
||||||
|
assign_correspondent_from = models.PositiveIntegerField(
|
||||||
|
choices=CORRESPONDENT_SELECTOR,
|
||||||
|
default=CORRESPONDENT_FROM_NOTHING
|
||||||
|
)
|
||||||
|
|
||||||
|
assign_correspondent = models.ForeignKey(
|
||||||
|
document_models.Correspondent,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
on_delete=models.SET_NULL
|
||||||
|
)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
23
src/paperless_mail/tasks.py
Normal file
23
src/paperless_mail/tasks.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from paperless_mail import mail
|
||||||
|
from paperless_mail.models import MailAccount
|
||||||
|
|
||||||
|
|
||||||
|
def process_mail_accounts():
|
||||||
|
total_new_documents = 0
|
||||||
|
for account in MailAccount.objects.all():
|
||||||
|
total_new_documents += mail.handle_mail_account(account)
|
||||||
|
|
||||||
|
if total_new_documents > 0:
|
||||||
|
return f"Added {total_new_documents} document(s)."
|
||||||
|
else:
|
||||||
|
return "No new documents were added."
|
||||||
|
|
||||||
|
|
||||||
|
def process_mail_account(name):
|
||||||
|
account = MailAccount.objects.find(name=name)
|
||||||
|
if account:
|
||||||
|
mail.handle_mail_account(account)
|
||||||
|
else:
|
||||||
|
logging.error("Unknown mail acccount: {}".format(name))
|
0
src/paperless_mail/tests/__init__.py
Normal file
0
src/paperless_mail/tests/__init__.py
Normal file
352
src/paperless_mail/tests/test_mail.py
Normal file
352
src/paperless_mail/tests/test_mail.py
Normal file
@@ -0,0 +1,352 @@
|
|||||||
|
import uuid
|
||||||
|
from collections import namedtuple
|
||||||
|
from typing import ContextManager
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from django.test import TestCase
|
||||||
|
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
||||||
|
|
||||||
|
from documents.models import Correspondent
|
||||||
|
from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError
|
||||||
|
from paperless_mail.models import MailRule, MailAccount
|
||||||
|
|
||||||
|
|
||||||
|
class BogusFolderManager:
|
||||||
|
|
||||||
|
current_folder = "INBOX"
|
||||||
|
|
||||||
|
def set(self, new_folder):
|
||||||
|
if new_folder not in ["INBOX", "spam"]:
|
||||||
|
raise MailboxFolderSelectError(None, "uhm")
|
||||||
|
self.current_folder = new_folder
|
||||||
|
|
||||||
|
|
||||||
|
class BogusMailBox(ContextManager):
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.messages = []
|
||||||
|
self.messages_spam = []
|
||||||
|
|
||||||
|
def login(self, username, password):
|
||||||
|
if not (username == 'admin' and password == 'secret'):
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
folder = BogusFolderManager()
|
||||||
|
|
||||||
|
def fetch(self, criteria, mark_seen):
|
||||||
|
msg = self.messages
|
||||||
|
|
||||||
|
criteria = str(criteria).strip('()').split(" ")
|
||||||
|
|
||||||
|
if 'UNSEEN' in criteria:
|
||||||
|
msg = filter(lambda m: not m.seen, msg)
|
||||||
|
|
||||||
|
if 'SUBJECT' in criteria:
|
||||||
|
subject = criteria[criteria.index('SUBJECT') + 1].strip('"')
|
||||||
|
msg = filter(lambda m: subject in m.subject, msg)
|
||||||
|
|
||||||
|
if 'BODY' in criteria:
|
||||||
|
body = criteria[criteria.index('BODY') + 1].strip('"')
|
||||||
|
msg = filter(lambda m: body in m.body, msg)
|
||||||
|
|
||||||
|
if 'FROM' in criteria:
|
||||||
|
from_ = criteria[criteria.index('FROM') + 1].strip('"')
|
||||||
|
msg = filter(lambda m: from_ in m.from_, msg)
|
||||||
|
|
||||||
|
if 'UNFLAGGED' in criteria:
|
||||||
|
msg = filter(lambda m: not m.flagged, msg)
|
||||||
|
|
||||||
|
return list(msg)
|
||||||
|
|
||||||
|
def seen(self, uid_list, seen_val):
|
||||||
|
for message in self.messages:
|
||||||
|
if message.uid in uid_list:
|
||||||
|
message.seen = seen_val
|
||||||
|
|
||||||
|
def delete(self, uid_list):
|
||||||
|
self.messages = list(filter(lambda m: m.uid not in uid_list, self.messages))
|
||||||
|
|
||||||
|
def flag(self, uid_list, flag_set, value):
|
||||||
|
for message in self.messages:
|
||||||
|
if message.uid in uid_list:
|
||||||
|
for flag in flag_set:
|
||||||
|
if flag == MailMessageFlags.FLAGGED:
|
||||||
|
message.flagged = value
|
||||||
|
|
||||||
|
def move(self, uid_list, folder):
|
||||||
|
if folder == "spam":
|
||||||
|
self.messages_spam.append(
|
||||||
|
filter(lambda m: m.uid in uid_list, self.messages)
|
||||||
|
)
|
||||||
|
self.messages = list(
|
||||||
|
filter(lambda m: m.uid not in uid_list, self.messages)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
|
||||||
|
def create_message(num_attachments=1, body="", subject="the suject", from_="noone@mail.com", seen=False, flagged=False):
|
||||||
|
message = namedtuple('MailMessage', [])
|
||||||
|
|
||||||
|
message.uid = uuid.uuid4()
|
||||||
|
message.subject = subject
|
||||||
|
message.attachments = []
|
||||||
|
message.from_ = from_
|
||||||
|
message.body = body
|
||||||
|
for i in range(num_attachments):
|
||||||
|
attachment = namedtuple('Attachment', [])
|
||||||
|
attachment.filename = 'some_file.pdf'
|
||||||
|
attachment.content_type = 'application/pdf'
|
||||||
|
attachment.payload = b'content of the attachment'
|
||||||
|
message.attachments.append(attachment)
|
||||||
|
|
||||||
|
message.seen = seen
|
||||||
|
message.flagged = flagged
|
||||||
|
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
class TestMail(TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
patcher = mock.patch('paperless_mail.mail.MailBox')
|
||||||
|
m = patcher.start()
|
||||||
|
self.bogus_mailbox = BogusMailBox()
|
||||||
|
m.return_value = self.bogus_mailbox
|
||||||
|
self.addCleanup(patcher.stop)
|
||||||
|
|
||||||
|
patcher = mock.patch('paperless_mail.mail.async_task')
|
||||||
|
self.async_task = patcher.start()
|
||||||
|
self.addCleanup(patcher.stop)
|
||||||
|
|
||||||
|
self.reset_bogus_mailbox()
|
||||||
|
|
||||||
|
def reset_bogus_mailbox(self):
|
||||||
|
self.bogus_mailbox.messages = []
|
||||||
|
self.bogus_mailbox.messages_spam = []
|
||||||
|
self.bogus_mailbox.messages.append(create_message(subject="Invoice 1", from_="amazon@amazon.de", body="cables", seen=True, flagged=False))
|
||||||
|
self.bogus_mailbox.messages.append(create_message(subject="Invoice 2", body="from my favorite electronic store", seen=False, flagged=True))
|
||||||
|
self.bogus_mailbox.messages.append(create_message(subject="Claim your $10M price now!", from_="amazon@amazon-some-indian-site.org", seen=False))
|
||||||
|
|
||||||
|
def test_get_correspondent(self):
|
||||||
|
message = namedtuple('MailMessage', [])
|
||||||
|
message.from_ = "someone@somewhere.com"
|
||||||
|
message.from_values = {'name': "Someone!", 'email': "someone@somewhere.com"}
|
||||||
|
|
||||||
|
message2 = namedtuple('MailMessage', [])
|
||||||
|
message2.from_ = "me@localhost.com"
|
||||||
|
message2.from_values = {'name': "", 'email': "fake@localhost.com"}
|
||||||
|
|
||||||
|
me_localhost = Correspondent.objects.create(name=message2.from_)
|
||||||
|
someone_else = Correspondent.objects.create(name="someone else")
|
||||||
|
|
||||||
|
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
|
||||||
|
self.assertIsNone(get_correspondent(message, rule))
|
||||||
|
|
||||||
|
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||||
|
c = get_correspondent(message, rule)
|
||||||
|
self.assertIsNotNone(c)
|
||||||
|
self.assertEqual(c.name, "someone@somewhere.com")
|
||||||
|
c = get_correspondent(message2, rule)
|
||||||
|
self.assertIsNotNone(c)
|
||||||
|
self.assertEqual(c.name, "me@localhost.com")
|
||||||
|
self.assertEqual(c.id, me_localhost.id)
|
||||||
|
|
||||||
|
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
|
||||||
|
c = get_correspondent(message, rule)
|
||||||
|
self.assertIsNotNone(c)
|
||||||
|
self.assertEqual(c.name, "Someone!")
|
||||||
|
c = get_correspondent(message2, rule)
|
||||||
|
self.assertIsNotNone(c)
|
||||||
|
self.assertEqual(c.id, me_localhost.id)
|
||||||
|
|
||||||
|
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
|
||||||
|
c = get_correspondent(message, rule)
|
||||||
|
self.assertEqual(c, someone_else)
|
||||||
|
|
||||||
|
def test_get_title(self):
|
||||||
|
message = namedtuple('MailMessage', [])
|
||||||
|
message.subject = "the message title"
|
||||||
|
att = namedtuple('Attachment', [])
|
||||||
|
att.filename = "this_is_the_file.pdf"
|
||||||
|
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||||
|
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
|
||||||
|
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_SUBJECT)
|
||||||
|
self.assertEqual(get_title(message, att, rule), "the message title")
|
||||||
|
|
||||||
|
def test_handle_message(self):
|
||||||
|
message = namedtuple('MailMessage', [])
|
||||||
|
message.subject = "the message title"
|
||||||
|
|
||||||
|
att = namedtuple('Attachment', [])
|
||||||
|
att.filename = "test1.pdf"
|
||||||
|
att.content_type = 'application/pdf'
|
||||||
|
att.payload = b"attachment contents"
|
||||||
|
|
||||||
|
att2 = namedtuple('Attachment', [])
|
||||||
|
att2.filename = "test2.pdf"
|
||||||
|
att2.content_type = 'application/pdf'
|
||||||
|
att2.payload = b"attachment contents"
|
||||||
|
|
||||||
|
att3 = namedtuple('Attachment', [])
|
||||||
|
att3.filename = "test3.pdf"
|
||||||
|
att3.content_type = 'application/invalid'
|
||||||
|
att3.payload = b"attachment contents"
|
||||||
|
|
||||||
|
message.attachments = [att, att2, att3]
|
||||||
|
|
||||||
|
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||||
|
|
||||||
|
result = handle_message(message, rule)
|
||||||
|
|
||||||
|
self.assertEqual(result, 2)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.async_task.call_args_list), 2)
|
||||||
|
|
||||||
|
args1, kwargs1 = self.async_task.call_args_list[0]
|
||||||
|
args2, kwargs2 = self.async_task.call_args_list[1]
|
||||||
|
|
||||||
|
self.assertEqual(kwargs1['override_title'], "test1")
|
||||||
|
self.assertEqual(kwargs1['override_filename'], "test1.pdf")
|
||||||
|
|
||||||
|
self.assertEqual(kwargs2['override_title'], "test2")
|
||||||
|
self.assertEqual(kwargs2['override_filename'], "test2.pdf")
|
||||||
|
|
||||||
|
@mock.patch("paperless_mail.mail.async_task")
|
||||||
|
def test_handle_empty_message(self, m):
|
||||||
|
message = namedtuple('MailMessage', [])
|
||||||
|
|
||||||
|
message.attachments = []
|
||||||
|
rule = MailRule()
|
||||||
|
|
||||||
|
result = handle_message(message, rule)
|
||||||
|
|
||||||
|
self.assertFalse(m.called)
|
||||||
|
self.assertEqual(result, 0)
|
||||||
|
|
||||||
|
def test_handle_mail_account_mark_read(self):
|
||||||
|
|
||||||
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||||
|
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
|
||||||
|
|
||||||
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(self.async_task.call_count, 2)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
|
||||||
|
|
||||||
|
def test_handle_mail_account_delete(self):
|
||||||
|
|
||||||
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||||
|
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Invoice")
|
||||||
|
|
||||||
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(self.async_task.call_count, 2)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||||
|
|
||||||
|
def test_handle_mail_account_flag(self):
|
||||||
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||||
|
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
|
||||||
|
|
||||||
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(self.async_task.call_count, 1)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
|
||||||
|
|
||||||
|
def test_handle_mail_account_move(self):
|
||||||
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||||
|
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", filter_subject="Claim")
|
||||||
|
|
||||||
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(self.async_task.call_count, 1)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||||
|
|
||||||
|
def test_errors(self):
|
||||||
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
||||||
|
|
||||||
|
try:
|
||||||
|
handle_mail_account(account)
|
||||||
|
except MailError as e:
|
||||||
|
self.assertTrue(str(e).startswith("Error while authenticating account"))
|
||||||
|
else:
|
||||||
|
self.fail("Should raise exception")
|
||||||
|
|
||||||
|
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
||||||
|
|
||||||
|
try:
|
||||||
|
handle_mail_account(account)
|
||||||
|
except MailError as e:
|
||||||
|
self.assertTrue("uuuh does not exist" in str(e))
|
||||||
|
else:
|
||||||
|
self.fail("Should raise exception")
|
||||||
|
|
||||||
|
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||||
|
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
||||||
|
|
||||||
|
try:
|
||||||
|
handle_mail_account(account)
|
||||||
|
except MailError as e:
|
||||||
|
self.assertTrue("Error while processing post-consume actions" in str(e))
|
||||||
|
else:
|
||||||
|
self.fail("Should raise exception")
|
||||||
|
|
||||||
|
def test_filters(self):
|
||||||
|
|
||||||
|
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||||
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
|
||||||
|
|
||||||
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
|
self.assertEqual(self.async_task.call_count, 1)
|
||||||
|
|
||||||
|
self.reset_bogus_mailbox()
|
||||||
|
|
||||||
|
rule.filter_subject = None
|
||||||
|
rule.filter_body = "electronic"
|
||||||
|
rule.save()
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
|
self.assertEqual(self.async_task.call_count, 2)
|
||||||
|
|
||||||
|
self.reset_bogus_mailbox()
|
||||||
|
|
||||||
|
rule.filter_from = "amazon"
|
||||||
|
rule.filter_body = None
|
||||||
|
rule.save()
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||||
|
self.assertEqual(self.async_task.call_count, 4)
|
||||||
|
|
||||||
|
self.reset_bogus_mailbox()
|
||||||
|
|
||||||
|
rule.filter_from = "amazon"
|
||||||
|
rule.filter_body = "cables"
|
||||||
|
rule.filter_subject = "Invoice"
|
||||||
|
rule.save()
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
handle_mail_account(account)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
|
self.assertEqual(self.async_task.call_count, 5)
|
3
src/paperless_mail/views.py
Normal file
3
src/paperless_mail/views.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from django.shortcuts import render
|
||||||
|
|
||||||
|
# Create your views here.
|
@@ -1,5 +1,7 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
from paperless_tesseract.signals import tesseract_consumer_declaration
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTesseractConfig(AppConfig):
|
class PaperlessTesseractConfig(AppConfig):
|
||||||
|
|
||||||
@@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
|
|||||||
|
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
from .signals import ConsumerDeclaration
|
document_consumer_declaration.connect(tesseract_consumer_declaration)
|
||||||
|
|
||||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
AppConfig.ready(self)
|
||||||
|
@@ -2,7 +2,7 @@ import itertools
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
import langdetect
|
import langdetect
|
||||||
import pdftotext
|
import pdftotext
|
||||||
@@ -151,7 +151,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
|
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
|
||||||
|
|
||||||
# Run unpaper in parallel on converted images
|
# Run unpaper in parallel on converted images
|
||||||
with Pool(processes=settings.OCR_THREADS) as pool:
|
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||||
pnms = pool.map(run_unpaper, pnms)
|
pnms = pool.map(run_unpaper, pnms)
|
||||||
|
|
||||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||||
@@ -166,7 +166,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
def _ocr(self, imgs, lang):
|
def _ocr(self, imgs, lang):
|
||||||
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
||||||
with Pool(processes=settings.OCR_THREADS) as pool:
|
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
@@ -3,21 +3,16 @@ import re
|
|||||||
from .parsers import RasterisedDocumentParser
|
from .parsers import RasterisedDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class ConsumerDeclaration:
|
def tesseract_consumer_declaration(sender, **kwargs):
|
||||||
|
|
||||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def handle(cls, sender, **kwargs):
|
|
||||||
return cls.test
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def test(cls, doc):
|
|
||||||
|
|
||||||
if cls.MATCHING_FILES.match(doc.lower()):
|
|
||||||
return {
|
return {
|
||||||
"parser": RasterisedDocumentParser,
|
"parser": RasterisedDocumentParser,
|
||||||
"weight": 0
|
"weight": 0,
|
||||||
|
"test": tesseract_consumer_test
|
||||||
}
|
}
|
||||||
|
|
||||||
return None
|
|
||||||
|
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||||
|
|
||||||
|
|
||||||
|
def tesseract_consumer_test(doc):
|
||||||
|
return MATCHING_FILES.match(doc.lower())
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
from ..signals import ConsumerDeclaration
|
from paperless_tesseract.signals import tesseract_consumer_test
|
||||||
|
|
||||||
|
|
||||||
class SignalsTestCase(TestCase):
|
class SignalsTestCase(TestCase):
|
||||||
@@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
|
|||||||
for prefix in prefixes:
|
for prefix in prefixes:
|
||||||
for suffix in suffixes:
|
for suffix in suffixes:
|
||||||
name = "{}.{}".format(prefix, suffix)
|
name = "{}.{}".format(prefix, suffix)
|
||||||
self.assertTrue(ConsumerDeclaration.test(name))
|
self.assertTrue(tesseract_consumer_test(name))
|
||||||
|
|
||||||
def test_test_handles_various_file_names_false(self):
|
def test_test_handles_various_file_names_false(self):
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
|
|||||||
for prefix in prefixes:
|
for prefix in prefixes:
|
||||||
for suffix in suffixes:
|
for suffix in suffixes:
|
||||||
name = "{}.{}".format(prefix, suffix)
|
name = "{}.{}".format(prefix, suffix)
|
||||||
self.assertFalse(ConsumerDeclaration.test(name))
|
self.assertFalse(tesseract_consumer_test(name))
|
||||||
|
|
||||||
self.assertFalse(ConsumerDeclaration.test(""))
|
self.assertFalse(tesseract_consumer_test(""))
|
||||||
self.assertFalse(ConsumerDeclaration.test("doc"))
|
self.assertFalse(tesseract_consumer_test("doc"))
|
||||||
|
@@ -1,5 +1,7 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
from paperless_text.signals import text_consumer_declaration
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTextConfig(AppConfig):
|
class PaperlessTextConfig(AppConfig):
|
||||||
|
|
||||||
@@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
|
|||||||
|
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
from .signals import ConsumerDeclaration
|
document_consumer_declaration.connect(text_consumer_declaration)
|
||||||
|
|
||||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
AppConfig.ready(self)
|
||||||
|
@@ -3,21 +3,16 @@ import re
|
|||||||
from .parsers import TextDocumentParser
|
from .parsers import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class ConsumerDeclaration:
|
def text_consumer_declaration(sender, **kwargs):
|
||||||
|
|
||||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def handle(cls, sender, **kwargs):
|
|
||||||
return cls.test
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def test(cls, doc):
|
|
||||||
|
|
||||||
if cls.MATCHING_FILES.match(doc.lower()):
|
|
||||||
return {
|
return {
|
||||||
"parser": TextDocumentParser,
|
"parser": TextDocumentParser,
|
||||||
"weight": 10
|
"weight": 10,
|
||||||
|
"test": text_consumer_test
|
||||||
}
|
}
|
||||||
|
|
||||||
return None
|
|
||||||
|
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||||
|
|
||||||
|
|
||||||
|
def text_consumer_test(doc):
|
||||||
|
return MATCHING_FILES.match(doc.lower())
|
||||||
|
@@ -6,7 +6,6 @@ ignore = E501
|
|||||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||||
addopts = --pythonwarnings=all
|
addopts = --pythonwarnings=all
|
||||||
env =
|
env =
|
||||||
PAPERLESS_PASSPHRASE=THISISNOTASECRET
|
|
||||||
PAPERLESS_SECRET=paperless
|
PAPERLESS_SECRET=paperless
|
||||||
PAPERLESS_EMAIL_SECRET=paperless
|
PAPERLESS_EMAIL_SECRET=paperless
|
||||||
|
|
||||||
@@ -15,4 +14,4 @@ env =
|
|||||||
source =
|
source =
|
||||||
./
|
./
|
||||||
omit =
|
omit =
|
||||||
*/tests
|
*/tests/*
|
||||||
|
Reference in New Issue
Block a user