Merge branch 'dev'

This commit is contained in:
Jonas Winkler 2020-11-18 02:02:26 +01:00
commit 8395bdfdf6
56 changed files with 2164 additions and 8913 deletions

View File

@ -1,3 +1,4 @@
/src-ui/.vscode
/src-ui/node_modules
/src-ui/dist
.git
@ -5,3 +6,7 @@
/consume
/media
/data
/docs
.pytest_cache
/dist
/scripts

View File

@ -5,23 +5,18 @@ python:
- "3.7"
- "3.8"
services:
- docker
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr
install:
- pip install --upgrade pipenv
- pipenv install --dev
- pipenv install --system --dev
script:
- cd src/
- pipenv run pytest --cov
- pipenv run pycodestyle
- cd ..
- docker build --tag=jonaswinkler/paperless-ng .
after_success:
- pipenv run coveralls

View File

@ -29,6 +29,7 @@ watchdog = "*"
pathvalidate = "*"
django-q = "*"
redis = "*"
imap-tools = "*"
[dev-packages]
coveralls = "*"

10
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "c0dfeedbac2e9b705267336349e6f72ba650ff9184affae06046db32299e2c87"
"sha256": "d6416e6844126b09200b9839a3abdcf3c24ef5cf70052b8f134d8bc804552c17"
},
"pipfile-spec": 6,
"requires": {},
@ -123,6 +123,14 @@
"index": "pypi",
"version": "==20.0.4"
},
"imap-tools": {
"hashes": [
"sha256:070929b8ec429c0aad94588a37a2962eed656a119ab61dcf91489f20fe983f5d",
"sha256:6232cd43748741496446871e889eb137351fc7a7e7f4c7888cd8c0fa28e20cda"
],
"index": "pypi",
"version": "==0.31.0"
},
"joblib": {
"hashes": [
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

BIN
docs/_static/recommended_workflow.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

View File

@ -294,10 +294,14 @@ Documents can be stored in Paperless using GnuPG encryption.
.. danger::
Decryption is depreceated since paperless-ng 1.0 and doesn't really provide any
Decryption is depreceated since paperless-ng 0.9 and doesn't really provide any
additional security, since you have to store the passphrase in a configuration
file on the same system as the encrypted documents for paperless to work. Also,
paperless provides transparent access to your encrypted documents.
file on the same system as the encrypted documents for paperless to work.
Furthermore, the entire text content of the documents is stored plain in the
database, even if your documents are encrypted. Filenames are not encrypted as
well.
Also, the web server provides transparent access to your encrypted documents.
Consider running paperless on an encrypted filesystem instead, which will then
at least provide security against physical hardware theft.

View File

@ -3,25 +3,168 @@
The REST API
************
.. warning::
This section is not updated to paperless-ng yet.
Paperless makes use of the `Django REST Framework`_ standard API interface
because of its inherent awesomeness. Conveniently, the system is also
self-documenting, so to learn more about the access points, schema, what's
accepted and what isn't, you need only visit ``/api`` on your local Paperless
installation.
Paperless makes use of the `Django REST Framework`_ standard API interface.
It provides a browsable API for most of its endpoints, which you can inspect
at ``http://<paperless-host>:<port>/api/``. This also documents most of the
available filters and ordering fields.
.. _Django REST Framework: http://django-rest-framework.org/
The API provides 5 main endpoints:
* ``/api/correspondents/``: Full CRUD support.
* ``/api/document_types/``: Full CRUD support.
* ``/api/documents/``: Full CRUD support, except POSTing new documents. See below.
* ``/api/logs/``: Read-Only.
* ``/api/tags/``: Full CRUD support.
All of these endpoints except for the logging endpoint
allow you to fetch, edit and delete individual objects
by appending their primary key to the path, for example ``/api/documents/454/``.
In addition to that, the document endpoint offers these additional actions on
individual documents:
* ``/api/documents/<pk>/download/``: Download the original document.
* ``/api/documents/<pk>/thumb/``: Download the PNG thumbnail of a document.
* ``/api/documents/<pk>/preview/``: Display the original document inline,
without downloading it.
.. hint::
Paperless used to provide these functionality at ``/fetch/<pk>/preview``,
``/fetch/<pk>/thumb`` and ``/fetch/<pk>/doc``. Redirects to the new URLs
are in place. However, if you use these old URLs to access documents, you
should update your app or script to use the new URLs.
Searching for documents
#######################
Paperless-ng offers API endpoints for full text search. These are as follows:
``/api/search/``
================
Get search results based on a query.
Query parameters:
* ``query``: The query string. See
`here <https://whoosh.readthedocs.io/en/latest/querylang.html>`_
for details on the syntax.
* ``page``: Specify the page you want to retrieve. Each page
contains 10 search results and the first page is ``page=1``, which
is the default if this is omitted.
Result list object returned by the endpoint:
.. code:: json
{
"count": 1,
"page": 1,
"page_count": 1,
"results": [
]
}
* ``count``: The approximate total number of results.
* ``page``: The page returned to you. This might be different from
the page you requested, if you requested a page that is behind
the last page. In that case, the last page is returned.
* ``page_count``: The total number of pages.
* ``results``: A list of result objects on the current page.
Result object:
.. code:: json
{
"id": 1,
"highlights": [
],
"score": 6.34234,
"rank": 23,
"document": {
}
* ``id``: the primary key of the found document
* ``highlights``: an object containing parseable highlights for the result.
See below.
* ``score``: The score assigned to the document. A higher score indicates a
better match with the query. Search results are sorted descending by score.
* ``rank``: the position of the document within the entire search results list.
* ``document``: The full json of the document, as returned by
``/api/documents/<id>/``.
Highlights object:
Highlights are provided as a list of fragments. A fragment is a longer section of
text from the original document.
Each fragment contains a list of strings, and some of them are marked as a highlight.
.. code:: json
"highlights": [
[
{"text": "This is a sample text with a "},
{"text": "highlighted", "term": 0},
{"text": " word."}
],
[
{"text": "Another", "term": 1},
{"text": " fragment with a highlight."}
]
]
When ``term`` is present within a string, the word within ``text`` should be highlighted.
The term index groups multiple matches together and words with the same index
should get identical highlighting.
A client may use this example to produce the following output:
... This is a sample text with a **highlighted** word. ... **Another** fragment with a highlight. ...
``/api/search/autocomplete/``
=============================
Get auto completions for a partial search term.
Query parameters:
* ``term``: The incomplete term.
* ``limit``: Amount of results. Defaults to 10.
Results returned by the endpoint are ordered by importance of the term in the
document index. The first result is the term that has the highest Tf/Idf score
in the index.
.. code:: json
[
"term1",
"term3",
"term6",
"term4"
]
.. _api-file_uploads:
POSTing Documents
=================
POSTing documents
#################
File uploads in an API are hard and so far as I've been able to tell, there's
no standard way of accepting them, so rather than crowbar file uploads into the
REST API and endure that headache, I've left that process to a simple HTTP
POST.
The API provides a special endpoint for file uploads:
``/api/documents/post_document/``
POST a multipart form to this endpoint, where the form field ``document`` contains
the document that you want to upload to paperless. The filename is sanitized and
then used to store the document in the consumption folder, where the consumer will
detect the document and process it as any other document.
The endpoint will immediately return "OK." if the document was stored in the
consumption directory.

View File

@ -8,10 +8,8 @@ Changelog
paperless-ng 0.9.0
##################
* **Deprecated:** GnuPG. Don't use it. If you're still using it, be aware that it
offers no protection at all, since the passphrase is stored alongside with the
encrypted documents itself. This features will most likely be removed in future
versions.
* **Deprecated:** GnuPG. :ref:`See this note on the state of GnuPG in paperless-ng. <utilities-encyption>`
This features will most likely be removed in future versions.
* **Added:** New frontend. Features:
@ -38,6 +36,25 @@ paperless-ng 0.9.0
multi user solution, however, it allows more than one user to access the website
and set some basic permissions / renew passwords.
* **Modified [breaking]:** All new mail consumer with customizable filters, actions and
multiple account support. Replaces the old mail consumer. The new mail consumer
needs different configuration but can be configured to act exactly like the old
consumer.
* **Modified:** Changes to the consumer:
* Now uses the excellent watchdog library that should make sure files are
discovered no matter what the platform is.
* The consumer now uses a task scheduler to run consumption processes in parallel.
This means that consuming many documents should be much faster on systems with
many cores.
* Concurrency is controlled with the new settings ``PAPERLESS_TASK_WORKERS``
and ``PAPERLESS_THREADS_PER_WORKER``. See TODO for details on concurrency.
* The consumer no longer blocks the database for extended periods of time.
* An issue with tesseract running multiple threads per page and slowing down
the consumer was fixed.
* **Modified [breaking]:** REST Api changes:
* New filters added, other filters removed (case sensitive filters, slug filters)
@ -64,8 +81,8 @@ paperless-ng 0.9.0
* Rework of the code of the tesseract parser. This is now a lot cleaner.
* Rework of the filename handling code. It was a mess.
* Fixed some issues with the document exporter not exporting all documents when encountering duplicate filenames.
* Consumer rework: now uses the excellent watchdog library, lots of code removed.
* Added a task scheduler that takes care of checking mail, training the classifier and maintaining the document search index.
* Added a task scheduler that takes care of checking mail, training the classifier, maintaining the document search index
and consuming documents.
* Updated dependencies. Now uses Pipenv all around.
* Updated Dockerfile and docker-compose. Now uses ``supervisord`` to run everything paperless-related in a single container.
@ -77,6 +94,8 @@ paperless-ng 0.9.0
* ``PAPERLESS_DEBUG`` defaults to ``false``.
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
sqlite.
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
* Many more small changes here and there. The usual stuff.

View File

@ -20,7 +20,3 @@ places.
Copy ``paperless.conf.example`` to any of these locations and adjust it to your
needs.
.. warning::
TBD: explain config options.

View File

@ -36,6 +36,10 @@ The old admin is still there and accessible!
.. image:: _static/paperless-9-admin.png
Fancy mail filters!
.. image:: _static/paperless-11-mail-filters.png
Mobile support in the future? This doesn't really work yet.
.. image:: _static/paperless-10-mobile.png

View File

@ -23,6 +23,77 @@ There are multiple options available.
that need to be compiled, and that's already done for you in the release.
Overview of Paperless-ng
########################
Compared to paperless, paperless-ng works a little different under the hood and has
more moving parts that work together. While this increases the complexity of
the system, it also brings many benefits.
Paperless consists of the following components:
* **The webserver:** This is pretty much the same as in paperless. It serves
the administration pages, the API, and the new frontend. This is the main
tool you'll be using to interact with paperless. You may start the webserver
with
.. code:: shell-session
$ cd /path/to/paperless/src/
$ pipenv run gunicorn -c /usr/src/paperless/gunicorn.conf.py -b 0.0.0.0:8000 paperless.wsgi
or by any other means such as Apache ``mod_wsgi``.
* **The consumer:** This is what watches your consumption folder for documents.
However, the consumer itself does not consume really consume your documents anymore.
It rather notifies a task processor that a new file is ready for consumption.
I suppose it should be named differently.
This also used to check your emails, but that's now gone elsewhere as well.
Start the consumer with the management command ``document_consumer``:
.. code:: shell-session
$ cd /path/to/paperless/src/
$ pipenv run python3 manage.py document_consumer
* **The task processor:** Paperless relies on `Django Q <https://django-q.readthedocs.io/en/latest/>`_
for doing much of the heavy lifting. This is a task queue that accepts tasks from
multiple sources and processes tasks in parallel. It also comes with a scheduler that executes
certain commands periodically.
This task processor is responsible for:
* Consuming documents. When the consumer finds new documents, it notifies the task processor to
start a consumption task.
* Consuming emails. It periodically checks your configured accounts for new mails and
produces consumption tasks for any documents it finds.
* The task processor also performs the consumption of any documents you upload through
the web interface.
* Maintain the search index and the automatic matching algorithm. These are things that paperless
needs to do from time to time in order to operate properly.
This allows paperless to process multiple documents from your consumption folder in parallel! On
a modern multicore system, consumption with full ocr is blazing fast.
The task processor comes with a built-in admin interface that you can use to see whenever any of the
tasks fail and inspect the errors.
You may start the task processor by executing:
.. code:: shell-session
$ cd /path/to/paperless/src/
$ pipenv run python3 manage.py qcluster
* A `redis <https://redis.io/>`_ message broker: This is a really lightweight service that is responsible
for getting the tasks from the webserver and consumer to the task scheduler. These run in different
processes (maybe even on different machines!), and therefore, this is necessary.
* A database server. Paperless supports PostgreSQL and sqlite for storing its data. However, with the
added concurrency, it is strongly advised to use PostgreSQL, as sqlite has its limits in that regard.
Installation
############
@ -31,10 +102,12 @@ You can go multiple routes with setting up and running Paperless:
* The `docker route`_
* The `bare metal route`_
The `docker route`_ is quick & easy. This is the recommended route.
The `docker route`_ is quick & easy. This is the recommended route. This configures all the stuff
from above automatically so that it just works and uses sensible defaults for all configuration options.
The `bare metal route`_ is more complicated to setup but makes it easier
should you want to contribute some code back.
should you want to contribute some code back. You need to configure and
run the above mentioned components yourself.
Docker Route
============

View File

@ -2,9 +2,38 @@
Troubleshooting
***************
.. warning::
No files are added by the consumer
##################################
Check for the following issues:
* Ensure that the directory you're putting your documents in is the folder
paperless is watching. With docker, this setting is performed in the
``docker-compose.yml`` file. Without docker, look at the ``CONSUMPTION_DIR``
setting. Don't adjust this setting if you're using docker.
* Ensure that redis is up and running. Paperless does its task processing
asynchronously, and for documents to arrive at the task processor, it needs
redis to run.
* Ensure that the task processor is running. Docker does this automatically.
Manually invoke the task processor by executing
.. code:: shell-session
$ python3 manage.py qcluster
* Look at the output of paperless and inspect it for any errors.
* Go to the admin interface, and check if there are failed tasks. If so, the
tasks will contain an error message.
Consumer fails to pickup any new files
######################################
If you notice, that the consumer will only pickup files in the consumption
directory at startup, but won't find any other files added later, check out
the configuration file and enable filesystem polling with the setting
``PAPERLESS_CONSUMER_POLLING``.
This section is not updated to paperless-ng yet.
Consumer warns ``OCR for XX failed``
####################################

View File

@ -27,7 +27,7 @@ Each document has a couple of fields that you can assign to them:
a document either originates form, or is sent to.
* A *tag* is a label that you can assign to documents. Think of labels as more
powerful folders: Multiple documents can be grouped together with a single
tag, however, a single document can also have multiple tags. This is not
tag, however, a single document can also have multiple tags. This is not
possible with folders. The reason folders are not implemented in paperless
is simply that tags are much more versatile than folders.
* A *document type* is used to demarkate the type of a document such as letter,
@ -86,49 +86,63 @@ files from the scanner. Typically, you're looking at an FTP server like
IMAP (Email)
============
Another handy way to get documents into your database is to email them to
yourself. The typical use-case would be to be out for lunch and want to send a
copy of the receipt back to your system at home. Paperless can be taught to
pull emails down from an arbitrary account and dump them into the consumption
directory where the consumer will follow the
usual pattern on consuming the document.
You can tell paperless-ng to consume documents from your email accounts.
This is a very flexible and powerful feature, if you regularly received documents
via mail that you need to archive. The mail consumer can be configured by using the
admin interface in the following manner:
.. hint::
1. Define e-mail accounts.
2. Define mail rules for your account.
It's disabled by default. By setting the values below it will be enabled.
It's been tested in a limited environment, so it may not work for you (please
submit a pull request if you can!)
These rules perform the following:
.. danger::
1. Connect to the mail server.
2. Fetch all matching mails (as defined by folder, maximum age and the filters)
3. Check if there are any consumable attachments.
4. If so, instruct paperless to consume the attachments and optionally
use the metadata provided in the rule for the new document.
5. If documents were consumed from a mail, the rule action is performed
on that mail.
It's designed to **delete mail from the server once consumed**. So don't go
pointing this to your personal email account and wonder where all your stuff
went.
Paperless will completely ignore mails that do not match your filters. It will also
only perform the action on mails that it has consumed documents from.
.. hint::
The actions all ensure that the same mail is not consumed twice by different means.
These are as follows:
Currently, only one photo (attachment) per email will work.
* **Delete:** Immediately deletes mail that paperless has consumed documents from.
Use with caution.
* **Mark as read:** Mark consumed mail as read. Paperless will not consume documents
from already read mails. If you read a mail before paperless sees it, it will be
ignored.
* **Flag:** Sets the 'important' flag on mails with consumed documents. Paperless
will not consume flagged mails.
* **Move to folder:** Moves consumed mails out of the way so that paperless wont
consume them again.
So, with all that in mind, here's what you do to get it running:
.. caution::
1. Setup a new email account somewhere, or if you're feeling daring, create a
folder in an existing email box and note the path to that folder.
2. In ``/etc/paperless.conf`` set all of the appropriate values in
``PATHS AND FOLDERS`` and ``SECURITY``.
If you decided to use a subfolder of an existing account, then make sure you
set ``PAPERLESS_CONSUME_MAIL_INBOX`` accordingly here. You also have to set
the ``PAPERLESS_EMAIL_SECRET`` to something you can remember 'cause you'll
have to include that in every email you send.
3. Restart paperless. Paperless will check
the configured email account at startup and from then on every 10 minutes
for something new and pulls down whatever it finds.
4. Send yourself an email! Note that the subject is treated as the file name,
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
get what you expect. Also, you must include the aforementioned secret
string in every email so the fetcher knows that it's safe to import.
Note that Paperless only allows the email title to consist of safe characters
to be imported. These consist of alpha-numeric characters and ``-_ ,.'``.
The mail consumer will perform these actions on all mails it has consumed
documents from. Keep in mind that the actual consumption process may fail
for some reason, leaving you with missing documents in paperless.
.. note::
With the correct set of rules, you can completely automate your email documents.
Create rules for every correspondent you receive digital documents from and
paperless will read them automatically. The default acion "mark as read" is
pretty tame and will not cause any damage or data loss whatsoever.
.. note::
Paperless will process the rules in the order defined in the admin page.
You can define catch-all rules and have them executed last to consume
any documents not matched by previous rules. Such a rule may assign an "Unknown
mail document" tag to consumed documents so you can inspect them further.
Paperless is set up to check your mails every 10 minutes. This can be configured on the
'Scheduled tasks' page in the admin.
REST API
@ -136,6 +150,7 @@ REST API
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
.. _usage-recommended_workflow:
The recommended workflow
@ -147,6 +162,10 @@ is as follows. This workflow also takes into account that some documents
have to be kept in physical form, but still ensures that you get all the
advantages for these documents as well.
The following diagram shows how easy it is to manage your documents.
.. image:: _static/recommended_workflow.png
Preparations in paperless
=========================
@ -156,7 +175,7 @@ Preparations in paperless
Processing of the physical documents
====================================
Keep a physical inbox. Whenever you receive a document that you need to
Keep a physical inbox. Whenever you receive a document that you need to
archive, put it into your inbox. Regulary, do the following for all documents
in your inbox:

View File

@ -59,22 +59,6 @@ PAPERLESS_CONSUMPTION_DIR="../consume"
#PAPERLESS_STATIC_URL="/static/"
# These values are required if you want paperless to check a particular email
# box every 10 minutes and attempt to consume documents from there. If you
# don't define a HOST, mail checking will just be disabled.
#PAPERLESS_CONSUME_MAIL_HOST=""
#PAPERLESS_CONSUME_MAIL_PORT=""
#PAPERLESS_CONSUME_MAIL_USER=""
#PAPERLESS_CONSUME_MAIL_PASS=""
# Override the default IMAP inbox here. If not set Paperless defaults to
# "INBOX".
#PAPERLESS_CONSUME_MAIL_INBOX="INBOX"
# Any email sent to the target account that does not contain this text will be
# ignored.
PAPERLESS_EMAIL_SECRET=""
# Specify a filename format for the document (directories are supported)
# Use the following placeholders:
# * {correspondent}
@ -143,6 +127,35 @@ PAPERLESS_EMAIL_SECRET=""
#### Software Tweaks ####
###############################################################################
# Paperless does multiple things in the background: Maintain the search index,
# maintain the automatic matching algorithm, check emails, consume documents,
# etc. This variable specifies how many things it will do in parallel.
#PAPERLESS_TASK_WORKERS=1
# Furthermore, paperless uses multiple threads when consuming documents to
# speed up OCR. This variable specifies how many pages paperless will process
# in parallel on a single document.
#PAPERLESS_THREADS_PER_WORKER=1
# Ensure that the product
# PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
# does not exceed your CPU core count or else paperless will be extremely slow.
# If you want paperless to process many documents in parallel, choose a high
# worker count. If you want paperless to process very large documents faster,
# use a higher thread per worker count.
# The default is a balance between the two, according to your CPU core count,
# with a slight favor towards threads per worker, and using as much cores as
# possible.
# If you only specify PAPERLESS_TASK_WORKERS, paperless will adjust
# PAPERLESS_THREADS_PER_WORKER automatically.
# If paperless won't find documents added to your consume folder, it might
# not be able to automatically detect filesystem changes. In that case,
# specify a polling interval in seconds below, which will then cause paperless
# to periodically check your consumption directory for changes.
#PAPERLESS_CONSUMER_POLLING=10
# When the consumer detects a duplicate document, it will not touch the
# original document. This default behavior can be changed here.
#PAPERLESS_CONSUMER_DELETE_DUPLICATES="false"
@ -186,12 +199,6 @@ PAPERLESS_EMAIL_SECRET=""
#
# By default, Paperless will attempt to use all available CPU cores to process
# a document, but if you would like to limit that, you can set this value to
# an integer:
#PAPERLESS_OCR_THREADS=1
# Customize the default language that tesseract will attempt to use when
# parsing documents. The default language is used whenever
# - No language could be detected on a document

View File

@ -2,6 +2,15 @@
set -e
VERSION=$1
if [ -z "$VERSION" ]
then
echo "Need a version string."
exit 1
fi
# source root directory of paperless
PAPERLESS_ROOT=$(git rev-parse --show-toplevel)
@ -42,6 +51,7 @@ mkdir "$PAPERLESS_DIST_APP/docker"
# the application itself
cp "$PAPERLESS_ROOT/.env" \
"$PAPERLESS_ROOT/.dockerignore" \
"$PAPERLESS_ROOT/CONTRIBUTING.md" \
"$PAPERLESS_ROOT/LICENSE" \
"$PAPERLESS_ROOT/Pipfile" \
@ -80,10 +90,12 @@ cp "$PAPERLESS_ROOT/docker/supervisord.conf" "$PAPERLESS_DIST_APP/docker/"
cd "$PAPERLESS_DIST_APP"
docker-compose build
docker build . -t "jonaswinkler/paperless-ng:$VERSION"
docker push "jonaswinkler/paperless-ng:$VERSION"
# works. package the app!
cd "$PAPERLESS_DIST"
tar -cJf paperless-ng.tar.xz paperless-ng/
tar -cJf "paperless-ng-$VERSION.tar.xz" paperless-ng/

View File

@ -12,7 +12,7 @@ from django.utils import timezone
from paperless.db import GnuPG
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .models import Document, FileInfo
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class
from .signals import (
document_consumption_finished,
@ -25,139 +25,204 @@ class ConsumerError(Exception):
class Consumer:
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale pnm
2. Use tesseract on the pnm
3. Store the document in the MEDIA_ROOT with optional encryption
4. Store the OCR'd text in the database
5. Delete the document and image(s)
"""
def __init__(self, consume=settings.CONSUMPTION_DIR,
scratch=settings.SCRATCH_DIR):
def __init__(self):
self.logger = logging.getLogger(__name__)
self.logging_group = None
self.path = None
self.filename = None
self.override_title = None
self.override_correspondent_id = None
self.override_tag_ids = None
self.override_document_type_id = None
self.consume = consume
self.scratch = scratch
def pre_check_file_exists(self):
if not os.path.isfile(self.path):
raise ConsumerError("Cannot consume {}: It is not a file".format(
self.path))
self.classifier = DocumentClassifier()
os.makedirs(self.scratch, exist_ok=True)
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
self.storage_type = Document.STORAGE_TYPE_GPG
if not self.consume:
def pre_check_consumption_dir(self):
if not settings.CONSUMPTION_DIR:
raise ConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
"set.")
if not os.path.isdir(settings.CONSUMPTION_DIR):
raise ConsumerError(
"Consumption directory {} does not exist".format(
settings.CONSUMPTION_DIR))
def pre_check_regex(self):
if not re.match(FileInfo.REGEXES["title"], self.filename):
raise ConsumerError(
"Filename {} does not seem to be safe to "
"consume".format(self.filename))
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(checksum=checksum).exists():
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
raise ConsumerError(
"Not consuming {}: It is a duplicate.".format(self.filename)
)
if not os.path.exists(self.consume):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.consume))
def pre_check_directories(self):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
@transaction.atomic
def try_consume_file(self, file):
def try_consume_file(self,
path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
"""
Return True if file was consumed
Return the document object if it was successfully created.
"""
self.path = path
self.filename = override_filename or os.path.basename(path)
self.override_title = override_title
self.override_correspondent_id = override_correspondent_id
self.override_document_type_id = override_document_type_id
self.override_tag_ids = override_tag_ids
# this is for grouping logging entries for this particular file
# together.
self.logging_group = uuid.uuid4()
if not re.match(FileInfo.REGEXES["title"], file):
return False
# Make sure that preconditions for consuming the file are met.
doc = file
self.pre_check_file_exists()
self.pre_check_consumption_dir()
self.pre_check_directories()
self.pre_check_regex()
self.pre_check_duplicate()
if self._is_duplicate(doc):
self.log(
"warning",
"Skipping {} as it appears to be a duplicate".format(doc)
)
if settings.CONSUMER_DELETE_DUPLICATES:
self._cleanup_doc(doc)
return False
self.log("info", "Consuming {}".format(self.filename))
self.log("info", "Consuming {}".format(doc))
# Determine the parser class.
parser_class = get_parser_class(doc)
parser_class = get_parser_class(self.filename)
if not parser_class:
self.log(
"error", "No parsers could be found for {}".format(doc))
return False
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
else:
self.log("info", "Parser: {}".format(parser_class.__name__))
self.log("debug", "Parser: {}".format(parser_class.__name__))
# Notify all listeners that we're going to do some work.
document_consumption_started.send(
sender=self.__class__,
filename=doc,
filename=self.path,
logging_group=self.logging_group
)
document_parser = parser_class(doc, self.logging_group)
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.path, self.logging_group)
# However, this already created working directories which we have to
# clean up.
# Parse the document. This may take some time.
try:
self.log("info", "Generating thumbnail for {}...".format(doc))
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
thumbnail = document_parser.get_optimised_thumbnail()
self.log("debug", "Parsing {}...".format(self.filename))
text = document_parser.get_text()
date = document_parser.get_date()
document = self._store(
text,
doc,
thumbnail,
date
)
except ParseError as e:
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
document_parser.cleanup()
return False
else:
document_parser.cleanup()
self._cleanup_doc(doc)
raise ConsumerError(e)
self.log(
"info",
"Document {} consumption finished".format(document)
)
# Prepare the document classifier.
# TODO: I don't really like to do this here, but this way we avoid
# reloading the classifier multiple times, since there are multiple
# post-consume hooks that all require the classifier.
try:
classifier = DocumentClassifier()
classifier.reload()
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning(
"Cannot classify documents: {}.".format(e))
classifier = None
try:
self.classifier.reload()
classifier = self.classifier
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
with transaction.atomic():
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
return True
# store the document.
document = self._store(
text=text,
date=date
)
def _store(self, text, doc, thumbnail, date):
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
file_info = FileInfo.from_path(doc)
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
stats = os.stat(doc)
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
create_source_path_directory(document.source_path)
self._write(document, self.path, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
# Delete the file only if it was successfully consumed
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
except Exception as e:
raise ConsumerError(e)
finally:
document_parser.cleanup()
self.log(
"info",
"Document {} consumption finished".format(document)
)
return document
def _store(self, text, date):
# If someone gave us the original filename, use it instead of doc.
file_info = FileInfo.from_path(self.filename)
stats = os.stat(self.path)
self.log("debug", "Saving record to database")
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
with open(doc, "rb") as f:
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
else:
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
with open(self.path, "rb") as f:
document = Document.objects.create(
correspondent=file_info.correspondent,
title=file_info.title,
@ -166,7 +231,7 @@ class Consumer:
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
storage_type=self.storage_type
storage_type=storage_type
)
relevant_tags = set(file_info.tags)
@ -175,19 +240,30 @@ class Consumer:
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
self.apply_overrides(document)
document.filename = generate_filename(document)
create_source_path_directory(document.source_path)
self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
# We need to save the document twice, since we need the PK of the
# document in order to create its filename above.
document.save()
return document
def apply_overrides(self, document):
if self.override_title:
document.title = self.override_title
if self.override_correspondent_id:
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
if self.override_document_type_id:
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
if self.override_tag_ids:
for tag_id in self.override_tag_ids:
document.tags.add(Tag.objects.get(pk=tag_id))
def _write(self, document, source, target):
with open(source, "rb") as read_file:
with open(target, "wb") as write_file:
@ -196,13 +272,3 @@ class Consumer:
return
self.log("debug", "Encrypting")
write_file.write(GnuPG.encrypted(read_file))
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
@staticmethod
def _is_duplicate(doc):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()

View File

@ -1,9 +1,11 @@
import os
import tempfile
from datetime import datetime
from time import mktime
from django import forms
from django.conf import settings
from django_q.tasks import async_task
from pathvalidate import validate_filename, ValidationError
@ -18,15 +20,6 @@ class UploadForm(forms.Form):
raise forms.ValidationError("That filename is suspicious.")
return self.cleaned_data.get("document")
def get_filename(self, i=None):
return os.path.join(
settings.CONSUMPTION_DIR,
"{}_{}".format(
str(i),
self.cleaned_data.get("document").name
) if i else self.cleaned_data.get("document").name
)
def save(self):
"""
Since the consumer already does a lot of work, it's easier just to save
@ -35,15 +28,13 @@ class UploadForm(forms.Form):
"""
document = self.cleaned_data.get("document").read()
original_filename = self.cleaned_data.get("document").name
t = int(mktime(datetime.now().timetuple()))
file_name = self.get_filename()
i = 0
while os.path.exists(file_name):
i += 1
file_name = self.get_filename(i)
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
with open(file_name, "wb") as f:
f.write(document)
os.utime(file_name, times=(t, t))
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))

View File

@ -1,249 +0,0 @@
import datetime
import imaplib
import logging
import os
import re
import time
import uuid
from base64 import b64decode
from email import policy
from email.parser import BytesParser
from dateutil import parser
from django.conf import settings
from .models import Correspondent
class MailFetcherError(Exception):
pass
class InvalidMessageError(MailFetcherError):
pass
class Loggable(object):
def __init__(self, group=None):
self.logger = logging.getLogger(__name__)
self.logging_group = group or uuid.uuid4()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
class Message(Loggable):
"""
A crude, but simple email message class. We assume that there's a subject
and n attachments, and that we don't care about the message body.
"""
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
def __init__(self, data, group=None):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
Loggable.__init__(self, group=group)
self.subject = None
self.time = None
self.attachment = None
message = BytesParser(policy=policy.default).parsebytes(data)
self.subject = str(message["Subject"]).replace("\r\n", "")
self.body = str(message.get_body())
self.check_subject()
self.check_body()
self._set_time(message)
self.log("info", 'Importing email: "{}"'.format(self.subject))
attachments = []
for part in message.walk():
content_disposition = part.get("Content-Disposition")
if not content_disposition:
continue
dispositions = content_disposition.strip().split(";")
if len(dispositions) < 2:
continue
if not dispositions[0].lower() == "attachment" and \
"filename" not in dispositions[1].lower():
continue
file_data = part.get_payload()
attachments.append(Attachment(
b64decode(file_data), content_type=part.get_content_type()))
if len(attachments) == 0:
raise InvalidMessageError(
"There don't appear to be any attachments to this message")
if len(attachments) > 1:
raise InvalidMessageError(
"There's more than one attachment to this message. It cannot "
"be indexed automatically."
)
self.attachment = attachments[0]
def __bool__(self):
return bool(self.attachment)
def check_subject(self):
if self.subject is None:
raise InvalidMessageError("Message does not have a subject")
if not Correspondent.SAFE_REGEX.match(self.subject):
raise InvalidMessageError("Message subject is unsafe: {}".format(
self.subject))
def check_body(self):
if self.SECRET not in self.body:
raise InvalidMessageError("The secret wasn't in the body")
def _set_time(self, message):
self.time = datetime.datetime.now()
message_time = message.get("Date")
if message_time:
try:
self.time = parser.parse(message_time)
except (ValueError, AttributeError):
pass # We assume that "now" is ok
@property
def file_name(self):
return "{}.{}".format(self.subject, self.attachment.suffix)
class Attachment(object):
SAFE_SUFFIX_REGEX = re.compile(
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
def __init__(self, data, content_type):
self.content_type = content_type
self.data = data
self.suffix = None
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
if not m:
raise MailFetcherError(
"Not-awesome file type: {}".format(self.content_type))
self.suffix = m.group(2) or m.group(4)
def read(self):
return self.data
class MailFetcher(Loggable):
def __init__(self, consume=settings.CONSUMPTION_DIR):
Loggable.__init__(self)
self._connection = None
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
self._enabled = bool(self._host)
if self._enabled and Message.SECRET is None:
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
self.last_checked = time.time()
self.consume = consume
def pull(self):
"""
Fetch all available mail at the target address and store it locally in
the consumption directory so that the file consumer can pick it up and
do its thing.
"""
if self._enabled:
# Reset the grouping id for each fetch
self.logging_group = uuid.uuid4()
self.log("debug", "Checking mail")
for message in self._get_messages():
self.log("info", 'Storing email: "{}"'.format(message.subject))
t = int(time.mktime(message.time.timetuple()))
file_name = os.path.join(self.consume, message.file_name)
with open(file_name, "wb") as f:
f.write(message.attachment.data)
os.utime(file_name, times=(t, t))
self.last_checked = time.time()
def _get_messages(self):
r = []
try:
self._connect()
self._login()
for message in self._fetch():
if message:
r.append(message)
self._connection.expunge()
self._connection.close()
self._connection.logout()
except MailFetcherError as e:
self.log("error", str(e))
return r
def _connect(self):
try:
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
except OSError as e:
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
raise MailFetcherError(msg)
def _login(self):
login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
inbox = self._connection.select(self._inbox)
if not inbox[0] == "OK":
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
__, data = self._connection.fetch(num, "(RFC822)")
message = None
try:
message = Message(data[0][1], self.logging_group)
except InvalidMessageError as e:
self.log("error", str(e))
else:
self._connection.store(num, "+FLAGS", "\\Deleted")
if message:
yield message

View File

@ -3,10 +3,10 @@ import os
from django.conf import settings
from django.core.management.base import BaseCommand
from django_q.tasks import async_task
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
from documents.consumer import Consumer
from watchdog.observers.polling import PollingObserver
try:
from inotify_simple import INotify, flags
@ -16,13 +16,10 @@ except ImportError:
class Handler(FileSystemEventHandler):
def __init__(self, consumer):
self.consumer = consumer
def _consume(self, file):
if os.path.isfile(file):
try:
self.consumer.try_consume_file(file)
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
except Exception as e:
# Catch all so that the consumer won't crash.
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
@ -37,7 +34,7 @@ class Handler(FileSystemEventHandler):
class Command(BaseCommand):
"""
On every iteration of an infinite loop, consume what we can from the
consumption directory, and fetch any mail available.
consumption directory.
"""
def __init__(self, *args, **kwargs):
@ -45,12 +42,6 @@ class Command(BaseCommand):
self.verbosity = 0
self.logger = logging.getLogger(__name__)
self.file_consumer = None
self.mail_fetcher = None
self.first_iteration = True
self.consumer = Consumer()
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
@ -66,9 +57,6 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
directory = options["directory"]
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
os.makedirs(d, exist_ok=True)
logging.getLogger(__name__).info(
"Starting document consumer at {}".format(
directory
@ -78,11 +66,16 @@ class Command(BaseCommand):
# Consume all files as this is not done initially by the watchdog
for entry in os.scandir(directory):
if entry.is_file():
self.consumer.try_consume_file(entry.path)
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
# Start the watchdog. Woof!
observer = Observer()
event_handler = Handler(self.consumer)
if settings.CONSUMER_POLLING > 0:
logging.getLogger(__name__).info('Using polling instead of file'
'system notifications.')
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
else:
observer = Observer()
event_handler = Handler()
observer.schedule(event_handler, directory, recursive=True)
observer.start()
try:

View File

@ -9,13 +9,11 @@ from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
Schedule.objects.filter(func='documents.tasks.consume_mail').delete()
class Migration(migrations.Migration):

View File

@ -113,6 +113,7 @@ class DocumentType(MatchingModel):
class Document(models.Model):
# TODO: why do we need an explicit list
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
@ -291,7 +292,7 @@ class FileInfo:
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
# TODO: what is this used for
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(

View File

@ -41,15 +41,16 @@ def get_parser_class(doc):
Determine the appropriate parser class based on the file
"""
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
options = []
for parser in parsers:
result = parser(doc)
if result:
options.append(result)
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
parser_test = parser_declaration["test"]
if parser_test(doc):
options.append(parser_declaration)
if not options:
return None

View File

@ -6,14 +6,10 @@ from whoosh.writing import AsyncWriter
from documents import index
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.mail import MailFetcher
from documents.consumer import Consumer, ConsumerError
from documents.models import Document
def consume_mail():
MailFetcher().pull()
def index_optimize():
index.open_index().optimize()
@ -54,3 +50,27 @@ def train_classifier():
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)
def consume_file(path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
override_tag_ids=None):
document = Consumer().try_consume_file(
path,
override_filename=override_filename,
override_title=override_title,
override_correspondent_id=override_correspondent_id,
override_document_type_id=override_document_type_id,
override_tag_ids=override_tag_ids)
if document:
return "Success. New document id {} created".format(
document.pk
)
else:
raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.")

File diff suppressed because it is too large Load Diff

View File

@ -1,208 +0,0 @@
Return-Path: <sender@example.com>
X-Original-To: sender@mailbox4.mailhost.com
Delivered-To: sender@mailbox4.mailhost.com
Received: from mx8.mailhost.com (mail8.mailhost.com [75.126.24.68])
by mailbox4.mailhost.com (Postfix) with ESMTP id B62BD5498001
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
Received: from localhost (localhost.localdomain [127.0.0.1])
by mx8.mailhost.com (Postfix) with ESMTP id B41796F190D
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
X-Spam-Flag: NO
X-Spam-Score: 0
X-Spam-Level:
X-Spam-Status: No, score=0 tagged_above=-999 required=3
tests=[RCVD_IN_DNSWL_NONE=-0.0001]
Received: from mx8.mailhost.com ([127.0.0.1])
by localhost (mail8.mailhost.com [127.0.0.1]) (amavisd-new, port 10024)
with ESMTP id 3cj6d28FXsS3 for <sender@mailbox4.mailhost.com>;
Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
Received: from smtp.mailhost.com (smtp.mailhost.com [74.55.86.74])
by mx8.mailhost.com (Postfix) with ESMTP id 527D76F1529
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
Received: from [10.114.0.19] (nl3x.mullvad.net [46.166.136.162])
by smtp.mailhost.com (Postfix) with ESMTP id 9C52420C6FDA
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:16 +0000 (UTC)
To: paperless@example.com
From: Daniel Quinn <sender@example.com>
Subject: Test 0
Message-ID: <56B3CA2A.6030806@example.com>
Date: Thu, 4 Feb 2016 22:01:14 +0000
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
Thunderbird/38.5.0
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="------------090701020702030809070008"
This is a multi-part message in MIME format.
--------------090701020702030809070008
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
The secret word is "paperless" :-)
--------------090701020702030809070008
Content-Type: application/pdf;
name="test0.pdf"
Content-Transfer-Encoding: base64
Content-Disposition: attachment;
filename="test0.pdf"
JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0
ZURlY29kZT4+CnN0cmVhbQp4nFWLQQvCMAyF7/kVOQutSdeuHZSA0+3gbVDwIN6c3gR38e/b
bF4kkPfyvReyjB94IyFVF7pgG0ze4TLDZYevLamzPKEvEFqbMEZfq+WO+5GRHZbHNROLy+So
UfFi6g7/RyusEpUl9VsQxQTlHR2oV3wUEzOdhOnXG1aw/o1yK2cYCkww4RdbUCevCmVuZHN0
cmVhbQplbmRvYmoKCjMgMCBvYmoKMTM5CmVuZG9iagoKNSAwIG9iago8PC9MZW5ndGggNiAw
IFIvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aDEgMTA4MjQ+PgpzdHJlYW0KeJzlOWt0G9WZ
95uRbNmWLckPWY4SaRTFedmybI8T4rw8sS3ZiZ1YfqWSCbFkS7YEtiQkJSE8GlNeOQ5pUmh5
Zkt2l+XQNl3GhLaBpcWw0D19UGALLRRS0gM9nD0lxVBK9wCx97tXI0UJAc727L8d+c587/u9
7p0rOZXYEyJaMkV4Io1OBuLOqmqBEPJLQqB0dG9K2NRTsQHhM4Rw/zkWH5+870e7PiRE9Rgh
+Y+NT+wf+/b3e4YI0YYJKX41HAoEfxj6vUjIIgltrA0jYef8/nzEr0F8WXgydY2bP7QO8WOI
SxOx0cDxxbUmxN9AfOlk4Jr4apWLI8SMKBGigcmQpYXrRBx9KtobjyVTQbJsgZDl91B+PBGK
d9838hzipwjhjyIN8EMvLYJ5FOd4lTovX1NQWKQtLtGR/3eX+jCpIJ3qTURH4ux+wcWfIFXk
XkIW3qXY+ft898LH/5deaNKPe8hD5DFymLxGrlAYbuIhEbIHKbnX0+QlpNLLQ4bId8n055g9
QU4hPy3nJ0doJJe8PORucpL8xwWzeMgkuQ59+QF5DRrIz7BVYuQD0JAbyXNo9QOkbb+UKa4E
b2MMHMuhvk7u5w6RbdzbiNxLOZyT05NnyTHYjZZTGOfhbMQbP2P0NnID3vtJmOxFmF3qTZ/+
jhQs/AWjuoFsI18jW8hEjsaT8ABfiPUbIA9gTp9mNGeGmd/JX8n9kOPO3YnIN8g4jgBg7Nxh
fsvnZOh/ffGDpBhW8dWk4FJcrono5j/mGhc+5JeRQjK4MJehLXQt/IUPzEdVw6rF6k2qX3zR
HHnfUE2iNln44/x180H1DvVDWK2HcePouHzI5x0c6O/r9fTs2N7dtW1rZ4fb1d7WukVq2bxp
44b1zesuW7umod5Z56hduWJ59TL7UpvVVG7Q60qKiwoLNPl5ahXPAakVZPC7ZL5aMLgDdpc9
0OmoFVymcLuj1mV3+2UhIMj4UC23d3Yykj0gC35BXo6PQA7ZL0soOXaRpJSWlLKSoBc2ko10
CrsgP99uF07BUK8X4cPtdp8gn2XwdgarljOkGBGbDTWYV9RbwSW794anXX70EWaKCtvsbaFC
Ry2ZKSxCsAgheaU9PgMrNwMDuJWu9TMc0RTTaTFSVyAoe3q9rnazzeZz1G6VS+ztjEXamEk5
r03OZyaFCHWdHBJmamenbz+lJyP+Gm3QHgzs8sp8AHWnedf09G2yoUZeZW+XV137tgkjD8m1
9naXXEOtdvVl5+k6PyXI6mq9XZj+K8Fw7GffvZASUCh51fq/EgrKXJsMfV4bvcxuzPX0tNsu
uKf904FTC1MjdkFvn57RaqfjLkw38XjRxKmFJw6ZZfftPlnvD8N6nxK6u69LLuu93Ctz1W4h
HEAK/rXYbevMNkNWxvN5bIJpweRghm02moZDpyQygog81etN4wIZMT9KJGeNT+b8lDOb4VQM
Us5UhpNV99uxtl393mlZVb01aHdhxg8F5KkR7K4raWHsernkI7PNPl1qEJqdPiYroFdbgxFB
Vi/HJKFWrgL2DVWZ1jOk5KP046wZJ1huKBWa7WiG2nHZXX7lb2/YhAYETHRnTboRBryy1I6A
FFAq5pqpd6JGwI8Fi7SzYspOe1wut7dmq0vdckX6vUxFUZPL22TiH1W0ZKeLrSvBNe1vT7tA
bdl7vY8TceHMTJNgPimSJuJrp8LGNuyy5a5pb3BMtvrNQVx3Y4LXbJMlH1bYZ/eGfLTtMEOr
zphZc/hYrwx4u/rtXb1D3nWKI2kGNaeqdl1kxu41p81gA8qaao3g5cy8DwX1SBDcCNhbN+Jd
zq/W4NBjwhmVNm7rRsELZpKRRjfkVYIr1K7IUfwCo2raTm2dGWt5FEU7bZ1mm8+Wvhy1HLIF
ZWLU0NCkdmZYuE0hQ4P92dbJSDSXJtr0gtcesvvsYUGWPF4aG00Py7KSDJZzpVYDF2A5ycI0
ERuyMwhNpuyuMecmV+5geBbtvIi9NcMWpjX2rv5patyuGCTo+VaZ0BaW1hnMbC+gC9qOe6+g
xyXNFvT0jCTRxRxeT43Ytwan7f3ejUwa95MbzNfSuUpJF3QNtDpqcWtrnbHDwd4ZCQ72D3kf
1+O58OCA91EOuDZ/q29mGfK8jwv40mBUjlIpkSICRailPkQ0TN78uETIFOOqGIHho6eAMJom
QwMyeopL0/TpiZaziSTCIUeV5kgZaRXSNGnaFKOxa4bQlEmFakkjFUharpgzzwAlPYqUJ/Ac
WwDkpBaKwTyDWn2MfAqmZgokc1piCiWktIcHB89PPTjkPanFt7OZ3XGiVnphu5jCWGx8rbiE
IG2U633hab+PLjZixNLgH8hg34xlsm9GR/K0cqE91CoX2VspvYXSW9L0PErPxxYFI6D6FNbe
IwPtgMu9NlySwqKfmaf1Z2mlfLipTOv/6MCMVeP3hqfxDFoOG6XTpVwRp+ErjFqigQJeoykw
8AW831fAl3KEG/aR0hYj6IxwxghPGeGIEQ4YYdgISBQY/ao5I7xghOOMFzdCjxGsjJGmy0Z4
gLFiTE0yQj0TIEZ4k3GnGL2eUTYssHnSakcYo4fx5hhdzsyRVhCYzhwzNMummWJcdM2ZmeOK
7HV15koo1+6L6J/hUB5pqTEQ0cTuBtHkHN59hWgohcpmg9hQb1tzmcG+VAd2g81gX1EHNWCo
rIANr4jnrjC3qY61my0/v6bhlTVm1d3lL8GG+edeyi/65CrzGnqgAlKOJ7c/4neCJeQJaT8p
L68qLikpqCqwWJcs8viWkHJEKqs8Pm1lRRnHqdWGPp9af9wKZ6wwawW9FYgVmhE5aoW4FfxW
8FhBskK9FQQrWBkbWVMZLrJeZJqyFY7n0HOTk0hckAAldoy6RaSAyNJQCs0Ye/rTUA/l+ZtB
bDRWYOA0G032pfkKuGKNDdz5nT9qufb6xPxVNzy0+6YD88F9t0Mj/1G4btXGr9927q4qh6OK
231iybkyCqk5kwMXTg2eT0vV3aQIvy39gzRGtNo8g6HSyBf0+wgPep6vkCpKPb4KndagM3h8
uorySlBVQvOHlXC0Erh4JfgrwVMJUiXMVoJcCccZKlSCvhJIJcwxCormSl7YIzQFwywL2fKT
RSb9r7D4LAEGUQk+z750+ZqmtZgA/nzQ10mOWkmqdUiF/zhfdfwWqFG9mcalT9bTOHmhiq7B
gYV3uV/zz5GVxCc12fLLFxVjS6xaXWzjKystHp+5Us8XeXz5vHFqNcRXg381eFaDsBoeWQ3D
q6FnNWT8JVgewmpUSrA26QKhg1kPV6wRK41i45omJ9RxzN3KCvuK5faleRXlxkoLz/165vvu
79Q7GrqueeZeX2hX43eOjt/vXL0m0Tu4fcedQy120Nx+dEnpOze1P3Rt0xJb+6j7+iPW5yed
nvbmHYsa69p20q8ZpHPhXf5q/mlixt1lUmoxaKqrVYJWW6Xi8di/tHBpr89UYTAsxooZrAZO
yxsMRFNozFdhjBWkwuMj+qkVMLwCpBWAwBVYBEw+MbEhljY708knzawn0yvQoESp9N8KDNbQ
tBlaYE3TcrYu16yF/BKoKBcb114GL933jT3z82WJmfe3Hr/ncMe2YP/Sdf8E5KZbh4+0jzby
T3/1a+duqXLsToBp93VbeNWdgV3OPc/b5y0q9e6obDWxNYs1c6huJEbSIa0oLCnJL+P5SpNK
W6T1+Aryi3S4pg29PmJ8wASyCVpM4DTRMiUybSSKivfNpc2NjbSH1NhABvuaFhArxAq7oRzr
dFlFCcAO//B1N4RafvvbDfXr++03lyfGuTsdK155ZeDcgS2t+i0mK8u5B3Puxh6qIIvJYWmo
CkC3SFOhq1hiqSKY6CprFSa6qkpbWmr0+Er1WnWvT2uctYBsgeMWOGqBKQvELeC3gMcCxAKb
8SFZoN4CggX0FphjciiU2R2yO+MVSnFoRUzOzMJINx5bGxXlFqBpx2CwBQ3YdYKhArDlbE3L
QbXpwPjab9bX/8vO13/xq6cgMn93OAZ37ILXSqfv9ZQWrbPWvQvqjz6YH+uDYw8/ePJeGus2
jPUd3C/LcMecknrKVUWkqkqv0lusZXqPrwz3A4yY5GOD5eurUIGr7PVxRtwGO3J3RsI2wSlG
SQN+RldWvxLk+Z0v04HnNz4WXnWeXTA0leJKWr4JcNHT9gNWPMNyu8D9+uq75w/87uWJWN63
oT01/9/z1qmbrx7yJeY/dQ/BH/4GUGm75UOT4+PHqxzw/E/+bQX3joHVcwfG+CjWsxA77Anp
RoO6iKhJpUlT4vFp9Fy5BwMSTEBMcMYEHhPUm0BvgjmGvmiCWdZ1x01w1ARTJoibwG8CyQRp
lQ0PMJKHkeoZVc8YufrHmWZaDe9XfO6bMbtdZpdpNkFYfL0tsy/mNyn7DPYC/+h858uvvvrG
b3732FdvvWnPvhtvnoLX5w3z7//507/95dVnnjjz1o+fTb8baR52YB6MxC9txCwY1UbMgg7f
hhq9sZwv7/XxRvR8c24kcyyGdABIf8QEw3TxZd3fnd3MxVxfq7E/BQPbFA10UxTSa5Df0XBi
aP6y/3rttuOX1fSn5j/85+/dMdG8bBW8/6dz1vmPH3LOh1/+gY36akZfT/Mn0NdvScOktFil
KigtqDSpy4xl2IpGnQqPpX2+Yr1RW4D+Vxxn2Z7NJL/5TE49CCtgtm5yJpw0RTBBbtpzX9NE
eUUrj5yXNH0H0K5UenQFXY1VtGOh+fj1E18Hcd/8nzUdT7TMXQMW0J6wcu9UOT69r8rRvaIZ
yrkxfFPRGPGdnFeF9WiAR6UFgzZv8WIbWbnS4bBpebGxoc7ja9CttC02aB01Do/PqqupqMrL
Kygo7/MV6FfgMYev7vPx+r0i7BRhrQjLRDCKkCfCRyK8LcLLIvxUhAdFuEuEERHAI0K7CPVM
rlwElQjhuYzgYyKkRJBEaGJs5H0owusizIogMxs3ixAUFRNpGX1G7EURnhXheyIcZWJXibBB
BCEzx7r0BMdF8IswkJmjnGm+zTS/KcIUTi/V5PDNTPdt5gAnM4E4mx5n1YmgUdbL8BcfMy88
heYcxM6r5wjlbE6Z45lyPsuc0CqzJzTWAOyEVknvVZA9ppVw+edPbcsvOrZ1PSy59izZ/kL7
3P75wduPL3K5WioMh+dbDw0Oem86PL9z3z4o4/0165uaa1rn/6Qc5LwnNIXFqrVbMmi/b8m5
quyBh/WRE5vhD9hHi8msdAMpKzMVabX5pvwllsV40l2sK0PEaPL4Co0VpbRt9LRtHrTA2xZ4
1gL4QlFZoBmRb1ogZYGgBQYs0G6BJgsss4CZsfHNxuW+1/Bt9qIFsq+8LD03o8N/18n3wnPv
RRls3/6v69Pn3t7BITz4Xnn11aDl/bXN2WOvt39YOfcq58HbFt6C/eQVPPeapCKSl6ct5gvu
v5wvIy3KmRP3qpwDJ+x3NTW53KLo3tXQ2dkgut3s/y30Pzblq28Z1m38K2dN/9b/yzuXdJ7/
JXfhrbwqNf0FXJMloV6+bd5FvpJLueDS5zXjN8a3SLWKkHKumdTwS8gAR397Pkw6ES/Hpwd5
23DsQHgHPs2oU4NPJ0eUX9KfgR3wDLcaP8e4t/kh/pcqj+ohtSlvY97P895VZtWTRhoDi0SP
/bILgX/nf0p4xrVANOvbzqyfgJI7FZgj+WRMgXk8i04qsAplDiqwmpSQexQ4j+jIQwqcT64l
P1BgDX43dipwASmBNgUuhCj0KnARWcw9lf0vVx33ugIXkzV8gQKXkEX8Zuq9iv46f4L3KjAQ
QaVSYI6UqJYpME/WqhoVWIUyYQVWk8WqgwqcRyyqBxU4n3yoekaBNWSl+ocKXEAWq3+vwIXc
G+qPFbiIrNP8RoG1ZFdBiQIXkysLrlTgEtJU8HJ7ZDySilwbCgrBQCogjMbi+xOR8XBKWDm6
Smisb6gXOmKx8YmQ0BZLxGOJQCoSi9YVtl0s1ij0oYnOQKpW2BodreuOjITSskJ/KBEZ6wuN
75kIJLYkR0PRYCghOISLJS7Gd4YSSYo01tXX1zWc514sHEkKASGVCARDk4HEVUJs7EJHhERo
PJJMhRJIjESFwbr+OsETSIWiKSEQDQoDWcWesbHIaIgRR0OJVACFY6kwunrlnkQkGYyM0tmS
ddkIctLRnwrtDQnbA6lUKBmLtgaSOBd6NhCJxpK1wr5wZDQs7AskhWAoGRmPInNkv3ChjoDc
AMYSjcb2osm9oVr0eywRSoYj0XEhSUNWtIVUOJCiQU+GUonIaGBiYj/WbDKOWiNYpH2RVBgn
ngwlhR2hfUJfbDIQ/W5d2hXMzRgmVYhMxhOxvcxHR3I0EQpFcbJAMDASmYik0Fo4kAiMYsYw
bZHRJMsIJkKIB6IO155ELB5CT7/S0X1eEB1MZzMZm9iLM1PpaCgUpDOi23tDE6iEE0/EYlfR
eMZiCXQ0mAo7cjwfi0VTqBoTAsEgBo7Zio3umaR1wjSnMs4FRhMx5MUnAim0MpmsC6dS8fVO
5759++oCSmlGsTJ1aNn5RbzU/nhIqUeCWpmc6MbyR2np9rD60iD6t3YLPXHMjxudExSBWiHT
mg11DcoUmMZIPJWsS0Ym6mKJcWePu5u0kwgZx5HCcS0JkSARcAQQDyA0SmIkTvaTBJMKI1Ug
K5G6Cp+NpJ404BBIB0rFkD+B+gJpQziBWvQeYHZjJErq8FtE25daa0SoT/Gik2nXIrQV9UfR
QjfqjSA3165A+hklgvss1Rwne9CPAFK2kCRqhVAmyCQE4sDxZTa+jL+TQckspxH9qsdPHXp/
Kd0vsxxBWwLLdYpxqK+TzP+rkBZDvS/KiIByIVa/JHJCDAsyq9T2IEr0MykP06S5SLHZokxq
4BIz9uCMY6g/ymqZkRxltmlPpC3HEA4rWb0SM55gHgSZXia2JM782Rpcujv6mXd72ZzbGZ3i
ScZrRTypxJXO2QDzIoZUmot96AmdN8zgAMtnkGnTLosqmiPYd8IXziMougGlLlE2x17FS6pT
q+R7jN2TbN4oziEw/9JVvnBugeUpwLKervQkclNMdhTpE/jZr6yzScxKeq4RZSXtY+syrEQ8
yewKZAc+97GuiLG6RW1LWY3PZyXdN2NKpwpMN45wjEWRyaOD1YZGEmKeUijA1v4IakywudO+
hVl3BFhtQ0qtUyyCTL6CSqTU6zijOIiL9QVd8SElp1/BnaL7khbTGcztTVqTCeZvMsd2lHkb
zMaYzjaVmlBmSkc8wXakq7L1GWP9ls5okFlzfE7Ox1huUsqsMeZRED/piqd7K4a6e1g90usp
3c2pz2QuwPIbU/TibF9KKb5MsvURZh0YJ+vxbOlE7+injvVh7qoZVdZMneKz8+/Wo37FWQZz
10ci68sk+titrP5odtXtyVm/mUr04x7UzfaLuNI/biVzwkUW6Kq5eNdsYPvlhVGkuzGCeIr5
k2S5rGMxjCO/B2foZufo9DcHG/p0iWumwLNlBEIEIAzjpIxYwU92wDAZhC1kE0j4lJDXis82
xOmzDjaRKZTbhPTNiG9E+gbcPK14b8HRg+MIDhWOtEQ9Sjjx6VRwB+K1qPEC3oENSm1BKn1u
Q7wTnx3K0410Fz5dCr4VcXwSP+TjQbyF3Z8ClXQSzpyDF86BcA4OfAKeT2Dqg6MfcO/PrbI+
MvfUHNfz3vB7j7zH178HuvdAQ87qz3rO+s/Gzx4/m1eoexe05E9geOvMOuubm04P/n7TG4Pk
NEZ2uv605/TUafm0+jTwg2/wRqt+Vpitn43PTs2+OHtmdm5WM/WToz/hfvyk06p70vokZz3Z
c/LASd7/MOgetj7Mee73388dPQa6Y9ZjzmP8fffWWe/tsFjvvmuF9cxdc3dxpxZmT95VbHA/
CT3QTTZhDnec5Besj2ypgO0Ylg7vVhxOHD04YjiO4MDvPShuxeGEbmkdP/wtKLrDfEfNHdfd
cegOdfzWqVuP3spP3XL0Fu6RvU/t5ZKeVdZYtMYa7VhtrRJNg/kiP5iH0+Ds0taR6pVu/7Bk
HUahy4fqrUMdq6xlYumgGgNWoaCOt/ItfA8f44/wT/H5mj6PxdqL44xnzsNJngKtW9dj7XH2
8KcWzkihLhta2xbfNrWN3+peZe3sWGfVdVg7nB0vdLzZ8V5H3nAHPIB/7kfcT7l5yb3K6Zbc
Fpt7cad50ChWDBpAN6gXdYMcYKFFMujULeg4nW5Yd0DH60gL4aaMoIZTcHRmoL+mputU/kJf
l6zxXC7DQbm6n96l3iE576BMBocu984AfN13y+HDpHVJl9zY75X9S3xdchABiQJTCOiXzBhJ
qy+ZTNWwC2pqEN6Dd1KzpwaJu5NpKsnySU0SkrhHJZkS1FCBNA54r6E8JFA9QO3dSUJvlFmT
VqLaScUcU07fGGDa/T/LhW2oCmVuZHN0cmVhbQplbmRvYmoKCjYgMCBvYmoKNjI5MQplbmRv
YmoKCjcgMCBvYmoKPDwvVHlwZS9Gb250RGVzY3JpcHRvci9Gb250TmFtZS9CQUFBQUErTGli
ZXJhdGlvblNlcmlmCi9GbGFncyA0Ci9Gb250QkJveFstNTQzIC0zMDMgMTI3NyA5ODFdL0l0
YWxpY0FuZ2xlIDAKL0FzY2VudCA4OTEKL0Rlc2NlbnQgLTIxNgovQ2FwSGVpZ2h0IDk4MQov
U3RlbVYgODAKL0ZvbnRGaWxlMiA1IDAgUgo+PgplbmRvYmoKCjggMCBvYmoKPDwvTGVuZ3Ro
IDI5Mi9GaWx0ZXIvRmxhdGVEZWNvZGU+PgpzdHJlYW0KeJxdkctuwyAQRfd8Bct0EfmROA/J
spQmseRFH6rbD3BgnCLVGGGy8N+XmUlbqQvQmZl7BxiSY3NqrAnJqx9VC0H2xmoP03jzCuQF
rsaKLJfaqHCPaFdD50QSve08BRga249lKZK3WJuCn+XioMcLPIjkxWvwxl7l4uPYxri9OfcF
A9ggU1FVUkMf+zx17rkbICHXstGxbMK8jJY/wfvsQOYUZ3wVNWqYXKfAd/YKokzTSpZ1XQmw
+l8tK9hy6dVn56M0i9I0LdZV5Jx4s0NeMe+R18TbFXJBnKfIG9ZkyFvWUJ8d5wvkPTPlD8w1
8iMz9Tyyl/Qnzp+Qz8xn5JrPPdOj7rfH5+H8f8Ym1c37ODL6JJoVTslY+P1HNzp00foG7l+O
gwplbmRzdHJlYW0KZW5kb2JqCgo5IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1RydWVU
eXBlL0Jhc2VGb250L0JBQUFBQStMaWJlcmF0aW9uU2VyaWYKL0ZpcnN0Q2hhciAwCi9MYXN0
Q2hhciAxNQovV2lkdGhzWzc3NyA2MTAgNTAwIDI3NyAzODkgMjUwIDQ0MyAyNzcgNDQzIDUw
MCA1MDAgNDQzIDUwMCA3NzcgNTAwIDI1MApdCi9Gb250RGVzY3JpcHRvciA3IDAgUgovVG9V
bmljb2RlIDggMCBSCj4+CmVuZG9iagoKMTAgMCBvYmoKPDwvRjEgOSAwIFIKPj4KZW5kb2Jq
CgoxMSAwIG9iago8PC9Gb250IDEwIDAgUgovUHJvY1NldFsvUERGL1RleHRdCj4+CmVuZG9i
agoKMSAwIG9iago8PC9UeXBlL1BhZ2UvUGFyZW50IDQgMCBSL1Jlc291cmNlcyAxMSAwIFIv
TWVkaWFCb3hbMCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNl
UkdCL0kgdHJ1ZT4+L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUv
UGFnZXMKL1Jlc291cmNlcyAxMSAwIFIKL01lZGlhQm94WyAwIDAgNTk1IDg0MiBdCi9LaWRz
WyAxIDAgUiBdCi9Db3VudCAxPj4KZW5kb2JqCgoxMiAwIG9iago8PC9UeXBlL0NhdGFsb2cv
UGFnZXMgNCAwIFIKL09wZW5BY3Rpb25bMSAwIFIgL1hZWiBudWxsIG51bGwgMF0KL0xhbmco
ZW4tR0IpCj4+CmVuZG9iagoKMTMgMCBvYmoKPDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5
MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAwNEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAw
NjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzUwMDJFMDAzMD4KL0NyZWF0aW9uRGF0ZShEOjIw
MTYwMjA0MjIwMDAyWicpPj4KZW5kb2JqCgp4cmVmCjAgMTQKMDAwMDAwMDAwMCA2NTUzNSBm
IAowMDAwMDA3NTA5IDAwMDAwIG4gCjAwMDAwMDAwMTkgMDAwMDAgbiAKMDAwMDAwMDIyOSAw
MDAwMCBuIAowMDAwMDA3NjUyIDAwMDAwIG4gCjAwMDAwMDAyNDkgMDAwMDAgbiAKMDAwMDAw
NjYyNSAwMDAwMCBuIAowMDAwMDA2NjQ2IDAwMDAwIG4gCjAwMDAwMDY4NDEgMDAwMDAgbiAK
MDAwMDAwNzIwMiAwMDAwMCBuIAowMDAwMDA3NDIyIDAwMDAwIG4gCjAwMDAwMDc0NTQgMDAw
MDAgbiAKMDAwMDAwNzc1MSAwMDAwMCBuIAowMDAwMDA3ODQ4IDAwMDAwIG4gCnRyYWlsZXIK
PDwvU2l6ZSAxNC9Sb290IDEyIDAgUgovSW5mbyAxMyAwIFIKL0lEIFsgPDRFN0ZCMEZCMjA4
ODBCNURBQkIzQTNEOTQxNDlBRTQ3Pgo8NEU3RkIwRkIyMDg4MEI1REFCQjNBM0Q5NDE0OUFF
NDc+IF0KL0RvY0NoZWNrc3VtIC8yQTY0RDMzNzRFQTVEODMwNTRDNEI2RDFEMUY4QzU1RQo+
PgpzdGFydHhyZWYKODAxOAolJUVPRgo=
--------------090701020702030809070008--

View File

@ -0,0 +1,218 @@
import os
import shutil
import tempfile
from unittest import mock
from unittest.mock import MagicMock
from django.contrib.auth.models import User
from django.test import override_settings
from rest_framework.test import APITestCase, APIClient
from documents.models import Document, Correspondent, DocumentType, Tag
class DocumentApiTest(APITestCase):
def setUp(self):
self.scratch_dir = tempfile.mkdtemp()
self.media_dir = tempfile.mkdtemp()
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
os.makedirs(self.originals_dir, exist_ok=True)
os.makedirs(self.thumbnail_dir, exist_ok=True)
override_settings(
SCRATCH_DIR=self.scratch_dir,
MEDIA_ROOT=self.media_dir,
ORIGINALS_DIR=self.originals_dir,
THUMBNAIL_DIR=self.thumbnail_dir
).enable()
user = User.objects.create_superuser(username="temp_admin")
self.client.force_login(user=user)
def tearDown(self):
shutil.rmtree(self.scratch_dir, ignore_errors=True)
shutil.rmtree(self.media_dir, ignore_errors=True)
def testDocuments(self):
response = self.client.get("/api/documents/").data
self.assertEqual(response['count'], 0)
c = Correspondent.objects.create(name="c", pk=41)
dt = DocumentType.objects.create(name="dt", pk=63)
tag = Tag.objects.create(name="t", pk=85)
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
doc.tags.add(tag)
response = self.client.get("/api/documents/", format='json')
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['count'], 1)
returned_doc = response.data['results'][0]
self.assertEqual(returned_doc['id'], doc.id)
self.assertEqual(returned_doc['title'], doc.title)
self.assertEqual(returned_doc['correspondent']['name'], c.name)
self.assertEqual(returned_doc['document_type']['name'], dt.name)
self.assertEqual(returned_doc['correspondent']['id'], c.id)
self.assertEqual(returned_doc['document_type']['id'], dt.id)
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
self.assertEqual(len(returned_doc['tags']), 1)
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
self.assertListEqual(returned_doc['tags_id'], [tag.id])
c2 = Correspondent.objects.create(name="c2")
returned_doc['correspondent_id'] = c2.pk
returned_doc['title'] = "the new title"
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
self.assertEqual(response.status_code, 200)
doc_after_save = Document.objects.get(id=doc.id)
self.assertEqual(doc_after_save.correspondent, c2)
self.assertEqual(doc_after_save.title, "the new title")
self.client.delete("/api/documents/{}/".format(doc_after_save.pk))
self.assertEqual(len(Document.objects.all()), 0)
def test_document_actions(self):
_, filename = tempfile.mkstemp(dir=self.originals_dir)
content = b"This is a test"
content_thumbnail = b"thumbnail content"
with open(filename, "wb") as f:
f.write(content)
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
f.write(content_thumbnail)
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content)
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content)
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content_thumbnail)
def test_document_actions_not_existing_file(self):
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
self.assertEqual(response.status_code, 404)
def test_document_filters(self):
doc1 = Document.objects.create(title="none1", checksum="A")
doc2 = Document.objects.create(title="none2", checksum="B")
doc3 = Document.objects.create(title="none3", checksum="C")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
tag_2 = Tag.objects.create(name="t2")
tag_3 = Tag.objects.create(name="t3")
doc1.tags.add(tag_inbox)
doc2.tags.add(tag_2)
doc3.tags.add(tag_2)
doc3.tags.add(tag_3)
response = self.client.get("/api/documents/?is_in_inbox=true")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['id'], doc1.id)
response = self.client.get("/api/documents/?is_in_inbox=false")
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['id'], doc2.id)
self.assertEqual(results[1]['id'], doc3.id)
response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_inbox.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['id'], doc1.id)
self.assertEqual(results[1]['id'], doc3.id)
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['id'], doc3.id)
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_inbox.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 0)
response = self.client.get("/api/documents/?tags__id__all={}a{}".format(tag_inbox.id, tag_3.id))
self.assertEqual(response.status_code, 200)
results = response.data['results']
self.assertEqual(len(results), 3)
@mock.patch("documents.index.autocomplete")
def test_search_autocomplete(self, m):
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
response = self.client.get("/api/search/autocomplete/?term=test")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
response = self.client.get("/api/search/autocomplete/?term=test&limit=20")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 20)
response = self.client.get("/api/search/autocomplete/?term=test&limit=-1")
self.assertEqual(response.status_code, 400)
response = self.client.get("/api/search/autocomplete/")
self.assertEqual(response.status_code, 400)
response = self.client.get("/api/search/autocomplete/?term=")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")
doc2 = Document.objects.create(title="none2", checksum="B")
doc3 = Document.objects.create(title="none3", checksum="C")
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
doc1.tags.add(tag_inbox)
response = self.client.get("/api/statistics/")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['documents_total'], 3)
self.assertEqual(response.data['documents_inbox'], 1)

View File

@ -0,0 +1,85 @@
import tempfile
from django.test import TestCase, override_settings
from documents.classifier import DocumentClassifier
from documents.models import Correspondent, Document, Tag, DocumentType
class TestClassifier(TestCase):
def setUp(self):
self.classifier = DocumentClassifier()
def generate_test_data(self):
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
self.c2 = Correspondent.objects.create(name="c2")
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
self.doc1.tags.add(self.t1)
self.doc2.tags.add(self.t1)
self.doc2.tags.add(self.t3)
self.doc_inbox.tags.add(self.t2)
def testNoTrainingData(self):
try:
self.classifier.train()
except ValueError as e:
self.assertEqual(str(e), "No training data available.")
else:
self.fail("Should raise exception")
def testEmpty(self):
Document.objects.create(title="WOW", checksum="3457", content="ASD")
self.classifier.train()
self.assertIsNone(self.classifier.document_type_classifier)
self.assertIsNone(self.classifier.tags_classifier)
self.assertIsNone(self.classifier.correspondent_classifier)
self.assertListEqual(self.classifier.predict_tags(""), [])
self.assertIsNone(self.classifier.predict_document_type(""))
self.assertIsNone(self.classifier.predict_correspondent(""))
def testTrain(self):
self.generate_test_data()
self.classifier.train()
self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
def testPredict(self):
self.generate_test_data()
self.classifier.train()
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
def testDatasetHashing(self):
self.generate_test_data()
self.assertTrue(self.classifier.train())
self.assertFalse(self.classifier.train())
@override_settings(DATA_DIR=tempfile.mkdtemp())
def testSaveClassifier(self):
self.generate_test_data()
self.classifier.train()
self.classifier.save_classifier()
newClassifier = DocumentClassifier()
newClassifier.reload()
self.assertFalse(newClassifier.train())

View File

@ -1,8 +1,17 @@
import os
import re
import shutil
import tempfile
from unittest import mock
from unittest.mock import MagicMock
from django.test import TestCase
from django.conf import settings
from django.db import DatabaseError
from django.test import TestCase, override_settings
from ..models import FileInfo, Tag
from ..consumer import Consumer, ConsumerError
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
from ..parsers import DocumentParser, ParseError
class TestAttributes(TestCase):
@ -394,3 +403,254 @@ class TestFieldPermutations(TestCase):
self.assertEqual(info.created.year, 2019)
self.assertEqual(info.created.month, 9)
self.assertEqual(info.created.day, 8)
class DummyParser(DocumentParser):
def get_thumbnail(self):
# not important during tests
raise NotImplementedError()
def __init__(self, path, logging_group, scratch_dir):
super(DummyParser, self).__init__(path, logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self):
return self.fake_thumb
def get_text(self):
return "The Text"
class FaultyParser(DocumentParser):
def get_thumbnail(self):
# not important during tests
raise NotImplementedError()
def __init__(self, path, logging_group, scratch_dir):
super(FaultyParser, self).__init__(path, logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self):
return self.fake_thumb
def get_text(self):
raise ParseError("Does not compute.")
class TestConsumer(TestCase):
def make_dummy_parser(self, path, logging_group):
return DummyParser(path, logging_group, self.scratch_dir)
def make_faulty_parser(self, path, logging_group):
return FaultyParser(path, logging_group, self.scratch_dir)
def setUp(self):
self.scratch_dir = tempfile.mkdtemp()
self.media_dir = tempfile.mkdtemp()
self.consumption_dir = tempfile.mkdtemp()
override_settings(
SCRATCH_DIR=self.scratch_dir,
MEDIA_ROOT=self.media_dir,
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
CONSUMPTION_DIR=self.consumption_dir
).enable()
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"test": lambda _: True,
"weight": 0
})]
self.addCleanup(patcher.stop)
self.consumer = Consumer()
def tearDown(self):
shutil.rmtree(self.scratch_dir, ignore_errors=True)
shutil.rmtree(self.media_dir, ignore_errors=True)
shutil.rmtree(self.consumption_dir, ignore_errors=True)
def get_test_file(self):
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
return f
def testNormalOperation(self):
filename = self.get_test_file()
document = self.consumer.try_consume_file(filename)
self.assertEqual(document.content, "The Text")
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
self.assertIsNone(document.correspondent)
self.assertIsNone(document.document_type)
self.assertEqual(document.filename, "0000001.pdf")
self.assertTrue(os.path.isfile(
document.source_path
))
self.assertTrue(os.path.isfile(
document.thumbnail_path
))
self.assertFalse(os.path.isfile(filename))
def testOverrideFilename(self):
filename = self.get_test_file()
overrideFilename = "My Bank - Statement for November.pdf"
document = self.consumer.try_consume_file(filename, override_filename=overrideFilename)
self.assertEqual(document.correspondent.name, "My Bank")
self.assertEqual(document.title, "Statement for November")
def testOverrideTitle(self):
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
self.assertEqual(document.title, "Override Title")
def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
self.assertEqual(document.correspondent.id, c.id)
def testOverrideDocumentType(self):
dt = DocumentType.objects.create(name="test")
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
self.assertEqual(document.document_type.id, dt.id)
def testOverrideTags(self):
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
t3 = Tag.objects.create(name="t3")
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
self.assertIn(t3, document.tags.all())
def testNotAFile(self):
try:
self.consumer.try_consume_file("non-existing-file")
except ConsumerError as e:
self.assertTrue(str(e).endswith('It is not a file'))
return
self.fail("Should throw exception")
@override_settings(CONSUMPTION_DIR=None)
def testConsumptionDirUnset(self):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
return
self.fail("Should throw exception")
@override_settings(CONSUMPTION_DIR="asd")
def testNoConsumptionDir(self):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "Consumption directory asd does not exist")
return
self.fail("Should throw exception")
def testDuplicates(self):
self.consumer.try_consume_file(self.get_test_file())
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).endswith("It is a duplicate."))
return
self.fail("Should throw exception")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testNoParsers(self, m):
m.return_value = []
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).startswith("No parsers abvailable"))
return
self.fail("Should throw exception")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"test": lambda _: True,
"weight": 0
})]
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertEqual(str(e), "Does not compute.")
return
self.fail("Should throw exception.")
@mock.patch("documents.consumer.Consumer._write")
def testPostSaveError(self, m):
filename = self.get_test_file()
m.side_effect = OSError("NO.")
try:
self.consumer.try_consume_file(filename)
except ConsumerError as e:
self.assertEqual(str(e), "NO.")
else:
self.fail("Should raise exception")
# file not deleted
self.assertTrue(os.path.isfile(filename))
# Database empty
self.assertEqual(len(Document.objects.all()), 0)
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def testFilenameHandling(self):
filename = self.get_test_file()
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
print(document.source_path)
print("===")
self.assertEqual(document.title, "new docs")
self.assertEqual(document.correspondent.name, "Bank")
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
@mock.patch("documents.consumer.DocumentClassifier")
def testClassifyDocument(self, m):
correspondent = Correspondent.objects.create(name="test")
dtype = DocumentType.objects.create(name="test")
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
m.return_value = MagicMock()
m.return_value.predict_correspondent.return_value = correspondent.pk
m.return_value.predict_document_type.return_value = dtype.pk
m.return_value.predict_tags.return_value = [t1.pk]
document = self.consumer.try_consume_file(self.get_test_file())
self.assertEqual(document.correspondent, correspondent)
self.assertEqual(document.document_type, dtype)
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())

View File

@ -1,90 +0,0 @@
import base64
import os
from hashlib import md5
from unittest import mock
import magic
from django.conf import settings
from django.test import TestCase
from ..mail import Message, Attachment
class TestMessage(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.sample = os.path.join(
settings.BASE_DIR,
"documents",
"tests",
"samples",
"mail.txt"
)
def test_init(self):
with open(self.sample, "rb") as f:
with mock.patch("logging.StreamHandler.emit") as __:
message = Message(f.read())
self.assertTrue(message)
self.assertEqual(message.subject, "Test 0")
data = message.attachment.read()
self.assertEqual(
md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
self.assertEqual(
message.attachment.content_type, "application/pdf")
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
self.assertEqual(m.id_buffer(data), "application/pdf")
class TestInlineMessage(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.sample = os.path.join(
settings.BASE_DIR,
"documents",
"tests",
"samples",
"inline_mail.txt"
)
def test_init(self):
with open(self.sample, "rb") as f:
with mock.patch("logging.StreamHandler.emit") as __:
message = Message(f.read())
self.assertTrue(message)
self.assertEqual(message.subject, "Paperless Inline Image")
data = message.attachment.read()
self.assertEqual(
md5(data).hexdigest(), "30c00a7b42913e65f7fdb0be40b9eef3")
self.assertEqual(
message.attachment.content_type, "image/png")
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
self.assertEqual(m.id_buffer(data), "image/png")
class TestAttachment(TestCase):
def test_init(self):
data = base64.encodebytes(b"0")
self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
self.assertEqual(Attachment(data, "image/png").suffix, "png")
self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
self.assertEqual(Attachment(data, "image/png").read(), data)

View File

@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser}),
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
)
self.assertEqual(
@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
)
self.assertEqual(
@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
m.return_value = ((None, lambda _: None),)
m.return_value = []
with TemporaryDirectory() as tmpdir:
self.assertIsNone(
get_parser_class("doc.pdf")

View File

@ -52,7 +52,7 @@ class CorrespondentViewSet(ModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = CorrespondentFilterSet
filterset_class = CorrespondentFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
@ -63,7 +63,7 @@ class TagViewSet(ModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = TagFilterSet
filterset_class = TagFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
@ -74,7 +74,7 @@ class DocumentTypeViewSet(ModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = DocumentTypeFilterSet
filterset_class = DocumentTypeFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
@ -89,7 +89,7 @@ class DocumentViewSet(RetrieveModelMixin,
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
filter_class = DocumentFilterSet
filterset_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
@ -170,7 +170,7 @@ class LogViewSet(ReadOnlyModelViewSet):
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filter_class = LogFilterSet
filterset_class = LogFilterSet
ordering_fields = ("created",)
@ -223,17 +223,16 @@ class SearchAutoCompleteView(APIView):
if 'term' in request.query_params:
term = request.query_params['term']
else:
term = None
return HttpResponseBadRequest("Term required")
if 'limit' in request.query_params:
limit = int(request.query_params['limit'])
if limit <= 0:
return HttpResponseBadRequest("Invalid limit")
else:
limit = 10
if term is not None:
return Response(index.autocomplete(self.ix, term, limit))
else:
return Response([])
return Response(index.autocomplete(self.ix, term, limit))
class StatisticsView(APIView):

View File

@ -1,4 +1,5 @@
import json
import math
import multiprocessing
import os
import re
@ -79,6 +80,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"django.contrib.admin",
@ -262,24 +264,58 @@ LOGGING = {
# Task queue #
###############################################################################
# Sensible defaults for multitasking:
# use a fair balance between worker processes and threads epr worker so that
# both consuming many documents in parallel and consuming large documents is
# reasonably fast.
# Favors threads per worker on smaller systems and never exceeds cpu_count()
# in total.
def default_task_workers():
try:
return max(
math.floor(math.sqrt(multiprocessing.cpu_count())),
1
)
except NotImplementedError:
return 1
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
Q_CLUSTER = {
'name': 'paperless',
'catch_up': False,
'workers': TASK_WORKERS,
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
}
def default_threads_per_worker():
try:
return max(
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
1
)
except NotImplementedError:
return 1
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
###############################################################################
# Paperless Specific Settings #
###############################################################################
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# The amount of threads to use for OCR
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
# OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
@ -324,5 +360,6 @@ FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
# TODO: this should not have a prefix.
# Specify the filename format for out files
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

View File

@ -1,7 +1,7 @@
from django.conf.urls import include, url
from django.conf.urls import include
from django.contrib import admin
from django.contrib.auth.decorators import login_required
from django.urls import path
from django.urls import path, re_path
from django.views.decorators.csrf import csrf_exempt
from django.views.generic import RedirectView
from rest_framework.routers import DefaultRouter
@ -30,32 +30,32 @@ api_router.register(r"tags", TagViewSet)
urlpatterns = [
# API
url(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
url(r"^api/search/", SearchView.as_view(), name="search"),
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
re_path(r"^api/search/", SearchView.as_view(), name="search"),
re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
# Favicon
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
# The Django admin
url(r"admin/", admin.site.urls),
re_path(r"admin/", admin.site.urls),
# These redirects are here to support clients that use the old FetchView.
url(
re_path(
r"^fetch/doc/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
),
url(
re_path(
r"^fetch/thumb/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
),
url(
re_path(
r"^fetch/preview/(?P<pk>\d+)$",
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
),
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
# Frontend assets TODO: this is pretty bad.
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
@ -63,7 +63,7 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
# Root of the Frontent
url(r".*", login_required(IndexView.as_view())),
re_path(r".*", login_required(IndexView.as_view())),
]

View File

View File

@ -0,0 +1,27 @@
from django.contrib import admin
from django import forms
from paperless_mail.models import MailAccount, MailRule
class MailAccountForm(forms.ModelForm):
password = forms.CharField(widget=forms.PasswordInput)
class Meta:
fields = '__all__'
model = MailAccount
class MailAccountAdmin(admin.ModelAdmin):
list_display = ("name", "imap_server", "username")
class MailRuleAdmin(admin.ModelAdmin):
list_display = ("name", "account", "folder", "action")
admin.site.register(MailAccount, MailAccountAdmin)
admin.site.register(MailRule, MailRuleAdmin)

View File

@ -0,0 +1,7 @@
from django.apps import AppConfig
class PaperlessMailConfig(AppConfig):
name = 'paperless_mail'
verbose_name = 'Paperless Mail'

227
src/paperless_mail/mail.py Normal file
View File

@ -0,0 +1,227 @@
import os
import tempfile
from datetime import timedelta, date
from django.conf import settings
from django.utils.text import slugify
from django_q.tasks import async_task
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
MailboxFolderSelectError
from documents.models import Correspondent
from paperless_mail.models import MailAccount, MailRule
class MailError(Exception):
pass
class BaseMailAction:
def get_criteria(self):
return {}
def post_consume(self, M, message_uids, parameter):
pass
class DeleteMailAction(BaseMailAction):
def post_consume(self, M, message_uids, parameter):
M.delete(message_uids)
class MarkReadMailAction(BaseMailAction):
def get_criteria(self):
return {'seen': False}
def post_consume(self, M, message_uids, parameter):
M.seen(message_uids, True)
class MoveMailAction(BaseMailAction):
def post_consume(self, M, message_uids, parameter):
M.move(message_uids, parameter)
class FlagMailAction(BaseMailAction):
def get_criteria(self):
return {'flagged': False}
def post_consume(self, M, message_uids, parameter):
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
def get_rule_action(rule):
if rule.action == MailRule.ACTION_FLAG:
return FlagMailAction()
elif rule.action == MailRule.ACTION_DELETE:
return DeleteMailAction()
elif rule.action == MailRule.ACTION_MOVE:
return MoveMailAction()
elif rule.action == MailRule.ACTION_MARK_READ:
return MarkReadMailAction()
else:
raise ValueError("Unknown action.")
def make_criterias(rule):
maximum_age = date.today() - timedelta(days=rule.maximum_age)
criterias = {
"date_gte": maximum_age
}
if rule.filter_from:
criterias["from_"] = rule.filter_from
if rule.filter_subject:
criterias["subject"] = rule.filter_subject
if rule.filter_body:
criterias["body"] = rule.filter_body
return {**criterias, **get_rule_action(rule).get_criteria()}
def handle_mail_account(account):
if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
mailbox = MailBox(account.imap_server, account.imap_port)
else:
raise ValueError("Unknown IMAP security")
total_processed_files = 0
with mailbox as M:
try:
M.login(account.username, account.password)
except Exception:
raise MailError(
f"Error while authenticating account {account.name}")
for rule in account.rules.all():
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} does not exist "
f"in account {account.name}")
criterias = make_criterias(rule)
try:
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
post_consume_messages = []
for message in messages:
try:
processed_files = handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing post-consume "
f"actions for account {account.name}")
return total_processed_files
def get_title(message, att, rule):
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
title = message.subject
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
title = os.path.splitext(os.path.basename(att.filename))[0]
else:
raise ValueError("Unknown title selector.")
return title
def get_correspondent(message, rule):
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
correspondent = None
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
correspondent_name = message.from_
correspondent = Correspondent.objects.get_or_create(
name=correspondent_name, defaults={
"slug": slugify(correspondent_name)
})[0]
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
if message.from_values and \
'name' in message.from_values \
and message.from_values['name']:
correspondent_name = message.from_values['name']
else:
correspondent_name = message.from_
correspondent = Correspondent.objects.get_or_create(
name=correspondent_name, defaults={
"slug": slugify(correspondent_name)
})[0]
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
correspondent = rule.assign_correspondent
else:
raise ValueError("Unknwown correspondent selector")
return correspondent
def handle_message(message, rule):
if not message.attachments:
return 0
correspondent = get_correspondent(message, rule)
tag = rule.assign_tag
doc_type = rule.assign_document_type
processed_attachments = 0
for att in message.attachments:
title = get_title(message, att, rule)
# TODO: check with parsers what files types are supported
if att.content_type == 'application/pdf':
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
with open(temp_filename, 'wb') as f:
f.write(att.payload)
async_task(
"documents.tasks.consume_file",
path=temp_filename,
override_filename=att.filename,
override_title=title,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
processed_attachments += 1
return processed_attachments

View File

@ -0,0 +1,13 @@
from django.core.management.base import BaseCommand
from paperless_mail import mail, tasks
class Command(BaseCommand):
help = """
""".replace(" ", "")
def handle(self, *args, **options):
tasks.process_mail_accounts()

View File

@ -0,0 +1,48 @@
# Generated by Django 3.1.3 on 2020-11-15 22:54
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('documents', '1002_auto_20201111_1105'),
]
operations = [
migrations.CreateModel(
name='MailAccount',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=256, unique=True)),
('imap_server', models.CharField(max_length=256)),
('imap_port', models.IntegerField(blank=True, null=True)),
('imap_security', models.PositiveIntegerField(choices=[(1, 'No encryption'), (2, 'Use SSL'), (3, 'Use STARTTLS')], default=2)),
('username', models.CharField(max_length=256)),
('password', models.CharField(max_length=256)),
],
),
migrations.CreateModel(
name='MailRule',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=256)),
('folder', models.CharField(default='INBOX', max_length=256)),
('filter_from', models.CharField(blank=True, max_length=256, null=True)),
('filter_subject', models.CharField(blank=True, max_length=256, null=True)),
('filter_body', models.CharField(blank=True, max_length=256, null=True)),
('maximum_age', models.PositiveIntegerField(default=30)),
('action', models.PositiveIntegerField(choices=[(1, 'Delete'), (2, 'Move to specified folder'), (3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails")], default=3, help_text='The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched.')),
('action_parameter', models.CharField(blank=True, help_text='Additional parameter for the action selected above, i.e., the target folder of the move to folder action.', max_length=256, null=True)),
('assign_title_from', models.PositiveIntegerField(choices=[(1, 'Use subject as title'), (2, 'Use attachment filename as title')], default=1)),
('assign_correspondent_from', models.PositiveIntegerField(choices=[(1, 'Do not assign a correspondent'), (2, 'Use mail address'), (3, 'Use name (or mail address if not available)'), (4, 'Use correspondent selected below')], default=1)),
('account', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rules', to='paperless_mail.mailaccount')),
('assign_correspondent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.correspondent')),
('assign_document_type', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.documenttype')),
('assign_tag', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.tag')),
],
),
]

View File

@ -0,0 +1,32 @@
# Generated by Django 3.1.3 on 2020-11-17 13:34
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('paperless_mail.tasks.process_mail_accounts',
name="Check all e-mail accounts",
schedule_type=Schedule.MINUTES,
minutes=10)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(
func='paperless_mail.tasks.process_mail_accounts').delete()
class Migration(migrations.Migration):
dependencies = [
('paperless_mail', '0001_initial'),
('django_q', '0013_task_attempt_count'),
]
operations = [
RunPython(add_schedules, remove_schedules)
]

View File

@ -0,0 +1,137 @@
from django.db import models
# Create your models here.
from django.db import models
import documents.models as document_models
class MailAccount(models.Model):
IMAP_SECURITY_NONE = 1
IMAP_SECURITY_SSL = 2
IMAP_SECURITY_STARTTLS = 3
IMAP_SECURITY_OPTIONS = (
(IMAP_SECURITY_NONE, "No encryption"),
(IMAP_SECURITY_SSL, "Use SSL"),
(IMAP_SECURITY_STARTTLS, "Use STARTTLS"),
)
name = models.CharField(max_length=256, unique=True)
imap_server = models.CharField(max_length=256)
imap_port = models.IntegerField(blank=True, null=True)
imap_security = models.PositiveIntegerField(
choices=IMAP_SECURITY_OPTIONS,
default=IMAP_SECURITY_SSL
)
username = models.CharField(max_length=256)
password = models.CharField(max_length=256)
def __str__(self):
return self.name
class MailRule(models.Model):
ACTION_DELETE = 1
ACTION_MOVE = 2
ACTION_MARK_READ = 3
ACTION_FLAG = 4
ACTIONS = (
(ACTION_DELETE, "Delete"),
(ACTION_MOVE, "Move to specified folder"),
(ACTION_MARK_READ, "Mark as read, don't process read mails"),
(ACTION_FLAG, "Flag the mail, don't process flagged mails")
)
TITLE_FROM_SUBJECT = 1
TITLE_FROM_FILENAME = 2
TITLE_SELECTOR = (
(TITLE_FROM_SUBJECT, "Use subject as title"),
(TITLE_FROM_FILENAME, "Use attachment filename as title")
)
CORRESPONDENT_FROM_NOTHING = 1
CORRESPONDENT_FROM_EMAIL = 2
CORRESPONDENT_FROM_NAME = 3
CORRESPONDENT_FROM_CUSTOM = 4
CORRESPONDENT_SELECTOR = (
(CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
(CORRESPONDENT_FROM_EMAIL, "Use mail address"),
(CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
)
name = models.CharField(max_length=256)
account = models.ForeignKey(
MailAccount,
related_name="rules",
on_delete=models.CASCADE
)
folder = models.CharField(default='INBOX', max_length=256)
filter_from = models.CharField(max_length=256, null=True, blank=True)
filter_subject = models.CharField(max_length=256, null=True, blank=True)
filter_body = models.CharField(max_length=256, null=True, blank=True)
maximum_age = models.PositiveIntegerField(default=30)
action = models.PositiveIntegerField(
choices=ACTIONS,
default=ACTION_MARK_READ,
help_text="The action applied to the mail. This action is only "
"performed when documents were consumed from the mail. "
"Mails without attachments will remain entirely "
"untouched."
)
action_parameter = models.CharField(
max_length=256, blank=True, null=True,
help_text="Additional parameter for the action selected above, i.e., "
"the target folder of the move to folder action."
)
assign_title_from = models.PositiveIntegerField(
choices=TITLE_SELECTOR,
default=TITLE_FROM_SUBJECT
)
assign_tag = models.ForeignKey(
document_models.Tag,
null=True,
blank=True,
on_delete=models.SET_NULL
)
assign_document_type = models.ForeignKey(
document_models.DocumentType,
null=True,
blank=True,
on_delete=models.SET_NULL
)
assign_correspondent_from = models.PositiveIntegerField(
choices=CORRESPONDENT_SELECTOR,
default=CORRESPONDENT_FROM_NOTHING
)
assign_correspondent = models.ForeignKey(
document_models.Correspondent,
null=True,
blank=True,
on_delete=models.SET_NULL
)
def __str__(self):
return self.name

View File

@ -0,0 +1,23 @@
import logging
from paperless_mail import mail
from paperless_mail.models import MailAccount
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
total_new_documents += mail.handle_mail_account(account)
if total_new_documents > 0:
return f"Added {total_new_documents} document(s)."
else:
return "No new documents were added."
def process_mail_account(name):
account = MailAccount.objects.find(name=name)
if account:
mail.handle_mail_account(account)
else:
logging.error("Unknown mail acccount: {}".format(name))

View File

View File

@ -0,0 +1,352 @@
import uuid
from collections import namedtuple
from typing import ContextManager
from unittest import mock
from django.test import TestCase
from imap_tools import MailMessageFlags, MailboxFolderSelectError
from documents.models import Correspondent
from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError
from paperless_mail.models import MailRule, MailAccount
class BogusFolderManager:
current_folder = "INBOX"
def set(self, new_folder):
if new_folder not in ["INBOX", "spam"]:
raise MailboxFolderSelectError(None, "uhm")
self.current_folder = new_folder
class BogusMailBox(ContextManager):
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __init__(self):
self.messages = []
self.messages_spam = []
def login(self, username, password):
if not (username == 'admin' and password == 'secret'):
raise Exception()
folder = BogusFolderManager()
def fetch(self, criteria, mark_seen):
msg = self.messages
criteria = str(criteria).strip('()').split(" ")
if 'UNSEEN' in criteria:
msg = filter(lambda m: not m.seen, msg)
if 'SUBJECT' in criteria:
subject = criteria[criteria.index('SUBJECT') + 1].strip('"')
msg = filter(lambda m: subject in m.subject, msg)
if 'BODY' in criteria:
body = criteria[criteria.index('BODY') + 1].strip('"')
msg = filter(lambda m: body in m.body, msg)
if 'FROM' in criteria:
from_ = criteria[criteria.index('FROM') + 1].strip('"')
msg = filter(lambda m: from_ in m.from_, msg)
if 'UNFLAGGED' in criteria:
msg = filter(lambda m: not m.flagged, msg)
return list(msg)
def seen(self, uid_list, seen_val):
for message in self.messages:
if message.uid in uid_list:
message.seen = seen_val
def delete(self, uid_list):
self.messages = list(filter(lambda m: m.uid not in uid_list, self.messages))
def flag(self, uid_list, flag_set, value):
for message in self.messages:
if message.uid in uid_list:
for flag in flag_set:
if flag == MailMessageFlags.FLAGGED:
message.flagged = value
def move(self, uid_list, folder):
if folder == "spam":
self.messages_spam.append(
filter(lambda m: m.uid in uid_list, self.messages)
)
self.messages = list(
filter(lambda m: m.uid not in uid_list, self.messages)
)
else:
raise Exception()
def create_message(num_attachments=1, body="", subject="the suject", from_="noone@mail.com", seen=False, flagged=False):
message = namedtuple('MailMessage', [])
message.uid = uuid.uuid4()
message.subject = subject
message.attachments = []
message.from_ = from_
message.body = body
for i in range(num_attachments):
attachment = namedtuple('Attachment', [])
attachment.filename = 'some_file.pdf'
attachment.content_type = 'application/pdf'
attachment.payload = b'content of the attachment'
message.attachments.append(attachment)
message.seen = seen
message.flagged = flagged
return message
class TestMail(TestCase):
def setUp(self):
patcher = mock.patch('paperless_mail.mail.MailBox')
m = patcher.start()
self.bogus_mailbox = BogusMailBox()
m.return_value = self.bogus_mailbox
self.addCleanup(patcher.stop)
patcher = mock.patch('paperless_mail.mail.async_task')
self.async_task = patcher.start()
self.addCleanup(patcher.stop)
self.reset_bogus_mailbox()
def reset_bogus_mailbox(self):
self.bogus_mailbox.messages = []
self.bogus_mailbox.messages_spam = []
self.bogus_mailbox.messages.append(create_message(subject="Invoice 1", from_="amazon@amazon.de", body="cables", seen=True, flagged=False))
self.bogus_mailbox.messages.append(create_message(subject="Invoice 2", body="from my favorite electronic store", seen=False, flagged=True))
self.bogus_mailbox.messages.append(create_message(subject="Claim your $10M price now!", from_="amazon@amazon-some-indian-site.org", seen=False))
def test_get_correspondent(self):
message = namedtuple('MailMessage', [])
message.from_ = "someone@somewhere.com"
message.from_values = {'name': "Someone!", 'email': "someone@somewhere.com"}
message2 = namedtuple('MailMessage', [])
message2.from_ = "me@localhost.com"
message2.from_values = {'name': "", 'email': "fake@localhost.com"}
me_localhost = Correspondent.objects.create(name=message2.from_)
someone_else = Correspondent.objects.create(name="someone else")
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
self.assertIsNone(get_correspondent(message, rule))
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
c = get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "someone@somewhere.com")
c = get_correspondent(message2, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "me@localhost.com")
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
c = get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "Someone!")
c = get_correspondent(message2, rule)
self.assertIsNotNone(c)
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
c = get_correspondent(message, rule)
self.assertEqual(c, someone_else)
def test_get_title(self):
message = namedtuple('MailMessage', [])
message.subject = "the message title"
att = namedtuple('Attachment', [])
att.filename = "this_is_the_file.pdf"
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_SUBJECT)
self.assertEqual(get_title(message, att, rule), "the message title")
def test_handle_message(self):
message = namedtuple('MailMessage', [])
message.subject = "the message title"
att = namedtuple('Attachment', [])
att.filename = "test1.pdf"
att.content_type = 'application/pdf'
att.payload = b"attachment contents"
att2 = namedtuple('Attachment', [])
att2.filename = "test2.pdf"
att2.content_type = 'application/pdf'
att2.payload = b"attachment contents"
att3 = namedtuple('Attachment', [])
att3.filename = "test3.pdf"
att3.content_type = 'application/invalid'
att3.payload = b"attachment contents"
message.attachments = [att, att2, att3]
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
result = handle_message(message, rule)
self.assertEqual(result, 2)
self.assertEqual(len(self.async_task.call_args_list), 2)
args1, kwargs1 = self.async_task.call_args_list[0]
args2, kwargs2 = self.async_task.call_args_list[1]
self.assertEqual(kwargs1['override_title'], "test1")
self.assertEqual(kwargs1['override_filename'], "test1.pdf")
self.assertEqual(kwargs2['override_title'], "test2")
self.assertEqual(kwargs2['override_filename'], "test2.pdf")
@mock.patch("paperless_mail.mail.async_task")
def test_handle_empty_message(self, m):
message = namedtuple('MailMessage', [])
message.attachments = []
rule = MailRule()
result = handle_message(message, rule)
self.assertFalse(m.called)
self.assertEqual(result, 0)
def test_handle_mail_account_mark_read(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
def test_handle_mail_account_delete(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Invoice")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.messages), 1)
def test_handle_mail_account_flag(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
def test_handle_mail_account_move(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", filter_subject="Claim")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
def test_errors(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
try:
handle_mail_account(account)
except MailError as e:
self.assertTrue(str(e).startswith("Error while authenticating account"))
else:
self.fail("Should raise exception")
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
try:
handle_mail_account(account)
except MailError as e:
self.assertTrue("uuuh does not exist" in str(e))
else:
self.fail("Should raise exception")
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
try:
handle_mail_account(account)
except MailError as e:
self.assertTrue("Error while processing post-consume actions" in str(e))
else:
self.fail("Should raise exception")
def test_filters(self):
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 1)
self.reset_bogus_mailbox()
rule.filter_subject = None
rule.filter_body = "electronic"
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 2)
self.reset_bogus_mailbox()
rule.filter_from = "amazon"
rule.filter_body = None
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 1)
self.assertEqual(self.async_task.call_count, 4)
self.reset_bogus_mailbox()
rule.filter_from = "amazon"
rule.filter_body = "cables"
rule.filter_subject = "Invoice"
rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 5)

View File

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View File

@ -1,5 +1,7 @@
from django.apps import AppConfig
from paperless_tesseract.signals import tesseract_consumer_declaration
class PaperlessTesseractConfig(AppConfig):
@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
document_consumer_declaration.connect(tesseract_consumer_declaration)
AppConfig.ready(self)

View File

@ -2,7 +2,7 @@ import itertools
import os
import re
import subprocess
from multiprocessing.pool import Pool
from multiprocessing.pool import ThreadPool
import langdetect
import pdftotext
@ -151,7 +151,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
# Run unpaper in parallel on converted images
with Pool(processes=settings.OCR_THREADS) as pool:
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
pnms = pool.map(run_unpaper, pnms)
return sorted(filter(lambda __: os.path.isfile(__), pnms))
@ -166,7 +166,7 @@ class RasterisedDocumentParser(DocumentParser):
def _ocr(self, imgs, lang):
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
with Pool(processes=settings.OCR_THREADS) as pool:
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
return r

View File

@ -3,21 +3,16 @@ import re
from .parsers import RasterisedDocumentParser
class ConsumerDeclaration:
def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"test": tesseract_consumer_test
}
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": RasterisedDocumentParser,
"weight": 0
}
return None
def tesseract_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())

View File

@ -1,6 +1,6 @@
from django.test import TestCase
from ..signals import ConsumerDeclaration
from paperless_tesseract.signals import tesseract_consumer_test
class SignalsTestCase(TestCase):
@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertTrue(ConsumerDeclaration.test(name))
self.assertTrue(tesseract_consumer_test(name))
def test_test_handles_various_file_names_false(self):
@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
for prefix in prefixes:
for suffix in suffixes:
name = "{}.{}".format(prefix, suffix)
self.assertFalse(ConsumerDeclaration.test(name))
self.assertFalse(tesseract_consumer_test(name))
self.assertFalse(ConsumerDeclaration.test(""))
self.assertFalse(ConsumerDeclaration.test("doc"))
self.assertFalse(tesseract_consumer_test(""))
self.assertFalse(tesseract_consumer_test("doc"))

View File

@ -1,5 +1,7 @@
from django.apps import AppConfig
from paperless_text.signals import text_consumer_declaration
class PaperlessTextConfig(AppConfig):
@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
document_consumer_declaration.connect(text_consumer_declaration)
AppConfig.ready(self)

View File

@ -3,21 +3,16 @@ import re
from .parsers import TextDocumentParser
class ConsumerDeclaration:
def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"test": text_consumer_test
}
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TextDocumentParser,
"weight": 10
}
return None
def text_consumer_test(doc):
return MATCHING_FILES.match(doc.lower())

View File

@ -6,7 +6,6 @@ ignore = E501
DJANGO_SETTINGS_MODULE=paperless.settings
addopts = --pythonwarnings=all
env =
PAPERLESS_PASSPHRASE=THISISNOTASECRET
PAPERLESS_SECRET=paperless
PAPERLESS_EMAIL_SECRET=paperless
@ -15,4 +14,4 @@ env =
source =
./
omit =
*/tests
*/tests/*