mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'dev'
This commit is contained in:
commit
5573a84335
4
Pipfile
4
Pipfile
@ -8,6 +8,9 @@ url = "https://www.piwheels.org/simple"
|
||||
verify_ssl = true
|
||||
name = "piwheels"
|
||||
|
||||
[requires]
|
||||
python_version = "3.6"
|
||||
|
||||
[packages]
|
||||
dateparser = "~=0.7.6"
|
||||
django = "~=3.1.3"
|
||||
@ -35,6 +38,7 @@ scikit-learn="~=0.23.2"
|
||||
whitenoise = "~=5.2.0"
|
||||
watchdog = "*"
|
||||
whoosh="~=2.7.4"
|
||||
inotify-simple = "*"
|
||||
|
||||
[dev-packages]
|
||||
coveralls = "*"
|
||||
|
50
Pipfile.lock
generated
50
Pipfile.lock
generated
@ -1,10 +1,12 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244"
|
||||
"sha256": "d6432a18280c092c108e998f00bcd377c0c55ef18f26cb0b8eb64f9618b9f383"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {},
|
||||
"requires": {
|
||||
"python_version": "3.6"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
@ -129,6 +131,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==0.32.0"
|
||||
},
|
||||
"inotify-simple": {
|
||||
"hashes": [
|
||||
"sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
|
||||
"sha256:854f9ac752cc1fcff6ca34e9d3d875c9a94c9b7d6eb377f63be2d481a566c6ee"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.3.5"
|
||||
},
|
||||
"joblib": {
|
||||
"hashes": [
|
||||
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
|
||||
@ -663,11 +673,11 @@
|
||||
},
|
||||
"faker": {
|
||||
"hashes": [
|
||||
"sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81",
|
||||
"sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473"
|
||||
"sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997",
|
||||
"sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==4.17.0"
|
||||
"version": "==4.17.1"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
@ -693,6 +703,22 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.2.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
|
||||
"sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"importlib-resources": {
|
||||
"hashes": [
|
||||
"sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
|
||||
"sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
|
||||
],
|
||||
"markers": "python_version < '3.7'",
|
||||
"version": "==3.3.0"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
@ -999,11 +1025,19 @@
|
||||
},
|
||||
"virtualenv": {
|
||||
"hashes": [
|
||||
"sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2",
|
||||
"sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380"
|
||||
"sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7",
|
||||
"sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==20.1.0"
|
||||
"version": "==20.2.1"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
|
||||
"sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==3.4.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ services:
|
||||
POSTGRES_PASSWORD: paperless
|
||||
|
||||
webserver:
|
||||
image: jonaswinkler/paperless-ng:0.9.2
|
||||
image: jonaswinkler/paperless-ng:0.9.3
|
||||
restart: always
|
||||
depends_on:
|
||||
- db
|
||||
|
@ -5,7 +5,7 @@ services:
|
||||
restart: always
|
||||
|
||||
webserver:
|
||||
image: jonaswinkler/paperless-ng:0.9.2
|
||||
image: jonaswinkler/paperless-ng:0.9.3
|
||||
restart: always
|
||||
depends_on:
|
||||
- broker
|
||||
|
@ -30,7 +30,7 @@ Options available to docker installations:
|
||||
Paperless uses 3 volumes:
|
||||
|
||||
* ``paperless_media``: This is where your documents are stored.
|
||||
* ``paperless_data``: This is where auxilliary data is stored. This
|
||||
* ``paperless_data``: This is where auxillary data is stored. This
|
||||
folder also contains the SQLite database, if you use it.
|
||||
* ``paperless_pgdata``: Exists only if you use PostgreSQL and contains
|
||||
the database.
|
||||
@ -109,7 +109,7 @@ B. If you built the image yourself, grab the new archive and replace your curre
|
||||
.. hint::
|
||||
|
||||
You can usually keep your ``docker-compose.env`` file, since this file will
|
||||
never include mandantory configuration options. However, it is worth checking
|
||||
never include mandatory configuration options. However, it is worth checking
|
||||
out the new version of this file, since it might have new recommendations
|
||||
on what to configure.
|
||||
|
||||
@ -126,8 +126,8 @@ After grabbing the new release and unpacking the contents, do the following:
|
||||
|
||||
$ pip install --upgrade pipenv
|
||||
$ cd /path/to/paperless
|
||||
$ pipenv install
|
||||
$ pipenv clean
|
||||
$ pipenv install
|
||||
|
||||
This creates a new virtual environment (or uses your existing environment)
|
||||
and installs all dependencies into it.
|
||||
@ -247,12 +247,12 @@ your already processed documents.
|
||||
|
||||
When multiple document types or correspondents match a single document,
|
||||
the retagger won't assign these to the document. Specify ``--use-first``
|
||||
to override this behaviour and just use the first correspondent or type
|
||||
to override this behavior and just use the first correspondent or type
|
||||
it finds. This option does not apply to tags, since any amount of tags
|
||||
can be applied to a document.
|
||||
|
||||
Finally, ``-f`` specifies that you wish to overwrite already assigned
|
||||
correspondents, types and/or tags. The default behaviour is to not
|
||||
correspondents, types and/or tags. The default behavior is to not
|
||||
assign correspondents and types to documents that have this data already
|
||||
assigned. ``-f`` works differently for tags: By default, only additional tags get
|
||||
added to documents, no tags will be removed. With ``-f``, tags that don't
|
||||
@ -341,7 +341,7 @@ Documents can be stored in Paperless using GnuPG encryption.
|
||||
|
||||
.. danger::
|
||||
|
||||
Encryption is depreceated since paperless-ng 0.9 and doesn't really provide any
|
||||
Encryption is deprecated since paperless-ng 0.9 and doesn't really provide any
|
||||
additional security, since you have to store the passphrase in a configuration
|
||||
file on the same system as the encrypted documents for paperless to work.
|
||||
Furthermore, the entire text content of the documents is stored plain in the
|
||||
@ -353,39 +353,23 @@ Documents can be stored in Paperless using GnuPG encryption.
|
||||
Consider running paperless on an encrypted filesystem instead, which will then
|
||||
at least provide security against physical hardware theft.
|
||||
|
||||
.. code::
|
||||
|
||||
change_storage_type [--passphrase PASSPHRASE] {gpg,unencrypted} {gpg,unencrypted}
|
||||
|
||||
positional arguments:
|
||||
{gpg,unencrypted} The state you want to change your documents from
|
||||
{gpg,unencrypted} The state you want to change your documents to
|
||||
|
||||
optional arguments:
|
||||
--passphrase PASSPHRASE
|
||||
|
||||
Enabling encryption
|
||||
-------------------
|
||||
|
||||
Basic usage to enable encryption of your document store (**USE A MORE SECURE PASSPHRASE**):
|
||||
|
||||
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
|
||||
.. code::
|
||||
|
||||
change_storage_type [--passphrase SECR3TP4SSPHRA$E] unencrypted gpg
|
||||
Enabling encryption is no longer supported.
|
||||
|
||||
|
||||
Disabling encryption
|
||||
--------------------
|
||||
|
||||
Basic usage to enable encryption of your document store:
|
||||
Basic usage to disable encryption of your document store:
|
||||
|
||||
(Note: Again, if ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
(Note: If ``PAPERLESS_PASSPHRASE`` isn't set already, you need to specify it here)
|
||||
|
||||
.. code::
|
||||
|
||||
change_storage_type [--passphrase SECR3TP4SSPHRA$E] gpg unencrypted
|
||||
decrypt_documents [--passphrase SECR3TP4SSPHRA$E]
|
||||
|
||||
|
||||
.. _Pipenv: https://pipenv.pypa.io/en/latest/
|
@ -84,6 +84,8 @@ to the filename.
|
||||
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
|
||||
|
||||
|
||||
.. _advanced-matching:
|
||||
|
||||
Matching tags, correspondents and document types
|
||||
################################################
|
||||
|
||||
@ -145,7 +147,9 @@ America are tagged with the tag "bofa_123" and the matching algorithm of this
|
||||
tag is set to *Auto*, this neural network will examine your documents and
|
||||
automatically learn when to assign this tag.
|
||||
|
||||
There are a couple caveats you need to keep in mind when using this feature:
|
||||
Paperless tries to hide much of the involved complexity with this approach.
|
||||
However, there are a couple caveats you need to keep in mind when using this
|
||||
feature:
|
||||
|
||||
* Changes to your documents are not immediately reflected by the matching
|
||||
algorithm. The neural network needs to be *trained* on your documents after
|
||||
@ -165,6 +169,11 @@ There are a couple caveats you need to keep in mind when using this feature:
|
||||
has the correspondent "Very obscure web shop I bought something five years
|
||||
ago", it will probably not assign this correspondent automatically if you buy
|
||||
something from them again. The more documents, the better.
|
||||
* Paperless also needs a reasonable amount of negative examples to decide when
|
||||
not to assign a certain tag, correspondent or type. This will usually be the
|
||||
case as you start filling up paperless with documents. Example: If all your
|
||||
documents are either from "Webshop" and "Bank", paperless will assign one of
|
||||
these correspondents to ANY new document, if both are set to automatic matching.
|
||||
|
||||
Hooking into the consumption process
|
||||
####################################
|
||||
@ -253,7 +262,7 @@ By default, paperless stores your documents in the media directory and renames t
|
||||
using the identifier which it has assigned to each document. You will end up getting
|
||||
files like ``0000123.pdf`` in your media directory. This isn't necessarily a bad
|
||||
thing, because you normally don't have to access these files manually. However, if
|
||||
you wish to name your files differently, you can do that by adjustng the
|
||||
you wish to name your files differently, you can do that by adjusting the
|
||||
``PAPERLESS_FILENAME_FORMAT`` settings variable.
|
||||
|
||||
This variable allows you to configure the filename (folders are allowed!) using
|
||||
@ -278,7 +287,7 @@ will create a directory structure as follows:
|
||||
my_new_shoes-0000004.pdf
|
||||
|
||||
Paperless appends the unique identifier of each document to the filename. This
|
||||
avoides filename clashes.
|
||||
avoids filename clashes.
|
||||
|
||||
.. danger::
|
||||
|
||||
|
@ -94,7 +94,7 @@ Result object:
|
||||
}
|
||||
|
||||
* ``id``: the primary key of the found document
|
||||
* ``highlights``: an object containing parseable highlights for the result.
|
||||
* ``highlights``: an object containing parsable highlights for the result.
|
||||
See below.
|
||||
* ``score``: The score assigned to the document. A higher score indicates a
|
||||
better match with the query. Search results are sorted descending by score.
|
||||
|
@ -5,6 +5,24 @@
|
||||
Changelog
|
||||
*********
|
||||
|
||||
paperless-ng 0.9.3
|
||||
##################
|
||||
|
||||
* Setting ``PAPERLESS_AUTO_LOGIN_USERNAME`` replaces ``PAPERLESS_DISABLE_LOGIN``.
|
||||
You have to specify your username.
|
||||
* Added a simple sanity checker that checks your documents for missing or orphaned files,
|
||||
files with wrong checksums, inaccessible files, and documents with empty content.
|
||||
* It is no longer possible to encrypt your documents. For the time being, paperless will
|
||||
continue to operate with already encrypted documents.
|
||||
* Fixes:
|
||||
|
||||
* Paperless now uses inotify again, since the watchdog was causing issues which I was not
|
||||
aware of.
|
||||
* Issue with the automatic classifier not working with only one tag.
|
||||
* A couple issues with the search index being opened to eagerly.
|
||||
|
||||
* Added lots of tests for various parts of the application.
|
||||
|
||||
paperless-ng 0.9.2
|
||||
##################
|
||||
|
||||
@ -52,7 +70,7 @@ paperless-ng 0.9.0
|
||||
* **Added:** New frontend. Features:
|
||||
|
||||
* Single page application: It's much more responsive than the django admin pages.
|
||||
* Dashboard. Shows recently scanned documents, or todos, or other documents
|
||||
* Dashboard. Shows recently scanned documents, or todo notes, or other documents
|
||||
at wish. Allows uploading of documents. Shows basic statistics.
|
||||
* Better document list with multiple display options.
|
||||
* Full text search with result highlighting, auto completion and scoring based
|
||||
@ -102,7 +120,7 @@ paperless-ng 0.9.0
|
||||
|
||||
* **Modified [breaking]:** PostgreSQL:
|
||||
|
||||
* If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses postgresql instead of sqlite.
|
||||
* If ``PAPERLESS_DBHOST`` is specified in the settings, paperless uses PostgreSQL instead of SQLite.
|
||||
Username, database and password all default to ``paperless`` if not specified.
|
||||
|
||||
* **Modified [breaking]:** document_retagger management command rework. See
|
||||
@ -130,7 +148,7 @@ paperless-ng 0.9.0
|
||||
Certain language specifics such as umlauts may not get picked up properly.
|
||||
* ``PAPERLESS_DEBUG`` defaults to ``false``.
|
||||
* The presence of ``PAPERLESS_DBHOST`` now determines whether to use PostgreSQL or
|
||||
sqlite.
|
||||
SQLite.
|
||||
* ``PAPERLESS_OCR_THREADS`` is gone and replaced with ``PAPERLESS_TASK_WORKERS`` and
|
||||
``PAPERLESS_THREADS_PER_WORKER``. Refer to the config example for details.
|
||||
* ``PAPERLESS_OPTIMIZE_THUMBNAILS`` allows you to disable or enable thumbnail
|
||||
@ -138,8 +156,11 @@ paperless-ng 0.9.0
|
||||
|
||||
* Many more small changes here and there. The usual stuff.
|
||||
|
||||
Paperless
|
||||
#########
|
||||
|
||||
2.7.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* `syntonym`_ submitted a pull request to catch IMAP connection errors `#475`_.
|
||||
* `Stéphane Brunner`_ added ``psycopg2`` to the Pipfile `#489`_. He also fixed
|
||||
@ -156,7 +177,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.6.1
|
||||
#####
|
||||
=====
|
||||
|
||||
* We now have a logo, complete with a favicon :-)
|
||||
* Removed some problematic tests.
|
||||
@ -168,7 +189,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.6.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Allow an infinite number of logs to be deleted. Thanks to `Ulli`_ for noting
|
||||
the problem in `#433`_.
|
||||
@ -189,7 +210,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.5.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* **New dependency**: Paperless now optimises thumbnail generation with
|
||||
`optipng`_, so you'll need to install that somewhere in your PATH or declare
|
||||
@ -233,7 +254,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.4.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* A new set of actions are now available thanks to `jonaswinkler`_'s very first
|
||||
pull request! You can now do nifty things like tag documents in bulk, or set
|
||||
@ -254,7 +275,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.3.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Support for consuming plain text & markdown documents was added by
|
||||
`Joshua Taillon`_! This was a long-requested feature, and it's addition is
|
||||
@ -272,14 +293,14 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.2.1
|
||||
#####
|
||||
=====
|
||||
|
||||
* `Kyle Lucy`_ reported a bug quickly after the release of 2.2.0 where we broke
|
||||
the ``DISABLE_LOGIN`` feature: `#392`_.
|
||||
|
||||
|
||||
2.2.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Thanks to `dadosch`_, `Wolfgang Mader`_, and `Tim Brooks`_ this is the first
|
||||
version of Paperless that supports Django 2.0! As a result of their hard
|
||||
@ -296,7 +317,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.1.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* `Enno Lohmeier`_ added three simple features that make Paperless a lot more
|
||||
user (and developer) friendly:
|
||||
@ -315,7 +336,7 @@ paperless-ng 0.9.0
|
||||
|
||||
|
||||
2.0.0
|
||||
#####
|
||||
=====
|
||||
|
||||
This is a big release as we've changed a core-functionality of Paperless: we no
|
||||
longer encrypt files with GPG by default.
|
||||
@ -347,7 +368,7 @@ Special thanks to `erikarvstedt`_, `matthewmoto`_, and `mcronce`_ who did the
|
||||
bulk of the work on this big change.
|
||||
|
||||
1.4.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* `Quentin Dawans`_ has refactored the document consumer to allow for some
|
||||
command-line options. Notably, you can now direct it to consume from a
|
||||
@ -382,7 +403,7 @@ bulk of the work on this big change.
|
||||
to some excellent work from `erikarvstedt`_ on `#351`_
|
||||
|
||||
1.3.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* You can now run Paperless without a login, though you'll still have to create
|
||||
at least one user. This is thanks to a pull-request from `matthewmoto`_:
|
||||
@ -405,7 +426,7 @@ bulk of the work on this big change.
|
||||
problem and helping me find where to fix it.
|
||||
|
||||
1.2.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* New Docker image, now based on Alpine, thanks to the efforts of `addadi`_
|
||||
and `Pit`_. This new image is dramatically smaller than the Debian-based
|
||||
@ -424,7 +445,7 @@ bulk of the work on this big change.
|
||||
in the document text.
|
||||
|
||||
1.1.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Fix for `#283`_, a redirect bug which broke interactions with
|
||||
paperless-desktop. Thanks to `chris-aeviator`_ for reporting it.
|
||||
@ -434,7 +455,7 @@ bulk of the work on this big change.
|
||||
`Dan Panzarella`_
|
||||
|
||||
1.0.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Upgrade to Django 1.11. **You'll need to run
|
||||
``pip install -r requirements.txt`` after the usual ``git pull`` to
|
||||
@ -453,14 +474,14 @@ bulk of the work on this big change.
|
||||
`Lukas Winkler`_'s issue `#278`_
|
||||
|
||||
0.8.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Paperless can now run in a subdirectory on a host (``/paperless``), rather
|
||||
than always running in the root (``/``) thanks to `maphy-psd`_'s work on
|
||||
`#255`_.
|
||||
|
||||
0.7.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* **Potentially breaking change**: As per `#235`_, Paperless will no longer
|
||||
automatically delete documents attached to correspondents when those
|
||||
@ -472,7 +493,7 @@ bulk of the work on this big change.
|
||||
`Kusti Skytén`_ for posting the correct solution in the Github issue.
|
||||
|
||||
0.6.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Abandon the shared-secret trick we were using for the POST API in favour
|
||||
of BasicAuth or Django session.
|
||||
@ -486,7 +507,7 @@ bulk of the work on this big change.
|
||||
the help with this feature.
|
||||
|
||||
0.5.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Support for fuzzy matching in the auto-tagger & auto-correspondent systems
|
||||
thanks to `Jake Gysland`_'s patch `#220`_.
|
||||
@ -504,13 +525,13 @@ bulk of the work on this big change.
|
||||
* Amended the Django Admin configuration to have nice headers (`#230`_)
|
||||
|
||||
0.4.1
|
||||
#####
|
||||
=====
|
||||
|
||||
* Fix for `#206`_ wherein the pluggable parser didn't recognise files with
|
||||
all-caps suffixes like ``.PDF``
|
||||
|
||||
0.4.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Introducing reminders. See `#199`_ for more information, but the short
|
||||
explanation is that you can now attach simple notes & times to documents
|
||||
@ -520,7 +541,7 @@ bulk of the work on this big change.
|
||||
like to make use of this feature in his project.
|
||||
|
||||
0.3.6
|
||||
#####
|
||||
=====
|
||||
|
||||
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
|
||||
correspondent or the tags for a document.
|
||||
@ -534,7 +555,7 @@ bulk of the work on this big change.
|
||||
documentation is on its way.
|
||||
|
||||
0.3.5
|
||||
#####
|
||||
=====
|
||||
|
||||
* A serious facelift for the documents listing page wherein we drop the
|
||||
tabular layout in favour of a tiled interface.
|
||||
@ -545,7 +566,7 @@ bulk of the work on this big change.
|
||||
consumption.
|
||||
|
||||
0.3.4
|
||||
#####
|
||||
=====
|
||||
|
||||
* Removal of django-suit due to a licensing conflict I bumped into in 0.3.3.
|
||||
Note that you *can* use Django Suit with Paperless, but only in a
|
||||
@ -558,26 +579,26 @@ bulk of the work on this big change.
|
||||
API thanks to @thomasbrueggemann. See `#179`_.
|
||||
|
||||
0.3.3
|
||||
#####
|
||||
=====
|
||||
|
||||
* Thumbnails in the UI and a Django-suit -based face-lift courtesy of @ekw!
|
||||
* Timezone, items per page, and default language are now all configurable,
|
||||
also thanks to @ekw.
|
||||
|
||||
0.3.2
|
||||
#####
|
||||
=====
|
||||
|
||||
* Fix for `#172`_: defaulting ALLOWED_HOSTS to ``["*"]`` and allowing the
|
||||
user to set her own value via ``PAPERLESS_ALLOWED_HOSTS`` should the need
|
||||
arise.
|
||||
|
||||
0.3.1
|
||||
#####
|
||||
=====
|
||||
|
||||
* Added a default value for ``CONVERT_BINARY``
|
||||
|
||||
0.3.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Updated to using django-filter 1.x
|
||||
* Added some system checks so new users aren't confused by misconfigurations.
|
||||
@ -590,7 +611,7 @@ bulk of the work on this big change.
|
||||
``PAPERLESS_SHARED_SECRET`` respectively instead.
|
||||
|
||||
0.2.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* `#150`_: The media root is now a variable you can set in
|
||||
``paperless.conf``.
|
||||
@ -618,7 +639,7 @@ bulk of the work on this big change.
|
||||
to `Martin Honermeyer`_ and `Tim White`_ for working with me on this.
|
||||
|
||||
0.1.1
|
||||
#####
|
||||
=====
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
@ -642,7 +663,7 @@ bulk of the work on this big change.
|
||||
to be imported but made unavailable.
|
||||
|
||||
0.1.0
|
||||
#####
|
||||
=====
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
@ -661,14 +682,14 @@ bulk of the work on this big change.
|
||||
* Added tox with pep8 checking
|
||||
|
||||
0.0.6
|
||||
#####
|
||||
=====
|
||||
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
|
||||
0.0.5
|
||||
#####
|
||||
=====
|
||||
|
||||
* Added support for image files as documents (png, jpg, gif, tiff)
|
||||
* Added a crude means of HTTP POST for document imports
|
||||
@ -677,7 +698,7 @@ bulk of the work on this big change.
|
||||
* Documentation for the above as well as data migration
|
||||
|
||||
0.0.4
|
||||
#####
|
||||
=====
|
||||
|
||||
* Added automated tagging basted on keyword matching
|
||||
* Cleaned up the document listing page
|
||||
@ -685,19 +706,19 @@ bulk of the work on this big change.
|
||||
* Added ``pytz`` to the list of requirements
|
||||
|
||||
0.0.3
|
||||
#####
|
||||
=====
|
||||
|
||||
* Added basic tagging
|
||||
|
||||
0.0.2
|
||||
#####
|
||||
=====
|
||||
|
||||
* Added language detection
|
||||
* Added datestamps to ``document_exporter``.
|
||||
* Changed ``settings.TESSERACT_LANGUAGE`` to ``settings.OCR_LANGUAGE``.
|
||||
|
||||
0.0.1
|
||||
#####
|
||||
=====
|
||||
|
||||
* Initial release
|
||||
|
||||
|
@ -35,22 +35,22 @@ PAPERLESS_DBHOST=<hostname>
|
||||
|
||||
PAPERLESS_DBPORT=<port>
|
||||
Adjust port if necessary.
|
||||
|
||||
|
||||
Default is 5432.
|
||||
|
||||
PAPERLESS_DBNAME=<name>
|
||||
Database name in PostgreSQL.
|
||||
|
||||
|
||||
Defaults to "paperless".
|
||||
|
||||
PAPERLESS_DBUSER=<name>
|
||||
Database user in PostgreSQL.
|
||||
|
||||
|
||||
Defaults to "paperless".
|
||||
|
||||
PAPERLESS_DBPASS=<password>
|
||||
Database password for PostgreSQL.
|
||||
|
||||
|
||||
Defaults to "paperless".
|
||||
|
||||
|
||||
@ -69,7 +69,7 @@ PAPERLESS_CONSUMPTION_DIR=<path>
|
||||
Defaults to "../consume", relative to the "src" directory.
|
||||
|
||||
PAPERLESS_DATA_DIR=<path>
|
||||
This is where paperless stores all its data (search index, sqlite database,
|
||||
This is where paperless stores all its data (search index, SQLite database,
|
||||
classification model, etc).
|
||||
|
||||
Defaults to "../data", relative to the "src" directory.
|
||||
@ -100,7 +100,7 @@ Hosting & Security
|
||||
##################
|
||||
|
||||
PAPERLESS_SECRET_KEY=<key>
|
||||
Paperless uses this to make session tokens. If you exose paperless on the
|
||||
Paperless uses this to make session tokens. If you expose paperless on the
|
||||
internet, you need to change this, since the default secret is well known.
|
||||
|
||||
Use any sequence of characters. The more, the better. You don't need to
|
||||
@ -113,7 +113,7 @@ PAPERLESS_ALLOWED_HOSTS<comma-separated-list>
|
||||
really should set this value to the domain name you're using. Failing to do
|
||||
so leaves you open to HTTP host header attacks:
|
||||
https://docs.djangoproject.com/en/3.1/topics/security/#host-header-validation
|
||||
|
||||
|
||||
Just remember that this is a comma-separated list, so "example.com" is fine,
|
||||
as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
|
||||
|
||||
@ -132,15 +132,25 @@ PAPERLESS_FORCE_SCRIPT_NAME=<path>
|
||||
.. note::
|
||||
|
||||
I don't know if this works in paperless-ng. Probably not.
|
||||
|
||||
|
||||
Defaults to none, which hosts paperless at "/".
|
||||
|
||||
PAPERLESS_STATIC_URL=<path>
|
||||
Override the STATIC_URL here. Unless you're hosting Paperless off a
|
||||
subdomain like /paperless/, you probably don't need to change this.
|
||||
|
||||
|
||||
Defaults to "/static/".
|
||||
|
||||
PAPERLESS_AUTO_LOGIN_USERNAME=<username>
|
||||
Specify a username here so that paperless will automatically perform login
|
||||
with the selected user.
|
||||
|
||||
.. danger::
|
||||
|
||||
Do not use this when exposing paperless on the internet. There are no
|
||||
checks in place that would prevent you from doing this.
|
||||
|
||||
Defaults to none, which disables this feature.
|
||||
|
||||
Software tweaks
|
||||
###############
|
||||
@ -156,11 +166,11 @@ PAPERLESS_THREADS_PER_WORKER=<num>
|
||||
in parallel on a single document.
|
||||
|
||||
.. caution::
|
||||
|
||||
|
||||
Ensure that the product
|
||||
|
||||
|
||||
PAPERLESS_TASK_WORKERS * PAPERLESS_THREADS_PER_WORKER
|
||||
|
||||
|
||||
does not exceed your CPU core count or else paperless will be extremely slow.
|
||||
If you want paperless to process many documents in parallel, choose a high
|
||||
worker count. If you want paperless to process very large documents faster,
|
||||
@ -197,10 +207,10 @@ PAPERLESS_OCR_PAGES=<num>
|
||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||
Customize the default language that tesseract will attempt to use when
|
||||
parsing documents. The default language is used whenever
|
||||
|
||||
|
||||
* No language could be detected on a document
|
||||
* No tesseract data files are available for the detected language
|
||||
|
||||
|
||||
It should be a 3-letter language code consistent with ISO
|
||||
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
@ -220,7 +230,7 @@ PAPERLESS_CONSUMER_POLLING=<num>
|
||||
specify a polling interval in seconds here, which will then cause paperless
|
||||
to periodically check your consumption directory for changes.
|
||||
|
||||
Defaults to 0, which disables polling and uses filesystem notifiactions.
|
||||
Defaults to 0, which disables polling and uses filesystem notifications.
|
||||
|
||||
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
|
||||
When the consumer detects a duplicate document, it will not touch the
|
||||
@ -234,7 +244,7 @@ PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
|
||||
such cases, try setting this to a reasonably low value, like 32. The
|
||||
default is to use whatever is necessary to do everything without writing to
|
||||
disk, and units are in megabytes.
|
||||
|
||||
|
||||
For more information on how to use this value, you should search
|
||||
the web for "MAGICK_MEMORY_LIMIT".
|
||||
|
||||
@ -245,7 +255,7 @@ PAPERLESS_CONVERT_TMPDIR=<path>
|
||||
/tmp as tmpfs, you should set this to a path that's on a physical disk, like
|
||||
/home/your_user/tmp or something. ImageMagick will use this as scratch space
|
||||
when crunching through very large documents.
|
||||
|
||||
|
||||
For more information on how to use this value, you should search
|
||||
the web for "MAGICK_TMPDIR".
|
||||
|
||||
@ -264,7 +274,7 @@ PAPERLESS_CONVERT_DENSITY=<num>
|
||||
Default is 300.
|
||||
|
||||
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
|
||||
Use optipng to optimize thumbnails. This usually reduces the sice of
|
||||
Use optipng to optimize thumbnails. This usually reduces the size of
|
||||
thumbnails by about 20%, but uses considerable compute time during
|
||||
consumption.
|
||||
|
||||
@ -282,7 +292,7 @@ PAPERLESS_FILENAME_DATE_ORDER=<format>
|
||||
Use this setting to enable checking the document filename for date
|
||||
information. The date order can be set to any option as specified in
|
||||
https://dateparser.readthedocs.io/en/latest/settings.html#date-order.
|
||||
The filename will be checked first, and if nothing is found, the document
|
||||
The filename will be checked first, and if nothing is found, the document
|
||||
text will be checked as normal.
|
||||
|
||||
Defaults to none, which disables this feature.
|
||||
|
@ -85,7 +85,7 @@ quoted, or triple-quoted string will do:
|
||||
problematic_string = 'This is a "string" with "quotes" in it'
|
||||
|
||||
In HTML templates, please use double-quotes for tag attributes, and single
|
||||
quotes for arguments passed to Django tempalte tags:
|
||||
quotes for arguments passed to Django template tags:
|
||||
|
||||
.. code:: html
|
||||
|
||||
|
@ -17,7 +17,7 @@ is
|
||||
|
||||
.. caution::
|
||||
|
||||
Dont mess with this folder. Don't change permissions and don't move
|
||||
Do not mess with this folder. Don't change permissions and don't move
|
||||
files around manually. This folder is meant to be entirely managed by docker
|
||||
and paperless.
|
||||
|
||||
@ -36,9 +36,9 @@ file extensions do not matter.
|
||||
|
||||
**A:** The short answer is yes. I've tested it on a Raspberry Pi 3 B.
|
||||
The long answer is that certain parts of
|
||||
Paperless will run very slow, such as the tesseract OCR. On Rasperry Pi,
|
||||
Paperless will run very slow, such as the tesseract OCR. On Raspberry Pi,
|
||||
try to OCR documents before feeding them into paperless so that paperless can
|
||||
reuse the text. The web interface should be alot snappier, since it runs
|
||||
reuse the text. The web interface should be a lot snappier, since it runs
|
||||
in your browser and paperless has to do much less work to serve the data.
|
||||
|
||||
.. note::
|
||||
|
@ -8,7 +8,7 @@ Scanner recommendations
|
||||
As Paperless operates by watching a folder for new files, doesn't care what
|
||||
scanner you use, but sometimes finding a scanner that will write to an FTP,
|
||||
NFS, or SMB server can be difficult. This page is here to help you find one
|
||||
that works right for you based on recommentations from other Paperless users.
|
||||
that works right for you based on recommendations from other Paperless users.
|
||||
|
||||
+---------+----------------+-----+-----+-----+----------------+
|
||||
| Brand | Model | Supports | Recommended By |
|
||||
|
@ -21,7 +21,7 @@ Extensive filtering mechanisms:
|
||||
|
||||
.. image:: _static/screenshots/documents-filter.png
|
||||
|
||||
Side-by-side editing of documents. Optmized for 1080p.
|
||||
Side-by-side editing of documents. Optimized for 1080p.
|
||||
|
||||
.. image:: _static/screenshots/editing.png
|
||||
|
||||
|
@ -85,7 +85,7 @@ Paperless consists of the following components:
|
||||
needs to do from time to time in order to operate properly.
|
||||
|
||||
This allows paperless to process multiple documents from your consumption folder in parallel! On
|
||||
a modern multicore system, consumption with full ocr is blazing fast.
|
||||
a modern multi core system, consumption with full ocr is blazing fast.
|
||||
|
||||
The task processor comes with a built-in admin interface that you can use to see whenever any of the
|
||||
tasks fail and inspect the errors (i.e., wrong email credentials, errors during consuming a specific
|
||||
@ -265,15 +265,17 @@ Migration to paperless-ng is then performed in a few simple steps:
|
||||
``docker-compose.env`` to your needs.
|
||||
See `docker route`_ for details on which edits are advised.
|
||||
|
||||
6. Start paperless-ng.
|
||||
6. In order to find your existing documents with the new search feature, you need
|
||||
to invoke a one-time operation that will create the search index:
|
||||
|
||||
.. code:: bash
|
||||
.. code:: shell-session
|
||||
|
||||
$ docker-compose up
|
||||
$ docker-compose run --rm webserver document_index reindex
|
||||
|
||||
This will migrate your database and create the search index. After that,
|
||||
paperless will take care of maintaining the index by itself.
|
||||
|
||||
If you see everything working (you should see some migrations getting
|
||||
applied, for instance), you can gracefully stop paperless-ng with Ctrl-C
|
||||
and then start paperless-ng as usual with
|
||||
7. Start paperless-ng.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
@ -281,11 +283,11 @@ Migration to paperless-ng is then performed in a few simple steps:
|
||||
|
||||
This will run paperless in the background and automatically start it on system boot.
|
||||
|
||||
7. Paperless installed a permanent redirect to ``admin/`` in your browser. This
|
||||
8. Paperless installed a permanent redirect to ``admin/`` in your browser. This
|
||||
redirect is still in place and prevents access to the new UI. Clear
|
||||
browsing cache in order to fix this.
|
||||
|
||||
8. Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
|
||||
9. Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
|
||||
|
||||
|
||||
.. _setup-sqlite_to_psql:
|
||||
@ -322,7 +324,7 @@ management commands as below.
|
||||
$ cd /path/to/paperless
|
||||
$ docker-compose run --rm webserver /bin/bash
|
||||
|
||||
This will lauch the container and initialize the PostgreSQL database.
|
||||
This will launch the container and initialize the PostgreSQL database.
|
||||
|
||||
b) Without docker, open a shell in your virtual environment, switch to
|
||||
the ``src`` directory and create the database schema:
|
||||
@ -357,6 +359,35 @@ management commands as below.
|
||||
7. Start paperless.
|
||||
|
||||
|
||||
Moving back to paperless
|
||||
========================
|
||||
|
||||
Lets say you migrated to Paperless-ng and used it for a while, but decided that
|
||||
you don't like it and want to move back (If you do, send me a mail about what
|
||||
part you didn't like!), you can totally do that with a few simple steps.
|
||||
|
||||
Paperless-ng modified the database schema slightly, however, these changes can
|
||||
be reverted while keeping your current data, so that your current data will
|
||||
be compatible with original Paperless.
|
||||
|
||||
Execute this:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless
|
||||
$ docker-compose run --rm webserver migrate documents 0023
|
||||
|
||||
Or without docker:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ cd /path/to/paperless/src
|
||||
$ python3 manage.py migrate documents 0023
|
||||
|
||||
After that, you need to clear your cookies (Paperless-ng comes with updated
|
||||
dependencies that do cookie-processing differently) and probably your cache
|
||||
as well.
|
||||
|
||||
.. _setup-less_powerful_devices:
|
||||
|
||||
|
||||
@ -372,7 +403,7 @@ configuring some options in paperless can help improve performance immensely:
|
||||
* ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
|
||||
to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
|
||||
paperless will use 2 workers and 2 threads per worker. This may result in
|
||||
slugish response times during consumption, so you might want to lower these
|
||||
sluggish response times during consumption, so you might want to lower these
|
||||
settings (example: 2 workers and 1 thread to always have some computing power
|
||||
left for other tasks).
|
||||
* Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
|
||||
|
@ -5,13 +5,13 @@ Usage Overview
|
||||
Paperless is an application that manages your personal documents. With
|
||||
the help of a document scanner (see :ref:`scanners`), paperless transforms
|
||||
your wieldy physical document binders into a searchable archive and
|
||||
provices many utilities for finding and managing your documents.
|
||||
provides many utilities for finding and managing your documents.
|
||||
|
||||
|
||||
Terms and definitions
|
||||
#####################
|
||||
|
||||
Paperless esentially consists of two different parts for managing your
|
||||
Paperless essentially consists of two different parts for managing your
|
||||
documents:
|
||||
|
||||
* The *consumer* watches a specified folder and adds all documents in that
|
||||
@ -30,12 +30,12 @@ Each document has a couple of fields that you can assign to them:
|
||||
tag, however, a single document can also have multiple tags. This is not
|
||||
possible with folders. The reason folders are not implemented in paperless
|
||||
is simply that tags are much more versatile than folders.
|
||||
* A *document type* is used to demarkate the type of a document such as letter,
|
||||
* A *document type* is used to demarcate the type of a document such as letter,
|
||||
bank statement, invoice, contract, etc. It is used to identify what a document
|
||||
is about.
|
||||
* The *date added* of a document is the date the document was scanned into
|
||||
paperless. You cannot and should not change this date.
|
||||
* The *date created* of a document is the date the document was intially issued.
|
||||
* The *date created* of a document is the date the document was initially issued.
|
||||
This can be the date you bought a product, the date you signed a contract, or
|
||||
the date a letter was sent to you.
|
||||
* The *archive serial number* (short: ASN) of a document is the identifier of
|
||||
@ -131,7 +131,7 @@ These are as follows:
|
||||
|
||||
With the correct set of rules, you can completely automate your email documents.
|
||||
Create rules for every correspondent you receive digital documents from and
|
||||
paperless will read them automatically. The default acion "mark as read" is
|
||||
paperless will read them automatically. The default action "mark as read" is
|
||||
pretty tame and will not cause any damage or data loss whatsoever.
|
||||
|
||||
You can also setup a special folder in your mail account for paperless and use
|
||||
@ -182,7 +182,7 @@ Processing of the physical documents
|
||||
====================================
|
||||
|
||||
Keep a physical inbox. Whenever you receive a document that you need to
|
||||
archive, put it into your inbox. Regulary, do the following for all documents
|
||||
archive, put it into your inbox. Regularly, do the following for all documents
|
||||
in your inbox:
|
||||
|
||||
1. For each document, decide if you need to keep the document in physical
|
||||
@ -217,18 +217,24 @@ Once you have scanned in a document, proceed in paperless as follows.
|
||||
|
||||
1. If the document has an ASN, assign the ASN to the document.
|
||||
2. Assign a correspondent to the document (i.e., your employer, bank, etc)
|
||||
This isnt strictly necessary but helps in finding a document when you need
|
||||
This isn't strictly necessary but helps in finding a document when you need
|
||||
it.
|
||||
3. Assign a document type (i.e., invoice, bank statement, etc) to the document
|
||||
This isnt strictly necessary but helps in finding a document when you need
|
||||
This isn't strictly necessary but helps in finding a document when you need
|
||||
it.
|
||||
4. Assign a proper title to the document (the name of an item you bought, the
|
||||
subject of the letter, etc)
|
||||
5. Check that the date of the document is corrent. Paperless tries to read
|
||||
5. Check that the date of the document is correct. Paperless tries to read
|
||||
the date from the content of the document, but this fails sometimes if the
|
||||
OCR is bad or multiple dates appear on the document.
|
||||
6. Remove inbox tags from the documents.
|
||||
|
||||
.. hint::
|
||||
|
||||
You can setup manual matching rules for your correspondents and tags and
|
||||
paperless will assign them automatically. After consuming a couple documents,
|
||||
you can even ask paperless to *learn* when to assign tags and correspondents
|
||||
by itself. For details on this feature, see :ref:`advanced-matching`.
|
||||
|
||||
Task management
|
||||
===============
|
||||
|
@ -29,6 +29,7 @@
|
||||
#PAPERLESS_CORS_ALLOWED_HOSTS=localhost:8080,example.com,localhost:8000
|
||||
#PAPERLESS_FORCE_SCRIPT_NAME=
|
||||
#PAPERLESS_STATIC_URL=/static/
|
||||
#PAPERLESS_AUTO_LOGIN_USERNAME=
|
||||
|
||||
# Software tweaks
|
||||
|
||||
|
@ -1,5 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Release checklist
|
||||
# - wait for travis build.
|
||||
# adjust src/paperless/version.py
|
||||
# changelog in the documentation
|
||||
# adjust versions in docker/hub/*
|
||||
# If docker-compose was modified: all compose files are the same.
|
||||
|
||||
# Steps:
|
||||
# run release script "dev", push
|
||||
# if it works: new tag, merge into master
|
||||
# on master: make release "lastest", push
|
||||
# on master: make release "version-tag", push
|
||||
# publish release files
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
|
@ -23,7 +23,7 @@ import { TagEditDialogComponent } from './components/manage/tag-list/tag-edit-di
|
||||
import { DocumentTypeEditDialogComponent } from './components/manage/document-type-list/document-type-edit-dialog/document-type-edit-dialog.component';
|
||||
import { TagComponent } from './components/common/tag/tag.component';
|
||||
import { SearchComponent } from './components/search/search.component';
|
||||
import { ResultHightlightComponent } from './components/search/result-hightlight/result-hightlight.component';
|
||||
import { ResultHighlightComponent } from './components/search/result-highlight/result-highlight.component';
|
||||
import { PageHeaderComponent } from './components/common/page-header/page-header.component';
|
||||
import { AppFrameComponent } from './components/app-frame/app-frame.component';
|
||||
import { ToastsComponent } from './components/common/toasts/toasts.component';
|
||||
@ -65,7 +65,7 @@ import { WidgetFrameComponent } from './components/dashboard/widgets/widget-fram
|
||||
DocumentTypeEditDialogComponent,
|
||||
TagComponent,
|
||||
SearchComponent,
|
||||
ResultHightlightComponent,
|
||||
ResultHighlightComponent,
|
||||
PageHeaderComponent,
|
||||
AppFrameComponent,
|
||||
ToastsComponent,
|
||||
|
@ -11,7 +11,7 @@
|
||||
<h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5>
|
||||
</div>
|
||||
<p class="card-text">
|
||||
<app-result-hightlight *ngIf="getDetailsAsHighlight()" class="result-content" [highlights]="getDetailsAsHighlight()"></app-result-hightlight>
|
||||
<app-result-highlight *ngIf="getDetailsAsHighlight()" class="result-content" [highlights]="getDetailsAsHighlight()"></app-result-highlight>
|
||||
<span *ngIf="getDetailsAsString()" class="result-content">{{getDetailsAsString()}}</span>
|
||||
</p>
|
||||
|
||||
|
@ -1,20 +1,20 @@
|
||||
import { ComponentFixture, TestBed } from '@angular/core/testing';
|
||||
|
||||
import { ResultHightlightComponent } from './result-hightlight.component';
|
||||
import { ResultHighlightComponent } from './result-highlight.component';
|
||||
|
||||
describe('ResultHightlightComponent', () => {
|
||||
let component: ResultHightlightComponent;
|
||||
let fixture: ComponentFixture<ResultHightlightComponent>;
|
||||
describe('ResultHighlightComponent', () => {
|
||||
let component: ResultHighlightComponent;
|
||||
let fixture: ComponentFixture<ResultHighlightComponent>;
|
||||
|
||||
beforeEach(async () => {
|
||||
await TestBed.configureTestingModule({
|
||||
declarations: [ ResultHightlightComponent ]
|
||||
declarations: [ ResultHighlightComponent ]
|
||||
})
|
||||
.compileComponents();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
fixture = TestBed.createComponent(ResultHightlightComponent);
|
||||
fixture = TestBed.createComponent(ResultHighlightComponent);
|
||||
component = fixture.componentInstance;
|
||||
fixture.detectChanges();
|
||||
});
|
@ -2,11 +2,11 @@ import { Component, Input, OnInit } from '@angular/core';
|
||||
import { SearchHitHighlight } from 'src/app/data/search-result';
|
||||
|
||||
@Component({
|
||||
selector: 'app-result-hightlight',
|
||||
templateUrl: './result-hightlight.component.html',
|
||||
styleUrls: ['./result-hightlight.component.scss']
|
||||
selector: 'app-result-highlight',
|
||||
templateUrl: './result-highlight.component.html',
|
||||
styleUrls: ['./result-highlight.component.scss']
|
||||
})
|
||||
export class ResultHightlightComponent implements OnInit {
|
||||
export class ResultHighlightComponent implements OnInit {
|
||||
|
||||
constructor() { }
|
||||
|
@ -1 +1,2 @@
|
||||
from .checks import changed_password_check
|
||||
# this is here so that django finds the checks.
|
||||
from .checks import *
|
||||
|
@ -4,12 +4,13 @@ import os
|
||||
import pickle
|
||||
import re
|
||||
|
||||
from django.conf import settings
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from documents.models import Document, MatchingModel
|
||||
from paperless import settings
|
||||
|
||||
|
||||
class IncompatibleClassifierVersionError(Exception):
|
||||
@ -27,7 +28,7 @@ def preprocess_content(content):
|
||||
|
||||
class DocumentClassifier(object):
|
||||
|
||||
FORMAT_VERSION = 5
|
||||
FORMAT_VERSION = 6
|
||||
|
||||
def __init__(self):
|
||||
# mtime of the model file on disk. used to prevent reloading when
|
||||
@ -54,6 +55,8 @@ class DocumentClassifier(object):
|
||||
"Cannor load classifier, incompatible versions.")
|
||||
else:
|
||||
if self.classifier_version > 0:
|
||||
# Don't be confused by this check. It's simply here
|
||||
# so that we wont log anything on initial reload.
|
||||
logger.info("Classifier updated on disk, "
|
||||
"reloading classifier models")
|
||||
self.data_hash = pickle.load(f)
|
||||
@ -122,9 +125,14 @@ class DocumentClassifier(object):
|
||||
labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
|
||||
|
||||
num_tags = len(labels_tags_unique)
|
||||
|
||||
# substract 1 since -1 (null) is also part of the classes.
|
||||
num_correspondents = len(set(labels_correspondent)) - 1
|
||||
num_document_types = len(set(labels_document_type)) - 1
|
||||
|
||||
# union with {-1} accounts for cases where all documents have
|
||||
# correspondents and types assigned, so -1 isnt part of labels_x, which
|
||||
# it usually is.
|
||||
num_correspondents = len(set(labels_correspondent) | {-1}) - 1
|
||||
num_document_types = len(set(labels_document_type) | {-1}) - 1
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
@ -145,12 +153,23 @@ class DocumentClassifier(object):
|
||||
)
|
||||
data_vectorized = self.data_vectorizer.fit_transform(data)
|
||||
|
||||
self.tags_binarizer = MultiLabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
|
||||
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logging.getLogger(__name__).debug("Training tags classifier...")
|
||||
|
||||
if num_tags == 1:
|
||||
# Special case where only one tag has auto:
|
||||
# Fallback to binary classification.
|
||||
labels_tags = [label[0] if len(label) == 1 else -1
|
||||
for label in labels_tags]
|
||||
self.tags_binarizer = LabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(
|
||||
labels_tags).ravel()
|
||||
else:
|
||||
self.tags_binarizer = MultiLabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(
|
||||
labels_tags)
|
||||
|
||||
self.tags_classifier = MLPClassifier(tol=0.01)
|
||||
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
|
||||
else:
|
||||
@ -222,6 +241,16 @@ class DocumentClassifier(object):
|
||||
X = self.data_vectorizer.transform([preprocess_content(content)])
|
||||
y = self.tags_classifier.predict(X)
|
||||
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
|
||||
return tags_ids
|
||||
if type_of_target(y).startswith('multilabel'):
|
||||
# the usual case when there are multiple tags.
|
||||
return list(tags_ids)
|
||||
elif type_of_target(y) == 'binary' and tags_ids != -1:
|
||||
# This is for when we have binary classification with only one
|
||||
# tag and the result is to assign this tag.
|
||||
return [tags_ids]
|
||||
else:
|
||||
# Usually binary as well with -1 as the result, but we're
|
||||
# going to catch everything else here as well.
|
||||
return []
|
||||
else:
|
||||
return []
|
||||
|
@ -8,7 +8,6 @@ from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
@ -40,17 +39,6 @@ class Consumer(LoggingMixin):
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
|
||||
def pre_check_consumption_dir(self):
|
||||
if not settings.CONSUMPTION_DIR:
|
||||
raise ConsumerError(
|
||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||
"set.")
|
||||
|
||||
if not os.path.isdir(settings.CONSUMPTION_DIR):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
@ -92,7 +80,6 @@ class Consumer(LoggingMixin):
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
@ -208,10 +195,7 @@ class Consumer(LoggingMixin):
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
else:
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
@ -260,8 +244,4 @@ class Consumer(LoggingMixin):
|
||||
def _write(self, document, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
write_file.write(read_file.read())
|
||||
return
|
||||
self.log("debug", "Encrypting")
|
||||
write_file.write(GnuPG.encrypted(read_file))
|
||||
write_file.write(read_file.read())
|
||||
|
@ -64,15 +64,15 @@ def get_schema():
|
||||
|
||||
|
||||
def open_index(recreate=False):
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR)
|
||||
else:
|
||||
# TODO: this is not thread safe. If 2 instances try to create the index
|
||||
# at the same time, this fails. This currently prevents parallel
|
||||
# tests.
|
||||
if not os.path.isdir(settings.INDEX_DIR):
|
||||
os.makedirs(settings.INDEX_DIR, exist_ok=True)
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR)
|
||||
except Exception as e:
|
||||
logger.error(f"Error while opening the index: {e}, recreating.")
|
||||
|
||||
if not os.path.isdir(settings.INDEX_DIR):
|
||||
os.makedirs(settings.INDEX_DIR, exist_ok=True)
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
|
@ -1,9 +1,14 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class PaperlessHandler(logging.Handler):
|
||||
def emit(self, record):
|
||||
if settings.DISABLE_DBHANDLER:
|
||||
return
|
||||
|
||||
# We have to do the import here or Django will barf when it tries to
|
||||
# load this because the apps aren't loaded at that point
|
||||
from .models import Log
|
||||
|
@ -17,16 +17,6 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
|
||||
parser.add_argument(
|
||||
"from",
|
||||
choices=("gpg", "unencrypted"),
|
||||
help="The state you want to change your documents from"
|
||||
)
|
||||
parser.add_argument(
|
||||
"to",
|
||||
choices=("gpg", "unencrypted"),
|
||||
help="The state you want to change your documents to"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--passphrase",
|
||||
help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
|
||||
@ -50,11 +40,6 @@ class Command(BaseCommand):
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
|
||||
if options["from"] == options["to"]:
|
||||
raise CommandError(
|
||||
'The "from" and "to" values can\'t be the same.'
|
||||
)
|
||||
|
||||
passphrase = options["passphrase"] or settings.PASSPHRASE
|
||||
if not passphrase:
|
||||
raise CommandError(
|
||||
@ -62,10 +47,7 @@ class Command(BaseCommand):
|
||||
"by declaring it in your environment or your config."
|
||||
)
|
||||
|
||||
if options["from"] == "gpg" and options["to"] == "unencrypted":
|
||||
self.__gpg_to_unencrypted(passphrase)
|
||||
elif options["from"] == "unencrypted" and options["to"] == "gpg":
|
||||
self.__unencrypted_to_gpg(passphrase)
|
||||
self.__gpg_to_unencrypted(passphrase)
|
||||
|
||||
@staticmethod
|
||||
def __gpg_to_unencrypted(passphrase):
|
||||
@ -79,42 +61,28 @@ class Command(BaseCommand):
|
||||
document).encode('utf-8'), "green"))
|
||||
|
||||
old_paths = [document.source_path, document.thumbnail_path]
|
||||
|
||||
raw_document = GnuPG.decrypted(document.source_file, passphrase)
|
||||
raw_thumb = GnuPG.decrypted(document.thumbnail_file, passphrase)
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
ext = os.path.splitext(document.filename)[1]
|
||||
|
||||
if not ext == '.gpg':
|
||||
raise CommandError(
|
||||
f"Abort: encrypted file {document.source_path} does not "
|
||||
f"end with .gpg")
|
||||
|
||||
document.filename = os.path.splitext(document.filename)[0]
|
||||
|
||||
with open(document.source_path, "wb") as f:
|
||||
f.write(raw_document)
|
||||
|
||||
with open(document.thumbnail_path, "wb") as f:
|
||||
f.write(raw_thumb)
|
||||
|
||||
document.save(update_fields=("storage_type",))
|
||||
|
||||
for path in old_paths:
|
||||
os.unlink(path)
|
||||
|
||||
@staticmethod
|
||||
def __unencrypted_to_gpg(passphrase):
|
||||
|
||||
unencrypted_files = Document.objects.filter(
|
||||
storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
|
||||
for document in unencrypted_files:
|
||||
|
||||
print(coloured("Encrypting {}".format(document), "green"))
|
||||
|
||||
old_paths = [document.source_path, document.thumbnail_path]
|
||||
with open(document.source_path, "rb") as raw_document:
|
||||
with open(document.thumbnail_path, "rb") as raw_thumb:
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
with open(document.source_path, "wb") as f:
|
||||
f.write(GnuPG.encrypted(raw_document, passphrase))
|
||||
with open(document.thumbnail_path, "wb") as f:
|
||||
f.write(GnuPG.encrypted(raw_thumb, passphrase))
|
||||
|
||||
document.save(update_fields=("storage_type",))
|
||||
document.save(update_fields=("storage_type", "filename"))
|
||||
|
||||
for path in old_paths:
|
||||
os.unlink(path)
|
@ -1,11 +1,11 @@
|
||||
import logging
|
||||
import os
|
||||
from time import sleep
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
try:
|
||||
@ -13,25 +13,54 @@ try:
|
||||
except ImportError:
|
||||
INotify = flags = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _consume(file):
|
||||
try:
|
||||
if os.path.isfile(file):
|
||||
async_task("documents.tasks.consume_file",
|
||||
file,
|
||||
task_name=os.path.basename(file)[:100])
|
||||
else:
|
||||
logger.debug(
|
||||
f"Not consuming file {file}: File has moved.")
|
||||
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
# This is also what the test case is listening for to check for
|
||||
# errors.
|
||||
logger.error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
|
||||
|
||||
def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
|
||||
mtime = -1
|
||||
current_try = 0
|
||||
while current_try < num_tries:
|
||||
try:
|
||||
new_mtime = os.stat(file).st_mtime
|
||||
except FileNotFoundError:
|
||||
logger.debug(f"File {file} moved while waiting for it to remain "
|
||||
f"unmodified.")
|
||||
return
|
||||
if new_mtime == mtime:
|
||||
_consume(file)
|
||||
return
|
||||
mtime = new_mtime
|
||||
sleep(wait_time)
|
||||
current_try += 1
|
||||
|
||||
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
|
||||
|
||||
|
||||
class Handler(FileSystemEventHandler):
|
||||
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
async_task("documents.tasks.consume_file",
|
||||
file,
|
||||
task_name=os.path.basename(file)[:100])
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
|
||||
def on_created(self, event):
|
||||
self._consume(event.src_path)
|
||||
_consume_wait_unmodified(event.src_path)
|
||||
|
||||
def on_moved(self, event):
|
||||
self._consume(event.src_path)
|
||||
_consume_wait_unmodified(event.dest_path)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
@ -40,12 +69,15 @@ class Command(BaseCommand):
|
||||
consumption directory.
|
||||
"""
|
||||
|
||||
# This is here primarily for the tests and is irrelevant in production.
|
||||
stop_flag = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
self.verbosity = 0
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.observer = None
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
@ -54,38 +86,66 @@ class Command(BaseCommand):
|
||||
nargs="?",
|
||||
help="The consumption directory."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
directory = options["directory"]
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(
|
||||
directory
|
||||
)
|
||||
parser.add_argument(
|
||||
"--oneshot",
|
||||
action="store_true",
|
||||
help="Run only once."
|
||||
)
|
||||
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
async_task("documents.tasks.consume_file",
|
||||
entry.path,
|
||||
task_name=os.path.basename(entry.path)[:100])
|
||||
def handle(self, *args, **options):
|
||||
directory = options["directory"]
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info(
|
||||
"Using polling instead of file system notifications.")
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
if not directory:
|
||||
raise CommandError(
|
||||
"CONSUMPTION_DIR does not appear to be set."
|
||||
)
|
||||
|
||||
if not os.path.isdir(directory):
|
||||
raise CommandError(
|
||||
f"Consumption directory {directory} does not exist")
|
||||
|
||||
for entry in os.scandir(directory):
|
||||
_consume(entry.path)
|
||||
|
||||
if options["oneshot"]:
|
||||
return
|
||||
|
||||
if settings.CONSUMER_POLLING == 0 and INotify:
|
||||
self.handle_inotify(directory)
|
||||
else:
|
||||
observer = Observer()
|
||||
event_handler = Handler()
|
||||
observer.schedule(event_handler, directory, recursive=True)
|
||||
observer.start()
|
||||
self.handle_polling(directory)
|
||||
|
||||
logger.debug("Consumer exiting.")
|
||||
|
||||
def handle_polling(self, directory):
|
||||
logging.getLogger(__name__).info(
|
||||
f"Polling directory for changes: {directory}")
|
||||
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
self.observer.schedule(Handler(), directory, recursive=False)
|
||||
self.observer.start()
|
||||
try:
|
||||
while observer.is_alive():
|
||||
observer.join(1)
|
||||
while self.observer.is_alive():
|
||||
self.observer.join(1)
|
||||
if self.stop_flag:
|
||||
self.observer.stop()
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
self.observer.stop()
|
||||
self.observer.join()
|
||||
|
||||
def handle_inotify(self, directory):
|
||||
logging.getLogger(__name__).info(
|
||||
f"Using inotify to watch directory for changes: {directory}")
|
||||
|
||||
inotify = INotify()
|
||||
descriptor = inotify.add_watch(
|
||||
directory, flags.CLOSE_WRITE | flags.MOVED_TO)
|
||||
try:
|
||||
while not self.stop_flag:
|
||||
for event in inotify.read(timeout=1000, read_delay=1000):
|
||||
file = os.path.join(directory, event.name)
|
||||
_consume(file)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
inotify.rm_watch(descriptor)
|
||||
inotify.close()
|
||||
|
@ -22,13 +22,6 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
parser.add_argument(
|
||||
"--legacy",
|
||||
action="store_true",
|
||||
help="Don't try to export all of the document data, just dump the "
|
||||
"original document files out in a format that makes "
|
||||
"re-consuming them easy."
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
@ -44,10 +37,7 @@ class Command(Renderable, BaseCommand):
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
if options["legacy"]:
|
||||
self.dump_legacy()
|
||||
else:
|
||||
self.dump()
|
||||
self.dump()
|
||||
|
||||
def dump(self):
|
||||
|
||||
@ -102,33 +92,3 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json"), "w") as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
def dump_legacy(self):
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
target = os.path.join(
|
||||
self.target, self._get_legacy_file_name(document))
|
||||
|
||||
print("Exporting: {}".format(target))
|
||||
|
||||
with open(target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
os.utime(target, times=(t, t))
|
||||
|
||||
@staticmethod
|
||||
def _get_legacy_file_name(doc):
|
||||
|
||||
if not doc.correspondent and not doc.title:
|
||||
return os.path.basename(doc.source_path)
|
||||
|
||||
created = doc.created.strftime("%Y%m%d%H%M%SZ")
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
|
||||
if tags:
|
||||
return "{} - {} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, tags, doc.file_type)
|
||||
|
||||
return "{} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, doc.file_type)
|
||||
|
@ -82,8 +82,6 @@ class Command(Renderable, BaseCommand):
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
@ -105,23 +103,8 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
if settings.PASSPHRASE:
|
||||
|
||||
with open(document_path, "rb") as unencrypted:
|
||||
with open(document.source_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
doc_file, document.source_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
with open(thumbnail_path, "rb") as unencrypted:
|
||||
with open(document.thumbnail_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
thumb_file, document.thumbnail_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
else:
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
|
||||
document.save()
|
||||
|
@ -5,23 +5,6 @@ from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
def make_index(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
print()
|
||||
try:
|
||||
print(" --> Creating document index...")
|
||||
from whoosh.writing import AsyncWriter
|
||||
from documents import index
|
||||
ix = index.open_index(recreate=True)
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
except ImportError:
|
||||
# index may not be relevant anymore
|
||||
print(" --> Cannot create document index.")
|
||||
|
||||
|
||||
def logs_set_default_group(apps, schema_editor):
|
||||
Log = apps.get_model('documents', 'Log')
|
||||
for log in Log.objects.all():
|
||||
@ -99,8 +82,4 @@ class Migration(migrations.Migration):
|
||||
code=django.db.migrations.operations.special.RunPython.noop,
|
||||
reverse_code=logs_set_default_group
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=make_index,
|
||||
reverse_code=django.db.migrations.operations.special.RunPython.noop,
|
||||
),
|
||||
]
|
||||
|
26
src/documents/migrations/1004_sanity_check_schedule.py
Normal file
26
src/documents/migrations/1004_sanity_check_schedule.py
Normal file
@ -0,0 +1,26 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-25 14:53
|
||||
|
||||
from django.db import migrations
|
||||
from django.db.migrations import RunPython
|
||||
from django_q.models import Schedule
|
||||
from django_q.tasks import schedule
|
||||
|
||||
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1003_mime_types'),
|
||||
('django_q', '0013_task_attempt_count'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
RunPython(add_schedules, remove_schedules)
|
||||
]
|
@ -230,6 +230,7 @@ class Document(models.Model):
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
# TODO: this is not stable across python versions
|
||||
return mimetypes.guess_extension(str(self.mime_type))
|
||||
|
||||
@property
|
||||
|
94
src/documents/sanity_checker.py
Normal file
94
src/documents/sanity_checker.py
Normal file
@ -0,0 +1,94 @@
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class SanityMessage:
|
||||
message = None
|
||||
|
||||
|
||||
class SanityWarning(SanityMessage):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return f"Warning: {self.message}"
|
||||
|
||||
|
||||
class SanityError(SanityMessage):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return f"ERROR: {self.message}"
|
||||
|
||||
|
||||
class SanityFailedError(Exception):
|
||||
|
||||
def __init__(self, messages):
|
||||
self.messages = messages
|
||||
|
||||
def __str__(self):
|
||||
message_string = "\n".join([str(m) for m in self.messages])
|
||||
return (
|
||||
f"The following issuse were found by the sanity checker:\n"
|
||||
f"{message_string}\n\n===============\n\n")
|
||||
|
||||
|
||||
def check_sanity():
|
||||
messages = []
|
||||
|
||||
present_files = []
|
||||
for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
|
||||
for f in files:
|
||||
present_files.append(os.path.normpath(os.path.join(root, f)))
|
||||
|
||||
for doc in Document.objects.all():
|
||||
# Check thumbnail
|
||||
if not os.path.isfile(doc.thumbnail_path):
|
||||
messages.append(SanityError(
|
||||
f"Thumbnail of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.thumbnail_path))
|
||||
try:
|
||||
with doc.thumbnail_file as f:
|
||||
f.read()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read thumbnail file of document {doc.pk}: {e}"
|
||||
))
|
||||
|
||||
# Check document
|
||||
if not os.path.isfile(doc.source_path):
|
||||
messages.append(SanityError(
|
||||
f"Original of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
checksum = None
|
||||
try:
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||
|
||||
if checksum and not checksum == doc.checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
if not doc.content:
|
||||
messages.append(SanityWarning(
|
||||
f"Document {doc.pk} has no content."
|
||||
))
|
||||
|
||||
for extra_file in present_files:
|
||||
messages.append(SanityWarning(
|
||||
f"Orphaned file in media dir: {extra_file}"
|
||||
))
|
||||
|
||||
return messages
|
@ -93,14 +93,11 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
"document_type_id",
|
||||
"title",
|
||||
"content",
|
||||
"mime_type",
|
||||
"tags",
|
||||
"tags_id",
|
||||
"checksum",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"file_name",
|
||||
"archive_serial_number"
|
||||
)
|
||||
|
||||
|
@ -3,11 +3,12 @@ import logging
|
||||
from django.conf import settings
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents import index, sanity_checker
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.consumer import Consumer, ConsumerError
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import SanityFailedError
|
||||
|
||||
|
||||
def index_optimize():
|
||||
@ -74,3 +75,12 @@ def consume_file(path,
|
||||
else:
|
||||
raise ConsumerError("Unknown error: Returned document was null, but "
|
||||
"no error message was given.")
|
||||
|
||||
|
||||
def sanity_check():
|
||||
messages = sanity_checker.check_sanity()
|
||||
|
||||
if len(messages) > 0:
|
||||
raise SanityFailedError(messages)
|
||||
else:
|
||||
return "No issues detected."
|
||||
|
BIN
src/documents/tests/samples/originals/0000001.pdf
Normal file
BIN
src/documents/tests/samples/originals/0000001.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/originals/0000002.pdf.gpg
Normal file
BIN
src/documents/tests/samples/originals/0000002.pdf.gpg
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/simple.pdf
Normal file
BIN
src/documents/tests/samples/simple.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/simple.zip
Normal file
BIN
src/documents/tests/samples/simple.zip
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/thumb/0000001.png
Normal file
BIN
src/documents/tests/samples/thumb/0000001.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
BIN
src/documents/tests/samples/thumb/0000002.png.gpg
Normal file
BIN
src/documents/tests/samples/thumb/0000002.png.gpg
Normal file
Binary file not shown.
@ -1,40 +1,24 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import override_settings
|
||||
from pathvalidate import ValidationError
|
||||
from rest_framework.test import APITestCase
|
||||
|
||||
from documents import index
|
||||
from documents.models import Document, Correspondent, DocumentType, Tag
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class DocumentApiTest(APITestCase):
|
||||
class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
|
||||
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
|
||||
|
||||
os.makedirs(self.originals_dir, exist_ok=True)
|
||||
os.makedirs(self.thumbnail_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumbnail_dir
|
||||
).enable()
|
||||
super(DocumentApiTest, self).setUp()
|
||||
|
||||
user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_login(user=user)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
|
||||
def testDocuments(self):
|
||||
|
||||
response = self.client.get("/api/documents/").data
|
||||
@ -87,7 +71,7 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
def test_document_actions(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.originals_dir)
|
||||
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_thumbnail = b"thumbnail content"
|
||||
@ -97,7 +81,7 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
|
||||
|
||||
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
with open(os.path.join(self.dirs.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
f.write(content_thumbnail)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
@ -179,6 +163,109 @@ class DocumentApiTest(APITestCase):
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
def test_search_no_query(self):
|
||||
response = self.client.get("/api/search/")
|
||||
results = response.data['results']
|
||||
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_search(self):
|
||||
d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
|
||||
d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
|
||||
d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
|
||||
with index.open_index(False).writer() as writer:
|
||||
# Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once
|
||||
# (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer.
|
||||
# That's why we cant open the writer in a model on_save handler or something.
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
response = self.client.get("/api/search/?query=bank")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 3)
|
||||
self.assertEqual(response.data['page'], 1)
|
||||
self.assertEqual(response.data['page_count'], 1)
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
response = self.client.get("/api/search/?query=september")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 1)
|
||||
self.assertEqual(response.data['page'], 1)
|
||||
self.assertEqual(response.data['page_count'], 1)
|
||||
self.assertEqual(len(results), 1)
|
||||
|
||||
response = self.client.get("/api/search/?query=statement")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 2)
|
||||
self.assertEqual(response.data['page'], 1)
|
||||
self.assertEqual(response.data['page_count'], 1)
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
response = self.client.get("/api/search/?query=sfegdfg")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 0)
|
||||
self.assertEqual(response.data['page'], 0)
|
||||
self.assertEqual(response.data['page_count'], 0)
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_search_multi_page(self):
|
||||
with index.open_index(False).writer() as writer:
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
|
||||
index.update_document(writer, doc)
|
||||
|
||||
# This is here so that we test that no document gets returned twice (might happen if the paging is not working)
|
||||
seen_ids = []
|
||||
|
||||
for i in range(1, 6):
|
||||
response = self.client.get(f"/api/search/?query=content&page={i}")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 55)
|
||||
self.assertEqual(response.data['page'], i)
|
||||
self.assertEqual(response.data['page_count'], 6)
|
||||
self.assertEqual(len(results), 10)
|
||||
|
||||
for result in results:
|
||||
self.assertNotIn(result['id'], seen_ids)
|
||||
seen_ids.append(result['id'])
|
||||
|
||||
response = self.client.get(f"/api/search/?query=content&page=6")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 55)
|
||||
self.assertEqual(response.data['page'], 6)
|
||||
self.assertEqual(response.data['page_count'], 6)
|
||||
self.assertEqual(len(results), 5)
|
||||
|
||||
for result in results:
|
||||
self.assertNotIn(result['id'], seen_ids)
|
||||
seen_ids.append(result['id'])
|
||||
|
||||
response = self.client.get(f"/api/search/?query=content&page=7")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 55)
|
||||
self.assertEqual(response.data['page'], 6)
|
||||
self.assertEqual(response.data['page_count'], 6)
|
||||
self.assertEqual(len(results), 5)
|
||||
|
||||
def test_search_invalid_page(self):
|
||||
with index.open_index(False).writer() as writer:
|
||||
for i in range(15):
|
||||
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
|
||||
index.update_document(writer, doc)
|
||||
|
||||
first_page = self.client.get(f"/api/search/?query=content&page=1").data
|
||||
second_page = self.client.get(f"/api/search/?query=content&page=2").data
|
||||
should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
|
||||
should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
|
||||
should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
|
||||
should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
|
||||
|
||||
self.assertDictEqual(first_page, should_be_first_page_1)
|
||||
self.assertDictEqual(first_page, should_be_first_page_2)
|
||||
self.assertDictEqual(first_page, should_be_first_page_3)
|
||||
self.assertDictEqual(first_page, should_be_first_page_4)
|
||||
self.assertNotEqual(len(first_page['results']), len(second_page['results']))
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
def test_search_autocomplete(self, m):
|
||||
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||
@ -215,3 +302,42 @@ class DocumentApiTest(APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['documents_total'], 3)
|
||||
self.assertEqual(response.data['documents_inbox'], 1)
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
def test_upload(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f})
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
self.assertEqual(kwargs['override_filename'], "simple.pdf")
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
def test_upload_invalid_form(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"documenst": f})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
def test_upload_invalid_file(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
@mock.patch("documents.forms.validate_filename")
|
||||
def test_upload_invalid_filename(self, validate_filename, async_task):
|
||||
validate_filename.side_effect = ValidationError()
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
@ -1,24 +1,29 @@
|
||||
import tempfile
|
||||
from time import sleep
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestClassifier(TestCase):
|
||||
class TestClassifier(DirectoriesMixin, TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
||||
super(TestClassifier, self).setUp()
|
||||
self.classifier = DocumentClassifier()
|
||||
|
||||
def generate_test_data(self):
|
||||
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
self.c2 = Correspondent.objects.create(name="c2")
|
||||
self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
|
||||
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
|
||||
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
|
||||
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
|
||||
@ -59,8 +64,8 @@ class TestClassifier(TestCase):
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
|
||||
self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk])
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
|
||||
|
||||
@ -71,6 +76,44 @@ class TestClassifier(TestCase):
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
def testVersionIncreased(self):
|
||||
|
||||
self.generate_test_data()
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2 = DocumentClassifier()
|
||||
|
||||
current_ver = DocumentClassifier.FORMAT_VERSION
|
||||
with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
|
||||
# assure that we won't load old classifiers.
|
||||
self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
|
||||
|
||||
self.classifier.save_classifier()
|
||||
|
||||
# assure that we can load the classifier after saving it.
|
||||
classifier2.reload()
|
||||
|
||||
def testReload(self):
|
||||
|
||||
self.generate_test_data()
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2 = DocumentClassifier()
|
||||
classifier2.reload()
|
||||
v1 = classifier2.classifier_version
|
||||
|
||||
# change the classifier after some time.
|
||||
sleep(1)
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2.reload()
|
||||
v2 = classifier2.classifier_version
|
||||
self.assertNotEqual(v1, v2)
|
||||
|
||||
@override_settings(DATA_DIR=tempfile.mkdtemp())
|
||||
def testSaveClassifier(self):
|
||||
|
||||
@ -83,3 +126,112 @@ class TestClassifier(TestCase):
|
||||
new_classifier = DocumentClassifier()
|
||||
new_classifier.reload()
|
||||
self.assertFalse(new_classifier.train())
|
||||
|
||||
def test_one_correspondent_predict(self):
|
||||
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
|
||||
|
||||
def test_one_correspondent_predict_manydocs(self):
|
||||
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
|
||||
doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
|
||||
self.assertIsNone(self.classifier.predict_correspondent(doc2.content))
|
||||
|
||||
def test_one_type_predict(self):
|
||||
dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
|
||||
checksum="A", document_type=dt)
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
|
||||
|
||||
def test_one_type_predict_manydocs(self):
|
||||
dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
|
||||
checksum="A", document_type=dt)
|
||||
|
||||
doc2 = Document.objects.create(title="doc1", content="this is a document from c2",
|
||||
checksum="B")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
|
||||
self.assertIsNone(self.classifier.predict_document_type(doc2.content))
|
||||
|
||||
def test_one_tag_predict(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
|
||||
def test_one_tag_predict_unassigned(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [])
|
||||
|
||||
def test_two_tags_predict_singledoc(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
|
||||
|
||||
doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
|
||||
|
||||
doc4.tags.add(t1)
|
||||
doc4.tags.add(t2)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
|
||||
|
||||
def test_two_tags_predict(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B")
|
||||
doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C")
|
||||
doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
doc2.tags.add(t2)
|
||||
|
||||
doc4.tags.add(t1)
|
||||
doc4.tags.add(t2)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [t2.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc3.content), [])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
|
||||
|
||||
def test_one_tag_predict_multi(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
doc2.tags.add(t1)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [t1.pk])
|
||||
|
||||
def test_one_tag_predict_multi_2(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
|
||||
|
@ -1,12 +1,12 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..consumer import Consumer, ConsumerError
|
||||
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
||||
from ..parsers import DocumentParser, ParseError
|
||||
@ -408,26 +408,16 @@ def fake_magic_from_file(file, mime=False):
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(TestCase):
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
return DummyParser(path, logging_group, self.scratch_dir)
|
||||
return DummyParser(path, logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def make_faulty_parser(self, path, logging_group):
|
||||
return FaultyParser(path, logging_group, self.scratch_dir)
|
||||
return FaultyParser(path, logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.consumption_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
|
||||
CONSUMPTION_DIR=self.consumption_dir
|
||||
).enable()
|
||||
super(TestConsumer, self).setUp()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
@ -441,13 +431,8 @@ class TestConsumer(TestCase):
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.consumption_dir, ignore_errors=True)
|
||||
|
||||
def get_test_file(self):
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
|
||||
return f
|
||||
|
||||
def testNormalOperation(self):
|
||||
@ -516,26 +501,6 @@ class TestConsumer(TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR=None)
|
||||
def testConsumptionDirUnset(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="asd")
|
||||
def testNoConsumptionDir(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Consumption directory asd does not exist")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
|
@ -2,7 +2,7 @@ import logging
|
||||
import uuid
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Log
|
||||
|
||||
@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase):
|
||||
self.logger = logging.getLogger(
|
||||
"documents.management.commands.document_consumer")
|
||||
|
||||
@override_settings(DISABLE_DBHANDLER=False)
|
||||
def test_that_it_saves_at_all(self):
|
||||
|
||||
kw = {"group": uuid.uuid4()}
|
||||
@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase):
|
||||
self.logger.critical("This is a critical message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 5)
|
||||
|
||||
@override_settings(DISABLE_DBHANDLER=False)
|
||||
def test_groups(self):
|
||||
|
||||
kw1 = {"group": uuid.uuid4()}
|
||||
|
210
src/documents/tests/test_management_consumer.py
Normal file
210
src/documents/tests/test_management_consumer.py
Normal file
@ -0,0 +1,210 @@
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management import call_command, CommandError
|
||||
from django.test import override_settings, TestCase
|
||||
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.management.commands import document_consumer
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class ConsumerThread(Thread):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.cmd = document_consumer.Command()
|
||||
|
||||
def run(self) -> None:
|
||||
self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False)
|
||||
|
||||
def stop(self):
|
||||
# Consumer checks this every second.
|
||||
self.cmd.stop_flag = True
|
||||
|
||||
|
||||
def chunked(size, source):
|
||||
for i in range(0, len(source), size):
|
||||
yield source[i:i+size]
|
||||
|
||||
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestConsumer, self).setUp()
|
||||
self.t = None
|
||||
patcher = mock.patch("documents.management.commands.document_consumer.async_task")
|
||||
self.task_mock = patcher.start()
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
def t_start(self):
|
||||
self.t = ConsumerThread()
|
||||
self.t.start()
|
||||
# give the consumer some time to do initial work
|
||||
sleep(1)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
if self.t:
|
||||
# set the stop flag
|
||||
self.t.stop()
|
||||
# wait for the consumer to exit.
|
||||
self.t.join()
|
||||
|
||||
super(TestConsumer, self).tearDown()
|
||||
|
||||
def wait_for_task_mock_call(self):
|
||||
n = 0
|
||||
while n < 100:
|
||||
if self.task_mock.call_count > 0:
|
||||
# give task_mock some time to finish and raise errors
|
||||
sleep(1)
|
||||
return
|
||||
n += 1
|
||||
sleep(0.1)
|
||||
self.fail("async_task was never called")
|
||||
|
||||
# A bogus async_task that will simply check the file for
|
||||
# completeness and raise an exception otherwise.
|
||||
def bogus_task(self, func, filename, **kwargs):
|
||||
eq = filecmp.cmp(filename, self.sample_file, shallow=False)
|
||||
if not eq:
|
||||
print("Consumed an INVALID file.")
|
||||
raise ConsumerError("Incomplete File READ FAILED")
|
||||
else:
|
||||
print("Consumed a perfectly valid file.")
|
||||
|
||||
def slow_write_file(self, target, incomplete=False):
|
||||
with open(self.sample_file, 'rb') as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
if incomplete:
|
||||
pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100]
|
||||
|
||||
with open(target, 'wb') as f:
|
||||
# this will take 2 seconds, since the file is about 20k.
|
||||
print("Start writing file.")
|
||||
for b in chunked(1000, pdf_bytes):
|
||||
f.write(b)
|
||||
sleep(0.1)
|
||||
print("file completed.")
|
||||
|
||||
def test_consume_file(self):
|
||||
self.t_start()
|
||||
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_consume_file_polling(self):
|
||||
self.test_consume_file()
|
||||
|
||||
def test_consume_existing_file(self):
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.t_start()
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_consume_existing_file_polling(self):
|
||||
self.test_consume_existing_file()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_pdf(self, error_logger):
|
||||
|
||||
self.task_mock.side_effect = self.bogus_task
|
||||
|
||||
self.t_start()
|
||||
|
||||
fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
|
||||
self.slow_write_file(fname)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
error_logger.assert_not_called()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_slow_write_pdf_polling(self):
|
||||
self.test_slow_write_pdf()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_and_move(self, error_logger):
|
||||
|
||||
self.task_mock.side_effect = self.bogus_task
|
||||
|
||||
self.t_start()
|
||||
|
||||
fname = os.path.join(self.dirs.consumption_dir, "my_file.~df")
|
||||
fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
|
||||
self.slow_write_file(fname)
|
||||
shutil.move(fname, fname2)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname2)
|
||||
|
||||
error_logger.assert_not_called()
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_slow_write_and_move_polling(self):
|
||||
self.test_slow_write_and_move()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_incomplete(self, error_logger):
|
||||
|
||||
self.task_mock.side_effect = self.bogus_task
|
||||
|
||||
self.t_start()
|
||||
|
||||
fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
self.slow_write_file(fname, incomplete=True)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname)
|
||||
|
||||
# assert that we have an error logged with this invalid file.
|
||||
error_logger.assert_called_once()
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_slow_write_incomplete_polling(self):
|
||||
self.test_slow_write_incomplete()
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="does_not_exist")
|
||||
def test_consumption_directory_invalid(self):
|
||||
|
||||
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="")
|
||||
def test_consumption_directory_unset(self):
|
||||
|
||||
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
|
56
src/documents/tests/test_management_decrypt.py
Normal file
56
src/documents/tests/test_management_decrypt.py
Normal file
@ -0,0 +1,56 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
|
||||
|
||||
class TestDecryptDocuments(TestCase):
|
||||
|
||||
@override_settings(
|
||||
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
|
||||
PASSPHRASE="test"
|
||||
)
|
||||
@mock.patch("documents.management.commands.decrypt_documents.input")
|
||||
def test_decrypt(self, m):
|
||||
|
||||
media_dir = tempfile.mkdtemp()
|
||||
originals_dir = os.path.join(media_dir, "documents", "originals")
|
||||
thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
|
||||
os.makedirs(originals_dir, exist_ok=True)
|
||||
os.makedirs(thumb_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=originals_dir,
|
||||
THUMBNAIL_DIR=thumb_dir,
|
||||
PASSPHRASE="test"
|
||||
).enable()
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "thumb", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
|
||||
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
|
||||
call_command('decrypt_documents')
|
||||
|
||||
doc = Document.objects.get(id=2)
|
||||
|
||||
self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(doc.filename, "0000002.pdf")
|
||||
self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
|
||||
self.assertTrue(os.path.isfile(doc.thumbnail_path))
|
||||
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, doc.checksum)
|
||||
|
53
src/documents/tests/test_management_exporter.py
Normal file
53
src/documents/tests/test_management_exporter.py
Normal file
@ -0,0 +1,53 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
|
||||
|
||||
class TestExporter(TestCase):
|
||||
|
||||
@override_settings(
|
||||
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
|
||||
PASSPHRASE="test"
|
||||
)
|
||||
def test_exporter(self):
|
||||
file = os.path.join(os.path.dirname(__file__), "samples", "originals", "0000001.pdf")
|
||||
|
||||
with open(file, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Tag.objects.create(name="t")
|
||||
DocumentType.objects.create(name="dt")
|
||||
Correspondent.objects.create(name="c")
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
|
||||
call_command('document_exporter', target)
|
||||
|
||||
with open(os.path.join(target, "manifest.json")) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
self.assertEqual(len(manifest), 5)
|
||||
|
||||
for element in manifest:
|
||||
if element['model'] == 'documents.document':
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['checksum'])
|
||||
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
|
||||
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
|
58
src/documents/tests/test_management_retagger.py
Normal file
58
src/documents/tests/test_management_retagger.py
Normal file
@ -0,0 +1,58 @@
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document, Tag, Correspondent, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
|
||||
self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
|
||||
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
|
||||
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
|
||||
|
||||
self.correspondent_first = Correspondent.objects.create(
|
||||
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
|
||||
self.correspondent_second = Correspondent.objects.create(
|
||||
name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY)
|
||||
|
||||
self.doctype_first = DocumentType.objects.create(
|
||||
name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY)
|
||||
self.doctype_second = DocumentType.objects.create(
|
||||
name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
|
||||
|
||||
def get_updated_docs(self):
|
||||
return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestRetagger, self).setUp()
|
||||
self.make_models()
|
||||
|
||||
def test_add_tags(self):
|
||||
call_command('document_retagger', '--tags')
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 1)
|
||||
self.assertEqual(d_second.tags.count(), 1)
|
||||
self.assertEqual(d_unrelated.tags.count(), 0)
|
||||
|
||||
self.assertEqual(d_first.tags.first(), self.tag_first)
|
||||
self.assertEqual(d_second.tags.first(), self.tag_second)
|
||||
|
||||
def test_add_type(self):
|
||||
call_command('document_retagger', '--document_type')
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.document_type, self.doctype_first)
|
||||
self.assertEqual(d_second.document_type, self.doctype_second)
|
||||
|
||||
def test_add_correspondent(self):
|
||||
call_command('document_retagger', '--correspondent')
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.correspondent, self.correspondent_first)
|
||||
self.assertEqual(d_second.correspondent, self.correspondent_second)
|
@ -1,3 +1,5 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from random import randint
|
||||
|
||||
from django.contrib.admin.models import LogEntry
|
||||
@ -215,6 +217,13 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.", mime_type="application/pdf")
|
||||
|
||||
self.index_dir = tempfile.mkdtemp()
|
||||
# TODO: we should not need the index here.
|
||||
override_settings(INDEX_DIR=self.index_dir).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.index_dir, ignore_errors=True)
|
||||
|
||||
def test_tag_applied_any(self):
|
||||
t1 = Tag.objects.create(
|
||||
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
|
||||
|
59
src/documents/tests/utils.py
Normal file
59
src/documents/tests/utils.py
Normal file
@ -0,0 +1,59 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
|
||||
def setup_directories():
|
||||
|
||||
dirs = namedtuple("Dirs", ())
|
||||
|
||||
dirs.data_dir = tempfile.mkdtemp()
|
||||
dirs.scratch_dir = tempfile.mkdtemp()
|
||||
dirs.media_dir = tempfile.mkdtemp()
|
||||
dirs.consumption_dir = tempfile.mkdtemp()
|
||||
dirs.index_dir = os.path.join(dirs.data_dir, "index")
|
||||
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
|
||||
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
|
||||
|
||||
os.makedirs(dirs.index_dir, exist_ok=True)
|
||||
os.makedirs(dirs.originals_dir, exist_ok=True)
|
||||
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
DATA_DIR=dirs.data_dir,
|
||||
SCRATCH_DIR=dirs.scratch_dir,
|
||||
MEDIA_ROOT=dirs.media_dir,
|
||||
ORIGINALS_DIR=dirs.originals_dir,
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
|
||||
|
||||
).enable()
|
||||
|
||||
return dirs
|
||||
|
||||
|
||||
def remove_dirs(dirs):
|
||||
shutil.rmtree(dirs.media_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
|
||||
|
||||
class DirectoriesMixin:
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.dirs = None
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.dirs = setup_directories()
|
||||
super(DirectoriesMixin, self).setUp()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super(DirectoriesMixin, self).tearDown()
|
||||
remove_dirs(self.dirs)
|
@ -149,13 +149,25 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
else:
|
||||
return HttpResponseBadRequest(str(form.errors))
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def metadata(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
return Response({
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
})
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
try:
|
||||
response = self.file_response(pk, "inline")
|
||||
return response
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document source file does not exist")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
@cache_control(public=False, max_age=315360000)
|
||||
@ -163,15 +175,15 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
try:
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
|
||||
content_type='image/png')
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document thumbnail does not exist")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def download(self, request, pk=None):
|
||||
try:
|
||||
return self.file_response(pk, "attachment")
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document source file does not exist")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
|
||||
class LogViewSet(ReadOnlyModelViewSet):
|
||||
@ -190,7 +202,9 @@ class SearchView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
ix = index.open_index()
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def add_infos_to_hit(self, r):
|
||||
doc = Document.objects.get(id=r['id'])
|
||||
@ -210,6 +224,9 @@ class SearchView(APIView):
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
if page < 1:
|
||||
page = 1
|
||||
|
||||
with index.query_page(self.ix, query, page) as result_page:
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
@ -229,7 +246,9 @@ class SearchAutoCompleteView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
ix = index.open_index()
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'term' in request.query_params:
|
||||
|
@ -1,8 +1,19 @@
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.utils.deprecation import MiddlewareMixin
|
||||
from rest_framework import authentication
|
||||
|
||||
|
||||
class AutoLoginMiddleware(MiddlewareMixin):
|
||||
|
||||
def process_request(self, request):
|
||||
try:
|
||||
request.user = User.objects.get(
|
||||
username=settings.AUTO_LOGIN_USERNAME)
|
||||
except User.DoesNotExist:
|
||||
pass
|
||||
|
||||
|
||||
class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
|
||||
""" This class is here to provide authentication to the angular dev server
|
||||
during development. This is disabled in production.
|
||||
|
@ -144,6 +144,15 @@ TEMPLATES = [
|
||||
# Security #
|
||||
###############################################################################
|
||||
|
||||
AUTO_LOGIN_USERNAME = os.getenv("PAPERLESS_AUTO_LOGIN_USERNAME")
|
||||
|
||||
if AUTO_LOGIN_USERNAME:
|
||||
_index = MIDDLEWARE.index('django.contrib.auth.middleware.AuthenticationMiddleware')
|
||||
# This overrides everything the auth middleware is doing but still allows
|
||||
# regular login in case the provided user does not exist.
|
||||
MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')
|
||||
|
||||
|
||||
if DEBUG:
|
||||
X_FRAME_OPTIONS = ''
|
||||
# this should really be 'allow-from uri' but its not supported in any mayor
|
||||
@ -241,6 +250,8 @@ USE_TZ = True
|
||||
# Logging #
|
||||
###############################################################################
|
||||
|
||||
DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
|
||||
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
|
@ -1 +1 @@
|
||||
__version__ = (0, 9, 2)
|
||||
__version__ = (0, 9, 3)
|
||||
|
@ -0,0 +1,2 @@
|
||||
# this is here so that django finds the checks.
|
||||
from .checks import *
|
25
src/paperless_tesseract/checks.py
Normal file
25
src/paperless_tesseract/checks.py
Normal file
@ -0,0 +1,25 @@
|
||||
import subprocess
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error, register
|
||||
|
||||
|
||||
def get_tesseract_langs():
|
||||
with subprocess.Popen(['tesseract', '--list-langs'],
|
||||
stdout=subprocess.PIPE) as p:
|
||||
stdout, stderr = p.communicate()
|
||||
|
||||
return stdout.decode().strip().split("\n")[1:]
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
langs = get_tesseract_langs()
|
||||
|
||||
if settings.OCR_LANGUAGE not in langs:
|
||||
return [Error(
|
||||
f"The default ocr language {settings.OCR_LANGUAGE} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
|
||||
else:
|
||||
return []
|
@ -3,10 +3,9 @@ exclude = migrations, paperless/settings.py, .tox, */tests/*
|
||||
|
||||
[tool:pytest]
|
||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||
addopts = --pythonwarnings=all
|
||||
addopts = --pythonwarnings=all --cov --cov-report=html -n auto
|
||||
env =
|
||||
PAPERLESS_SECRET=paperless
|
||||
PAPERLESS_EMAIL_SECRET=paperless
|
||||
PAPERLESS_DISABLE_DBHANDLER=true
|
||||
|
||||
|
||||
[coverage:run]
|
||||
|
Loading…
x
Reference in New Issue
Block a user