Compare commits

..

70 Commits
2.2.0 ... 2.5.0

Author SHA1 Message Date
Daniel Quinn
2ef2bf873e Version bump: 2.5.0 2018-10-07 16:30:36 +01:00
Daniel Quinn
0bb7d27269 pep8 2018-10-07 16:30:02 +01:00
Daniel Quinn
ce5e8b2658 Rework user hack for "login-free" sessions #394 2018-10-07 16:27:41 +01:00
Daniel Quinn
3f572afb8b Add a little more read-only info for documents 2018-10-07 16:26:05 +01:00
Daniel Quinn
5c3cb1e4ab Rework how slugs are generated/referenced #393 2018-10-07 16:25:51 +01:00
Daniel Quinn
c7f4bfe4f3 Add migration that should have come in some time ago 2018-10-07 16:23:03 +01:00
Daniel Quinn
65d6599964 Fix formatting 2018-10-07 16:22:52 +01:00
Daniel Quinn
5d32e89c44 Wrap each document consumption in a transaction 2018-10-07 14:56:56 +01:00
Daniel Quinn
750ab5bf85 Use optipng to optimise document thumbnails 2018-10-07 14:56:38 +01:00
Daniel Quinn
2a3f766b93 Consolidate get_date onto the DocumentParser parent class 2018-10-07 14:56:02 +01:00
Daniel Quinn
14bb52b6a4 Wrap document consumption in a transaction #262 2018-10-07 13:12:22 +01:00
Daniel Quinn
b5176d207e Hopefully fix Travis 2018-10-01 20:40:43 +01:00
Daniel Quinn
e4044d0df9 Update version number & changelog 2018-10-01 20:40:32 +01:00
Daniel Quinn
bacdd51fd7 Merge pull request #413 from euri10/master
Fix issue where tesseract langages weren't installed properly
2018-10-01 19:40:04 +00:00
Daniel Quinn
8010d72f18 Tweak the date guesser to not allow dates prior to 1900 (#414) 2018-10-01 20:03:47 +01:00
euri10
9dd76f1b87 Fix issue where tesseract langages weren't installed properly 2018-09-24 13:30:10 +02:00
Daniel Quinn
a511d34d69 Fix implementation of django-filter 2018-09-23 15:47:14 +01:00
Daniel Quinn
35c5b8e263 Add note about tweaks to psql connections 2018-09-23 14:05:35 +01:00
Daniel Quinn
8726b0316c Add note about import/export process changes 2018-09-23 14:03:38 +01:00
Daniel Quinn
acf6caca2f Add a tox test for Python 3.7 2018-09-23 14:01:35 +01:00
Daniel Quinn
b20d7eca03 Tweak settings.py to allow for TRUST-based PostgreSQL auth 2018-09-23 14:01:15 +01:00
Daniel Quinn
d17497fd5b Move the unique key on checksums to migration 15
This shouldn't affect anyone, since this migration is pretty old, but it
allows people using PostgreSQL to actually run Paperless.
2018-09-23 14:00:27 +01:00
Daniel Quinn
090565d84c Tweak the import/export system to handle encryption choices better
Now when you export a document, the `storage_type` value is always
`unencrypted` (since that's what it is when it's exported anyway), and
the flag is set by the importing script instead, based on the existence
of a `PAPERLESS_PASSPHRASE` environment variable, indicating that
encryption is enabled.
2018-09-23 13:58:40 +01:00
Daniel Quinn
79e1e60238 Fix typo 2018-09-23 12:59:56 +01:00
Daniel Quinn
ff111f1bde Update changelog for new stuff from #405 2018-09-23 12:54:49 +01:00
Daniel Quinn
6db788a550 Add docs for indentation & spacing 2018-09-23 12:54:39 +01:00
Daniel Quinn
f4a09013d7 Merge branch 'jonaswinkler-new-features' 2018-09-23 12:42:02 +01:00
Daniel Quinn
4130dd3465 Conform code to standards 2018-09-23 12:41:28 +01:00
Daniel Quinn
117d7dad04 Improve the unknown language error message 2018-09-23 12:41:14 +01:00
Daniel Quinn
b420281be0 Remove numpy, scikit-learn, and scipy as they weren't being used 2018-09-23 12:40:46 +01:00
Daniel Quinn
17f8953a49 Merge branch 'new-features' of git://github.com/jonaswinkler/paperless into jonaswinkler-new-features 2018-09-23 11:57:44 +01:00
Daniel Quinn
9682a6f6fc Add a contribution guide 2018-09-22 16:22:03 +01:00
Daniel Quinn
425bbe34ef Make the names of the sample files visible 2018-09-22 16:17:18 +01:00
Daniel Quinn
60ee08adec Reduce duplication in docker-compose.env.example
See #404 for more info on where this came from.
2018-09-22 15:27:22 +01:00
Daniel Quinn
b4b4d8f25e Add an example for pdf2pdfocr with the pre-consume hook 2018-09-22 14:00:00 +01:00
Daniel Quinn
cce6b43062 Clean up release notes 2018-09-22 13:59:50 +01:00
Jonas Winkler
fb6f2e07c9 Added a bunch of new features:
- Debug mode is now configurable in the configuration file. This way, we don't have to edit versioned files to disable it on production systems.
- Recent correspondents filter (enable in configuration file)
- Document actions: Edit tags and correspondents on multiple documents at once
- Replaced month list filter with date drilldown
- Sortable document count columns on Tag and Correspondent admin
- Last correspondence column on Correspondent admin
- Save and edit next functionality for document editing
2018-09-13 15:19:25 +02:00
Daniel Quinn
2edf65dd1e Bump to 2.3.0 2018-09-09 21:51:44 +01:00
Daniel Quinn
9a739bdbab Merge pull request #401 from ahyear/patch-1
add migrate commande to docker update process
2018-09-09 21:26:56 +01:00
Daniel Quinn
66db06590d Merge branch 'jat255-ENH_config_inline_or_attach' 2018-09-09 21:22:42 +01:00
Daniel Quinn
7cef108785 Streamline how we handle boolean values in settings.py 2018-09-09 21:22:07 +01:00
Daniel Quinn
a86a20ef0f Make the example file contain the default value 2018-09-09 21:16:53 +01:00
Daniel Quinn
f94347abc0 Merge branch 'ENH_config_inline_or_attach' of git://github.com/jat255/paperless into jat255-ENH_config_inline_or_attach 2018-09-09 21:15:14 +01:00
Daniel Quinn
46cbd10ba0 Merge pull request #399 from jat255/ENH_convert_only_one_page
Speed up thumbnail generation for PDFs
2018-09-09 21:12:42 +01:00
Daniel Quinn
2a96c648e8 Merge pull request #396 from dubit0/postgres_mysql_fix
Fix document checks with PostgreSQL and MySQL backends.
2018-09-09 21:10:36 +01:00
Daniel Quinn
75648cc74b Merge branch 'jat255-ENH_text_consumer' 2018-09-09 21:03:58 +01:00
Daniel Quinn
0472fe4e9e Reorder imports 2018-09-09 21:03:37 +01:00
Daniel Quinn
c99f5923d5 Rename parsers to DATE_REGEX
In moving the `parsers` variable into the package-level, it lost the
context, so a more descriptive name was needed.
2018-09-09 21:02:30 +01:00
Daniel Quinn
ef302abed7 Fix pycodestyle complaints 2018-09-09 20:55:37 +01:00
Daniel Quinn
2dc35cc856 Merge branch 'ENH_text_consumer' of git://github.com/jat255/paperless into jat255-ENH_text_consumer 2018-09-09 20:52:59 +01:00
Daniel Quinn
f4c399f0dd Merge pull request #398 from ddddavidmartin/bump_pyocr_version_for_tesseract_4_support
Bump required version for Pyocr to support the latest tesseract 4.
2018-09-09 20:01:51 +01:00
Daniel Quinn
5342db6ada Fix pycodestyle complaints
Apparently, pycodestyle updated itself to now check for invalid escape
sequences, which only complain if the regex in use isn't a raw string
(r"").
2018-09-09 20:00:12 +01:00
Daniel Quinn
5c39fff51b Add tox to dev dependencies 2018-09-09 19:59:47 +01:00
ahyear
ed0e40d3e6 add migrate commande to docker update process 2018-09-06 15:32:41 +02:00
Joshua Taillon
652ead2f5c remove debugging print statement 2018-09-05 23:05:37 -04:00
Joshua Taillon
be9757894a add INLINE_DOC to settings.py 2018-09-05 23:03:30 -04:00
Joshua Taillon
22378789e2 add option for inline vs. attachment for document rendering 2018-09-05 22:58:38 -04:00
Joshua Taillon
72c828170e move date-matching regex pattern to base parser module for use by all subclasses 2018-09-05 21:13:36 -04:00
Joshua Taillon
cac63494f0 change tesseract parser to only convert first page to save (potentially) massive amounts of work 2018-09-05 15:18:35 -04:00
Daniel Quinn
939a67bd4b Add empty requirements for rtd to reference 2018-09-05 11:16:42 +01:00
Daniel Quinn
fbc6a58f5a Add credits for 2.2.0 that I forgot 2018-09-05 10:59:06 +01:00
Daniel Quinn
01a358d2b0 Re-flow text to keep it <80c wide 2018-09-05 10:58:41 +01:00
David Martin
6b447628ed Bump required version for Pyocr to support the latest tesseract 4.
This recently changed in the official tesseract engine [0]. -psm is
not allowed as an option anymore and --psm has to be used instead. The
latest pyocr enables support for this [1].

[0] tesseract-ocr/tesseract@ee201e1
[1] 5abd0a566a
2018-09-05 13:03:42 +10:00
Thomas Niederprüm
2308d5a613 Catch ProgrammingError in Document checks.
When running PostgreSQL or MariaDB/MySQL backends, a query to a non-existent
table will raise a "ProgrammingError". This patch properly catches this error.
Without this patch all management calls to manage.py will lead to an error when
running PostgreSQL or MariaDB as a backend.
2018-09-04 20:11:48 +02:00
Joshua Taillon
23bf79274c Merge branch 'master' into ENH_text_consumer 2018-09-03 23:47:30 -04:00
Joshua Taillon
4849249d86 explicitly add txt, md, and csv types for consumer and viewer; fix thumbnail generation 2018-09-03 23:46:13 -04:00
Daniel Quinn
ee20af71e8 Bump for 2.2.1 2018-09-03 00:27:40 +01:00
Daniel Quinn
3c8aa3ba42 Don't try to remove SessionAuthenticationMiddleware
It was remove entirely in Django 2.0
2018-09-03 00:25:10 +01:00
Daniel Quinn
778ffa488d Add Tim to the credits for 2.2.0 2018-09-02 21:53:52 +01:00
Joshua Taillon
d6fedbec52 first stab at text consumer 2018-08-30 23:32:41 -04:00
44 changed files with 1295 additions and 266 deletions

2
.gitignore vendored
View File

@@ -81,3 +81,5 @@ docker-compose.env
scripts/import-for-development
scripts/nuke
# Static files collected by the collectstatic command
static/

View File

@@ -2,7 +2,7 @@ language: python
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr tesseract-ocr-eng tesseract-ocr-cat
sudo: false

View File

@@ -1,4 +1,4 @@
FROM alpine:3.7
FROM alpine:3.8
LABEL maintainer="The Paperless Project https://github.com/danielquinn/paperless" \
contributors="Guy Addadi <addadi@gmail.com>, Pit Kleyersburg <pitkley@googlemail.com>, \
@@ -12,11 +12,10 @@ COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
ENV PAPERLESS_EXPORT_DIR=/export \
PAPERLESS_CONSUMPTION_DIR=/consume
# Install dependencies
RUN apk --no-cache --update add \
python3 gnupg libmagic bash shadow curl \
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
apk --no-cache add --virtual .build-dependencies \
RUN apk update --no-cache && apk add python3 gnupg libmagic bash shadow curl \
sudo poppler tesseract-ocr imagemagick ghostscript unpaper optipng && \
apk add --virtual .build-dependencies \
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
# Install python dependencies
python3 -m ensurepip && \

View File

@@ -36,3 +36,5 @@ pytest-xdist = "*"
[dev-packages]
ipython = "*"
sphinx = "*"
tox = "*"

83
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e20c2294bcafd346ee57901df94a515a12976ed192dc37df848b39b56bdd1f4b"
"sha256": "6d8bad24aa5d0c102b13b5ae27acba04836cd5a07a4003cb2763de1e0a3406b7"
},
"pipfile-spec": 6,
"requires": {},
@@ -19,7 +19,7 @@
"sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6",
"sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.5"
},
"atomicwrites": {
@@ -27,7 +27,7 @@
"sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
"sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.2.1"
},
"attrs": {
@@ -85,7 +85,7 @@
"sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
"sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80"
],
"markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.1.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4'",
"version": "==4.5.1"
},
"coveralls": {
@@ -163,7 +163,7 @@
"sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a",
"sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.5.0"
},
"factory-boy": {
@@ -179,6 +179,7 @@
"sha256:ea7cfd3aeb1544732d08bd9cfba40c5b78e3a91e17b1a0698ab81bfc5554c628",
"sha256:f6d67f04abfb2b4bea7afc7fa6c18cf4c523a67956e455668be9ae42bccc21ad"
],
"markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version >= '2.7'",
"version": "==0.9.0"
},
"filemagic": {
@@ -282,7 +283,7 @@
"sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
"sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.7.1"
},
"py": {
@@ -290,7 +291,7 @@
"sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
"sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.6.0"
},
"pycodestyle": {
@@ -303,26 +304,26 @@
},
"pyocr": {
"hashes": [
"sha256:bdc4d43bf9b63c2a9a4b2c9a1a623a0e63c8e6600eede5dbe866b31f3a5f2207"
"sha256:b6ba6263fd92da56627dff6d263d991a2246aacd117d1788f11b93f419ca395f"
],
"index": "pypi",
"version": "==0.5.2"
"version": "==0.5.3"
},
"pytest": {
"hashes": [
"sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349",
"sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18"
"sha256:453cbbbe5ce6db38717d282b758b917de84802af4288910c12442984bde7b823",
"sha256:a8a07f84e680482eb51e244370aaf2caa6301ef265f37c2bdefb3dd3b663f99d"
],
"index": "pypi",
"version": "==3.7.4"
"version": "==3.8.0"
},
"pytest-cov": {
"hashes": [
"sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d",
"sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec"
"sha256:513c425e931a0344944f84ea47f3956be0e416d95acbd897a44970c8d926d5d7",
"sha256:e360f048b7dae3f2f2a9a4d067b2dd6b6a015d384d1577c994a43f3f7cbad762"
],
"index": "pypi",
"version": "==2.5.1"
"version": "==2.6.0"
},
"pytest-django": {
"hashes": [
@@ -344,6 +345,7 @@
"sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805",
"sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.2"
},
"pytest-sugar": {
@@ -457,7 +459,7 @@
"sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
"sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
],
"markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
"version": "==1.23"
}
},
@@ -521,10 +523,11 @@
},
"imagesize": {
"hashes": [
"sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18",
"sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315"
"sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8",
"sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5"
],
"version": "==1.0.0"
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.1.0"
},
"ipython": {
"hashes": [
@@ -590,6 +593,14 @@
],
"version": "==0.7.4"
},
"pluggy": {
"hashes": [
"sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
"sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.7.1"
},
"prompt-toolkit": {
"hashes": [
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
@@ -605,6 +616,14 @@
],
"version": "==0.6.0"
},
"py": {
"hashes": [
"sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
"sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.6.0"
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
@@ -656,20 +675,28 @@
},
"sphinx": {
"hashes": [
"sha256:a07050845cc9a2f4026a6035cc8ed795a5ce7be6528bbc82032385c10807dfe7",
"sha256:d719de667218d763e8fd144b7fcfeefd8d434a6201f76bf9f0f0c1fa6f47fcdb"
"sha256:217a7705adcb573da5bbe1e0f5cab4fa0bd89fd9342c9159121746f593c2d5a4",
"sha256:a602513f385f1d5785ff1ca420d9c7eb1a1b63381733b2f0ea8188a391314a86"
],
"index": "pypi",
"version": "==1.7.8"
"version": "==1.7.9"
},
"sphinxcontrib-websupport": {
"hashes": [
"sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd",
"sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9"
],
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.1.0"
},
"tox": {
"hashes": [
"sha256:37cf240781b662fb790710c6998527e65ca6851eace84d1595ee71f7af4e85f7",
"sha256:eb61aa5bcce65325538686f09848f04ef679b5cd9b83cc491272099b28739600"
],
"index": "pypi",
"version": "==3.2.1"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
@@ -682,9 +709,17 @@
"sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
"sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
],
"markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
"version": "==1.23"
},
"virtualenv": {
"hashes": [
"sha256:2ce32cd126117ce2c539f0134eb89de91a8413a29baac49cbab3eb50e2026669",
"sha256:ca07b4c0b54e14a91af9f34d0919790b016923d157afda5efdde55c96718f752"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"version": "==16.0.0"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",

View File

@@ -1,38 +1,22 @@
# Environment variables to set for Paperless
# Commented out variables will be replaced by a default within Paperless.
# Commented out variables will be replaced with a default within Paperless.
#
# In addition to what you see here, you can also define any values you find in
# paperless.conf.example here. Values like:
#
# * PAPERLESS_PASSPHRASE
# * PAPERLESS_CONSUMPTION_DIR
# * PAPERLESS_CONSUME_MAIL_HOST
#
# ...are all explained in that file but can be defined here, since the Docker
# installation doesn't make use of paperless.conf.
# Passphrase Paperless uses to encrypt and decrypt your documents, if you want
# encryption at all.
# PAPERLESS_PASSPHRASE=CHANGE_ME
# The amount of threads to use for text recognition
# PAPERLESS_OCR_THREADS=4
# Additional languages to install for text recognition
# Additional languages to install for text recognition. Note that this is
# different from PAPERLESS_OCR_LANGUAGE (default=eng), which defines the
# default language used when guessing the language from the OCR output.
# PAPERLESS_OCR_LANGUAGES=deu ita
# You can change the default user and group id to a custom one
# USERMAP_UID=1000
# USERMAP_GID=1000
###############################################################################
#### Mail Consumption ####
###############################################################################
# These values are required if you want paperless to check a particular email
# box every 10 minutes and attempt to consume documents from there. If you
# don't define a HOST, mail checking will just be disabled.
# Don't use quotes after = or it will crash your docker
# PAPERLESS_CONSUME_MAIL_HOST=
# PAPERLESS_CONSUME_MAIL_PORT=
# PAPERLESS_CONSUME_MAIL_USER=
# PAPERLESS_CONSUME_MAIL_PASS=
# Override the default IMAP inbox here. If it's not set, Paperless defaults to
# INBOX.
# PAPERLESS_CONSUME_MAIL_INBOX=INBOX
# Any email sent to the target account that does not contain this text will be
# ignored. Mail checking won't work without this.
# PAPERLESS_EMAIL_SECRET=

View File

@@ -1,16 +1,111 @@
Changelog
#########
2.5.0
=====
* **New dependency**: Paperless now optimises thumbnail generation with
`optipng`_, so you'll need to install that somewhere in your PATH or declare
its location in ``PAPERLESS_OPTIPNG_BINARY``. The Docker image has already
been updated on the Docker Hub, so you just need to pull the latest one from
there if you're a Docker user.
* "Login free" instances of Paperless were breaking whenever you tried to edit
objects in the admin: adding/deleting tags or correspondents, or even fixing
spelling. This was due to the "user hack" we were applying to sessions that
weren't using a login, as that hack user didn't have a valid id. The fix was
to attribute the first user id in the system to this hack user. `#394`_
* A problem in how we handle slug values on Tags and Correspondents required a
few changes to how we handle this field `#393`_:
1. Slugs are no longer editable. They're derived from the name of the tag or
correspondent at save time, so if you wanna change the slug, you have to
change the name, and even then you're restricted to the rules of the
``slugify()`` function. The slug value is still visible in the admin
though.
2. I've added a migration to go over all existing tags & correspondents and
rewrite the ``.slug`` values to ones conforming to the ``slugify()``
rules.
3. The consumption process now uses the same rules as ``.save()`` in
determining a slug and using that to check for an existing
tag/correspondent.
* An annoying bug in the date capture code was causing some bogus dates to be
attached to documents, which in turn busted the UI. Thanks to `Andrew Peng`_
for reporting this. `#414`_.
* A bug in the Dockerfile meant that Tesseract language files weren't being
installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
* Document consumption is now wrapped in a transaction as per an old ticket
`#262`_.
* The ``get_date()`` functionality of the parsers has been consolidated onto
the ``DocumentParser`` class since much of that code was redundant anyway.
2.4.0
=====
* A new set of actions are now available thanks to `jonaswinkler`_'s very first
pull request! You can now do nifty things like tag documents in bulk, or set
correspondents in bulk. `#405`_
* The import/export system is now a little smarter. By default, documents are
tagged as ``unencrypted``, since exports are by their nature unencrypted.
It's now in the import step that we decide the storage type. This allows you
to export from an encrypted system and import into an unencrypted one, or
vice-versa.
* The migration history has been slightly modified to accomodate PostgreSQL
users. Additionally, you can now tell paperless to use PostgreSQL simply by
declaring ``PAPERLESS_DBUSER`` in your environment. This will attempt to
connect to your Postgres database without a password unless you also set
``PAPERLESS_DBPASS``.
* A bug was found in the REST API filter system that was the result of an
update of django-filter some time ago. This has now been patched `#412`_.
Thanks to `thepill`_ for spotting it!
2.3.0
=====
* Support for consuming plain text & markdown documents was added by
`Joshua Taillon`_! This was a long-requested feature, and it's addition is
likely to be greatly appreciated by the community: `#395`_ Thanks also to
`David Martin`_ for his assistance on the issue.
* `dubit0`_ found & fixed a bug that prevented management commands from running
before we had an operational database: `#396`_
* Joshua also added a simple update to the thumbnail generation process to
improve performance: `#399`_
* As his last bit of effort on this release, Joshua also added some code to
allow you to view the documents inline rather than download them as an
attachment. `#400`_
* Finally, `ahyear`_ found a slip in the Docker documentation and patched it.
`#401`_
2.2.1
=====
* `Kyle Lucy`_ reported a bug quickly after the release of 2.2.0 where we broke
the ``DISABLE_LOGIN`` feature: `#392`_.
2.2.0
=====
* Thanks to `dadosch`_ and `Wolfgang Mader`_, this is the first version of
Paperless that supports Django 2.0! As a result of their hard work, you can
now also run Paperless on Python 3.7 as well: `#386`_ & `#390`_.
* `Stéphane Brunner`_ added a few lines of code that made tagging interface a lot
easier on those of us with lots of different tags: `#391`_.
* Thanks to `dadosch`_, `Wolfgang Mader`_, and `Tim Brooks`_ this is the first
version of Paperless that supports Django 2.0! As a result of their hard
work, you can now also run Paperless on Python 3.7 as well: `#386`_ &
`#390`_.
* `Stéphane Brunner`_ added a few lines of code that made tagging interface a
lot easier on those of us with lots of different tags: `#391`_.
* `Kilian Koeltzsch`_ noticed a bug in how we capture & automatically create
tags, so that's fixed now too: `#384`_.
* `erikarvstedt`_ tweaked the behaviour of the test suite to be better behaved
for packaging environments: `#383`_.
* `Lukasz Soluch`_ added CORS support to make building a new Javascript-based
front-end cleaner & easier: `#387`_.
2.1.0
=====
@@ -464,8 +559,17 @@ bulk of the work on this big change.
.. _Mark McFate: https://github.com/SummittDweller
.. _dadosch: https://github.com/dadosch
.. _Wolfgang Mader: https://github.com/wmader
.. _Tim Brooks: https://github.com/brookst
.. _Stéphane Brunner: https://github.com/sbrunner
.. _Kilian Koeltzsch: https://github.com/kiliankoe
.. _Lukasz Soluch: https://github.com/LukaszSolo
.. _Joshua Taillon: https://github.com/jat255
.. _dubit0: https://github.com/dubit0
.. _ahyear: https://github.com/ahyear
.. _jonaswinkler: https://github.com/jonaswinkler
.. _thepill: https://github.com/thepill
.. _Andrew Peng: https://github.com/pengc99
.. _euri10: https://github.com/euri10
.. _#20: https://github.com/danielquinn/paperless/issues/20
.. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -531,6 +635,7 @@ bulk of the work on this big change.
.. _#322: https://github.com/danielquinn/paperless/pull/322
.. _#328: https://github.com/danielquinn/paperless/pull/328
.. _#253: https://github.com/danielquinn/paperless/issues/253
.. _#262: https://github.com/danielquinn/paperless/issues/262
.. _#323: https://github.com/danielquinn/paperless/issues/323
.. _#344: https://github.com/danielquinn/paperless/pull/344
.. _#351: https://github.com/danielquinn/paperless/pull/351
@@ -540,10 +645,26 @@ bulk of the work on this big change.
.. _#374: https://github.com/danielquinn/paperless/pull/374
.. _#375: https://github.com/danielquinn/paperless/pull/375
.. _#376: https://github.com/danielquinn/paperless/pull/376
.. _#383: https://github.com/danielquinn/paperless/pull/383
.. _#384: https://github.com/danielquinn/paperless/issues/384
.. _#386: https://github.com/danielquinn/paperless/issues/386
.. _#387: https://github.com/danielquinn/paperless/pull/387
.. _#391: https://github.com/danielquinn/paperless/pull/391
.. _#390: https://github.com/danielquinn/paperless/pull/390
.. _#392: https://github.com/danielquinn/paperless/issues/392
.. _#393: https://github.com/danielquinn/paperless/issues/393
.. _#395: https://github.com/danielquinn/paperless/pull/395
.. _#394: https://github.com/danielquinn/paperless/issues/394
.. _#396: https://github.com/danielquinn/paperless/pull/396
.. _#399: https://github.com/danielquinn/paperless/pull/399
.. _#400: https://github.com/danielquinn/paperless/pull/400
.. _#401: https://github.com/danielquinn/paperless/pull/401
.. _#405: https://github.com/danielquinn/paperless/pull/405
.. _#406: https://github.com/danielquinn/paperless/issues/406
.. _#412: https://github.com/danielquinn/paperless/issues/412
.. _#413: https://github.com/danielquinn/paperless/pull/413
.. _#414: https://github.com/danielquinn/paperless/issues/414
.. _pipenv: https://docs.pipenv.org/
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
.. _optipng: http://optipng.sourceforge.net/

View File

@@ -76,6 +76,31 @@ Pre-consumption script
* Document file name
A simple but common example for this would be creating a simple script like
this:
``/usr/local/bin/ocr-pdf``
.. code:: bash
#!/usr/bin/env bash
pdf2pdfocr.py -i ${1}
``/etc/paperless.conf``
.. code:: bash
...
PAPERLESS_PRE_CONSUME_SCRIPT="/usr/local/bin/ocr-pdf"
...
This will pass the path to the document about to be consumed to ``/usr/local/bin/ocr-pdf``,
which will in turn call `pdf2pdfocr.py`_ on your document, which will then
overwrite the file with an OCR'd version of the file and exit. At which point,
the consumption process will begin with the newly modified file.
.. _pdf2pdfocr.py: https://github.com/LeoFCardoso/pdf2pdfocr
.. _consumption-director-hook-variables-post:

141
docs/contributing.rst Normal file
View File

@@ -0,0 +1,141 @@
.. _contributing:
Contributing to Paperless
#########################
Maybe you've been using Paperless for a while and want to add a feature or two,
or maybe you've come across a bug that you have some ideas how to solve. The
beauty of Free software is that you can see what's wrong and help to get it
fixed for everyone!
How to Get Your Changes Rolled Into Paperless
=============================================
If you've found a bug, but don't know how to fix it, you can always post an
issue on `GitHub`_ in the hopes that someone will have the time to fix it for
you. If however you're the one with the time, pull requests are always
welcome, you just have to make sure that your code conforms to a few standards:
Pep8
----
It's the standard for all Python development, so it's `very well documented`_.
The short version is:
* Lines should wrap at 79 characters
* Use ``snake_case`` for variables, ``CamelCase`` for classes, and ``ALL_CAPS``
for constants.
* Space out your operators: ``stuff + 7`` instead of ``stuff+7``
* Two empty lines between classes, and functions, but 1 empty line between
class methods.
There's more to it than that, but if you follow those, you'll probably be
alright. When you submit your pull request, there's a pep8 checker that'll
look at your code to see if anything is off. If it finds anything, it'll
complain at you until you fix it.
Additional Style Guides
-----------------------
Where pep8 is ambiguous, I've tried to be a little more specific. These rules
aren't hard-and-fast, but if you can conform to them, I'll appreciate it and
spend less time trying to conform your PR before merging:
Function calls
..............
If you're calling a function and that necessitates more than one line of code,
please format it like this:
.. code:: python
my_function(
argument1,
kwarg1="x",
kwarg2="y"
another_really_long_kwarg="some big value"
a_kwarg_calling_another_long_function=another_function(
another_arg,
another_kwarg="kwarg!"
)
)
This is all in the interest of code uniformity rather than anything else. If
we stick to a style, everything is understandable in the same way.
Quoting Strings
...............
pep8 is a little too open-minded on this for my liking. Python strings should
be quoted with double quotes (``"``) except in cases where the resulting string
would require too much escaping of a double quote, in which case, a single
quoted, or triple-quoted string will do:
.. code:: python
my_string = "This is my string"
problematic_string = 'This is a "string" with "quotes" in it'
In HTML templates, please use double-quotes for tag attributes, and single
quotes for arguments passed to Django tempalte tags:
.. code:: html
<div class="stuff">
<a href="{% url 'some-url-name' pk='w00t' %}">link this</a>
</div>
This is to keep linters happy they look at an HTML file and see an attribute
closing the ``"`` before it should have been.
--
That's all there is in terms of guidelines, so I hope it's not too daunting.
Indentation & Spacing
.....................
When it comes to indentation:
* For Python, the rule is: follow pep8 and use 4 spaces.
* For Javascript, CSS, and HTML, please use 1 tab.
Additionally, Django templates making use of block elements like ``{% if %}``,
``{% for %}``, and ``{% block %}`` etc. should be indented:
Good:
.. code:: html
{% block stuff %}
<h1>This is the stuff</h1>
{% endblock %}
Bad:
.. code:: html
{% block stuff %}
<h1>This is the stuff</h1>
{% endblock %}
The Code of Conduct
===================
Paperless has a `code of conduct`_. It's a lot like the other ones you see out
there, with a few small changes, but basically it boils down to:
> Don't be an ass, or you might get banned.
I'm proud to say that the CoC has never had to be enforced because everyone has
been awesome, friendly, and professional.
.. _GitHub: https://github.com/danielquinn/paperless/issues
.. _very well documented: https://www.python.org/dev/peps/pep-0008/
.. _code of conduct: https://github.com/danielquinn/paperless/blob/master/CODE_OF_CONDUCT.md

View File

@@ -43,5 +43,6 @@ Contents
customising
extending
troubleshooting
contributing
scanners
changelog

View File

@@ -101,6 +101,7 @@ is similar:
$ cd /path/to/project
$ git pull
$ docker build -t paperless .
$ docker-compose run --rm comsumer migrate
$ docker-compose up -d
If ``git pull`` doesn't report any changes, there is no need to continue with

0
docs/requirements.txt Normal file
View File

View File

@@ -59,6 +59,11 @@ PAPERLESS_EMAIL_SECRET=""
#### Security ####
###############################################################################
# Controls whether django's debug mode is enabled. Disable this on production
# systems. Debug mode is enabled by default.
PAPERLESS_DEBUG="false"
# Paperless can be instructed to attempt to encrypt your PDF files with GPG
# using the PAPERLESS_PASSPHRASE specified below. If however you're not
# concerned about encrypting these files (for example if you have disk
@@ -89,9 +94,10 @@ PAPERLESS_EMAIL_SECRET=""
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
# If you decide to use Paperless APIs in an ajax calls, you need to add your
# servers to the allowed hosts that can do CORS calls. By default Paperless allows
# calls from localhost:8080. The same rules as above how the list should look like.
# If you decide to use the Paperless API in an ajax call, you need to add your
# servers to the list of allowed hosts that can do CORS calls. By default
# Paperless allows calls from localhost:8080, but you'd like to change that,
# you can set this value to a comma-separated list.
#PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"
# To host paperless under a subpath url like example.com/paperless you set
@@ -116,6 +122,10 @@ PAPERLESS_EMAIL_SECRET=""
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
# By default, when clicking on a document within the web interface, the
# browser will prompt the user to save the document to disk. By setting this to
# "true", the document will instead be opened in the browser, if possible.
#PAPERLESS_INLINE_DOC="false"
#
# The following values use sensible defaults for modern systems, but if you're
@@ -198,3 +208,28 @@ PAPERLESS_EMAIL_SECRET=""
# positive integer, but if you don't define one in paperless.conf, a default of
# 100 will be used.
#PAPERLESS_LIST_PER_PAGE=100
# The number of years for which a correspondent will be included in the recent
# correspondents filter.
#PAPERLESS_RECENT_CORRESPONDENT_YEARS=1
###############################################################################
#### Third-Party Binaries ####
###############################################################################
# There are a few external software packages that Paperless expects to find on
# your system when it starts up. Unless you've done something creative with
# their installation, you probably won't need to edit any of these. However,
# if you've installed these programs somewhere where simply typing the name of
# the program doesn't automatically execute it (ie. the program isn't in your
# $PATH), then you'll need to specify the literal path for that program here.
# Convert (part of the ImageMagick suite)
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
# Unpaper
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
# Optipng (for optimising thumbnail sizes)
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng

24
requirements.txt Normal file → Executable file
View File

@@ -1,10 +1,10 @@
-i https://pypi.python.org/simple
apipkg==1.5; python_version != '3.1.*'
atomicwrites==1.2.1; python_version != '3.1.*'
apipkg==1.5; python_version != '3.3.*'
atomicwrites==1.2.1; python_version != '3.3.*'
attrs==18.2.0
certifi==2018.8.24
chardet==3.0.4
coverage==4.5.1; python_version != '3.1.*'
coverage==4.5.1; python_version < '4'
coveralls==1.5.0
dateparser==0.7.0
django-cors-headers==2.4.0
@@ -14,9 +14,9 @@ django-filter==2.0.0
django==2.0.8
djangorestframework==3.8.2
docopt==0.6.2
execnet==1.5.0; python_version != '3.1.*'
execnet==1.5.0; python_version != '3.3.*'
factory-boy==2.11.1
faker==0.9.0
faker==0.9.0; python_version >= '2.7'
filemagic==1.6
fuzzywuzzy==0.15.0
gunicorn==19.9.0
@@ -26,17 +26,17 @@ langdetect==1.0.7
more-itertools==4.3.0
pdftotext==2.1.0
pillow==5.2.0
pluggy==0.7.1; python_version != '3.1.*'
py==1.6.0; python_version != '3.1.*'
pluggy==0.7.1; python_version != '3.3.*'
py==1.6.0; python_version != '3.3.*'
pycodestyle==2.4.0
pyocr==0.5.2
pytest-cov==2.5.1
pyocr==0.5.3
pytest-cov==2.6.0
pytest-django==3.4.2
pytest-env==0.6.2
pytest-forked==0.2
pytest-forked==0.2; python_version != '3.3.*'
pytest-sugar==0.9.1
pytest-xdist==1.23.0
pytest==3.7.4
pytest==3.8.0
python-dateutil==2.7.3
python-dotenv==0.9.1
python-gnupg==0.4.3
@@ -48,4 +48,4 @@ six==1.11.0
termcolor==1.1.0
text-unidecode==1.2
tzlocal==1.5.1
urllib3==1.23; python_version != '3.0.*'
urllib3==1.23; python_version != '3.3.*'

146
src/documents/actions.py Normal file
View File

@@ -0,0 +1,146 @@
from django.contrib import messages
from django.contrib.admin import helpers
from django.contrib.admin.utils import model_ngettext
from django.core.exceptions import PermissionDenied
from django.template.response import TemplateResponse
from documents.models import Correspondent, Tag
def select_action(
modeladmin, request, queryset, title, action, modelclass,
success_message="", document_action=None, queryset_action=None):
opts = modeladmin.model._meta
app_label = opts.app_label
if not modeladmin.has_change_permission(request):
raise PermissionDenied
if request.POST.get('post'):
n = queryset.count()
selected_object = modelclass.objects.get(id=request.POST.get('obj_id'))
if n:
for document in queryset:
if document_action:
document_action(document, selected_object)
document_display = str(document)
modeladmin.log_change(request, document, document_display)
if queryset_action:
queryset_action(queryset, selected_object)
modeladmin.message_user(request, success_message % {
"selected_object": selected_object.name,
"count": n,
"items": model_ngettext(modeladmin.opts, n)
}, messages.SUCCESS)
# Return None to display the change list page again.
return None
context = dict(
modeladmin.admin_site.each_context(request),
title=title,
queryset=queryset,
opts=opts,
action_checkbox_name=helpers.ACTION_CHECKBOX_NAME,
media=modeladmin.media,
action=action,
objects=modelclass.objects.all(),
itemname=model_ngettext(modelclass, 1)
)
request.current_app = modeladmin.admin_site.name
return TemplateResponse(
request,
"admin/{}/{}/select_object.html".format(app_label, opts.model_name),
context
)
def simple_action(
modeladmin, request, queryset, success_message="",
document_action=None, queryset_action=None):
if not modeladmin.has_change_permission(request):
raise PermissionDenied
n = queryset.count()
if n:
for document in queryset:
if document_action:
document_action(document)
document_display = str(document)
modeladmin.log_change(request, document, document_display)
if queryset_action:
queryset_action(queryset)
modeladmin.message_user(request, success_message % {
"count": n, "items": model_ngettext(modeladmin.opts, n)
}, messages.SUCCESS)
# Return None to display the change list page again.
return None
def add_tag_to_selected(modeladmin, request, queryset):
return select_action(
modeladmin=modeladmin,
request=request,
queryset=queryset,
title="Add tag to multiple documents",
action="add_tag_to_selected",
modelclass=Tag,
success_message="Successfully added tag %(selected_object)s to "
"%(count)d %(items)s.",
document_action=lambda doc, tag: doc.tags.add(tag)
)
def remove_tag_from_selected(modeladmin, request, queryset):
return select_action(
modeladmin=modeladmin,
request=request,
queryset=queryset,
title="Remove tag from multiple documents",
action="remove_tag_from_selected",
modelclass=Tag,
success_message="Successfully removed tag %(selected_object)s from "
"%(count)d %(items)s.",
document_action=lambda doc, tag: doc.tags.remove(tag)
)
def set_correspondent_on_selected(modeladmin, request, queryset):
return select_action(
modeladmin=modeladmin,
request=request,
queryset=queryset,
title="Set correspondent on multiple documents",
action="set_correspondent_on_selected",
modelclass=Correspondent,
success_message="Successfully set correspondent %(selected_object)s "
"on %(count)d %(items)s.",
queryset_action=lambda qs, corr: qs.update(correspondent=corr)
)
def remove_correspondent_from_selected(modeladmin, request, queryset):
return simple_action(
modeladmin=modeladmin,
request=request,
queryset=queryset,
success_message="Successfully removed correspondent from %(count)d "
"%(items)s.",
queryset_action=lambda qs: qs.update(correspondent=None)
)
add_tag_to_selected.short_description = "Add tag to selected documents"
remove_tag_from_selected.short_description = \
"Remove tag from selected documents"
set_correspondent_on_selected.short_description = \
"Set correspondent on selected documents"
remove_correspondent_from_selected.short_description = \
"Remove correspondent from selected documents"

View File

@@ -1,42 +1,25 @@
from datetime import datetime
from datetime import datetime, timedelta
from django.conf import settings
from django.contrib import admin
from django.contrib.auth.models import User, Group
try:
from django.core.urlresolvers import reverse
except ImportError:
from django.urls import reverse
from django.contrib import admin, messages
from django.contrib.admin.templatetags.admin_urls import add_preserved_filters
from django.contrib.auth.models import Group, User
from django.db import models
from django.http import HttpResponseRedirect
from django.templatetags.static import static
from django.utils.safestring import mark_safe
from django.urls import reverse
from django.utils.html import format_html, format_html_join
from django.utils.http import urlquote
from django.utils.safestring import mark_safe
from .models import Correspondent, Tag, Document, Log
from documents.actions import (
add_tag_to_selected,
remove_correspondent_from_selected,
remove_tag_from_selected,
set_correspondent_on_selected
)
class MonthListFilter(admin.SimpleListFilter):
title = "Month"
# Parameter for the filter that will be used in the URL query.
parameter_name = "month"
def lookups(self, request, model_admin):
r = []
for document in Document.objects.all():
r.append((
document.created.strftime("%Y-%m"),
document.created.strftime("%B %Y")
))
return sorted(set(r), key=lambda x: x[0], reverse=True)
def queryset(self, request, queryset):
if not self.value():
return None
year, month = self.value().split("-")
return queryset.filter(created__year=year, created__month=month)
from .models import Correspondent, Document, Log, Tag
class FinancialYearFilter(admin.SimpleListFilter):
@@ -104,18 +87,61 @@ class FinancialYearFilter(admin.SimpleListFilter):
created__lte=self._fy_end(end))
class RecentCorrespondentFilter(admin.RelatedFieldListFilter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.title = "correspondent (recent)"
def field_choices(self, field, request, model_admin):
years = settings.PAPERLESS_RECENT_CORRESPONDENT_YEARS
days = 365 * years
lookups = []
if years and years > 0:
correspondents = Correspondent.objects.filter(
documents__created__gte=datetime.now() - timedelta(days=days)
).distinct()
for c in correspondents:
lookups.append((c.id, c.name))
return lookups
class CommonAdmin(admin.ModelAdmin):
list_per_page = settings.PAPERLESS_LIST_PER_PAGE
class CorrespondentAdmin(CommonAdmin):
list_display = ("name", "match", "matching_algorithm", "document_count")
list_display = (
"name",
"match",
"matching_algorithm",
"document_count",
"last_correspondence"
)
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
readonly_fields = ("slug",)
def get_queryset(self, request):
qs = super(CorrespondentAdmin, self).get_queryset(request)
qs = qs.annotate(
document_count=models.Count("documents"),
last_correspondence=models.Max("documents__created")
)
return qs
def document_count(self, obj):
return obj.documents.count()
return obj.document_count
document_count.admin_order_field = "document_count"
def last_correspondence(self, obj):
return obj.last_correspondence
last_correspondence.admin_order_field = "last_correspondence"
class TagAdmin(CommonAdmin):
@@ -125,8 +151,16 @@ class TagAdmin(CommonAdmin):
list_filter = ("colour", "matching_algorithm")
list_editable = ("colour", "match", "matching_algorithm")
readonly_fields = ("slug",)
def get_queryset(self, request):
qs = super(TagAdmin, self).get_queryset(request)
qs = qs.annotate(document_count=models.Count("documents"))
return qs
def document_count(self, obj):
return obj.documents.count()
return obj.document_count
document_count.admin_order_field = "document_count"
class DocumentAdmin(CommonAdmin):
@@ -137,15 +171,33 @@ class DocumentAdmin(CommonAdmin):
}
search_fields = ("correspondent__name", "title", "content", "tags__name")
readonly_fields = ("added",)
readonly_fields = ("added", "file_type", "storage_type",)
list_display = ("title", "created", "added", "thumbnail", "correspondent",
"tags_")
list_filter = ("tags", "correspondent", FinancialYearFilter,
MonthListFilter)
list_filter = (
"tags",
("correspondent", RecentCorrespondentFilter),
"correspondent",
FinancialYearFilter
)
filter_horizontal = ("tags",)
ordering = ["-created", "correspondent"]
actions = [
add_tag_to_selected,
remove_tag_from_selected,
set_correspondent_on_selected,
remove_correspondent_from_selected
]
date_hierarchy = "created"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.document_queue = []
def has_add_permission(self, request):
return False
@@ -153,6 +205,79 @@ class DocumentAdmin(CommonAdmin):
return obj.created.date().strftime("%Y-%m-%d")
created_.short_description = "Created"
def changelist_view(self, request, extra_context=None):
response = super().changelist_view(
request,
extra_context=extra_context
)
if request.method == "GET":
cl = self.get_changelist_instance(request)
self.document_queue = [doc.id for doc in cl.queryset]
return response
def change_view(self, request, object_id=None, form_url='',
extra_context=None):
extra_context = extra_context or {}
if self.document_queue and object_id:
if int(object_id) in self.document_queue:
# There is a queue of documents
current_index = self.document_queue.index(int(object_id))
if current_index < len(self.document_queue) - 1:
# ... and there are still documents in the queue
extra_context["next_object"] = self.document_queue[
current_index + 1
]
return super(DocumentAdmin, self).change_view(
request,
object_id,
form_url,
extra_context=extra_context,
)
def response_change(self, request, obj):
# This is mostly copied from ModelAdmin.response_change()
opts = self.model._meta
preserved_filters = self.get_preserved_filters(request)
msg_dict = {
"name": opts.verbose_name,
"obj": format_html(
'<a href="{}">{}</a>',
urlquote(request.path),
obj
),
}
if "_saveandeditnext" in request.POST:
msg = format_html(
'The {name} "{obj}" was changed successfully. '
'Editing next object.',
**msg_dict
)
self.message_user(request, msg, messages.SUCCESS)
redirect_url = reverse(
"admin:{}_{}_change".format(opts.app_label, opts.model_name),
args=(request.POST["_next_object"],),
current_app=self.admin_site.name
)
redirect_url = add_preserved_filters(
{
"preserved_filters": preserved_filters,
"opts": opts
},
redirect_url
)
return HttpResponseRedirect(redirect_url)
return super().response_change(request, obj)
@mark_safe
def thumbnail(self, obj):
return self._html_tag(
"a",
@@ -165,8 +290,8 @@ class DocumentAdmin(CommonAdmin):
),
href=obj.download_url
)
thumbnail.allow_tags = True
@mark_safe
def tags_(self, obj):
r = ""
for tag in obj.tags.all():
@@ -183,10 +308,11 @@ class DocumentAdmin(CommonAdmin):
)
}
)
return mark_safe(r)
tags_.allow_tags = True
return r
@mark_safe
def document(self, obj):
# TODO: is this method even used anymore?
return self._html_tag(
"a",
self._html_tag(
@@ -199,7 +325,6 @@ class DocumentAdmin(CommonAdmin):
),
href=obj.download_url
)
document.allow_tags = True
@staticmethod
def _html_tag(kind, inside=None, **kwargs):

View File

@@ -2,7 +2,7 @@ import textwrap
from django.conf import settings
from django.core.checks import Error, register
from django.db.utils import OperationalError
from django.db.utils import OperationalError, ProgrammingError
@register()
@@ -14,7 +14,7 @@ def changed_password_check(app_configs, **kwargs):
try:
encrypted_doc = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG).first()
except OperationalError:
except (OperationalError, ProgrammingError):
return [] # No documents table yet
if encrypted_doc:

View File

@@ -1,3 +1,4 @@
from django.db import transaction
import datetime
import hashlib
import logging
@@ -111,8 +112,11 @@ class Consumer:
if not self.try_consume_file(file):
self._ignore.append((file, mtime))
@transaction.atomic
def try_consume_file(self, file):
"Return True if file was consumed"
"""
Return True if file was consumed
"""
if not re.match(FileInfo.REGEXES["title"], file):
return False
@@ -145,7 +149,7 @@ class Consumer:
parsed_document = parser_class(doc)
try:
thumbnail = parsed_document.get_thumbnail()
thumbnail = parsed_document.get_optimised_thumbnail()
date = parsed_document.get_date()
document = self._store(
parsed_document.get_text(),

View File

@@ -1,8 +1,14 @@
from django_filters.rest_framework import CharFilter, FilterSet, BooleanFilter
from django_filters.rest_framework import BooleanFilter, FilterSet
from .models import Correspondent, Document, Tag
CHAR_KWARGS = (
"startswith", "endswith", "contains",
"istartswith", "iendswith", "icontains"
)
class CorrespondentFilterSet(FilterSet):
class Meta:
@@ -31,34 +37,24 @@ class TagFilterSet(FilterSet):
class DocumentFilterSet(FilterSet):
CHAR_KWARGS = {
"lookup_expr": (
"startswith",
"endswith",
"contains",
"istartswith",
"iendswith",
"icontains"
)
}
correspondent__name = CharFilter(
field_name="correspondent__name", **CHAR_KWARGS)
correspondent__slug = CharFilter(
field_name="correspondent__slug", **CHAR_KWARGS)
tags__name = CharFilter(
field_name="tags__name", **CHAR_KWARGS)
tags__slug = CharFilter(
field_name="tags__slug", **CHAR_KWARGS)
tags__empty = BooleanFilter(
field_name="tags", lookup_expr="isnull", distinct=True)
tags_empty = BooleanFilter(
label="Is tagged",
field_name="tags",
lookup_expr="isnull",
exclude=True
)
class Meta:
model = Document
fields = {
"title": [
"startswith", "endswith", "contains",
"istartswith", "iendswith", "icontains"
],
"content": ["contains", "icontains"],
"title": CHAR_KWARGS,
"content": ("contains", "icontains"),
"correspondent__name": CHAR_KWARGS,
"correspondent__slug": CHAR_KWARGS,
"tags__name": CHAR_KWARGS,
"tags__slug": CHAR_KWARGS,
}

View File

@@ -55,7 +55,12 @@ class Command(Renderable, BaseCommand):
documents = Document.objects.all()
document_map = {d.pk: d for d in documents}
manifest = json.loads(serializers.serialize("json", documents))
for document_dict in manifest:
for index, document_dict in enumerate(manifest):
# Force output to unencrypted as that will be the current state.
# The importer will make the decision to encrypt or not.
manifest[index]["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
document = document_map[document_dict["pk"]]

View File

@@ -94,7 +94,7 @@ class Command(Renderable, BaseCommand):
document_path = os.path.join(self.source, doc_file)
thumbnail_path = os.path.join(self.source, thumb_file)
if document.storage_type == Document.STORAGE_TYPE_GPG:
if settings.PASSPHRASE:
with open(document_path, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
@@ -112,3 +112,15 @@ class Command(Renderable, BaseCommand):
shutil.copy(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path)
# Reset the storage type to whatever we've used while importing
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
storage_type = Document.STORAGE_TYPE_GPG
Document.objects.filter(
pk__in=[r["pk"] for r in self.manifest]
).update(
storage_type=storage_type
)

View File

@@ -158,9 +158,4 @@ class Migration(migrations.Migration):
name='modified',
field=models.DateTimeField(auto_now=True, db_index=True),
),
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.', max_length=32, unique=True),
),
]

View File

@@ -12,6 +12,11 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.', max_length=32, unique=True),
),
migrations.AddField(
model_name='correspondent',
name='is_insensitive',

View File

@@ -0,0 +1,52 @@
# Generated by Django 2.0.8 on 2018-10-07 14:20
from django.db import migrations, models
from django.utils.text import slugify
def re_slug_all_the_things(apps, schema_editor):
"""
Rewrite all slug values to make sure they're actually slugs before we brand
them as uneditable.
"""
Tag = apps.get_model("documents", "Tag")
Correspondent = apps.get_model("documents", "Tag")
for klass in (Tag, Correspondent):
for instance in klass.objects.all():
klass.objects.filter(
pk=instance.pk
).update(
slug=slugify(instance.slug)
)
class Migration(migrations.Migration):
dependencies = [
('documents', '0021_document_storage_type'),
]
operations = [
migrations.AlterModelOptions(
name='tag',
options={'ordering': ('name',)},
),
migrations.AlterField(
model_name='correspondent',
name='slug',
field=models.SlugField(blank=True, editable=False),
),
migrations.AlterField(
model_name='document',
name='file_type',
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')], editable=False, max_length=4),
),
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(blank=True, editable=False),
),
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
]

View File

@@ -1,24 +1,25 @@
# coding=utf-8
import dateutil.parser
import logging
import os
import re
import uuid
from collections import OrderedDict
import dateutil.parser
from django.conf import settings
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.text import slugify
from fuzzywuzzy import fuzz
from django.conf import settings
from .managers import LogManager
try:
from django.core.urlresolvers import reverse
except ImportError:
from django.urls import reverse
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from .managers import LogManager
class MatchingModel(models.Model):
@@ -37,7 +38,7 @@ class MatchingModel(models.Model):
)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True)
slug = models.SlugField(blank=True, editable=False)
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
@@ -135,7 +136,7 @@ class MatchingModel(models.Model):
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with\s+quotes", "and", "spaces"]
["some", "random", "words", "with+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
@@ -147,9 +148,7 @@ class MatchingModel(models.Model):
def save(self, *args, **kwargs):
self.match = self.match.lower()
if not self.slug:
self.slug = slugify(self.name)
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
@@ -192,7 +191,11 @@ class Document(models.Model):
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
@@ -365,51 +368,52 @@ class FileInfo:
)
)
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
))
])
@@ -447,7 +451,7 @@ class FileInfo:
r = []
for t in tags.split(","):
r.append(Tag.objects.get_or_create(
slug=t.lower(),
slug=slugify(t),
defaults={"name": t}
)[0])
return tuple(r)

View File

@@ -1,8 +1,28 @@
import logging
import os
import re
import shutil
import subprocess
import tempfile
import dateparser
from django.conf import settings
from django.utils import timezone
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
)
class ParseError(Exception):
@@ -16,6 +36,8 @@ class DocumentParser:
"""
SCRATCH = settings.SCRATCH_DIR
DATE_ORDER = settings.DATE_ORDER
OPTIPNG = settings.OPTIPNG_BINARY
def __init__(self, path):
self.document_path = path
@@ -29,6 +51,19 @@ class DocumentParser:
"""
raise NotImplementedError()
def optimise_thumbnail(self, in_path):
out_path = os.path.join(self.tempdir, "optipng.png")
args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
def get_text(self):
"""
Returns the text from the document and only the text.
@@ -39,7 +74,52 @@ class DocumentParser:
"""
Returns the date of the document.
"""
raise NotImplementedError()
date = None
date_string = None
try:
text = self.get_text()
except ParseError:
return None
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = dateparser.parse(
date_string,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
break
else:
date = None
if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
date_string
)
)
else:
self.log("info", "Unable to detect date for document")
return date
def log(self, level, message):
getattr(self.logger, level)(message, extra={

View File

@@ -1,5 +1,21 @@
{% extends 'admin/change_form.html' %}
{% block content %}
{{ block.super }}
{% if next_object %}
<script type="text/javascript">//<![CDATA[
(function($){
$('<input type="submit" value="Save and edit next" name="_saveandeditnext" />')
.prependTo('div.submit-row');
$('<input type="hidden" value="{{next_object}}" name="_next_object" />')
.prependTo('div.submit-row');
})(django.jQuery);
//]]></script>
{% endif %}
{% endblock content %}
{% block footer %}
@@ -10,4 +26,4 @@
django.jQuery(".field-created input").first().attr("type", "date")
</script>
{% endblock footer %}
{% endblock footer %}

View File

@@ -0,0 +1,50 @@
{% extends "admin/base_site.html" %}
{% load i18n l10n admin_urls static %}
{% load staticfiles %}
{% block extrahead %}
{{ block.super }}
{{ media }}
<script type="text/javascript" src="{% static 'admin/js/cancel.js' %}"></script>
{% endblock %}
{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} delete-confirmation delete-selected-confirmation{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:app_list' app_label=opts.app_label %}">{{ opts.app_config.verbose_name }}</a>
&rsaquo; <a href="{% url opts|admin_urlname:'changelist' %}">{{ opts.verbose_name_plural|capfirst }}</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<p>Please select the {{itemname}}.</p>
<form method="post">{% csrf_token %}
<div>
{% for obj in queryset %}
<input type="hidden" name="{{ action_checkbox_name }}" value="{{ obj.pk|unlocalize }}"/>
{% endfor %}
<p>
<select name="obj_id">
{% for obj in objects %}
<option value="{{ obj.id }}">{{ obj.name }}</option>
{% endfor %}
</select>
</p>
<input type="hidden" name="action" value="{{ action }}"/>
<input type="hidden" name="post" value="yes" />
<p>
<input type="submit" value="{% trans 'Confirm' %}" />
<a href="#" class="button cancel-link">{% trans "Go back" %}</a>
</p>
</div>
</form>
{% endblock %}

View File

@@ -166,7 +166,7 @@ class TestMatching(TestCase):
def test_match_regex(self):
self._test_matching(
"alpha\w+gamma",
r"alpha\w+gamma",
"MATCH_REGEX",
(
"I have alpha_and_gamma in me",

View File

@@ -1,6 +1,8 @@
from django.http import HttpResponse, HttpResponseBadRequest
from django.views.generic import DetailView, FormView, TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django.conf import settings
from paperless.db import GnuPG
from paperless.mixins import SessionOrBasicAuthMixin
from paperless.views import StandardPagination
@@ -48,6 +50,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
if self.kwargs["kind"] == "thumb":
@@ -60,8 +65,11 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
self._get_raw_data(self.object.source_file),
content_type=content_types[self.object.file_type]
)
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
self.object.file_name)
DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment'
response["Content-Disposition"] = '{}; filename="{}"'.format(
DISPOSITION, self.object.file_name)
return response

View File

@@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs):
error = "Paperless can't find {}. Without it, consumption is impossible."
hint = "Either it's not in your ${PATH} or it's not installed."
binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
binaries = (
settings.CONVERT_BINARY,
settings.OPTIPNG_BINARY,
settings.UNPAPER_BINARY,
"tesseract"
)
check_messages = []
for binary in binaries:

View File

@@ -1,15 +1,20 @@
from django.contrib.auth.models import User as DjangoUser
class User:
"""
This is a dummy django User used with our middleware to disable
login authentication if that is configured in paperless.conf
This is a dummy django User used with our middleware to disable
login authentication if that is configured in paperless.conf
"""
is_superuser = True
is_active = True
is_staff = True
is_authenticated = True
# Must be -1 to avoid colliding with real user ID's (which start at 1)
id = -1
@property
def id(self):
return DjangoUser.objects.order_by("pk").first().pk
@property
def pk(self):
@@ -17,9 +22,9 @@ class User:
"""
NOTE: These are here as a hack instead of being in the User definition
above due to the way pycodestyle handles lamdbdas.
See https://github.com/PyCQA/pycodestyle/issues/379 for more.
NOTE: These are here as a hack instead of being in the User definition
NOTE: above due to the way pycodestyle handles lamdbdas.
NOTE: See https://github.com/PyCQA/pycodestyle/issues/379 for more.
"""
User.has_module_perms = lambda *_: True

View File

@@ -22,6 +22,14 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
load_dotenv("/usr/local/etc/paperless.conf")
def __get_boolean(key, default="NO"):
"""
Return a boolean value based on whatever the user has supplied in the
environment based on whether the value "looks like" it's True or not.
"""
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -39,7 +47,7 @@ SECRET_KEY = os.getenv(
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
DEBUG = __get_boolean("PAPERLESS_DEBUG", "YES")
LOGIN_URL = "admin:login"
@@ -67,6 +75,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"reminders.apps.RemindersConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"django.contrib.admin",
@@ -79,8 +88,6 @@ INSTALLED_APPS = [
if os.getenv("PAPERLESS_INSTALLED_APPS"):
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
@@ -99,7 +106,6 @@ CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "localho
if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):
_index = MIDDLEWARE.index("django.contrib.auth.middleware.AuthenticationMiddleware")
MIDDLEWARE[_index] = "paperless.middleware.Middleware"
MIDDLEWARE.remove("django.contrib.auth.middleware.SessionAuthenticationMiddleware")
ROOT_URLCONF = 'paperless.urls'
@@ -138,13 +144,14 @@ DATABASES = {
}
}
if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
if os.getenv("PAPERLESS_DBUSER"):
DATABASES["default"] = {
"ENGINE": "django.db.backends.postgresql_psycopg2",
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER"),
"PASSWORD": os.getenv("PAPERLESS_DBPASS")
}
if os.getenv("PAPERLESS_DBPASS"):
DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS")
# Password validation
@@ -224,12 +231,12 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
# OCR all documents?
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) # NOQA
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # NOQA
FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -240,6 +247,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")
# OptiPNG
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
# Unpaper
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
@@ -273,6 +283,9 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
# Whether to display a selected document inline, or download it as attachment:
INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC")
# The number of items on each page in the web UI. This value must be a
# positive integer, but if you don't define one in paperless.conf, a default of
# 100 will be used.
@@ -283,3 +296,9 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")
# Specify the default date order (for autodetected dates)
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
# Specify for how many years a correspondent is considered recent. Recent
# correspondents will be shown in a separate "Recent correspondents" filter as
# well. Set to 0 to disable this filter.
PAPERLESS_RECENT_CORRESPONDENT_YEARS = int(os.getenv(
"PAPERLESS_RECENT_CORRESPONDENT_YEARS", 0))

View File

@@ -1 +1 @@
__version__ = (2, 2, 0)
__version__ = (2, 5, 0)

View File

@@ -4,7 +4,6 @@ import re
import subprocess
from multiprocessing.pool import Pool
import dateparser
import langdetect
import pyocr
from django.conf import settings
@@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
@@ -46,14 +44,18 @@ class RasterisedDocumentParser(DocumentParser):
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
"{}[0]".format(self.document_path),
out_path
)
return os.path.join(self.tempdir, "convert-0000.png")
return out_path
def _is_ocred(self):
@@ -171,8 +173,8 @@ class RasterisedDocumentParser(DocumentParser):
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError(
"The guessed language is not available in this instance of "
"Tesseract."
"The guessed language ({}) is not available in this instance "
"of Tesseract.".format(guessed_language)
)
def _ocr(self, imgs, lang):
@@ -201,54 +203,6 @@ class RasterisedDocumentParser(DocumentParser):
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
datestring = m.group(0)
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_convert(*args):
@@ -272,8 +226,9 @@ def run_unpaper(args):
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub(
r"([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace

View File

@@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def handle(cls, sender, **kwargs):

View File

@@ -384,3 +384,42 @@ class TestDate(TestCase):
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-2350 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_future(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())

View File

View File

@@ -0,0 +1,16 @@
from django.apps import AppConfig
class PaperlessTextConfig(AppConfig):
name = "paperless_text"
def ready(self):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self)

View File

@@ -0,0 +1,105 @@
import os
import subprocess
from django.conf import settings
from documents.parsers import DocumentParser, ParseError
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
"""
CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path):
super().__init__(path)
self._text = None
def get_thumbnail(self):
"""
The thumbnail of a text file is just a 500px wide image of the text
rendered onto a letter-sized page.
"""
# The below is heavily cribbed from https://askubuntu.com/a/590951
bg_color = "white" # bg color
text_color = "black" # text color
psize = [500, 647] # icon size
n_lines = 50 # number of lines to show
out_path = os.path.join(self.tempdir, "convert.png")
temp_bg = os.path.join(self.tempdir, "bg.png")
temp_txlayer = os.path.join(self.tempdir, "tx.png")
picsize = "x".join([str(n) for n in psize])
txsize = "x".join([str(n - 8) for n in psize])
def create_bg():
work_size = ",".join([str(n - 1) for n in psize])
r = str(round(psize[0] / 10))
rounded = ",".join([r, r])
run_command(
self.CONVERT,
"-size ", picsize,
' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ', # NOQA: E501
temp_bg
)
def read_text():
with open(self.document_path, 'r') as src:
lines = [l.strip() for l in src.readlines()]
text = "\n".join([l for l in lines[:n_lines]])
return text.replace('"', "'")
def create_txlayer():
run_command(
self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer
)
create_txlayer()
create_bg()
run_command(
self.CONVERT,
temp_bg,
temp_txlayer,
"-background None -layers merge ",
out_path
)
return out_path
def get_text(self):
if self._text is not None:
return self._text
with open(self.document_path, 'r') as f:
self._text = f.read()
return self._text
def run_command(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
if not subprocess.Popen(' '.join(args), env=environment,
shell=True).wait() == 0:
raise ParseError("Convert failed at {}".format(args))

View File

@@ -0,0 +1,23 @@
import re
from .parsers import TextDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TextDocumentParser,
"weight": 10
}
return None

View File

@@ -0,0 +1,19 @@
# Generated by Django 2.0.8 on 2018-10-07 14:20
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('reminders', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='reminder',
name='document',
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='documents.Document'),
),
]

View File

@@ -4,7 +4,6 @@ from django.db import models
class Reminder(models.Model):
document = models.ForeignKey(
"documents.Document", on_delete=models.PROTECT
)
"documents.Document", on_delete=models.PROTECT)
date = models.DateTimeField()
note = models.TextField(blank=True)

View File

@@ -5,7 +5,7 @@
[tox]
skipsdist = True
envlist = py34, py35, py36, pycodestyle, doc
envlist = py34, py35, py36, py37, pycodestyle, doc
[testenv]
commands = pytest