Compare commits

...

30 Commits
2.2.1 ... 2.3.0

Author SHA1 Message Date
Daniel Quinn
2edf65dd1e Bump to 2.3.0 2018-09-09 21:51:44 +01:00
Daniel Quinn
9a739bdbab Merge pull request #401 from ahyear/patch-1
add migrate commande to docker update process
2018-09-09 21:26:56 +01:00
Daniel Quinn
66db06590d Merge branch 'jat255-ENH_config_inline_or_attach' 2018-09-09 21:22:42 +01:00
Daniel Quinn
7cef108785 Streamline how we handle boolean values in settings.py 2018-09-09 21:22:07 +01:00
Daniel Quinn
a86a20ef0f Make the example file contain the default value 2018-09-09 21:16:53 +01:00
Daniel Quinn
f94347abc0 Merge branch 'ENH_config_inline_or_attach' of git://github.com/jat255/paperless into jat255-ENH_config_inline_or_attach 2018-09-09 21:15:14 +01:00
Daniel Quinn
46cbd10ba0 Merge pull request #399 from jat255/ENH_convert_only_one_page
Speed up thumbnail generation for PDFs
2018-09-09 21:12:42 +01:00
Daniel Quinn
2a96c648e8 Merge pull request #396 from dubit0/postgres_mysql_fix
Fix document checks with PostgreSQL and MySQL backends.
2018-09-09 21:10:36 +01:00
Daniel Quinn
75648cc74b Merge branch 'jat255-ENH_text_consumer' 2018-09-09 21:03:58 +01:00
Daniel Quinn
0472fe4e9e Reorder imports 2018-09-09 21:03:37 +01:00
Daniel Quinn
c99f5923d5 Rename parsers to DATE_REGEX
In moving the `parsers` variable into the package-level, it lost the
context, so a more descriptive name was needed.
2018-09-09 21:02:30 +01:00
Daniel Quinn
ef302abed7 Fix pycodestyle complaints 2018-09-09 20:55:37 +01:00
Daniel Quinn
2dc35cc856 Merge branch 'ENH_text_consumer' of git://github.com/jat255/paperless into jat255-ENH_text_consumer 2018-09-09 20:52:59 +01:00
Daniel Quinn
f4c399f0dd Merge pull request #398 from ddddavidmartin/bump_pyocr_version_for_tesseract_4_support
Bump required version for Pyocr to support the latest tesseract 4.
2018-09-09 20:01:51 +01:00
Daniel Quinn
5342db6ada Fix pycodestyle complaints
Apparently, pycodestyle updated itself to now check for invalid escape
sequences, which only complain if the regex in use isn't a raw string
(r"").
2018-09-09 20:00:12 +01:00
Daniel Quinn
5c39fff51b Add tox to dev dependencies 2018-09-09 19:59:47 +01:00
ahyear
ed0e40d3e6 add migrate commande to docker update process 2018-09-06 15:32:41 +02:00
Joshua Taillon
652ead2f5c remove debugging print statement 2018-09-05 23:05:37 -04:00
Joshua Taillon
be9757894a add INLINE_DOC to settings.py 2018-09-05 23:03:30 -04:00
Joshua Taillon
22378789e2 add option for inline vs. attachment for document rendering 2018-09-05 22:58:38 -04:00
Joshua Taillon
72c828170e move date-matching regex pattern to base parser module for use by all subclasses 2018-09-05 21:13:36 -04:00
Joshua Taillon
cac63494f0 change tesseract parser to only convert first page to save (potentially) massive amounts of work 2018-09-05 15:18:35 -04:00
Daniel Quinn
939a67bd4b Add empty requirements for rtd to reference 2018-09-05 11:16:42 +01:00
Daniel Quinn
fbc6a58f5a Add credits for 2.2.0 that I forgot 2018-09-05 10:59:06 +01:00
Daniel Quinn
01a358d2b0 Re-flow text to keep it <80c wide 2018-09-05 10:58:41 +01:00
David Martin
6b447628ed Bump required version for Pyocr to support the latest tesseract 4.
This recently changed in the official tesseract engine [0]. -psm is
not allowed as an option anymore and --psm has to be used instead. The
latest pyocr enables support for this [1].

[0] tesseract-ocr/tesseract@ee201e1
[1] 5abd0a566a
2018-09-05 13:03:42 +10:00
Thomas Niederprüm
2308d5a613 Catch ProgrammingError in Document checks.
When running PostgreSQL or MariaDB/MySQL backends, a query to a non-existent
table will raise a "ProgrammingError". This patch properly catches this error.
Without this patch all management calls to manage.py will lead to an error when
running PostgreSQL or MariaDB as a backend.
2018-09-04 20:11:48 +02:00
Joshua Taillon
23bf79274c Merge branch 'master' into ENH_text_consumer 2018-09-03 23:47:30 -04:00
Joshua Taillon
4849249d86 explicitly add txt, md, and csv types for consumer and viewer; fix thumbnail generation 2018-09-03 23:46:13 -04:00
Joshua Taillon
d6fedbec52 first stab at text consumer 2018-08-30 23:32:41 -04:00
20 changed files with 348 additions and 74 deletions

View File

@@ -36,3 +36,5 @@ pytest-xdist = "*"
[dev-packages]
ipython = "*"
sphinx = "*"
tox = "*"

83
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e20c2294bcafd346ee57901df94a515a12976ed192dc37df848b39b56bdd1f4b"
"sha256": "6d8bad24aa5d0c102b13b5ae27acba04836cd5a07a4003cb2763de1e0a3406b7"
},
"pipfile-spec": 6,
"requires": {},
@@ -19,7 +19,7 @@
"sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6",
"sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.5"
},
"atomicwrites": {
@@ -27,7 +27,7 @@
"sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
"sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.2.1"
},
"attrs": {
@@ -85,7 +85,7 @@
"sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
"sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80"
],
"markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.1.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4'",
"version": "==4.5.1"
},
"coveralls": {
@@ -163,7 +163,7 @@
"sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a",
"sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.5.0"
},
"factory-boy": {
@@ -179,6 +179,7 @@
"sha256:ea7cfd3aeb1544732d08bd9cfba40c5b78e3a91e17b1a0698ab81bfc5554c628",
"sha256:f6d67f04abfb2b4bea7afc7fa6c18cf4c523a67956e455668be9ae42bccc21ad"
],
"markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version >= '2.7'",
"version": "==0.9.0"
},
"filemagic": {
@@ -282,7 +283,7 @@
"sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
"sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.7.1"
},
"py": {
@@ -290,7 +291,7 @@
"sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
"sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.6.0"
},
"pycodestyle": {
@@ -303,26 +304,26 @@
},
"pyocr": {
"hashes": [
"sha256:bdc4d43bf9b63c2a9a4b2c9a1a623a0e63c8e6600eede5dbe866b31f3a5f2207"
"sha256:b6ba6263fd92da56627dff6d263d991a2246aacd117d1788f11b93f419ca395f"
],
"index": "pypi",
"version": "==0.5.2"
"version": "==0.5.3"
},
"pytest": {
"hashes": [
"sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349",
"sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18"
"sha256:453cbbbe5ce6db38717d282b758b917de84802af4288910c12442984bde7b823",
"sha256:a8a07f84e680482eb51e244370aaf2caa6301ef265f37c2bdefb3dd3b663f99d"
],
"index": "pypi",
"version": "==3.7.4"
"version": "==3.8.0"
},
"pytest-cov": {
"hashes": [
"sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d",
"sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec"
"sha256:513c425e931a0344944f84ea47f3956be0e416d95acbd897a44970c8d926d5d7",
"sha256:e360f048b7dae3f2f2a9a4d067b2dd6b6a015d384d1577c994a43f3f7cbad762"
],
"index": "pypi",
"version": "==2.5.1"
"version": "==2.6.0"
},
"pytest-django": {
"hashes": [
@@ -344,6 +345,7 @@
"sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805",
"sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.2"
},
"pytest-sugar": {
@@ -457,7 +459,7 @@
"sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
"sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
],
"markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
"version": "==1.23"
}
},
@@ -521,10 +523,11 @@
},
"imagesize": {
"hashes": [
"sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18",
"sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315"
"sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8",
"sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5"
],
"version": "==1.0.0"
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.1.0"
},
"ipython": {
"hashes": [
@@ -590,6 +593,14 @@
],
"version": "==0.7.4"
},
"pluggy": {
"hashes": [
"sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
"sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.7.1"
},
"prompt-toolkit": {
"hashes": [
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
@@ -605,6 +616,14 @@
],
"version": "==0.6.0"
},
"py": {
"hashes": [
"sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
"sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.6.0"
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
@@ -656,20 +675,28 @@
},
"sphinx": {
"hashes": [
"sha256:a07050845cc9a2f4026a6035cc8ed795a5ce7be6528bbc82032385c10807dfe7",
"sha256:d719de667218d763e8fd144b7fcfeefd8d434a6201f76bf9f0f0c1fa6f47fcdb"
"sha256:217a7705adcb573da5bbe1e0f5cab4fa0bd89fd9342c9159121746f593c2d5a4",
"sha256:a602513f385f1d5785ff1ca420d9c7eb1a1b63381733b2f0ea8188a391314a86"
],
"index": "pypi",
"version": "==1.7.8"
"version": "==1.7.9"
},
"sphinxcontrib-websupport": {
"hashes": [
"sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd",
"sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9"
],
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.1.0"
},
"tox": {
"hashes": [
"sha256:37cf240781b662fb790710c6998527e65ca6851eace84d1595ee71f7af4e85f7",
"sha256:eb61aa5bcce65325538686f09848f04ef679b5cd9b83cc491272099b28739600"
],
"index": "pypi",
"version": "==3.2.1"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
@@ -682,9 +709,17 @@
"sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
"sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
],
"markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
"version": "==1.23"
},
"virtualenv": {
"hashes": [
"sha256:2ce32cd126117ce2c539f0134eb89de91a8413a29baac49cbab3eb50e2026669",
"sha256:ca07b4c0b54e14a91af9f34d0919790b016923d157afda5efdde55c96718f752"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"version": "==16.0.0"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",

View File

@@ -1,6 +1,23 @@
Changelog
#########
2.3.0
=====
* Support for consuming plain text & markdown documents was added by
`Joshua Taillon`_! This was a long-requested feature, and it's addition is
likely to be greatly appreciated by the community: `#395`_ Thanks also to
`David Martin`_ for his assistance on the issue.
* `dubit0`_ found & fixed a bug that prevented management commands from running
before we had an operational database: `#396`_
* Joshua also added a simple update to the thumbnail generation process to
improve performance: `#399`_
* As his last bit of effort on this release, Joshua also added some code to
allow you to view the documents inline rather than download them as an
attachment. `#400`_
* Finally, `ahyear`_ found a slip in the Docker documentation and patched it. `#401`_
2.2.1
=====
@@ -19,6 +36,10 @@ Changelog
easier on those of us with lots of different tags: `#391`_.
* `Kilian Koeltzsch`_ noticed a bug in how we capture & automatically create
tags, so that's fixed now too: `#384`_.
* `erikarvstedt`_ tweaked the behaviour of the test suite to be better behaved
for packaging environments: `#383`_.
* `Lukasz Soluch`_ added CORS support to make building a new Javascript-based front-end
cleaner & easier: `#387`_.
2.1.0
@@ -476,6 +497,10 @@ bulk of the work on this big change.
.. _Tim Brooks: https://github.com/brookst
.. _Stéphane Brunner: https://github.com/sbrunner
.. _Kilian Koeltzsch: https://github.com/kiliankoe
.. _Lukasz Soluch: https://github.com/LukaszSolo
.. _Joshua Taillon: https://github.com/jat255
.. _dubit0: https://github.com/dubit0
.. _ahyear: https://github.com/ahyear
.. _#20: https://github.com/danielquinn/paperless/issues/20
.. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -550,11 +575,18 @@ bulk of the work on this big change.
.. _#374: https://github.com/danielquinn/paperless/pull/374
.. _#375: https://github.com/danielquinn/paperless/pull/375
.. _#376: https://github.com/danielquinn/paperless/pull/376
.. _#383: https://github.com/danielquinn/paperless/pull/383
.. _#384: https://github.com/danielquinn/paperless/issues/384
.. _#386: https://github.com/danielquinn/paperless/issues/386
.. _#387: https://github.com/danielquinn/paperless/pull/387
.. _#391: https://github.com/danielquinn/paperless/pull/391
.. _#390: https://github.com/danielquinn/paperless/pull/390
.. _#392: https://github.com/danielquinn/paperless/issues/392
.. _#395: https://github.com/danielquinn/paperless/pull/395
.. _#396: https://github.com/danielquinn/paperless/pull/396
.. _#399: https://github.com/danielquinn/paperless/pull/399
.. _#400: https://github.com/danielquinn/paperless/pull/400
.. _#401: https://github.com/danielquinn/paperless/pull/401
.. _pipenv: https://docs.pipenv.org/
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/

View File

@@ -101,6 +101,7 @@ is similar:
$ cd /path/to/project
$ git pull
$ docker build -t paperless .
$ docker-compose run --rm comsumer migrate
$ docker-compose up -d
If ``git pull`` doesn't report any changes, there is no need to continue with

0
docs/requirements.txt Normal file
View File

View File

@@ -89,9 +89,10 @@ PAPERLESS_EMAIL_SECRET=""
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
# If you decide to use Paperless APIs in an ajax calls, you need to add your
# servers to the allowed hosts that can do CORS calls. By default Paperless allows
# calls from localhost:8080. The same rules as above how the list should look like.
# If you decide to use the Paperless API in an ajax call, you need to add your
# servers to the list of allowed hosts that can do CORS calls. By default
# Paperless allows calls from localhost:8080, but you'd like to change that,
# you can set this value to a comma-separated list.
#PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"
# To host paperless under a subpath url like example.com/paperless you set
@@ -116,6 +117,10 @@ PAPERLESS_EMAIL_SECRET=""
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
# By default, when clicking on a document within the web interface, the
# browser will prompt the user to save the document to disk. By setting this to
# "true", the document will instead be opened in the browser, if possible.
#PAPERLESS_INLINE_DOC="false"
#
# The following values use sensible defaults for modern systems, but if you're

View File

@@ -29,7 +29,7 @@ pillow==5.2.0
pluggy==0.7.1; python_version != '3.1.*'
py==1.6.0; python_version != '3.1.*'
pycodestyle==2.4.0
pyocr==0.5.2
pyocr==0.5.3
pytest-cov==2.5.1
pytest-django==3.4.2
pytest-env==0.6.2

View File

@@ -2,7 +2,7 @@ import textwrap
from django.conf import settings
from django.core.checks import Error, register
from django.db.utils import OperationalError
from django.db.utils import OperationalError, ProgrammingError
@register()
@@ -14,7 +14,7 @@ def changed_password_check(app_configs, **kwargs):
try:
encrypted_doc = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG).first()
except OperationalError:
except (OperationalError, ProgrammingError):
return [] # No documents table yet
if encrypted_doc:

View File

@@ -1,24 +1,24 @@
# coding=utf-8
import dateutil.parser
import logging
import os
import re
import uuid
from collections import OrderedDict
import dateutil.parser
from django.conf import settings
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from fuzzywuzzy import fuzz
from django.conf import settings
from .managers import LogManager
try:
from django.core.urlresolvers import reverse
except ImportError:
from django.urls import reverse
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from .managers import LogManager
class MatchingModel(models.Model):
@@ -135,7 +135,7 @@ class MatchingModel(models.Model):
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with\s+quotes", "and", "spaces"]
["some", "random", "words", "with+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
@@ -192,7 +192,11 @@ class Document(models.Model):
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
@@ -365,51 +369,52 @@ class FileInfo:
)
)
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
))
])

View File

@@ -1,9 +1,25 @@
import logging
import shutil
import tempfile
import re
from django.conf import settings
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
)
class ParseError(Exception):
pass

View File

@@ -166,7 +166,7 @@ class TestMatching(TestCase):
def test_match_regex(self):
self._test_matching(
"alpha\w+gamma",
r"alpha\w+gamma",
"MATCH_REGEX",
(
"I have alpha_and_gamma in me",

View File

@@ -1,6 +1,8 @@
from django.http import HttpResponse, HttpResponseBadRequest
from django.views.generic import DetailView, FormView, TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django.conf import settings
from paperless.db import GnuPG
from paperless.mixins import SessionOrBasicAuthMixin
from paperless.views import StandardPagination
@@ -48,6 +50,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
if self.kwargs["kind"] == "thumb":
@@ -60,8 +65,11 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
self._get_raw_data(self.object.source_file),
content_type=content_types[self.object.file_type]
)
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
self.object.file_name)
DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment'
response["Content-Disposition"] = '{}; filename="{}"'.format(
DISPOSITION, self.object.file_name)
return response

View File

@@ -22,6 +22,14 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
load_dotenv("/usr/local/etc/paperless.conf")
def __get_boolean(key):
"""
Return a boolean value based on whatever the user has supplied in the
environment based on whether the value "looks like" it's True or not.
"""
return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true"))
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -67,6 +75,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"reminders.apps.RemindersConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"django.contrib.admin",
@@ -221,12 +230,12 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
# OCR all documents?
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) # NOQA
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # NOQA
FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -270,6 +279,9 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
# Whether to display a selected document inline, or download it as attachment:
INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC")
# The number of items on each page in the web UI. This value must be a
# positive integer, but if you don't define one in paperless.conf, a default of
# 100 will be used.

View File

@@ -1 +1 @@
__version__ = (2, 2, 1)
__version__ = (2, 3, 0)

View File

@@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError
import pdftotext
from documents.parsers import DocumentParser, ParseError
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
from .languages import ISO639
@@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser):
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
"{}[0]".format(self.document_path),
os.path.join(self.tempdir, "convert.png")
)
return os.path.join(self.tempdir, "convert-0000.png")
return os.path.join(self.tempdir, "convert.png")
def _is_ocred(self):
@@ -210,22 +211,8 @@ class RasterisedDocumentParser(DocumentParser):
except ParseError as e:
return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:
@@ -272,8 +259,9 @@ def run_unpaper(args):
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub(
r"([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace

View File

@@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def handle(cls, sender, **kwargs):

View File

View File

@@ -0,0 +1,16 @@
from django.apps import AppConfig
class PaperlessTextConfig(AppConfig):
name = "paperless_text"
def ready(self):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self)

View File

@@ -0,0 +1,131 @@
import os
import re
import subprocess
import dateparser
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
"""
CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path):
super().__init__(path)
self._text = None
def get_thumbnail(self):
"""
The thumbnail of a txt is just a 500px wide image of the text
rendered onto a letter-sized page.
"""
# The below is heavily cribbed from https://askubuntu.com/a/590951
bg_color = "white" # bg color
text_color = "black" # text color
psize = [500, 647] # icon size
n_lines = 50 # number of lines to show
output_file = os.path.join(self.tempdir, "convert-txt.png")
temp_bg = os.path.join(self.tempdir, "bg.png")
temp_txlayer = os.path.join(self.tempdir, "tx.png")
picsize = "x".join([str(n) for n in psize])
txsize = "x".join([str(n - 8) for n in psize])
def create_bg():
work_size = ",".join([str(n - 1) for n in psize])
r = str(round(psize[0] / 10))
rounded = ",".join([r, r])
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,',
work_size, ",", rounded, '" ', temp_bg)
def read_text():
with open(self.document_path, 'r') as src:
lines = [l.strip() for l in src.readlines()]
text = "\n".join([l for l in lines[:n_lines]])
return text.replace('"', "'")
def create_txlayer():
run_command(self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer)
create_txlayer()
create_bg()
run_command(self.CONVERT, temp_bg, temp_txlayer,
"-background None -layers merge ", output_file)
return output_file
def get_text(self):
if self._text is not None:
return self._text
with open(self.document_path, 'r') as f:
self._text = f.read()
return self._text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_command(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
if not subprocess.Popen(' '.join(args), env=environment,
shell=True).wait() == 0:
raise ParseError("Convert failed at {}".format(args))

View File

@@ -0,0 +1,23 @@
import re
from .parsers import TextDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TextDocumentParser,
"weight": 10
}
return None