diff --git a/Pipfile b/Pipfile index a9331f134..b1c30698d 100644 --- a/Pipfile +++ b/Pipfile @@ -36,3 +36,5 @@ pytest-xdist = "*" [dev-packages] ipython = "*" sphinx = "*" +tox = "*" + diff --git a/Pipfile.lock b/Pipfile.lock index 614ee0e78..71a46d37f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e20c2294bcafd346ee57901df94a515a12976ed192dc37df848b39b56bdd1f4b" + "sha256": "6d8bad24aa5d0c102b13b5ae27acba04836cd5a07a4003cb2763de1e0a3406b7" }, "pipfile-spec": 6, "requires": {}, @@ -19,7 +19,7 @@ "sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6", "sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.5" }, "atomicwrites": { @@ -27,7 +27,7 @@ "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0", "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.2.1" }, "attrs": { @@ -85,7 +85,7 @@ "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6", "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80" ], - "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.1.*'", + "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4'", "version": "==4.5.1" }, "coveralls": { @@ -163,7 +163,7 @@ "sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a", "sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.5.0" }, "factory-boy": { @@ -179,6 +179,7 @@ "sha256:ea7cfd3aeb1544732d08bd9cfba40c5b78e3a91e17b1a0698ab81bfc5554c628", "sha256:f6d67f04abfb2b4bea7afc7fa6c18cf4c523a67956e455668be9ae42bccc21ad" ], + "markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version >= '2.7'", "version": "==0.9.0" }, "filemagic": { @@ -282,7 +283,7 @@ "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==0.7.1" }, "py": { @@ -290,7 +291,7 @@ "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6" ], - "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.6.0" }, "pycodestyle": { @@ -303,26 +304,26 @@ }, "pyocr": { "hashes": [ - "sha256:bdc4d43bf9b63c2a9a4b2c9a1a623a0e63c8e6600eede5dbe866b31f3a5f2207" + "sha256:b6ba6263fd92da56627dff6d263d991a2246aacd117d1788f11b93f419ca395f" ], "index": "pypi", - "version": "==0.5.2" + "version": "==0.5.3" }, "pytest": { "hashes": [ - "sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349", - "sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18" + "sha256:453cbbbe5ce6db38717d282b758b917de84802af4288910c12442984bde7b823", + "sha256:a8a07f84e680482eb51e244370aaf2caa6301ef265f37c2bdefb3dd3b663f99d" ], "index": "pypi", - "version": "==3.7.4" + "version": "==3.8.0" }, "pytest-cov": { "hashes": [ - "sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d", - "sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec" + "sha256:513c425e931a0344944f84ea47f3956be0e416d95acbd897a44970c8d926d5d7", + "sha256:e360f048b7dae3f2f2a9a4d067b2dd6b6a015d384d1577c994a43f3f7cbad762" ], "index": "pypi", - "version": "==2.5.1" + "version": "==2.6.0" }, "pytest-django": { "hashes": [ @@ -344,6 +345,7 @@ "sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805", "sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08" ], + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==0.2" }, "pytest-sugar": { @@ -457,7 +459,7 @@ "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" ], - "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'", "version": "==1.23" } }, @@ -521,10 +523,11 @@ }, "imagesize": { "hashes": [ - "sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18", - "sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315" + "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", + "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" ], - "version": "==1.0.0" + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", + "version": "==1.1.0" }, "ipython": { "hashes": [ @@ -590,6 +593,14 @@ ], "version": "==0.7.4" }, + "pluggy": { + "hashes": [ + "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", + "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1" + ], + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", + "version": "==0.7.1" + }, "prompt-toolkit": { "hashes": [ "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381", @@ -605,6 +616,14 @@ ], "version": "==0.6.0" }, + "py": { + "hashes": [ + "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", + "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6" + ], + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", + "version": "==1.6.0" + }, "pygments": { "hashes": [ "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", @@ -656,20 +675,28 @@ }, "sphinx": { "hashes": [ - "sha256:a07050845cc9a2f4026a6035cc8ed795a5ce7be6528bbc82032385c10807dfe7", - "sha256:d719de667218d763e8fd144b7fcfeefd8d434a6201f76bf9f0f0c1fa6f47fcdb" + "sha256:217a7705adcb573da5bbe1e0f5cab4fa0bd89fd9342c9159121746f593c2d5a4", + "sha256:a602513f385f1d5785ff1ca420d9c7eb1a1b63381733b2f0ea8188a391314a86" ], "index": "pypi", - "version": "==1.7.8" + "version": "==1.7.9" }, "sphinxcontrib-websupport": { "hashes": [ "sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd", "sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9" ], - "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'", "version": "==1.1.0" }, + "tox": { + "hashes": [ + "sha256:37cf240781b662fb790710c6998527e65ca6851eace84d1595ee71f7af4e85f7", + "sha256:eb61aa5bcce65325538686f09848f04ef679b5cd9b83cc491272099b28739600" + ], + "index": "pypi", + "version": "==3.2.1" + }, "traitlets": { "hashes": [ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", @@ -682,9 +709,17 @@ "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" ], - "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'", "version": "==1.23" }, + "virtualenv": { + "hashes": [ + "sha256:2ce32cd126117ce2c539f0134eb89de91a8413a29baac49cbab3eb50e2026669", + "sha256:ca07b4c0b54e14a91af9f34d0919790b016923d157afda5efdde55c96718f752" + ], + "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'", + "version": "==16.0.0" + }, "wcwidth": { "hashes": [ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", diff --git a/docs/changelog.rst b/docs/changelog.rst index f80445dde..804447855 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,23 @@ Changelog ######### +2.3.0 +===== + +* Support for consuming plain text & markdown documents was added by + `Joshua Taillon`_! This was a long-requested feature, and it's addition is + likely to be greatly appreciated by the community: `#395`_ Thanks also to + `David Martin`_ for his assistance on the issue. +* `dubit0`_ found & fixed a bug that prevented management commands from running + before we had an operational database: `#396`_ +* Joshua also added a simple update to the thumbnail generation process to + improve performance: `#399`_ +* As his last bit of effort on this release, Joshua also added some code to + allow you to view the documents inline rather than download them as an + attachment. `#400`_ +* Finally, `ahyear`_ found a slip in the Docker documentation and patched it. `#401`_ + + 2.2.1 ===== @@ -19,6 +36,10 @@ Changelog easier on those of us with lots of different tags: `#391`_. * `Kilian Koeltzsch`_ noticed a bug in how we capture & automatically create tags, so that's fixed now too: `#384`_. +* `erikarvstedt`_ tweaked the behaviour of the test suite to be better behaved + for packaging environments: `#383`_. +* `Lukasz Soluch`_ added CORS support to make building a new Javascript-based front-end + cleaner & easier: `#387`_. 2.1.0 @@ -476,6 +497,10 @@ bulk of the work on this big change. .. _Tim Brooks: https://github.com/brookst .. _Stéphane Brunner: https://github.com/sbrunner .. _Kilian Koeltzsch: https://github.com/kiliankoe +.. _Lukasz Soluch: https://github.com/LukaszSolo +.. _Joshua Taillon: https://github.com/jat255 +.. _dubit0: https://github.com/dubit0 +.. _ahyear: https://github.com/ahyear .. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 @@ -550,11 +575,18 @@ bulk of the work on this big change. .. _#374: https://github.com/danielquinn/paperless/pull/374 .. _#375: https://github.com/danielquinn/paperless/pull/375 .. _#376: https://github.com/danielquinn/paperless/pull/376 +.. _#383: https://github.com/danielquinn/paperless/pull/383 .. _#384: https://github.com/danielquinn/paperless/issues/384 .. _#386: https://github.com/danielquinn/paperless/issues/386 +.. _#387: https://github.com/danielquinn/paperless/pull/387 .. _#391: https://github.com/danielquinn/paperless/pull/391 .. _#390: https://github.com/danielquinn/paperless/pull/390 .. _#392: https://github.com/danielquinn/paperless/issues/392 +.. _#395: https://github.com/danielquinn/paperless/pull/395 +.. _#396: https://github.com/danielquinn/paperless/pull/396 +.. _#399: https://github.com/danielquinn/paperless/pull/399 +.. _#400: https://github.com/danielquinn/paperless/pull/400 +.. _#401: https://github.com/danielquinn/paperless/pull/401 .. _pipenv: https://docs.pipenv.org/ .. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/ diff --git a/docs/migrating.rst b/docs/migrating.rst index d97d3d4bf..45646f058 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -101,6 +101,7 @@ is similar: $ cd /path/to/project $ git pull $ docker build -t paperless . + $ docker-compose run --rm comsumer migrate $ docker-compose up -d If ``git pull`` doesn't report any changes, there is no need to continue with diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/paperless.conf.example b/paperless.conf.example index 8aa33216f..15498a26a 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -89,9 +89,10 @@ PAPERLESS_EMAIL_SECRET="" # as is "example.com,www.example.com", but NOT " example.com" or "example.com," #PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com" -# If you decide to use Paperless APIs in an ajax calls, you need to add your -# servers to the allowed hosts that can do CORS calls. By default Paperless allows -# calls from localhost:8080. The same rules as above how the list should look like. +# If you decide to use the Paperless API in an ajax call, you need to add your +# servers to the list of allowed hosts that can do CORS calls. By default +# Paperless allows calls from localhost:8080, but you'd like to change that, +# you can set this value to a comma-separated list. #PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000" # To host paperless under a subpath url like example.com/paperless you set @@ -116,6 +117,10 @@ PAPERLESS_EMAIL_SECRET="" # http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process #PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh" +# By default, when clicking on a document within the web interface, the +# browser will prompt the user to save the document to disk. By setting this to +# "true", the document will instead be opened in the browser, if possible. +#PAPERLESS_INLINE_DOC="false" # # The following values use sensible defaults for modern systems, but if you're diff --git a/requirements.txt b/requirements.txt index 247d9993a..0476efef1 100755 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ pillow==5.2.0 pluggy==0.7.1; python_version != '3.1.*' py==1.6.0; python_version != '3.1.*' pycodestyle==2.4.0 -pyocr==0.5.2 +pyocr==0.5.3 pytest-cov==2.5.1 pytest-django==3.4.2 pytest-env==0.6.2 diff --git a/src/documents/checks.py b/src/documents/checks.py index c80b63863..3310b1806 100644 --- a/src/documents/checks.py +++ b/src/documents/checks.py @@ -2,7 +2,7 @@ import textwrap from django.conf import settings from django.core.checks import Error, register -from django.db.utils import OperationalError +from django.db.utils import OperationalError, ProgrammingError @register() @@ -14,7 +14,7 @@ def changed_password_check(app_configs, **kwargs): try: encrypted_doc = Document.objects.filter( storage_type=Document.STORAGE_TYPE_GPG).first() - except OperationalError: + except (OperationalError, ProgrammingError): return [] # No documents table yet if encrypted_doc: diff --git a/src/documents/models.py b/src/documents/models.py index 36466bbac..c66bb5b0f 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,24 +1,24 @@ # coding=utf-8 -import dateutil.parser import logging import os import re import uuid - from collections import OrderedDict + +import dateutil.parser +from django.conf import settings +from django.db import models +from django.template.defaultfilters import slugify +from django.utils import timezone from fuzzywuzzy import fuzz -from django.conf import settings +from .managers import LogManager + try: from django.core.urlresolvers import reverse except ImportError: from django.urls import reverse -from django.db import models -from django.template.defaultfilters import slugify -from django.utils import timezone - -from .managers import LogManager class MatchingModel(models.Model): @@ -135,7 +135,7 @@ class MatchingModel(models.Model): Example: ' some random words "with quotes " and spaces' ==> - ["some", "random", "words", "with\s+quotes", "and", "spaces"] + ["some", "random", "words", "with+quotes", "and", "spaces"] """ findterms = re.compile(r'"([^"]+)"|(\S+)').findall normspace = re.compile(r"\s+").sub @@ -192,7 +192,11 @@ class Document(models.Model): TYPE_JPG = "jpg" TYPE_GIF = "gif" TYPE_TIF = "tiff" - TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) + TYPE_TXT = "txt" + TYPE_CSV = "csv" + TYPE_MD = "md" + TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF, + TYPE_TXT, TYPE_CSV, TYPE_MD) STORAGE_TYPE_UNENCRYPTED = "unencrypted" STORAGE_TYPE_GPG = "gpg" @@ -365,51 +369,52 @@ class FileInfo: ) ) + formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv" REGEXES = OrderedDict([ ("created-correspondent-title-tags", re.compile( r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P.*) - " r"(?P.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-title-tags", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-correspondent-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<correspondent>.*) - " r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("created-title", re.compile( r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("correspondent-title-tags", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*) - " r"(?P<tags>[a-z0-9\-,]*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("correspondent-title", re.compile( r"(?P<correspondent>.*) - " r"(?P<title>.*)?" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )), ("title", re.compile( r"(?P<title>.*)" - r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$", + r"\.(?P<extension>{})$".format(formats), flags=re.IGNORECASE )) ]) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c44e4c5bf..884f91ae4 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -1,9 +1,25 @@ import logging import shutil import tempfile +import re from django.conf import settings +# This regular expression will try to find dates in the document at +# hand and will match the following formats: +# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits +# - MONTH ZZZZ, with ZZZZ being 4 digits +# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits +DATE_REGEX = re.compile( + r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b' +) + class ParseError(Exception): pass diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 55d25598a..e592237b6 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -166,7 +166,7 @@ class TestMatching(TestCase): def test_match_regex(self): self._test_matching( - "alpha\w+gamma", + r"alpha\w+gamma", "MATCH_REGEX", ( "I have alpha_and_gamma in me", diff --git a/src/documents/views.py b/src/documents/views.py index e297e0984..9cb66b59c 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,6 +1,8 @@ from django.http import HttpResponse, HttpResponseBadRequest from django.views.generic import DetailView, FormView, TemplateView from django_filters.rest_framework import DjangoFilterBackend +from django.conf import settings + from paperless.db import GnuPG from paperless.mixins import SessionOrBasicAuthMixin from paperless.views import StandardPagination @@ -48,6 +50,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): Document.TYPE_JPG: "image/jpeg", Document.TYPE_GIF: "image/gif", Document.TYPE_TIF: "image/tiff", + Document.TYPE_CSV: "text/csv", + Document.TYPE_MD: "text/markdown", + Document.TYPE_TXT: "text/plain" } if self.kwargs["kind"] == "thumb": @@ -60,8 +65,11 @@ class FetchView(SessionOrBasicAuthMixin, DetailView): self._get_raw_data(self.object.source_file), content_type=content_types[self.object.file_type] ) - response["Content-Disposition"] = 'attachment; filename="{}"'.format( - self.object.file_name) + + DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment' + + response["Content-Disposition"] = '{}; filename="{}"'.format( + DISPOSITION, self.object.file_name) return response diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 06cc1807f..956b90a7f 100755 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -22,6 +22,14 @@ elif os.path.exists("/usr/local/etc/paperless.conf"): load_dotenv("/usr/local/etc/paperless.conf") +def __get_boolean(key): + """ + Return a boolean value based on whatever the user has supplied in the + environment based on whether the value "looks like" it's True or not. + """ + return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true")) + + # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -67,6 +75,7 @@ INSTALLED_APPS = [ "documents.apps.DocumentsConfig", "reminders.apps.RemindersConfig", "paperless_tesseract.apps.PaperlessTesseractConfig", + "paperless_text.apps.PaperlessTextConfig", "django.contrib.admin", @@ -221,12 +230,12 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") # OCR all documents? -OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) # NOQA +OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS") # If this is true, any failed attempts to OCR a PDF will result in the PDF # being indexed anyway, with whatever we could get. If it's False, the file # will simply be left in the CONSUMPTION_DIR. -FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # NOQA +FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR") # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") @@ -270,6 +279,9 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT") POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT") +# Whether to display a selected document inline, or download it as attachment: +INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC") + # The number of items on each page in the web UI. This value must be a # positive integer, but if you don't define one in paperless.conf, a default of # 100 will be used. diff --git a/src/paperless/version.py b/src/paperless/version.py index 0fbece706..c1b36d9c1 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (2, 2, 1) +__version__ = (2, 3, 0) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index add65985a..e3c2ed361 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \ from pyocr.tesseract import TesseractError import pdftotext -from documents.parsers import DocumentParser, ParseError +from documents.parsers import DocumentParser, ParseError, DATE_REGEX from .languages import ISO639 @@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser): self.CONVERT, "-scale", "500x5000", "-alpha", "remove", - self.document_path, os.path.join(self.tempdir, "convert-%04d.png") + "{}[0]".format(self.document_path), + os.path.join(self.tempdir, "convert.png") ) - return os.path.join(self.tempdir, "convert-0000.png") + return os.path.join(self.tempdir, "convert.png") def _is_ocred(self): @@ -210,22 +211,8 @@ class RasterisedDocumentParser(DocumentParser): except ParseError as e: return None - # This regular expression will try to find dates in the document at - # hand and will match the following formats: - # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits - # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits - # - MONTH ZZZZ, with ZZZZ being 4 digits - # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits - pattern = re.compile( - r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b') - # Iterate through all regex matches and try to parse the date - for m in re.finditer(pattern, text): + for m in re.finditer(DATE_REGEX, text): datestring = m.group(0) try: @@ -272,8 +259,9 @@ def run_unpaper(args): def strip_excess_whitespace(text): collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) no_leading_whitespace = re.sub( - "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) - no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace) + r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) + no_trailing_whitespace = re.sub( + r"([^\S\n\r]+)$", '', no_leading_whitespace) return no_trailing_whitespace diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 2fa54f5d5..237f15c52 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser class ConsumerDeclaration: - MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") + MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") @classmethod def handle(cls, sender, **kwargs): diff --git a/src/paperless_text/__init__.py b/src/paperless_text/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_text/apps.py b/src/paperless_text/apps.py new file mode 100644 index 000000000..389167368 --- /dev/null +++ b/src/paperless_text/apps.py @@ -0,0 +1,16 @@ +from django.apps import AppConfig + + +class PaperlessTextConfig(AppConfig): + + name = "paperless_text" + + def ready(self): + + from documents.signals import document_consumer_declaration + + from .signals import ConsumerDeclaration + + document_consumer_declaration.connect(ConsumerDeclaration.handle) + + AppConfig.ready(self) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py new file mode 100644 index 000000000..f02ba3ef8 --- /dev/null +++ b/src/paperless_text/parsers.py @@ -0,0 +1,131 @@ +import os +import re +import subprocess + +import dateparser +from django.conf import settings + +from documents.parsers import DocumentParser, ParseError, DATE_REGEX + + +class TextDocumentParser(DocumentParser): + """ + This parser directly parses a text document (.txt, .md, or .csv) + """ + + CONVERT = settings.CONVERT_BINARY + THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None + UNPAPER = settings.UNPAPER_BINARY + DATE_ORDER = settings.DATE_ORDER + DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE + OCR_ALWAYS = settings.OCR_ALWAYS + + def __init__(self, path): + super().__init__(path) + self._text = None + + def get_thumbnail(self): + """ + The thumbnail of a txt is just a 500px wide image of the text + rendered onto a letter-sized page. + """ + # The below is heavily cribbed from https://askubuntu.com/a/590951 + + bg_color = "white" # bg color + text_color = "black" # text color + psize = [500, 647] # icon size + n_lines = 50 # number of lines to show + output_file = os.path.join(self.tempdir, "convert-txt.png") + + temp_bg = os.path.join(self.tempdir, "bg.png") + temp_txlayer = os.path.join(self.tempdir, "tx.png") + picsize = "x".join([str(n) for n in psize]) + txsize = "x".join([str(n - 8) for n in psize]) + + def create_bg(): + work_size = ",".join([str(n - 1) for n in psize]) + r = str(round(psize[0] / 10)) + rounded = ",".join([r, r]) + run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ', + '"fill ', bg_color, ' roundrectangle 0,0,', + work_size, ",", rounded, '" ', temp_bg) + + def read_text(): + with open(self.document_path, 'r') as src: + lines = [l.strip() for l in src.readlines()] + text = "\n".join([l for l in lines[:n_lines]]) + return text.replace('"', "'") + + def create_txlayer(): + run_command(self.CONVERT, + "-background none", + "-fill", + text_color, + "-pointsize", "12", + "-border 4 -bordercolor none", + "-size ", txsize, + ' caption:"', read_text(), '" ', + temp_txlayer) + + create_txlayer() + create_bg() + run_command(self.CONVERT, temp_bg, temp_txlayer, + "-background None -layers merge ", output_file) + + return output_file + + def get_text(self): + + if self._text is not None: + return self._text + + with open(self.document_path, 'r') as f: + self._text = f.read() + + return self._text + + def get_date(self): + date = None + datestring = None + + try: + text = self.get_text() + except ParseError as e: + return None + + # Iterate through all regex matches and try to parse the date + for m in re.finditer(DATE_REGEX, text): + datestring = m.group(0) + + try: + date = dateparser.parse( + datestring, + settings={'DATE_ORDER': self.DATE_ORDER, + 'PREFER_DAY_OF_MONTH': 'first', + 'RETURN_AS_TIMEZONE_AWARE': True}) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None: + break + + if date is not None: + self.log("info", "Detected document date " + date.isoformat() + + " based on string " + datestring) + else: + self.log("info", "Unable to detect date for document") + + return date + + +def run_command(*args): + environment = os.environ.copy() + if settings.CONVERT_MEMORY_LIMIT: + environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT + if settings.CONVERT_TMPDIR: + environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR + + if not subprocess.Popen(' '.join(args), env=environment, + shell=True).wait() == 0: + raise ParseError("Convert failed at {}".format(args)) diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py new file mode 100644 index 000000000..ae5a005e1 --- /dev/null +++ b/src/paperless_text/signals.py @@ -0,0 +1,23 @@ +import re + +from .parsers import TextDocumentParser + + +class ConsumerDeclaration: + + MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") + + @classmethod + def handle(cls, sender, **kwargs): + return cls.test + + @classmethod + def test(cls, doc): + + if cls.MATCHING_FILES.match(doc.lower()): + return { + "parser": TextDocumentParser, + "weight": 10 + } + + return None