Merge branch 'master' into dev

This commit is contained in:
Jonas Winkler 2018-09-12 11:47:35 +02:00
commit ef0d37985b
19 changed files with 345 additions and 71 deletions

View File

@ -36,3 +36,5 @@ pytest-xdist = "*"
[dev-packages]
ipython = "*"
sphinx = "*"
tox = "*"

83
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e20c2294bcafd346ee57901df94a515a12976ed192dc37df848b39b56bdd1f4b"
"sha256": "6d8bad24aa5d0c102b13b5ae27acba04836cd5a07a4003cb2763de1e0a3406b7"
},
"pipfile-spec": 6,
"requires": {},
@ -19,7 +19,7 @@
"sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6",
"sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.5"
},
"atomicwrites": {
@ -27,7 +27,7 @@
"sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
"sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.2.1"
},
"attrs": {
@ -85,7 +85,7 @@
"sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
"sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80"
],
"markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.1.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4'",
"version": "==4.5.1"
},
"coveralls": {
@ -163,7 +163,7 @@
"sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a",
"sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.5.0"
},
"factory-boy": {
@ -179,6 +179,7 @@
"sha256:ea7cfd3aeb1544732d08bd9cfba40c5b78e3a91e17b1a0698ab81bfc5554c628",
"sha256:f6d67f04abfb2b4bea7afc7fa6c18cf4c523a67956e455668be9ae42bccc21ad"
],
"markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version >= '2.7'",
"version": "==0.9.0"
},
"filemagic": {
@ -282,7 +283,7 @@
"sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
"sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.7.1"
},
"py": {
@ -290,7 +291,7 @@
"sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
"sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.6.0"
},
"pycodestyle": {
@ -303,26 +304,26 @@
},
"pyocr": {
"hashes": [
"sha256:bdc4d43bf9b63c2a9a4b2c9a1a623a0e63c8e6600eede5dbe866b31f3a5f2207"
"sha256:b6ba6263fd92da56627dff6d263d991a2246aacd117d1788f11b93f419ca395f"
],
"index": "pypi",
"version": "==0.5.2"
"version": "==0.5.3"
},
"pytest": {
"hashes": [
"sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349",
"sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18"
"sha256:453cbbbe5ce6db38717d282b758b917de84802af4288910c12442984bde7b823",
"sha256:a8a07f84e680482eb51e244370aaf2caa6301ef265f37c2bdefb3dd3b663f99d"
],
"index": "pypi",
"version": "==3.7.4"
"version": "==3.8.0"
},
"pytest-cov": {
"hashes": [
"sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d",
"sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec"
"sha256:513c425e931a0344944f84ea47f3956be0e416d95acbd897a44970c8d926d5d7",
"sha256:e360f048b7dae3f2f2a9a4d067b2dd6b6a015d384d1577c994a43f3f7cbad762"
],
"index": "pypi",
"version": "==2.5.1"
"version": "==2.6.0"
},
"pytest-django": {
"hashes": [
@ -344,6 +345,7 @@
"sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805",
"sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.2"
},
"pytest-sugar": {
@ -457,7 +459,7 @@
"sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
"sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
],
"markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
"version": "==1.23"
}
},
@ -521,10 +523,11 @@
},
"imagesize": {
"hashes": [
"sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18",
"sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315"
"sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8",
"sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5"
],
"version": "==1.0.0"
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.1.0"
},
"ipython": {
"hashes": [
@ -590,6 +593,14 @@
],
"version": "==0.7.4"
},
"pluggy": {
"hashes": [
"sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
"sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==0.7.1"
},
"prompt-toolkit": {
"hashes": [
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
@ -605,6 +616,14 @@
],
"version": "==0.6.0"
},
"py": {
"hashes": [
"sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
"sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
],
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.6.0"
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
@ -656,20 +675,28 @@
},
"sphinx": {
"hashes": [
"sha256:a07050845cc9a2f4026a6035cc8ed795a5ce7be6528bbc82032385c10807dfe7",
"sha256:d719de667218d763e8fd144b7fcfeefd8d434a6201f76bf9f0f0c1fa6f47fcdb"
"sha256:217a7705adcb573da5bbe1e0f5cab4fa0bd89fd9342c9159121746f593c2d5a4",
"sha256:a602513f385f1d5785ff1ca420d9c7eb1a1b63381733b2f0ea8188a391314a86"
],
"index": "pypi",
"version": "==1.7.8"
"version": "==1.7.9"
},
"sphinxcontrib-websupport": {
"hashes": [
"sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd",
"sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9"
],
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
"version": "==1.1.0"
},
"tox": {
"hashes": [
"sha256:37cf240781b662fb790710c6998527e65ca6851eace84d1595ee71f7af4e85f7",
"sha256:eb61aa5bcce65325538686f09848f04ef679b5cd9b83cc491272099b28739600"
],
"index": "pypi",
"version": "==3.2.1"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
@ -682,9 +709,17 @@
"sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
"sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
],
"markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
"markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
"version": "==1.23"
},
"virtualenv": {
"hashes": [
"sha256:2ce32cd126117ce2c539f0134eb89de91a8413a29baac49cbab3eb50e2026669",
"sha256:ca07b4c0b54e14a91af9f34d0919790b016923d157afda5efdde55c96718f752"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'",
"version": "==16.0.0"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",

View File

@ -1,6 +1,23 @@
Changelog
#########
2.3.0
=====
* Support for consuming plain text & markdown documents was added by
`Joshua Taillon`_! This was a long-requested feature, and it's addition is
likely to be greatly appreciated by the community: `#395`_ Thanks also to
`David Martin`_ for his assistance on the issue.
* `dubit0`_ found & fixed a bug that prevented management commands from running
before we had an operational database: `#396`_
* Joshua also added a simple update to the thumbnail generation process to
improve performance: `#399`_
* As his last bit of effort on this release, Joshua also added some code to
allow you to view the documents inline rather than download them as an
attachment. `#400`_
* Finally, `ahyear`_ found a slip in the Docker documentation and patched it. `#401`_
2.2.1
=====
@ -19,6 +36,10 @@ Changelog
easier on those of us with lots of different tags: `#391`_.
* `Kilian Koeltzsch`_ noticed a bug in how we capture & automatically create
tags, so that's fixed now too: `#384`_.
* `erikarvstedt`_ tweaked the behaviour of the test suite to be better behaved
for packaging environments: `#383`_.
* `Lukasz Soluch`_ added CORS support to make building a new Javascript-based front-end
cleaner & easier: `#387`_.
2.1.0
@ -476,6 +497,10 @@ bulk of the work on this big change.
.. _Tim Brooks: https://github.com/brookst
.. _Stéphane Brunner: https://github.com/sbrunner
.. _Kilian Koeltzsch: https://github.com/kiliankoe
.. _Lukasz Soluch: https://github.com/LukaszSolo
.. _Joshua Taillon: https://github.com/jat255
.. _dubit0: https://github.com/dubit0
.. _ahyear: https://github.com/ahyear
.. _#20: https://github.com/danielquinn/paperless/issues/20
.. _#44: https://github.com/danielquinn/paperless/issues/44
@ -550,11 +575,18 @@ bulk of the work on this big change.
.. _#374: https://github.com/danielquinn/paperless/pull/374
.. _#375: https://github.com/danielquinn/paperless/pull/375
.. _#376: https://github.com/danielquinn/paperless/pull/376
.. _#383: https://github.com/danielquinn/paperless/pull/383
.. _#384: https://github.com/danielquinn/paperless/issues/384
.. _#386: https://github.com/danielquinn/paperless/issues/386
.. _#387: https://github.com/danielquinn/paperless/pull/387
.. _#391: https://github.com/danielquinn/paperless/pull/391
.. _#390: https://github.com/danielquinn/paperless/pull/390
.. _#392: https://github.com/danielquinn/paperless/issues/392
.. _#395: https://github.com/danielquinn/paperless/pull/395
.. _#396: https://github.com/danielquinn/paperless/pull/396
.. _#399: https://github.com/danielquinn/paperless/pull/399
.. _#400: https://github.com/danielquinn/paperless/pull/400
.. _#401: https://github.com/danielquinn/paperless/pull/401
.. _pipenv: https://docs.pipenv.org/
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/

View File

@ -101,6 +101,7 @@ is similar:
$ cd /path/to/project
$ git pull
$ docker build -t paperless .
$ docker-compose run --rm comsumer migrate
$ docker-compose up -d
If ``git pull`` doesn't report any changes, there is no need to continue with

0
docs/requirements.txt Normal file
View File

View File

@ -111,9 +111,10 @@ PAPERLESS_DEBUG="false"
# as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
#PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"
# If you decide to use Paperless APIs in an ajax calls, you need to add your
# servers to the allowed hosts that can do CORS calls. By default Paperless allows
# calls from localhost:8080. The same rules as above how the list should look like.
# If you decide to use the Paperless API in an ajax call, you need to add your
# servers to the list of allowed hosts that can do CORS calls. By default
# Paperless allows calls from localhost:8080, but you'd like to change that,
# you can set this value to a comma-separated list.
#PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"
# To host paperless under a subpath url like example.com/paperless you set
@ -138,6 +139,10 @@ PAPERLESS_DEBUG="false"
# http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
#PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"
# By default, when clicking on a document within the web interface, the
# browser will prompt the user to save the document to disk. By setting this to
# "true", the document will instead be opened in the browser, if possible.
#PAPERLESS_INLINE_DOC="false"
#
# The following values use sensible defaults for modern systems, but if you're

View File

@ -29,7 +29,7 @@ pillow==5.2.0
pluggy==0.7.1; python_version != '3.1.*'
py==1.6.0; python_version != '3.1.*'
pycodestyle==2.4.0
pyocr==0.5.2
pyocr==0.5.3
pytest-cov==2.5.1
pytest-django==3.4.2
pytest-env==0.6.2

View File

@ -1,24 +1,24 @@
# coding=utf-8
import dateutil.parser
import logging
import os
import re
import uuid
from collections import OrderedDict
import dateutil.parser
from django.conf import settings
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from fuzzywuzzy import fuzz
from django.conf import settings
from .managers import LogManager
try:
from django.core.urlresolvers import reverse
except ImportError:
from django.urls import reverse
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from .managers import LogManager
class MatchingModel(models.Model):
@ -94,7 +94,11 @@ class Document(models.Model):
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
TYPE_TXT = "txt"
TYPE_CSV = "csv"
TYPE_MD = "md"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
TYPE_TXT, TYPE_CSV, TYPE_MD)
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
@ -282,51 +286,52 @@ class FileInfo:
)
)
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)"
r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
r"\.(?P<extension>{})$".format(formats),
flags=re.IGNORECASE
))
])

View File

@ -1,9 +1,25 @@
import logging
import shutil
import tempfile
import re
from django.conf import settings
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b'
)
class ParseError(Exception):
pass

View File

@ -166,7 +166,7 @@ class TestMatching(TestCase):
def test_match_regex(self):
self._test_matching(
"alpha\w+gamma",
r"alpha\w+gamma",
"MATCH_REGEX",
(
"I have alpha_and_gamma in me",

View File

@ -1,6 +1,8 @@
from django.http import HttpResponse, HttpResponseBadRequest
from django.views.generic import DetailView, FormView, TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django.conf import settings
from paperless.db import GnuPG
from paperless.mixins import SessionOrBasicAuthMixin
from paperless.views import StandardPagination
@ -48,6 +50,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
Document.TYPE_CSV: "text/csv",
Document.TYPE_MD: "text/markdown",
Document.TYPE_TXT: "text/plain"
}
if self.kwargs["kind"] == "thumb":
@ -60,8 +65,11 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
self._get_raw_data(self.object.source_file),
content_type=content_types[self.object.file_type]
)
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
self.object.file_name)
DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment'
response["Content-Disposition"] = '{}; filename="{}"'.format(
DISPOSITION, self.object.file_name)
return response

View File

@ -22,6 +22,14 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
load_dotenv("/usr/local/etc/paperless.conf")
def __get_boolean(key):
"""
Return a boolean value based on whatever the user has supplied in the
environment based on whether the value "looks like" it's True or not.
"""
return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true"))
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -67,6 +75,7 @@ INSTALLED_APPS = [
"documents.apps.DocumentsConfig",
"reminders.apps.RemindersConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"django.contrib.admin",
@ -226,12 +235,12 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
# OCR all documents?
OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) # NOQA
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # NOQA
FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")
@ -275,6 +284,9 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
# Whether to display a selected document inline, or download it as attachment:
INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC")
# The number of items on each page in the web UI. This value must be a
# positive integer, but if you don't define one in paperless.conf, a default of
# 100 will be used.

View File

@ -1 +1 @@
__version__ = (2, 2, 1)
__version__ = (2, 3, 0)

View File

@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
from pyocr.tesseract import TesseractError
import pdftotext
from documents.parsers import DocumentParser, ParseError
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
from .languages import ISO639
@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser):
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
"{}[0]".format(self.document_path),
os.path.join(self.tempdir, "convert.png")
)
return os.path.join(self.tempdir, "convert-0000.png")
return os.path.join(self.tempdir, "convert.png")
def _is_ocred(self):
@ -210,22 +211,8 @@ class RasterisedDocumentParser(DocumentParser):
except ParseError as e:
return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:
@ -272,8 +259,9 @@ def run_unpaper(args):
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub(
r"([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace

View File

@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod
def handle(cls, sender, **kwargs):

View File

View File

@ -0,0 +1,16 @@
from django.apps import AppConfig
class PaperlessTextConfig(AppConfig):
name = "paperless_text"
def ready(self):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self)

View File

@ -0,0 +1,131 @@
import os
import re
import subprocess
import dateparser
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
"""
CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
def __init__(self, path):
super().__init__(path)
self._text = None
def get_thumbnail(self):
"""
The thumbnail of a txt is just a 500px wide image of the text
rendered onto a letter-sized page.
"""
# The below is heavily cribbed from https://askubuntu.com/a/590951
bg_color = "white" # bg color
text_color = "black" # text color
psize = [500, 647] # icon size
n_lines = 50 # number of lines to show
output_file = os.path.join(self.tempdir, "convert-txt.png")
temp_bg = os.path.join(self.tempdir, "bg.png")
temp_txlayer = os.path.join(self.tempdir, "tx.png")
picsize = "x".join([str(n) for n in psize])
txsize = "x".join([str(n - 8) for n in psize])
def create_bg():
work_size = ",".join([str(n - 1) for n in psize])
r = str(round(psize[0] / 10))
rounded = ",".join([r, r])
run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
'"fill ', bg_color, ' roundrectangle 0,0,',
work_size, ",", rounded, '" ', temp_bg)
def read_text():
with open(self.document_path, 'r') as src:
lines = [l.strip() for l in src.readlines()]
text = "\n".join([l for l in lines[:n_lines]])
return text.replace('"', "'")
def create_txlayer():
run_command(self.CONVERT,
"-background none",
"-fill",
text_color,
"-pointsize", "12",
"-border 4 -bordercolor none",
"-size ", txsize,
' caption:"', read_text(), '" ',
temp_txlayer)
create_txlayer()
create_bg()
run_command(self.CONVERT, temp_bg, temp_txlayer,
"-background None -layers merge ", output_file)
return output_file
def get_text(self):
if self._text is not None:
return self._text
with open(self.document_path, 'r') as f:
self._text = f.read()
return self._text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_command(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
if not subprocess.Popen(' '.join(args), env=environment,
shell=True).wait() == 0:
raise ParseError("Convert failed at {}".format(args))

View File

@ -0,0 +1,23 @@
import re
from .parsers import TextDocumentParser
class ConsumerDeclaration:
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()):
return {
"parser": TextDocumentParser,
"weight": 10
}
return None