Bump to 2.3.0

Merge pull request #401 from ahyear/patch-1
add migrate commande to docker update process
2025-08-03 18:54:40 -05:00 · 2018-09-09 21:51:44 +01:00 · 2018-09-09 21:26:56 +01:00 · 2018-09-09 21:22:42 +01:00 · 2018-09-09 21:22:07 +01:00 · 2018-09-09 21:16:53 +01:00
20 changed files with 348 additions and 74 deletions
--- a/2
+++ b/2
@@ -36,3 +36,5 @@ pytest-xdist = "*"
 [dev-packages]
 ipython = "*"
 sphinx = "*"
+tox = "*"
+
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "e20c2294bcafd346ee57901df94a515a12976ed192dc37df848b39b56bdd1f4b"
+            "sha256": "6d8bad24aa5d0c102b13b5ae27acba04836cd5a07a4003cb2763de1e0a3406b7"
        },
        "pipfile-spec": 6,
        "requires": {},
@@ -19,7 +19,7 @@
                "sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6",
                "sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c"
            ],
-            "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==1.5"
        },
        "atomicwrites": {
@@ -27,7 +27,7 @@
                "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0",
                "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee"
            ],
-            "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==1.2.1"
        },
        "attrs": {
@@ -85,7 +85,7 @@
                "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6",
                "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80"
            ],
-            "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.1.*'",
+            "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4'",
            "version": "==4.5.1"
        },
        "coveralls": {
@@ -163,7 +163,7 @@
                "sha256:a7a84d5fa07a089186a329528f127c9d73b9de57f1a1131b82bb5320ee651f6a",
                "sha256:fc155a6b553c66c838d1a22dba1dc9f5f505c43285a878c6f74a79c024750b83"
            ],
-            "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==1.5.0"
        },
        "factory-boy": {
@@ -179,6 +179,7 @@
                "sha256:ea7cfd3aeb1544732d08bd9cfba40c5b78e3a91e17b1a0698ab81bfc5554c628",
                "sha256:f6d67f04abfb2b4bea7afc7fa6c18cf4c523a67956e455668be9ae42bccc21ad"
            ],
+            "markers": "python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version >= '2.7'",
            "version": "==0.9.0"
        },
        "filemagic": {
@@ -282,7 +283,7 @@
                "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
                "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
            ],
-            "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==0.7.1"
        },
        "py": {
@@ -290,7 +291,7 @@
                "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
                "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
            ],
-            "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'",
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==1.6.0"
        },
        "pycodestyle": {
@@ -303,26 +304,26 @@
        },
        "pyocr": {
            "hashes": [
-                "sha256:bdc4d43bf9b63c2a9a4b2c9a1a623a0e63c8e6600eede5dbe866b31f3a5f2207"
+                "sha256:b6ba6263fd92da56627dff6d263d991a2246aacd117d1788f11b93f419ca395f"
            ],
            "index": "pypi",
-            "version": "==0.5.2"
+            "version": "==0.5.3"
        },
        "pytest": {
            "hashes": [
-                "sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349",
-                "sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18"
+                "sha256:453cbbbe5ce6db38717d282b758b917de84802af4288910c12442984bde7b823",
+                "sha256:a8a07f84e680482eb51e244370aaf2caa6301ef265f37c2bdefb3dd3b663f99d"
            ],
            "index": "pypi",
-            "version": "==3.7.4"
+            "version": "==3.8.0"
        },
        "pytest-cov": {
            "hashes": [
-                "sha256:03aa752cf11db41d281ea1d807d954c4eda35cfa1b21d6971966cc041bbf6e2d",
-                "sha256:890fe5565400902b0c78b5357004aab1c814115894f4f21370e2433256a3eeec"
+                "sha256:513c425e931a0344944f84ea47f3956be0e416d95acbd897a44970c8d926d5d7",
+                "sha256:e360f048b7dae3f2f2a9a4d067b2dd6b6a015d384d1577c994a43f3f7cbad762"
            ],
            "index": "pypi",
-            "version": "==2.5.1"
+            "version": "==2.6.0"
        },
        "pytest-django": {
            "hashes": [
@@ -344,6 +345,7 @@
                "sha256:e4500cd0509ec4a26535f7d4112a8cc0f17d3a41c29ffd4eab479d2a55b30805",
                "sha256:f275cb48a73fc61a6710726348e1da6d68a978f0ec0c54ece5a5fae5977e5a08"
            ],
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==0.2"
        },
        "pytest-sugar": {
@@ -457,7 +459,7 @@
                "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
                "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
            ],
-            "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
+            "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
            "version": "==1.23"
        }
    },
@@ -521,10 +523,11 @@
        },
        "imagesize": {
            "hashes": [
-                "sha256:3620cc0cadba3f7475f9940d22431fc4d407269f1be59ec9b8edcca26440cf18",
-                "sha256:5b326e4678b6925158ccc66a9fa3122b6106d7c876ee32d7de6ce59385b96315"
+                "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8",
+                "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5"
            ],
-            "version": "==1.0.0"
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
+            "version": "==1.1.0"
        },
        "ipython": {
            "hashes": [
@@ -590,6 +593,14 @@
            ],
            "version": "==0.7.4"
        },
+        "pluggy": {
+            "hashes": [
+                "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1",
+                "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1"
+            ],
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
+            "version": "==0.7.1"
+        },
        "prompt-toolkit": {
            "hashes": [
                "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
@@ -605,6 +616,14 @@
            ],
            "version": "==0.6.0"
        },
+        "py": {
+            "hashes": [
+                "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1",
+                "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6"
+            ],
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
+            "version": "==1.6.0"
+        },
        "pygments": {
            "hashes": [
                "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
@@ -656,20 +675,28 @@
        },
        "sphinx": {
            "hashes": [
-                "sha256:a07050845cc9a2f4026a6035cc8ed795a5ce7be6528bbc82032385c10807dfe7",
-                "sha256:d719de667218d763e8fd144b7fcfeefd8d434a6201f76bf9f0f0c1fa6f47fcdb"
+                "sha256:217a7705adcb573da5bbe1e0f5cab4fa0bd89fd9342c9159121746f593c2d5a4",
+                "sha256:a602513f385f1d5785ff1ca420d9c7eb1a1b63381733b2f0ea8188a391314a86"
            ],
            "index": "pypi",
-            "version": "==1.7.8"
+            "version": "==1.7.9"
        },
        "sphinxcontrib-websupport": {
            "hashes": [
                "sha256:68ca7ff70785cbe1e7bccc71a48b5b6d965d79ca50629606c7861a21b206d9dd",
                "sha256:9de47f375baf1ea07cdb3436ff39d7a9c76042c10a769c52353ec46e4e8fc3b9"
            ],
-            "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
+            "markers": "python_version != '3.2.*' and python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*'",
            "version": "==1.1.0"
        },
+        "tox": {
+            "hashes": [
+                "sha256:37cf240781b662fb790710c6998527e65ca6851eace84d1595ee71f7af4e85f7",
+                "sha256:eb61aa5bcce65325538686f09848f04ef679b5cd9b83cc491272099b28739600"
+            ],
+            "index": "pypi",
+            "version": "==3.2.1"
+        },
        "traitlets": {
            "hashes": [
                "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
@@ -682,9 +709,17 @@
                "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf",
                "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5"
            ],
-            "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version < '4' and python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.0.*'",
+            "markers": "python_version >= '2.6' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version < '4' and python_version != '3.3.*'",
            "version": "==1.23"
        },
+        "virtualenv": {
+            "hashes": [
+                "sha256:2ce32cd126117ce2c539f0134eb89de91a8413a29baac49cbab3eb50e2026669",
+                "sha256:ca07b4c0b54e14a91af9f34d0919790b016923d157afda5efdde55c96718f752"
+            ],
+            "markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'",
+            "version": "==16.0.0"
+        },
        "wcwidth": {
            "hashes": [
                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,6 +1,23 @@
 Changelog
 #########

+2.3.0
+=====
+
+* Support for consuming plain text & markdown documents was added by
+  `Joshua Taillon`_!  This was a long-requested feature, and it's addition is
+  likely to be greatly appreciated by the community: `#395`_  Thanks also to
+  `David Martin`_ for his assistance on the issue.
+* `dubit0`_ found & fixed a bug that prevented management commands from running
+  before we had an operational database: `#396`_
+* Joshua also added a simple update to the thumbnail generation process to
+  improve performance: `#399`_
+* As his last bit of effort on this release, Joshua also added some code to
+  allow you to view the documents inline rather than download them as an
+  attachment. `#400`_
+* Finally, `ahyear`_ found a slip in the Docker documentation and patched it. `#401`_
+
+
 2.2.1
 =====

@@ -19,6 +36,10 @@ Changelog
  easier on those of us with lots of different tags: `#391`_.
 * `Kilian Koeltzsch`_ noticed a bug in how we capture & automatically create
  tags, so that's fixed now too: `#384`_.
+* `erikarvstedt`_ tweaked the behaviour of the test suite to be better behaved
+  for packaging environments: `#383`_.
+* `Lukasz Soluch`_ added CORS support to make building a new Javascript-based front-end
+  cleaner & easier: `#387`_.


 2.1.0
@@ -476,6 +497,10 @@ bulk of the work on this big change.
 .. _Tim Brooks: https://github.com/brookst
 .. _Stéphane Brunner: https://github.com/sbrunner
 .. _Kilian Koeltzsch: https://github.com/kiliankoe
+.. _Lukasz Soluch: https://github.com/LukaszSolo
+.. _Joshua Taillon: https://github.com/jat255
+.. _dubit0:  https://github.com/dubit0
+.. _ahyear:  https://github.com/ahyear

 .. _#20: https://github.com/danielquinn/paperless/issues/20
 .. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -550,11 +575,18 @@ bulk of the work on this big change.
 .. _#374: https://github.com/danielquinn/paperless/pull/374
 .. _#375: https://github.com/danielquinn/paperless/pull/375
 .. _#376: https://github.com/danielquinn/paperless/pull/376
+.. _#383: https://github.com/danielquinn/paperless/pull/383
 .. _#384: https://github.com/danielquinn/paperless/issues/384
 .. _#386: https://github.com/danielquinn/paperless/issues/386
+.. _#387: https://github.com/danielquinn/paperless/pull/387
 .. _#391: https://github.com/danielquinn/paperless/pull/391
 .. _#390: https://github.com/danielquinn/paperless/pull/390
 .. _#392: https://github.com/danielquinn/paperless/issues/392
+.. _#395: https://github.com/danielquinn/paperless/pull/395
+.. _#396: https://github.com/danielquinn/paperless/pull/396
+.. _#399: https://github.com/danielquinn/paperless/pull/399
+.. _#400: https://github.com/danielquinn/paperless/pull/400
+.. _#401: https://github.com/danielquinn/paperless/pull/401

 .. _pipenv: https://docs.pipenv.org/
 .. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
--- a/docs/migrating.rst
+++ b/docs/migrating.rst
@@ -101,6 +101,7 @@ is similar:
    $ cd /path/to/project
    $ git pull
    $ docker build -t paperless .
+    $ docker-compose run --rm comsumer migrate
    $ docker-compose up -d

 If ``git pull`` doesn't report any changes, there is no need to continue with
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -89,9 +89,10 @@ PAPERLESS_EMAIL_SECRET=""
 # as is "example.com,www.example.com", but NOT " example.com" or "example.com,"
 #PAPERLESS_ALLOWED_HOSTS="example.com,www.example.com"

-# If you decide to use Paperless APIs in an ajax calls, you need to add your
-# servers to the allowed hosts that can do CORS calls. By default Paperless allows 
-# calls from localhost:8080. The same rules as above how the list should look like.
+# If you decide to use the Paperless API in an ajax call, you need to add your
+# servers to the list of allowed hosts that can do CORS calls. By default
+# Paperless allows calls from localhost:8080, but you'd like to change that,
+# you can set this value to a comma-separated list.
 #PAPERLESS_CORS_ALLOWED_HOSTS="localhost:8080,example.com,localhost:8000"

 # To host paperless under a subpath url like example.com/paperless you set
@@ -116,6 +117,10 @@ PAPERLESS_EMAIL_SECRET=""
 # http://paperless.readthedocs.org/en/latest/consumption.html#hooking-into-the-consumption-process
 #PAPERLESS_POST_CONSUME_SCRIPT="/path/to/an/arbitrary/script.sh"

+# By default, when clicking on a document within the web interface, the
+# browser will prompt the user to save the document to disk. By setting this to
+# "true", the document will instead be opened in the browser, if possible.
+#PAPERLESS_INLINE_DOC="false"

 #
 # The following values use sensible defaults for modern systems, but if you're
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,7 +29,7 @@ pillow==5.2.0
 pluggy==0.7.1; python_version != '3.1.*'
 py==1.6.0; python_version != '3.1.*'
 pycodestyle==2.4.0
-pyocr==0.5.2
+pyocr==0.5.3
 pytest-cov==2.5.1
 pytest-django==3.4.2
 pytest-env==0.6.2
--- a/src/documents/checks.py
+++ b/src/documents/checks.py
@@ -2,7 +2,7 @@ import textwrap

 from django.conf import settings
 from django.core.checks import Error, register
-from django.db.utils import OperationalError
+from django.db.utils import OperationalError, ProgrammingError


@register()
@@ -14,7 +14,7 @@ def changed_password_check(app_configs, **kwargs):
    try:
        encrypted_doc = Document.objects.filter(
            storage_type=Document.STORAGE_TYPE_GPG).first()
-    except OperationalError:
+    except (OperationalError, ProgrammingError):
        return []  # No documents table yet

    if encrypted_doc:
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,24 +1,24 @@
 # coding=utf-8

-import dateutil.parser
 import logging
 import os
 import re
 import uuid
-
 from collections import OrderedDict
+
+import dateutil.parser
+from django.conf import settings
+from django.db import models
+from django.template.defaultfilters import slugify
+from django.utils import timezone
 from fuzzywuzzy import fuzz

-from django.conf import settings
+from .managers import LogManager
+
 try:
    from django.core.urlresolvers import reverse
 except ImportError:
    from django.urls import reverse
-from django.db import models
-from django.template.defaultfilters import slugify
-from django.utils import timezone
-
-from .managers import LogManager


 class MatchingModel(models.Model):
@@ -135,7 +135,7 @@ class MatchingModel(models.Model):
        Example:
          '  some random  words "with   quotes  " and   spaces'
            ==>
-          ["some", "random", "words", "with\s+quotes", "and", "spaces"]
+          ["some", "random", "words", "with+quotes", "and", "spaces"]
        """
        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
        normspace = re.compile(r"\s+").sub
@@ -192,7 +192,11 @@ class Document(models.Model):
    TYPE_JPG = "jpg"
    TYPE_GIF = "gif"
    TYPE_TIF = "tiff"
-    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
+    TYPE_TXT = "txt"
+    TYPE_CSV = "csv"
+    TYPE_MD = "md"
+    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
+             TYPE_TXT, TYPE_CSV, TYPE_MD)

    STORAGE_TYPE_UNENCRYPTED = "unencrypted"
    STORAGE_TYPE_GPG = "gpg"
@@ -365,51 +369,52 @@ class FileInfo:
        )
    )

+    formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
    REGEXES = OrderedDict([
        ("created-correspondent-title-tags", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<correspondent>.*) - "
            r"(?P<title>.*) - "
            r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        )),
        ("created-title-tags", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<title>.*) - "
            r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        )),
        ("created-correspondent-title", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<correspondent>.*) - "
            r"(?P<title>.*)"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        )),
        ("created-title", re.compile(
            r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
            r"(?P<title>.*)"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        )),
        ("correspondent-title-tags", re.compile(
            r"(?P<correspondent>.*) - "
            r"(?P<title>.*) - "
            r"(?P<tags>[a-z0-9\-,]*)"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        )),
        ("correspondent-title", re.compile(
            r"(?P<correspondent>.*) - "
            r"(?P<title>.*)?"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        )),
        ("title", re.compile(
            r"(?P<title>.*)"
-            r"\.(?P<extension>pdf|jpe?g|png|gif|tiff?)$",
+            r"\.(?P<extension>{})$".format(formats),
            flags=re.IGNORECASE
        ))
    ])
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -1,9 +1,25 @@
 import logging
 import shutil
 import tempfile
+import re

 from django.conf import settings

+# This regular expression will try to find dates in the document at
+# hand and will match the following formats:
+# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
+# - MONTH ZZZZ, with ZZZZ being 4 digits
+# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
+DATE_REGEX = re.compile(
+    r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
+    r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
+    r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
+    r'\b([^\W\d_]{3,9} [0-9]{4})\b'
+)
+

 class ParseError(Exception):
    pass
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -166,7 +166,7 @@ class TestMatching(TestCase):
    def test_match_regex(self):

        self._test_matching(
-            "alpha\w+gamma",
+            r"alpha\w+gamma",
            "MATCH_REGEX",
            (
                "I have alpha_and_gamma in me",
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,6 +1,8 @@
 from django.http import HttpResponse, HttpResponseBadRequest
 from django.views.generic import DetailView, FormView, TemplateView
 from django_filters.rest_framework import DjangoFilterBackend
+from django.conf import settings
+
 from paperless.db import GnuPG
 from paperless.mixins import SessionOrBasicAuthMixin
 from paperless.views import StandardPagination
@@ -48,6 +50,9 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
            Document.TYPE_JPG: "image/jpeg",
            Document.TYPE_GIF: "image/gif",
            Document.TYPE_TIF: "image/tiff",
+            Document.TYPE_CSV: "text/csv",
+            Document.TYPE_MD:  "text/markdown",
+            Document.TYPE_TXT: "text/plain"
        }

        if self.kwargs["kind"] == "thumb":
@@ -60,8 +65,11 @@ class FetchView(SessionOrBasicAuthMixin, DetailView):
            self._get_raw_data(self.object.source_file),
            content_type=content_types[self.object.file_type]
        )
-        response["Content-Disposition"] = 'attachment; filename="{}"'.format(
-            self.object.file_name)
+
+        DISPOSITION = 'inline' if settings.INLINE_DOC else 'attachment'
+
+        response["Content-Disposition"] = '{}; filename="{}"'.format(
+            DISPOSITION, self.object.file_name)

        return response

--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -22,6 +22,14 @@ elif os.path.exists("/usr/local/etc/paperless.conf"):
    load_dotenv("/usr/local/etc/paperless.conf")


+def __get_boolean(key):
+    """
+    Return a boolean value based on whatever the user has supplied in the
+    environment based on whether the value "looks like" it's True or not.
+    """
+    return bool(os.getenv(key, "NO").lower() in ("yes", "y", "1", "t", "true"))
+
+
 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

@@ -67,6 +75,7 @@ INSTALLED_APPS = [
    "documents.apps.DocumentsConfig",
    "reminders.apps.RemindersConfig",
    "paperless_tesseract.apps.PaperlessTesseractConfig",
+    "paperless_text.apps.PaperlessTextConfig",

    "django.contrib.admin",

@@ -221,12 +230,12 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")

 # OCR all documents?
-OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true"))  # NOQA
+OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS")

 # If this is true, any failed attempts to OCR a PDF will result in the PDF
 # being indexed anyway, with whatever we could get.  If it's False, the file
 # will simply be left in the CONSUMPTION_DIR.
-FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true"))  # NOQA
+FORGIVING_OCR = __get_boolean("PAPERLESS_FORGIVING_OCR")

 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -270,6 +279,9 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
 PRE_CONSUME_SCRIPT = os.getenv("PAPERLESS_PRE_CONSUME_SCRIPT")
 POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")

+# Whether to display a selected document inline, or download it as attachment:
+INLINE_DOC = __get_boolean("PAPERLESS_INLINE_DOC")
+
 # The number of items on each page in the web UI.  This value must be a
 # positive integer, but if you don't define one in paperless.conf, a default of
 # 100 will be used.
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (2, 2, 1)
+__version__ = (2, 3, 0)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -14,7 +14,7 @@ from pyocr.libtesseract.tesseract_raw import \
 from pyocr.tesseract import TesseractError

 import pdftotext
-from documents.parsers import DocumentParser, ParseError
+from documents.parsers import DocumentParser, ParseError, DATE_REGEX

 from .languages import ISO639

@@ -50,10 +50,11 @@ class RasterisedDocumentParser(DocumentParser):
            self.CONVERT,
            "-scale", "500x5000",
            "-alpha", "remove",
-            self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
+            "{}[0]".format(self.document_path),
+            os.path.join(self.tempdir, "convert.png")
        )

-        return os.path.join(self.tempdir, "convert-0000.png")
+        return os.path.join(self.tempdir, "convert.png")

    def _is_ocred(self):

@@ -210,22 +211,8 @@ class RasterisedDocumentParser(DocumentParser):
        except ParseError as e:
            return None

-        # This regular expression will try to find dates in the document at
-        # hand and will match the following formats:
-        # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-        # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-        # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
-        # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
-        # - MONTH ZZZZ, with ZZZZ being 4 digits
-        # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
-        pattern = re.compile(
-            r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
-            r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
-            r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
-            r'\b([^\W\d_]{3,9} [0-9]{4})\b')
-
        # Iterate through all regex matches and try to parse the date
-        for m in re.finditer(pattern, text):
+        for m in re.finditer(DATE_REGEX, text):
            datestring = m.group(0)

            try:
@@ -272,8 +259,9 @@ def run_unpaper(args):
 def strip_excess_whitespace(text):
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(
-        "([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
-    no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
+        r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
+    no_trailing_whitespace = re.sub(
+        r"([^\S\n\r]+)$", '', no_leading_whitespace)
    return no_trailing_whitespace


--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -5,7 +5,7 @@ from .parsers import RasterisedDocumentParser

 class ConsumerDeclaration:

-    MATCHING_FILES = re.compile("^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
+    MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")

    @classmethod
    def handle(cls, sender, **kwargs):
--- a/src/paperless_text/init.py
+++ b/src/paperless_text/init.py
--- a/src/paperless_text/apps.py
+++ b/src/paperless_text/apps.py
@@ -0,0 +1,16 @@
+from django.apps import AppConfig
+
+
+class PaperlessTextConfig(AppConfig):
+
+    name = "paperless_text"
+
+    def ready(self):
+
+        from documents.signals import document_consumer_declaration
+
+        from .signals import ConsumerDeclaration
+
+        document_consumer_declaration.connect(ConsumerDeclaration.handle)
+
+        AppConfig.ready(self)
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -0,0 +1,131 @@
+import os
+import re
+import subprocess
+
+import dateparser
+from django.conf import settings
+
+from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+
+
+class TextDocumentParser(DocumentParser):
+    """
+    This parser directly parses a text document (.txt, .md, or .csv)
+    """
+
+    CONVERT = settings.CONVERT_BINARY
+    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
+    UNPAPER = settings.UNPAPER_BINARY
+    DATE_ORDER = settings.DATE_ORDER
+    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
+    OCR_ALWAYS = settings.OCR_ALWAYS
+
+    def __init__(self, path):
+        super().__init__(path)
+        self._text = None
+
+    def get_thumbnail(self):
+        """
+        The thumbnail of a txt is just a 500px wide image of the text
+        rendered onto a letter-sized page.
+        """
+        # The below is heavily cribbed from https://askubuntu.com/a/590951
+
+        bg_color = "white"  # bg color
+        text_color = "black"  # text color
+        psize = [500, 647]  # icon size
+        n_lines = 50  # number of lines to show
+        output_file = os.path.join(self.tempdir, "convert-txt.png")
+
+        temp_bg = os.path.join(self.tempdir, "bg.png")
+        temp_txlayer = os.path.join(self.tempdir, "tx.png")
+        picsize = "x".join([str(n) for n in psize])
+        txsize = "x".join([str(n - 8) for n in psize])
+
+        def create_bg():
+            work_size = ",".join([str(n - 1) for n in psize])
+            r = str(round(psize[0] / 10))
+            rounded = ",".join([r, r])
+            run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
+                        '"fill ', bg_color, ' roundrectangle 0,0,',
+                        work_size, ",", rounded, '" ', temp_bg)
+
+        def read_text():
+            with open(self.document_path, 'r') as src:
+                lines = [l.strip() for l in src.readlines()]
+                text = "\n".join([l for l in lines[:n_lines]])
+                return text.replace('"', "'")
+
+        def create_txlayer():
+            run_command(self.CONVERT,
+                        "-background none",
+                        "-fill",
+                        text_color,
+                        "-pointsize", "12",
+                        "-border 4 -bordercolor none",
+                        "-size ", txsize,
+                        ' caption:"', read_text(), '" ',
+                        temp_txlayer)
+
+        create_txlayer()
+        create_bg()
+        run_command(self.CONVERT, temp_bg, temp_txlayer,
+                    "-background None -layers merge ", output_file)
+
+        return output_file
+
+    def get_text(self):
+
+        if self._text is not None:
+            return self._text
+
+        with open(self.document_path, 'r') as f:
+            self._text = f.read()
+
+        return self._text
+
+    def get_date(self):
+        date = None
+        datestring = None
+
+        try:
+            text = self.get_text()
+        except ParseError as e:
+            return None
+
+        # Iterate through all regex matches and try to parse the date
+        for m in re.finditer(DATE_REGEX, text):
+            datestring = m.group(0)
+
+            try:
+                date = dateparser.parse(
+                           datestring,
+                           settings={'DATE_ORDER': self.DATE_ORDER,
+                                     'PREFER_DAY_OF_MONTH': 'first',
+                                     'RETURN_AS_TIMEZONE_AWARE': True})
+            except TypeError:
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None:
+                break
+
+        if date is not None:
+            self.log("info", "Detected document date " + date.isoformat() +
+                             " based on string " + datestring)
+        else:
+            self.log("info", "Unable to detect date for document")
+
+        return date
+
+
+def run_command(*args):
+    environment = os.environ.copy()
+    if settings.CONVERT_MEMORY_LIMIT:
+        environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
+    if settings.CONVERT_TMPDIR:
+        environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
+
+    if not subprocess.Popen(' '.join(args), env=environment,
+                            shell=True).wait() == 0:
+        raise ParseError("Convert failed at {}".format(args))
--- a/src/paperless_text/signals.py
+++ b/src/paperless_text/signals.py
@@ -0,0 +1,23 @@
+import re
+
+from .parsers import TextDocumentParser
+
+
+class ConsumerDeclaration:
+
+    MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
+
+    @classmethod
+    def handle(cls, sender, **kwargs):
+        return cls.test
+
+    @classmethod
+    def test(cls, doc):
+
+        if cls.MATCHING_FILES.match(doc.lower()):
+            return {
+                "parser": TextDocumentParser,
+                "weight": 10
+            }
+
+        return None
Author	SHA1	Message	Date
Daniel Quinn	2edf65dd1e	Bump to 2.3.0	2018-09-09 21:51:44 +01:00
Daniel Quinn	9a739bdbab	Merge pull request #401 from ahyear/patch-1 add migrate commande to docker update process	2018-09-09 21:26:56 +01:00
Daniel Quinn	66db06590d	Merge branch 'jat255-ENH_config_inline_or_attach'	2018-09-09 21:22:42 +01:00
Daniel Quinn	7cef108785	Streamline how we handle boolean values in settings.py	2018-09-09 21:22:07 +01:00
Daniel Quinn	a86a20ef0f	Make the example file contain the default value	2018-09-09 21:16:53 +01:00
Daniel Quinn	f94347abc0	Merge branch 'ENH_config_inline_or_attach' of git://github.com/jat255/paperless into jat255-ENH_config_inline_or_attach	2018-09-09 21:15:14 +01:00
Daniel Quinn	46cbd10ba0	Merge pull request #399 from jat255/ENH_convert_only_one_page Speed up thumbnail generation for PDFs	2018-09-09 21:12:42 +01:00
Daniel Quinn	2a96c648e8	Merge pull request #396 from dubit0/postgres_mysql_fix Fix document checks with PostgreSQL and MySQL backends.	2018-09-09 21:10:36 +01:00
Daniel Quinn	75648cc74b	Merge branch 'jat255-ENH_text_consumer'	2018-09-09 21:03:58 +01:00
Daniel Quinn	0472fe4e9e	Reorder imports	2018-09-09 21:03:37 +01:00
Daniel Quinn	c99f5923d5	Rename `parsers` to `DATE_REGEX` In moving the `parsers` variable into the package-level, it lost the context, so a more descriptive name was needed.	2018-09-09 21:02:30 +01:00
Daniel Quinn	ef302abed7	Fix pycodestyle complaints	2018-09-09 20:55:37 +01:00
Daniel Quinn	2dc35cc856	Merge branch 'ENH_text_consumer' of git://github.com/jat255/paperless into jat255-ENH_text_consumer	2018-09-09 20:52:59 +01:00
Daniel Quinn	f4c399f0dd	Merge pull request #398 from ddddavidmartin/bump_pyocr_version_for_tesseract_4_support Bump required version for Pyocr to support the latest tesseract 4.	2018-09-09 20:01:51 +01:00
Daniel Quinn	5342db6ada	Fix pycodestyle complaints Apparently, pycodestyle updated itself to now check for invalid escape sequences, which only complain if the regex in use isn't a raw string (r"").	2018-09-09 20:00:12 +01:00
Daniel Quinn	5c39fff51b	Add tox to dev dependencies	2018-09-09 19:59:47 +01:00
ahyear	ed0e40d3e6	add migrate commande to docker update process	2018-09-06 15:32:41 +02:00
Joshua Taillon	652ead2f5c	remove debugging print statement	2018-09-05 23:05:37 -04:00
Joshua Taillon	be9757894a	add INLINE_DOC to settings.py	2018-09-05 23:03:30 -04:00
Joshua Taillon	22378789e2	add option for inline vs. attachment for document rendering	2018-09-05 22:58:38 -04:00
Joshua Taillon	72c828170e	move date-matching regex pattern to base parser module for use by all subclasses	2018-09-05 21:13:36 -04:00
Joshua Taillon	cac63494f0	change tesseract parser to only convert first page to save (potentially) massive amounts of work	2018-09-05 15:18:35 -04:00
Daniel Quinn	939a67bd4b	Add empty requirements for rtd to reference	2018-09-05 11:16:42 +01:00
Daniel Quinn	fbc6a58f5a	Add credits for 2.2.0 that I forgot	2018-09-05 10:59:06 +01:00
Daniel Quinn	01a358d2b0	Re-flow text to keep it <80c wide	2018-09-05 10:58:41 +01:00
David Martin	6b447628ed	Bump required version for Pyocr to support the latest tesseract 4. This recently changed in the official tesseract engine [0]. -psm is not allowed as an option anymore and --psm has to be used instead. The latest pyocr enables support for this [1]. [0] tesseract-ocr/tesseract@ee201e1 [1] `5abd0a566a`	2018-09-05 13:03:42 +10:00
Thomas Niederprüm	2308d5a613	Catch ProgrammingError in Document checks. When running PostgreSQL or MariaDB/MySQL backends, a query to a non-existent table will raise a "ProgrammingError". This patch properly catches this error. Without this patch all management calls to manage.py will lead to an error when running PostgreSQL or MariaDB as a backend.	2018-09-04 20:11:48 +02:00
Joshua Taillon	23bf79274c	Merge branch 'master' into ENH_text_consumer	2018-09-03 23:47:30 -04:00
Joshua Taillon	4849249d86	explicitly add txt, md, and csv types for consumer and viewer; fix thumbnail generation	2018-09-03 23:46:13 -04:00
Joshua Taillon	d6fedbec52	first stab at text consumer	2018-08-30 23:32:41 -04:00