diff --git a/.gitignore b/.gitignore index 908fa9748..3c8b8ffea 100644 --- a/.gitignore +++ b/.gitignore @@ -57,7 +57,9 @@ docs/_build/ target/ # Stored PDFs -media/* +media/documents/*.gpg +media/documents/thumbnails/*.gpg +media/documents/originals/*.gpg # Sqlite database db.sqlite3 @@ -68,8 +70,9 @@ db.sqlite3 # Other stuff that doesn't belong virtualenv .vagrant +docker-compose.yml +docker-compose.env # Used for development scripts/import-for-development environment - diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..dcaaeab8d --- /dev/null +++ b/.travis.yml @@ -0,0 +1,18 @@ +language: python + +sudo: false + +matrix: + include: + - python: 3.4 + env: TOXENV=py34 + - python: 3.5 + env: TOXENV=py35 + - python: 3.5 + env: TOXENV=pep8 + +install: + - pip install --requirement requirements.txt + - pip install tox + +script: tox -c src/tox.ini diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..a13fa7b3f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,46 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install dependencies +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + sudo \ + tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \ + && rm -rf /var/lib/apt/lists/* + +# Install python dependencies +RUN mkdir -p /usr/src/paperless +WORKDIR /usr/src/paperless +COPY requirements.txt /usr/src/paperless/ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +RUN mkdir -p /usr/src/paperless/src +RUN mkdir -p /usr/src/paperless/data +RUN mkdir -p /usr/src/paperless/media +COPY src/ /usr/src/paperless/src/ +COPY data/ /usr/src/paperless/data/ +COPY media/ /usr/src/paperless/media/ + +# Set consumption directory +ENV PAPERLESS_CONSUMPTION_DIR /consume +RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR + +# Migrate database +WORKDIR /usr/src/paperless/src +RUN ./manage.py migrate + +# Create user +RUN groupadd -g 1000 paperless \ + && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ + && chown -Rh paperless:paperless /usr/src/paperless + +# Setup entrypoint +COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh +RUN chmod 755 /sbin/docker-entrypoint.sh + +# Mount volumes +VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"] + +ENTRYPOINT ["/sbin/docker-entrypoint.sh"] +CMD ["--help"] diff --git a/README.rst b/README.rst index cf9d387cc..80043ff7a 100644 --- a/README.rst +++ b/README.rst @@ -3,6 +3,7 @@ Paperless |Documentation| |Chat| +|Travis| Scan, index, and archive all of your paper documents @@ -55,6 +56,7 @@ powerful tools. * `ImageMagick`_ converts the images between colour and greyscale. * `Tesseract`_ does the character recognition. +* `Unpaper`_ despeckles and and deskews the scanned image. * `GNU Privacy Guard`_ is used as the encryption backend. * `Python 3`_ is the language of the project. @@ -92,6 +94,7 @@ home. .. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail .. _ImageMagick: http://imagemagick.org/ .. _Tesseract: https://github.com/tesseract-ocr +.. _Unpaper: https://www.flameeyes.eu/projects/unpaper .. _GNU Privacy Guard: https://gnupg.org/ .. _Python 3: https://python.org/ .. _Pillow: https://pypi.python.org/pypi/pillowfight/ @@ -105,4 +108,5 @@ home. .. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg :alt: Join the chat at https://gitter.im/danielquinn/paperless :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge - +.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master + :target: https://travis-ci.org/danielquinn/paperless diff --git a/docker-compose.env.example b/docker-compose.env.example new file mode 100644 index 000000000..13c74b6ab --- /dev/null +++ b/docker-compose.env.example @@ -0,0 +1,15 @@ +# Environment variables to set for Paperless +# Commented out variables will be replaced by a default within Paperless. + +# Passphrase Paperless uses to encrypt and decrypt your documents +PAPERLESS_PASSPHRASE=CHANGE_ME + +# The amount of threads to use for text recognition +# PAPERLESS_OCR_THREADS=4 + +# Additional languages to install for text recognition +# PAPERLESS_OCR_LANGUAGES=deu ita + +# You can change the default user and group id to a custom one +# USERMAP_UID=1000 +# USERMAP_GID=1000 diff --git a/docker-compose.yml.example b/docker-compose.yml.example new file mode 100644 index 000000000..488fc83d2 --- /dev/null +++ b/docker-compose.yml.example @@ -0,0 +1,37 @@ +version: '2' + +services: + webserver: + image: paperless + ports: + # You can adapt the port you want Paperless to listen on by + # modifying the part before the `:`. + - "8000:8000" + volumes: + - data:/usr/src/paperless/data + - media:/usr/src/paperless/media + env_file: docker-compose.env + environment: + - PAPERLESS_OCR_LANGUAGES= + command: ["runserver", "0.0.0.0:8000"] + + consumer: + image: paperless + volumes: + - data:/usr/src/paperless/data + - media:/usr/src/paperless/media + # You have to adapt the local path you want the consumption + # directory to mount to by modifying the part before the ':'. + - /path/to/arbitrary/place:/consume + # Likewise, you can add a local path to mount a directory for + # exporting. This is not strictly needed for paperless to + # function, only if you're exporting your files: uncomment + # it and fill in a local path if you know you're going to + # want to export your documents. + # - /path/to/another/arbitrary/place:/export + env_file: docker-compose.env + command: ["document_consumer"] + +volumes: + data: + media: diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 000000000..ee63aebb4 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install Sphinx and Pygments +RUN pip install Sphinx Pygments + +# Setup directories, copy data +RUN mkdir /build +COPY . /build +WORKDIR /build/docs + +# Build documentation +RUN make html + +# Start webserver +WORKDIR /build/docs/_build/html +EXPOSE 8000/tcp +CMD ["python3", "-m", "http.server"] diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 000000000..15ca9bc44 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,23 @@ +.. _api: + +The REST API +############ + +Paperless makes use of the `Django REST Framework`_ standard API interface +because of its inherent awesomeness. Conveniently, the system is also +self-documenting, so learn more about the access points, schema, what's +accepted and what isn't, you need only visit ``/api`` on your local Paperless +installation. + +.. _Django REST Framework: http://django-rest-framework.org/ + + +.. _api-uploading: + +Uploading +--------- + +File uploads in an API are hard and so far as I've been able to tell, there's +no standard way of accepting them, so rather than crowbar file uploads into the +REST API and endure that headache, I've left that process to a simple HTTP +POST, documented on the :ref:`consumption page `. diff --git a/docs/changelog.rst b/docs/changelog.rst index c56e7a367..f2ab6cabc 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,10 +1,51 @@ Changelog ######### +* 0.1.1 + + * Potentially **Breaking Change**: All references to "sender" in the code + have been renamed to "correspondent" to better reflect the nature of the + property (one could quite reasonably scan a document before sending it to + someone.) + * `#67`_: Rewrote the document exporter and added a new importer that allows + for full metadata retention without depending on the file name and + modification time. A big thanks to `Tikitu de Jager`_, `Pit`_, + `Florian Jung`_, and `Christopher Luu`_ for their code snippets and + contributing conversation that lead to this change. + * `#20`_: Added *unpaper* support to help in cleaning up the scanned image + before it's OCR'd. Thanks to `Pit`_ for this one. + * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. + * `#68`_: Added support for using a proper config file at + ``/etc/paperless.conf`` and modified the systemd unit files to use it. + * Refactored the Vagrant installation process to use environment variables + rather than asking the user to modify ``settings.py``. + * `#44`_: Harmonise environment variable names with constant names. + * `#60`_: Setup logging to actually use the Python native logging framework. + * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images + to be imported but made unavailable. + +* 0.1.0 + + * Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and + `Tikitu de Jager`_ for this one, and especially to `Pit`_ + who spearheadded this effort. + * A simple REST API is in place, but it should be considered unstable. + * Cleaned up the consumer to use temporary directories instead of a single + scratch space. (Thanks `Pit`_) + * Improved the efficiency of the consumer by parsing pages more intelligently + and introducing a threaded OCR process (thanks again `Pit`_). + * `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_. + * `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by + `Pit`_. + * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) + * `#54`_: Documented the re-tagger (`zedster`_) + * `#57`_: Make sure file is preserved on import failure (`darkmatter`_) + * Added tox with pep8 checking + * 0.0.6 - * Added support for parallel OCR (significant work from pitkley) - * Sped up the language detection (significant work from pitkley) + * Added support for parallel OCR (significant work from `Pit`_) + * Sped up the language detection (significant work from `Pit`_) * Added simple logging * 0.0.5 @@ -35,3 +76,26 @@ Changelog * 0.0.1 * Initial release + +.. _Brian Conn: https://github.com/TheConnMan +.. _Christopher Luu: https://github.com/nuudles +.. _Florian Jung: https://github.com/the01 +.. _Tikitu de Jager: https://github.com/tikitu +.. _Paul: https://github.com/polo2ro +.. _Pit: https://github.com/pitkley +.. _Wayne Werner: https://github.com/waynew +.. _darkmatter: https://github.com/darkmatter +.. _zedster: https://github.com/zedster + +.. _#20: https://github.com/danielquinn/paperless/issues/20 +.. _#44: https://github.com/danielquinn/paperless/issues/44 +.. _#45: https://github.com/danielquinn/paperless/issues/45 +.. _#47: https://github.com/danielquinn/paperless/issues/47 +.. _#48: https://github.com/danielquinn/paperless/issues/48 +.. _#53: https://github.com/danielquinn/paperless/issues/53 +.. _#54: https://github.com/danielquinn/paperless/issues/54 +.. _#57: https://github.com/danielquinn/paperless/issues/57 +.. _#60: https://github.com/danielquinn/paperless/issues/60 +.. _#67: https://github.com/danielquinn/paperless/issues/67 +.. _#68: https://github.com/danielquinn/paperless/issues/68 +.. _#71: https://github.com/danielquinn/paperless/issues/71 diff --git a/docs/consumption.rst b/docs/consumption.rst index 8b9b35433..eadf12823 100644 --- a/docs/consumption.rst +++ b/docs/consumption.rst @@ -40,14 +40,14 @@ follow the :ref:`consumer ` instructions to get it running. A Note on File Naming --------------------- -Any document you put into the consumption directory will be consumed, but if you -name the file right, it'll automatically set some values in the database for -you. This is is the logic the consumer follows: +Any document you put into the consumption directory will be consumed, but if +you name the file right, it'll automatically set some values in the database +for you. This is is the logic the consumer follows: -1. Try to find the sender, title, and tags in the file name following the - pattern: ``Sender - Title - tag,tag,tag.pdf``. -2. If that doesn't work, try to find the sender and title in the file name - following the pattern: ``Sender - Title.pdf``. +1. Try to find the correspondent, title, and tags in the file name following + the pattern: ``Correspondent - Title - tag,tag,tag.pdf``. +2. If that doesn't work, try to find the correspondent and title in the file + name following the pattern: ``Correspondent - Title.pdf``. 3. If that doesn't work, just assume that the name of the file is the title. So given the above, the following examples would work as you'd expect: @@ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running: the configured email account every 10 minutes for something new and pull down whatever it finds. 4. Send yourself an email! Note that the subject is treated as the file name, - so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get - what you expect. Also, you must include the aforementioned secret string in - every email so the fetcher knows that it's safe to import. + so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll + get what you expect. Also, you must include the aforementioned secret + string in every email so the fetcher knows that it's safe to import. 5. After a few minutes, the consumer will poll your mailbox, pull down the message, and place the attachment in the consumption directory with the appropriate name. A few minutes later, the consumer will import it like any @@ -111,23 +111,22 @@ So, with all that in mind, here's what you do to get it running: HTTP POST ========= -Currently, the API is limited to only handling file uploads, it doesn't do tags -yet, and the URL schema isn't concrete, but it's a start. It's also not much of -a real API, it's just a URL that accepts an HTTP POST. +You can also submit a document via HTTP POST. It doesn't do tags yet, and the +URL schema isn't concrete, but it's a start. -To push your document to *Paperless*, send an HTTP POST to the server with the +To push your document to Paperless, send an HTTP POST to the server with the following name/value pairs: -* ``sender``: The name of the document's sender. Note that there are - restrictions on what characters you can use here. Specifically, alphanumeric - characters, `-`, `,`, `.`, and `'` are ok, everything else it out. You also - can't use the sequence ` - ` (space, dash, space). +* ``correspondent``: The name of the document's correspondent. Note that there + are restrictions on what characters you can use here. Specifically, + alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it + out. You also can't use the sequence ` - ` (space, dash, space). * ``title``: The title of the document. The rules for characters is the same - here as the sender. -* ``signature``: For security reasons, we have the sender send a signature using - a "shared secret" method to make sure that random strangers don't start - uploading stuff to your server. The means of generating this signature is - defined below. + here as the correspondent. +* ``signature``: For security reasons, we have the correspondent send a + signature using a "shared secret" method to make sure that random strangers + don't start uploading stuff to your server. The means of generating this + signature is defined below. Specify ``enctype="multipart/form-data"``, and then POST your file with::: @@ -146,12 +145,12 @@ verification. In the case of *Paperless*, you configure the server with the secret by setting ``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by -concatenating the sender, title, and the secret, and then using sha256 to -generate a hexdigest. +concatenating the correspondent, title, and the secret, and then using sha256 +to generate a hexdigest. If you're using Python, this is what that looks like: .. code:: python from hashlib import sha256 - signature = sha256(sender + title + secret).hexdigest() + signature = sha256(correspondent + title + secret).hexdigest() diff --git a/docs/index.rst b/docs/index.rst index fc78f6f23..47710d376 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ Contents requirements setup consumption + api utilities migrating changelog diff --git a/docs/migrating.rst b/docs/migrating.rst index 46083533a..d659620ac 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -4,31 +4,10 @@ Migrating, Updates, and Backups =============================== As *Paperless* is still under active development, there's a lot that can change -as software updates roll out. The thing you just need to remember for all of -this is that for the most part, **the database is expendable** so long as you -have your files. This is because the file name of the exported files includes -the name of the sender, the title, and the tags (if any) on each file. - - -.. _migrating-updates: - -Updates -------- - -For the most part, all you have to do to update *Paperless* is run ``git pull`` -on the directory containing the project files, and then use Django's ``migrate`` -command to execute any database schema updates that might have been rolled in -as part of the update: - -.. code:: bash - - $ cd /path/to/project - $ git pull - $ cd src - $ ./manage.py migrate - -Note that it's possible (even likely) that while ``git pull`` may update some -files, the ``migrate`` step may not update anything. This is totally normal. +as software updates roll out. You should backup often, so if anything goes +wrong during an update, you at least have a means of restoring to something +usable. Thankfully, there are automated ways of backing up, restoring, and +updating the software. .. _migrating-backup: @@ -38,20 +17,8 @@ Backing Up So you're bored of this whole project, or you want to make a remote backup of the unencrypted files for whatever reason. This is easy to do, simply use the -:ref:`exporter ` to dump your documents out into an -arbitrary directory. - -Additionally however, you'll need to back up the tags themselves. The file -names contain the tag names, but you still need to define the tags and their -matching algorithms in the database for things to work properly. We do this -with Django's ``dumpdata`` command, which produces JSON output. - -.. code:: bash - - $ cd /path/to/project - $ cd src - $ ./manage.py document_export /path/to/arbitrary/place/ - $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json +:ref:`exporter ` to dump your documents and database out +into an arbitrary directory. .. _migrating-restoring: @@ -66,7 +33,7 @@ create an empty database (just follow the ``tags.json`` file you created as part of your backup. Lastly, copy your exported documents into the consumption directory and start up the consumer. -.. code:: bash +.. code-block:: shell-session $ cd /path/to/project $ rm data/db.sqlite3 # Delete the database @@ -77,3 +44,60 @@ exported documents into the consumption directory and start up the consumer. $ cp /path/to/exported/docs/* /path/to/consumption/dir/ $ ./manage.py document_consumer +Importing your data if you are :ref:`using Docker ` +is almost as simple: + +.. code-block:: shell-session + + # Stop and remove your current containers + $ docker-compose stop + $ docker-compose rm -f + + # Recreate them, add the superuser + $ docker-compose up -d + $ docker-compose run --rm webserver createsuperuser + + # Load the tags + $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - + + # Load your exported documents into the consumption directory + # (How you do this highly depends on how you have set this up) + $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ + +After loading the documents into the consumption directory the consumer will +immediately start consuming the documents. + + +.. _migrating-updates: + +Updates +------- + +For the most part, all you have to do to update *Paperless* is run ``git pull`` +on the directory containing the project files, and then use Django's ``migrate`` +command to execute any database schema updates that might have been rolled in +as part of the update: + +.. code-block:: shell-session + + $ cd /path/to/project + $ git pull + $ cd src + $ ./manage.py migrate + +Note that it's possible (even likely) that while ``git pull`` may update some +files, the ``migrate`` step may not update anything. This is totally normal. + +If you are :ref:`using Docker ` the update process +requires only one additional step: + +.. code-block:: shell-session + + $ cd /path/to/project + $ git pull + $ docker build -t paperless . + $ docker-compose up -d + $ docker-compose run --rm webserver migrate + +If ``git pull`` doesn't report any changes, there is no need to continue with +the remaining steps. diff --git a/docs/requirements.rst b/docs/requirements.rst index 1c4f989db..36bc234c0 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -10,11 +10,13 @@ should work) that has the following software installed on it: * `GNU Privacy Guard`_ * `Tesseract`_ * `Imagemagick`_ +* `unpaper`_ .. _Python3: https://python.org/ .. _GNU Privacy Guard: https://gnupg.org .. _Tesseract: https://github.com/tesseract-ocr .. _Imagemagick: http://imagemagick.org/ +.. _unpaper: https://www.flameeyes.eu/projects/unpaper Notably, you should confirm how you access your Python3 installation. Many Linux distributions will install Python3 in parallel to Python2, using the names @@ -101,3 +103,16 @@ you'd like to generate your own docs locally, you'll need to: $ pip install sphinx and then cd into the ``docs`` directory and type ``make html``. + +If you are using Docker, you can use the following commands to build the +documentation and run a webserver serving it on `port 8001`_: + +.. code:: bash + + $ pwd + /path/to/paperless + + $ docker build -t paperless:docs -f docs/Dockerfile . + $ docker run --rm -it -p "8001:8000" paperless:docs + +.. _port 8001: http://127.0.0.1:8001 diff --git a/docs/setup.rst b/docs/setup.rst index 24a9b9fa2..9992418c1 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -37,11 +37,19 @@ or just download the tarball and go that route: Installation & Configuration ---------------------------- -You can go two routes with setting up and running Paperless. The *Vagrant* -route is quick & easy, but means you're running a VM which comes with memory -consumption etc. Alternatively the standard, "bare metal" approach is a little -more complicated. +You can go multiple routes with setting up and running Paperless. The `Vagrant +route`_ is quick & easy, but means you're running a VM which comes with memory +consumption etc. We also `support Docker`_, which you can use natively under +Linux and in a VM with `Docker Machine`_ (this guide was written for native +Docker usage under Linux, you might have to adapt it for Docker Machine.) +Alternatively the standard, `bare metal`_ approach is a little more complicated, +but worth it because it makes it easier to should you want to contribute some +code back. +.. _Vagrant route: setup-installation-vagrant_ +.. _support Docker: setup-installation-docker_ +.. _bare metal: setup-installation-standard_ +.. _Docker Machine: https://docs.docker.com/machine/ .. _setup-installation-standard: @@ -91,33 +99,188 @@ Vagrant Method 2. Run ``vagrant up``. An instance will start up for you. When it's ready and provisioned... 3. Run ``vagrant ssh`` and once inside your new vagrant box, edit - ``/opt/paperless/src/paperless/settings.py`` and set the values for: - * ``CONSUMPTION_DIR``: this is where your documents will be dumped to be - consumed by Paperless. - * ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt - the original document. The default value attempts to source the - passphrase from the environment, so if you don't set it to a static value - here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the - command line whenever invoking the consumer or webserver. -4. Initialise the database with ``/opt/paperless/src/manage.py migrate``. -5. Still inside your vagrant box, create a user for your Paperless instance with - ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to + ``/etc/paperless.conf`` and set the values for: + * ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be + dumped to be consumed by Paperless. + * ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to + encrypt/decrypt the original document. + * ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming + documents from mail or via the API. If you don't use either, leaving it + blank is just fine. +4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This + updates the environment to make use of the changes you made to the config + file. +5. Initialise the database with ``/opt/paperless/src/manage.py migrate``. +6. Still inside your vagrant box, create a user for your Paperless instance + with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to create your user. -6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. - You should now be able to visit your (empty) `Paperless webserver`_ at - ``172.28.128.4:8000``. You can login with the user/pass you created in #5. -7. In a separate window, run ``vagrant ssh`` again, but this time once inside +7. Start the webserver with + ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be + able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``. + You can login with the user/pass you created in #6. +8. In a separate window, run ``vagrant ssh`` again, but this time once inside your vagrant instance, you should start the consumer script with ``/opt/paperless/src/manage.py document_consumer``. -8. Scan something. Put it in the ``CONSUMPTION_DIR``. -9. Wait a few minutes -10. Visit the document list on your webserver, and it should be there, indexed +9. Scan something. Put it in the ``CONSUMPTION_DIR``. +10. Wait a few minutes +11. Visit the document list on your webserver, and it should be there, indexed and downloadable. .. _Vagrant: https://vagrantup.com/ .. _Paperless server: http://172.28.128.4:8000 +.. _setup-installation-docker: + +Docker Method +............. + +1. Install `Docker`_. + + .. caution:: + + As mentioned earlier, this guide assumes that you use Docker natively + under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows, + you will have to adapt IP addresses, volume-mounting, command execution + and maybe more. + +2. Install `docker-compose`_. [#compose]_ + + .. caution:: + + If you want to use the included ``docker-compose.yml.example`` file, you + need to have at least Docker version **1.10.0** and docker-compose + version **1.6.0**. + + See the `Docker installation guide`_ on how to install the current + version of Docker for your operating system or Linux distribution of + choice. To get an up-to-date version of docker-compose, follow the + `docker-compose installation guide`_ if your package repository doesn't + include it. + + .. _Docker installation guide: https://docs.docker.com/engine/installation/ + .. _docker-compose installation guide: https://docs.docker.com/compose/install/ + +3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and + a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be + editing both these files: taking a copy ensures that you can ``git pull`` to + receive updates without risking merge conflicts with your modified versions + of the configuration files. +4. Modify ``docker-compose.yml`` to your preferences, following the instructions + in comments in the file. The only change that is a hard requirement is to + specify where the consumption directory should mount. +5. Modify ``docker-compose.env`` and adapt the following environment variables: + + ``PAPERLESS_PASSPHRASE`` + This is the passphrase Paperless uses to encrypt/decrypt the original + document. + + ``PAPERLESS_OCR_THREADS`` + This is the number of threads the OCR process will spawn to process + document pages in parallel. If the variable is not set, Python determines + the core-count of your CPU and uses that value. + + ``PAPERLESS_OCR_LANGUAGES`` + If you want the OCR to recognize other languages in addition to the default + English, set this parameter to a space separated list of three-letter + language-codes after `ISO 639-2/T`_. For a list of available languages -- + including their three letter codes -- see the `Debian packagelist`_. + + ``USERMAP_UID`` and ``USERMAP_GID`` + If you want to mount the consumption volume (directory ``/consume`` within + the containers) to a host-directory -- which you probably want to do -- + access rights might be an issue. The default user and group ``paperless`` + in the containers have an id of 1000. The containers will enforce that the + owning group of the consumption directory will be ``paperless`` to be able + to delete consumed documents. If your host-system has a group with an id of + 1000 and you don't want this group to have access rights to the consumption + directory, you can use ``USERMAP_GID`` to change the id in the container + and thus the one of the consumption directory. Furthermore, you can change + the id of the default user as well using ``USERMAP_UID``. + +6. Run ``docker-compose up -d``. This will create and start the necessary + containers. +7. To be able to login, you will need a super user. To create it, execute the + following command: + + .. code-block:: shell-session + + $ docker-compose run --rm webserver createsuperuser + + This will prompt you to set a username (default ``paperless``), an optional + e-mail address and finally a password. +8. The default ``docker-compose.yml`` exports the webserver on your local port + 8000. If you haven't adapted this, you should now be able to visit your + `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the + user and password you just created. +9. Add files to consumption directory the way you prefer to. Following are two + possible options: + + 1. Mount the consumption directory to a local host path by modifying your + ``docker-compose.yml``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - - /consume + + - /local/path/you/choose:/consume + + .. danger:: + + While the consumption container will ensure at startup that it can + **delete** a consumed file from a host-mounted directory, it might not + be able to **read** the document in the first place if the access + rights to the file are incorrect. + + Make sure that the documents you put into the consumption directory + will either be readable by everyone (``chmod o+r file.pdf``) or + readable by the default user or group id 1000 (or the one you have set + with ``USERMAP_UID`` or ``USERMAP_GID`` respectively). + + 2. Use ``docker cp`` to copy your files directly into the container: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume + + ``docker cp`` is a one-shot-command, just like ``cp``. This means that + every time you want to consume a new document, you will have to execute + ``docker cp`` again. You can of course automate this process, but option 1 + is generally the preferred one. + + .. danger:: + + ``docker cp`` will change the owning user and group of a copied file + to the acting user at the destination, which will be ``root``. + + You therefore need to ensure that the documents you want to copy into + the container are readable by everyone (``chmod o+r file.pdf``) before + copying them. + + +.. _Docker: https://www.docker.com/ +.. _docker-compose: https://docs.docker.com/compose/install/ +.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- + +.. [#compose] You of course don't have to use docker-compose, but it + simplifies deployment immensely. If you know your way around Docker, feel + free to tinker around without using compose! + + .. _making-things-a-little-more-permanent: Making Things a Little more Permanent @@ -126,5 +289,9 @@ Making Things a Little more Permanent Once you've tested things and are happy with the work flow, you can automate the process of starting the webserver and consumer automatically. If you're running on a bare metal system that's using Systemd, you can use the service unit files -in the ``scripts`` directory to set this up. If you're on a SysV or other -startup system (like the Vagrant box), then you're currently on your own. +in the ``scripts`` directory to set this up. If you're on another startup +system or are using a Vagrant box, then you're currently on your own. If you are +using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to +have the containers automatically start with the Docker daemon. + +.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart diff --git a/docs/utilities.rst b/docs/utilities.rst index 2b795d31a..ce3555b73 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -26,7 +26,7 @@ How to Use It The webserver is started via the ``manage.py`` script: -.. code:: bash +.. code-block:: shell-session $ /path/to/paperless/src/manage.py runserver @@ -64,7 +64,7 @@ How to Use It The consumer is started via the ``manage.py`` script: -.. code:: bash +.. code-block:: shell-session $ /path/to/paperless/src/manage.py document_consumer @@ -95,13 +95,110 @@ How to Use It This too is done via the ``manage.py`` script: +.. code-block:: shell-session + + $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ + +This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you +to do with as you please. The files are accompanied with a special file, +``manifest.json`` which can be used to +:ref:`import the files ` at a later date if you wish. + + +.. _utilities-exporter-howto-docker: + +Docker +______ + +If you are :ref:`using Docker `, running the +expoorter is almost as easy. To mount a volume for exports, follow the +instructions in the ``docker-compose.yml.example`` file for the ``/export`` +volume (making the changes in your own ``docker-compose.yml`` file, of course). +Once you have the volume mounted, the command to run an export is: + +.. code-block:: shell-session + + $ docker-compose run --rm consumer document_exporter /export + +If you prefer to use ``docker run`` directly, supplying the necessary commandline +options: + +.. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export + + +.. _utilities-importer: + +The Importer +------------ + +Looking to transfer Paperless data from one instance to another, or just want +to restore from a backup? This is your go-to toy. + + +.. _utilities-importer-howto: + +How to Use It +............. + +The importer works just like the exporter. You point it at a directory, and +the script does the rest of the work: + +.. code-block:: shell-session + + $ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/ + +Docker +______ + +Assuming that you've already gone through the steps above in the +:ref:`export ` section, then the easiest thing +to do is just re-use the ``/export`` path you already setup: + +.. code-block:: shell-session + + $ docker-compose run --rm consumer document_importer /export + +Similarly, if you're not using docker-compose, you can adjust the export +instructions above to do the import. + + +.. _utilities-retagger: + +The Re-tagger +------------- + +Say you've imported a few hundred documents and now want to introduce a tag +and apply its matching to all of the currently-imported docs. This problem is +common enough that there's a tool for it. + + +.. _utilities-retagger-howto: + +How to Use It +............. + +This too is done via the ``manage.py`` script: + .. code:: bash - $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere + $ /path/to/paperless/src/manage.py document_retagger -This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with -as you please. The naming scheme on export is identical to that used for -import, so should you can now safely delete the entire project directly, -database, encrypted PDFs and all, and later create it all again simply by -running the consumer again and dumping all of these files into -``CONSUMPTION_DIR``. +That's it. It'll loop over all of the documents in your database and attempt +to match all of your tags to them. If one matches, it'll be applied. And +don't worry, you can run this as often as you like, it' won't double-tag +a document. diff --git a/src/logger/__init__.py b/media/documents/originals/.keep similarity index 100% rename from src/logger/__init__.py rename to media/documents/originals/.keep diff --git a/src/logger/migrations/__init__.py b/media/documents/thumbnails/.keep similarity index 100% rename from src/logger/migrations/__init__.py rename to media/documents/thumbnails/.keep diff --git a/paperless.conf.example b/paperless.conf.example new file mode 100644 index 000000000..3ee429ea8 --- /dev/null +++ b/paperless.conf.example @@ -0,0 +1,33 @@ +# Sample paperless.conf +# Copy this file to /etc/paperless.conf and modify it to suit your needs. + +# This where your documents should go to be consumed. Make sure that it exists +# and that the user running the paperless service can read/write its contents +# before you start Paperless. +PAPERLESS_CONSUMPTION_DIR="" + +# These values are required if you want paperless to check a particular email +# box every 10 minutes and attempt to consume documents from there. If you +# don't define a HOST, mail checking will just be disabled. +PAPERLESS_CONSUME_MAIL_HOST="" +PAPERLESS_CONSUME_MAIL_PORT="" +PAPERLESS_CONSUME_MAIL_USER="" +PAPERLESS_CONSUME_MAIL_PASS="" + +# You must have a passphrase in order for Paperless to work at all. If you set +# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte +# file. +# +# The passphrase you use here will be used when storing your documents in +# Paperless, but you can always export them in an unencrypted format by using +# document exporter. See the documentaiton for more information. +# +# One final note about the passphrase. Once you've consumed a document with +# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and +# can't properly export documents that were encrypted with an old passphrase if +# you've since changed it to a new one. +PAPERLESS_PASSPHRASE="secret" + +# If you intend to consume documents either via HTTP POST or by email, you must +# have a shared secret here. +PAPERLESS_SHARED_SECRET="" diff --git a/requirements.txt b/requirements.txt index 6dd8b32b5..6a133327a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ -Django==1.9 +Django==1.9.2 django-extensions==1.6.1 +djangorestframework==3.3.2 +python-dotenv==0.3.0 filemagic==1.6 langdetect==1.0.5 -Pillow==3.0.0 +Pillow==3.1.1 pyocr==0.3.1 python-dateutil==2.4.2 python-gnupg==0.3.8 diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh new file mode 100644 index 000000000..14d385469 --- /dev/null +++ b/scripts/docker-entrypoint.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -e + +# Source: https://github.com/sameersbn/docker-gitlab/ +map_uidgid() { + USERMAP_ORIG_UID=$(id -u paperless) + USERMAP_ORIG_UID=$(id -g paperless) + USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}} + USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} + if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then + echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" + groupmod -g ${USERMAP_GID} paperless + sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd + fi +} + +set_permissions() { + # Set permissions for consumption directory + chgrp paperless "$PAPERLESS_CONSUMPTION_DIR" + chmod g+x "$PAPERLESS_CONSUMPTION_DIR" + + # Set permissions for application directory + chown -Rh paperless:paperless /usr/src/paperless +} + +initialize() { + map_uidgid + set_permissions +} + +install_languages() { + local langs="$1" + read -ra langs <<<"$langs" + + # Check that it is not empty + if [ ${#langs[@]} -eq 0 ]; then + return + fi + + # Update apt-lists + apt-get update + + # Loop over languages to be installed + for lang in "${langs[@]}"; do + pkg="tesseract-ocr-$lang" + if dpkg -s "$pkg" 2>&1 > /dev/null; then + continue + fi + + if ! apt-cache show "$pkg" 2>&1 > /dev/null; then + continue + fi + + apt-get install "$pkg" + done + + # Remove apt lists + rm -rf /var/lib/apt/lists/* +} + + +if [[ "$1" != "/"* ]]; then + initialize + + # Install additional languages if specified + if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then + install_languages "$PAPERLESS_OCR_LANGUAGES" + fi + + exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" +fi + +exec "$@" + diff --git a/scripts/paperless-consumer.service b/scripts/paperless-consumer.service index 34d65dedb..79a27d3ce 100644 --- a/scripts/paperless-consumer.service +++ b/scripts/paperless-consumer.service @@ -2,10 +2,9 @@ Description=Paperless consumer [Service] -EnvironmentFile=/etc/conf.d/paperless User=paperless Group=paperless -ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY +ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer [Install] WantedBy=multi-user.target diff --git a/scripts/paperless-webserver.service b/scripts/paperless-webserver.service index 1a2386471..9d20f5a1c 100644 --- a/scripts/paperless-webserver.service +++ b/scripts/paperless-webserver.service @@ -2,7 +2,6 @@ Description=Paperless webserver [Service] -EnvironmentFile=/etc/conf.d/paperless User=paperless Group=paperless ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000 diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision index aa6ca5e14..940bf476c 100644 --- a/scripts/vagrant-provision +++ b/scripts/vagrant-provision @@ -1,13 +1,31 @@ #!/bin/bash -# install packages -sudo apt-get update -sudo apt-get build-dep -y python-imaging -sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev -sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git -sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick +# Install packages +apt-get update +apt-get build-dep -y python-imaging +apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev +apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git +apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper -# setup python project -pushd /opt/paperless -sudo pip3 install -r requirements.txt -popd +# Python dependencies +pip3 install -r /opt/paperless/requirements.txt + +# Create the environment file +cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf +chmod 0640 /etc/paperless.conf +chown root:vagrant /etc/paperless.conf + +# Create the consumption directory +mkdir /home/vagrant/consumption +chown vagrant:vagrant /home/vagrant/consumption + +echo " + + +Now follow the remaining steps in the Vagrant section of the setup +documentation to complete the process: + +http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant + + +" diff --git a/src/documents/admin.py b/src/documents/admin.py index 635b9ddf8..a5b523492 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group from django.core.urlresolvers import reverse from django.templatetags.static import static -from .models import Sender, Tag, Document +from .models import Correspondent, Tag, Document, Log class MonthListFilter(admin.SimpleListFilter): @@ -45,39 +45,73 @@ class DocumentAdmin(admin.ModelAdmin): "all": ("paperless.css",) } - search_fields = ("sender__name", "title", "content") - list_display = ("created", "sender", "title", "tags_", "document") - list_filter = ("tags", "sender", MonthListFilter) + search_fields = ("correspondent__name", "title", "content") + list_display = ("created_", "correspondent", "title", "tags_", "document") + list_filter = ("tags", "correspondent", MonthListFilter) list_per_page = 25 + def created_(self, obj): + return obj.created.date().strftime("%Y-%m-%d") + def tags_(self, obj): r = "" for tag in obj.tags.all(): - r += '{}'.format( - tag.get_colour_display(), - "{}?tags__id__exact={}".format( - reverse("admin:documents_document_changelist"), - tag.pk - ), - tag.slug + colour = tag.get_colour_display() + r += self._html_tag( + "a", + tag.slug, + **{ + "class": "tag", + "style": "background-color: {};".format(colour), + "href": "{}?tags__id__exact={}".format( + reverse("admin:documents_document_changelist"), + tag.pk + ) + } ) return r tags_.allow_tags = True def document(self, obj): - return '' \ - '{} icon' \ - ''.format( - obj.download_url, - static("documents/img/{}.png".format(obj.file_type)), - obj.file_type, - obj.file_name - ) + return self._html_tag( + "a", + self._html_tag( + "img", + src=static("documents/img/{}.png".format(obj.file_type)), + width=22, + height=22, + alt=obj.file_type, + title=obj.file_name + ), + href=obj.download_url + ) document.allow_tags = True -admin.site.register(Sender) + @staticmethod + def _html_tag(kind, inside=None, **kwargs): + + attributes = [] + for lft, rgt in kwargs.items(): + attributes.append('{}="{}"'.format(lft, rgt)) + + if inside is not None: + return "<{kind} {attributes}>{inside}".format( + kind=kind, attributes=" ".join(attributes), inside=inside) + + return "<{} {}/>".format(kind, " ".join(attributes)) + + +class LogAdmin(admin.ModelAdmin): + + list_display = ("message", "level", "component") + list_filter = ("level", "component",) + + +admin.site.register(Correspondent) admin.site.register(Tag, TagAdmin) admin.site.register(Document, DocumentAdmin) +admin.site.register(Log, LogAdmin) + # Unless we implement multi-user, these default registrations don't make sense. admin.site.unregister(Group) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5ca42813b..fbdbbc276 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,5 +1,8 @@ import datetime +import logging import tempfile +import uuid + from multiprocessing.pool import Pool import itertools @@ -17,20 +20,14 @@ from PIL import Image from django.conf import settings from django.utils import timezone from django.template.defaultfilters import slugify +from pyocr.tesseract import TesseractError -from logger.models import Log from paperless.db import GnuPG -from .models import Sender, Tag, Document +from .models import Correspondent, Tag, Document, Log from .languages import ISO639 -def image_to_string(args): - self, png, lang = args - with Image.open(os.path.join(self.SCRATCH, png)) as f: - return self.OCR.image_to_string(f, lang=lang) - - class OCRError(Exception): pass @@ -42,8 +39,8 @@ class ConsumerError(Exception): class Consumer(object): """ Loop over every file found in CONSUMPTION_DIR and: - 1. Convert it to a greyscale png - 2. Use tesseract on the png + 1. Convert it to a greyscale pnm + 2. Use tesseract on the pnm 3. Encrypt and store the document in the MEDIA_ROOT 4. Store the OCR'd text in the database 5. Delete the document and image(s) @@ -51,28 +48,29 @@ class Consumer(object): SCRATCH = settings.SCRATCH_DIR CONVERT = settings.CONVERT_BINARY + UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None - OCR = pyocr.get_available_tools()[0] DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE REGEX_TITLE = re.compile( r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) - REGEX_SENDER_TITLE = re.compile( + REGEX_CORRESPONDENT_TITLE = re.compile( r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) - REGEX_SENDER_TITLE_TAGS = re.compile( + REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) - def __init__(self, verbosity=1): + def __init__(self): - self.verbosity = verbosity + self.logger = logging.getLogger(__name__) + self.logging_group = None try: os.makedirs(self.SCRATCH) @@ -92,6 +90,12 @@ class Consumer(object): raise ConsumerError( "Consumption directory {} does not exist".format(self.CONSUME)) + def log(self, level, message): + getattr(self.logger, level)(message, extra={ + "group": self.logging_group, + "component": Log.COMPONENT_CONSUMER + }) + def consume(self): for doc in os.listdir(self.CONSUME): @@ -110,122 +114,156 @@ class Consumer(object): if self._is_ready(doc): continue - Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) + self.logging_group = uuid.uuid4() + + self.log("info", "Consuming {}".format(doc)) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) - pngs = self._get_greyscale(tempdir, doc) + imgs = self._get_greyscale(tempdir, doc) + thumbnail = self._get_thumbnail(tempdir, doc) try: - text = self._get_ocr(pngs) - self._store(text, doc) - except OCRError: + text = self._get_ocr(imgs) + self._store(text, doc, thumbnail) + except OCRError as e: self._ignore.append(doc) - Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) + self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) + self._cleanup_tempdir(tempdir) continue - finally: - self._cleanup(tempdir, doc) + else: + self._cleanup_tempdir(tempdir) + self._cleanup_doc(doc) def _get_greyscale(self, tempdir, doc): + """ + Greyscale images are easier for Tesseract to OCR + """ - Log.debug( - "Generating greyscale image from {}".format(doc), - Log.COMPONENT_CONSUMER - ) - - png = os.path.join(tempdir, "convert-%04d.jpg") + self.log("info", "Generating greyscale image from {}".format(doc)) + # Convert PDF to multiple PNMs + pnm = os.path.join(tempdir, "convert-%04d.pnm") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", - "-type", "grayscale", doc, png + "-type", "grayscale", doc, pnm )).wait() - pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] - return sorted(filter(lambda f: os.path.isfile(f), pngs)) + # Get a list of converted images + pnms = [] + for f in os.listdir(tempdir): + if f.endswith(".pnm"): + pnms.append(os.path.join(tempdir, f)) - @staticmethod - def _guess_language(text): + # Run unpaper in parallel on converted images + with Pool(processes=self.THREADS) as pool: + pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) + + # Return list of converted images, processed with unpaper + pnms = [] + for f in os.listdir(tempdir): + if f.endswith(".unpaper.pnm"): + pnms.append(os.path.join(tempdir, f)) + + return sorted(filter(lambda __: os.path.isfile(__), pnms)) + + def _get_thumbnail(self, tempdir, doc): + """ + The thumbnail of a PDF is just a 500px wide image of the first page. + """ + + self.log("info", "Generating the thumbnail") + + subprocess.Popen(( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + doc, + os.path.join(tempdir, "convert-%04d.png") + )).wait() + + return os.path.join(tempdir, "convert-0000.png") + + def _guess_language(self, text): try: guess = langdetect.detect(text) - Log.debug( - "Language detected: {}".format(guess), - Log.COMPONENT_CONSUMER - ) + self.log("debug", "Language detected: {}".format(guess)) return guess except Exception as e: - Log.warning( - "Language detection error: {}".format(e), Log.COMPONENT_MAIL) + self.log("warning", "Language detection error: {}".format(e)) - def _get_ocr(self, pngs): + def _get_ocr(self, imgs): """ Attempts to do the best job possible OCR'ing the document based on simple language detection trial & error. """ - if not pngs: - raise OCRError + if not imgs: + raise OCRError("No images found") - Log.debug("OCRing the document", Log.COMPONENT_CONSUMER) + self.log("info", "OCRing the document") # Since the division gets rounded down by int, this calculation works # for every edge-case, i.e. 1 - middle = int(len(pngs) / 2) - raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE) + middle = int(len(imgs) / 2) + raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) guessed_language = self._guess_language(raw_text) if not guessed_language or guessed_language not in ISO639: - Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER) + self.log("warning", "Language detection failed!") if settings.FORGIVING_OCR: - Log.warning( - "As FORGIVING_OCR is enabled, we're going to make the best " - "with what we have.", - Log.COMPONENT_CONSUMER + self.log( + "warning", + "As FORGIVING_OCR is enabled, we're going to make the " + "best with what we have." ) - raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) return raw_text - raise OCRError + raise OCRError("Language detection failed") if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: - raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) return raw_text try: - return self._ocr(pngs, ISO639[guessed_language]) + return self._ocr(imgs, ISO639[guessed_language]) except pyocr.pyocr.tesseract.TesseractError: if settings.FORGIVING_OCR: - Log.warning( + self.log( + "warning", "OCR for {} failed, but we're going to stick with what " "we've got since FORGIVING_OCR is enabled.".format( guessed_language - ), - Log.COMPONENT_CONSUMER + ) ) - raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) return raw_text - raise OCRError + raise OCRError( + "The guessed language is not available in this instance of " + "Tesseract." + ) - def _assemble_ocr_sections(self, pngs, middle, text): + def _assemble_ocr_sections(self, imgs, middle, text): """ Given a `middle` value and the text that middle page represents, we OCR the remainder of the document and return the whole thing. """ - text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text - text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE) + text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text + text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) return text - def _ocr(self, pngs, lang): + def _ocr(self, imgs, lang): """ Performs a single OCR attempt. """ - if not pngs: + if not imgs: return "" - Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER) + self.log("info", "Parsing for {}".format(lang)) with Pool(processes=self.THREADS) as pool: - r = pool.map( - image_to_string, itertools.product([self], pngs, [lang])) + r = pool.map(image_to_string, itertools.product(imgs, [lang])) r = " ".join(r) # Strip out excess white space to allow matching to go smoother @@ -233,16 +271,18 @@ class Consumer(object): def _guess_attributes_from_name(self, parseable): """ - We use a crude naming convention to make handling the sender, title, and - tags easier: - " - - <tags>.<suffix>" - "<sender> - <title>.<suffix>" + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" "<title>.<suffix>" """ - def get_sender(sender_name): - return Sender.objects.get_or_create( - name=sender_name, defaults={"slug": slugify(sender_name)})[0] + def get_correspondent(correspondent_name): + return Correspondent.objects.get_or_create( + name=correspondent_name, + defaults={"slug": slugify(correspondent_name)} + )[0] def get_tags(tags): r = [] @@ -251,40 +291,47 @@ class Consumer(object): Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) return tuple(r) - # First attempt: "<sender> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable) + def get_suffix(suffix): + suffix = suffix.lower() + if suffix == "jpeg": + return "jpg" + return suffix + + # First attempt: "<correspondent> - <title> - <tags>.<suffix>" + m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) if m: return ( - get_sender(m.group(1)), + get_correspondent(m.group(1)), m.group(2), get_tags(m.group(3)), - m.group(4) + get_suffix(m.group(4)) ) - # Second attempt: "<sender> - <title>.<suffix>" - m = re.match(self.REGEX_SENDER_TITLE, parseable) + # Second attempt: "<correspondent> - <title>.<suffix>" + m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) if m: - return get_sender(m.group(1)), m.group(2), (), m.group(3) + return ( + get_correspondent(m.group(1)), + m.group(2), + (), + get_suffix(m.group(3)) + ) - # That didn't work, so we assume sender and tags are None + # That didn't work, so we assume correspondent and tags are None m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), m.group(2) + return None, m.group(1), (), get_suffix(m.group(2)) - def _store(self, text, doc): + def _store(self, text, doc, thumbnail): sender, title, tags, file_type = self._guess_attributes_from_name(doc) - tags = list(tags) - - lower_text = text.lower() - relevant_tags = set( - [t for t in Tag.objects.all() if t.matches(lower_text)] + tags) + relevant_tags = set(list(Tag.match_all(text)) + list(tags)) stats = os.stat(doc) - Log.debug("Saving record to database", Log.COMPONENT_CONSUMER) + self.log("debug", "Saving record to database") document = Document.objects.create( - sender=sender, + correspondent=sender, title=title, content=text, file_type=file_type, @@ -296,22 +343,29 @@ class Consumer(object): if relevant_tags: tag_names = ", ".join([t.slug for t in relevant_tags]) - Log.debug( - "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER) + self.log("debug", "Tagging with {}".format(tag_names)) document.tags.add(*relevant_tags) + # Encrypt and store the actual document with open(doc, "rb") as unencrypted: with open(document.source_path, "wb") as encrypted: - Log.debug("Encrypting", Log.COMPONENT_CONSUMER) + self.log("debug", "Encrypting the document") encrypted.write(GnuPG.encrypted(unencrypted)) - def _cleanup(self, tempdir, doc): - # Remove temporary directory recursively - Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) - shutil.rmtree(tempdir) + # Encrypt and store the thumbnail + with open(thumbnail, "rb") as unencrypted: + with open(document.thumbnail_path, "wb") as encrypted: + self.log("debug", "Encrypting the thumbnail") + encrypted.write(GnuPG.encrypted(unencrypted)) - # Remove doc - Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) + self.log("info", "Completed") + + def _cleanup_tempdir(self, d): + self.log("debug", "Deleting directory {}".format(d)) + shutil.rmtree(d) + + def _cleanup_doc(self, doc): + self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) def _is_ready(self, doc): @@ -329,3 +383,23 @@ class Consumer(object): self.stats[doc] = t return False + + +def image_to_string(args): + img, lang = args + ocr = pyocr.get_available_tools()[0] + with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: + if ocr.can_detect_orientation(): + try: + orientation = ocr.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) + except TesseractError: + pass + return ocr.image_to_string(f, lang=lang) + + +def run_unpaper(args): + unpaper, pnm = args + subprocess.Popen(( + unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") + )).wait() diff --git a/src/documents/forms.py b/src/documents/forms.py index d544917b4..d4c01745a 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -8,13 +8,13 @@ from time import mktime from django import forms from django.conf import settings -from .models import Document, Sender +from .models import Document, Correspondent from .consumer import Consumer class UploadForm(forms.Form): - SECRET = settings.UPLOAD_SHARED_SECRET + SECRET = settings.SHARED_SECRET TYPE_LOOKUP = { "application/pdf": Document.TYPE_PDF, "image/png": Document.TYPE_PNG, @@ -23,31 +23,36 @@ class UploadForm(forms.Form): "image/tiff": Document.TYPE_TIF, } - sender = forms.CharField( - max_length=Sender._meta.get_field("name").max_length, required=False) + correspondent = forms.CharField( + max_length=Correspondent._meta.get_field("name").max_length, + required=False + ) title = forms.CharField( - max_length=Document._meta.get_field("title").max_length, required=False) + max_length=Document._meta.get_field("title").max_length, + required=False + ) document = forms.FileField() signature = forms.CharField(max_length=256) - def clean_sender(self): + def clean_correspondent(self): """ I suppose it might look cleaner to use .get_or_create() here, but that - would also allow someone to fill up the db with bogus senders before all - validation was met. + would also allow someone to fill up the db with bogus correspondents + before all validation was met. """ - sender = self.cleaned_data.get("sender") - if not sender: + corresp = self.cleaned_data.get("correspondent") + if not corresp: return None - if not Sender.SAFE_REGEX.match(sender) or " - " in sender: - raise forms.ValidationError("That sender name is suspicious.") - return sender + if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp: + raise forms.ValidationError( + "That correspondent name is suspicious.") + return corresp def clean_title(self): title = self.cleaned_data.get("title") if not title: return None - if not Sender.SAFE_REGEX.match(title) or " - " in title: + if not Correspondent.SAFE_REGEX.match(title) or " - " in title: raise forms.ValidationError("That title is suspicious.") def clean_document(self): @@ -59,10 +64,10 @@ class UploadForm(forms.Form): return document, self.TYPE_LOOKUP[file_type] def clean(self): - sender = self.clened_data("sender") + corresp = self.clened_data("correspondent") title = self.cleaned_data("title") signature = self.cleaned_data("signature") - if sha256(sender + title + self.SECRET).hexdigest() == signature: + if sha256(corresp + title + self.SECRET).hexdigest() == signature: return True return False @@ -73,13 +78,15 @@ class UploadForm(forms.Form): form do that as well. Think of it as a poor-man's queue server. """ - sender = self.clened_data("sender") + correspondent = self.clened_data("correspondent") title = self.cleaned_data("title") document, file_type = self.cleaned_data.get("document") t = int(mktime(datetime.now())) file_name = os.path.join( - Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type)) + Consumer.CONSUME, + "{} - {}.{}".format(correspondent, title, file_type) + ) with open(file_name, "wb") as f: f.write(document) diff --git a/src/documents/languages.py b/src/documents/languages.py index 2bfafe08a..5ea560654 100644 --- a/src/documents/languages.py +++ b/src/documents/languages.py @@ -185,10 +185,10 @@ ISO639 = { "yo": "yor", "za": "zha", - # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I have - # no idea which one is better, so I just picked the bigger file. + # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I + # have no idea which one is better, so I just picked the bigger file. "zh": "chi_tra", "zu": "zul" -} \ No newline at end of file +} diff --git a/src/documents/loggers.py b/src/documents/loggers.py new file mode 100644 index 000000000..3464478cc --- /dev/null +++ b/src/documents/loggers.py @@ -0,0 +1,30 @@ +import logging + + +class PaperlessLogger(logging.StreamHandler): + """ + A logger smart enough to know to log some kinds of messages to the database + for later retrieval in a pretty interface. + """ + + def emit(self, record): + + logging.StreamHandler.emit(self, record) + + if not hasattr(record, "component"): + return + + # We have to do the import here or Django will barf when it tries to + # load this because the apps aren't loaded at that point + from .models import Log + + kwargs = { + "message": record.msg, + "component": record.component, + "level": record.levelno, + } + + if hasattr(record, "group"): + kwargs["group"] = record.group + + Log.objects.create(**kwargs) diff --git a/src/documents/mail.py b/src/documents/mail.py index 384567e60..5bacb5b5f 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -1,8 +1,10 @@ import datetime import imaplib +import logging import os import re import time +import uuid from base64 import b64decode from email import policy @@ -11,10 +13,8 @@ from dateutil import parser from django.conf import settings -from logger.models import Log - from .consumer import Consumer -from .models import Sender +from .models import Correspondent, Log class MailFetcherError(Exception): @@ -25,21 +25,34 @@ class InvalidMessageError(Exception): pass -class Message(object): +class Loggable(object): + + def __init__(self, group=None): + self.logger = logging.getLogger(__name__) + self.logging_group = group or uuid.uuid4() + + def log(self, level, message): + getattr(self.logger, level)(message, extra={ + "group": self.logging_group, + "component": Log.COMPONENT_MAIL + }) + + +class Message(Loggable): """ A crude, but simple email message class. We assume that there's a subject and n attachments, and that we don't care about the message body. """ - SECRET = settings.UPLOAD_SHARED_SECRET + SECRET = settings.SHARED_SECRET - def __init__(self, data, verbosity=1): + def __init__(self, data, group=None): """ Cribbed heavily from https://www.ianlewis.org/en/parsing-email-attachments-python """ - self.verbosity = verbosity + Loggable.__init__(self, group=group) self.subject = None self.time = None @@ -54,8 +67,7 @@ class Message(object): self._set_time(message) - Log.info( - 'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL) + self.log("info", 'Importing email: "{}"'.format(self.subject)) attachments = [] for part in message.walk(): @@ -91,7 +103,7 @@ class Message(object): def check_subject(self): if self.subject is None: raise InvalidMessageError("Message does not have a subject") - if not Sender.SAFE_REGEX.match(self.subject): + if not Correspondent.SAFE_REGEX.match(self.subject): raise InvalidMessageError("Message subject is unsafe: {}".format( self.subject)) @@ -134,9 +146,11 @@ class Attachment(object): return self.data -class MailFetcher(object): +class MailFetcher(Loggable): - def __init__(self, verbosity=1): + def __init__(self): + + Loggable.__init__(self) self._connection = None self._host = settings.MAIL_CONSUMPTION["HOST"] @@ -148,7 +162,6 @@ class MailFetcher(object): self._enabled = bool(self._host) self.last_checked = datetime.datetime.now() - self.verbosity = verbosity def pull(self): """ @@ -159,14 +172,14 @@ class MailFetcher(object): if self._enabled: - Log.info("Checking mail", Log.COMPONENT_MAIL) + # Reset the grouping id for each fetch + self.logging_group = uuid.uuid4() + + self.log("debug", "Checking mail") for message in self._get_messages(): - Log.debug( - 'Storing email: "{}"'.format(message.subject), - Log.COMPONENT_MAIL - ) + self.log("info", 'Storing email: "{}"'.format(message.subject)) t = int(time.mktime(message.time.timetuple())) file_name = os.path.join(Consumer.CONSUME, message.file_name) @@ -193,7 +206,7 @@ class MailFetcher(object): self._connection.logout() except Exception as e: - Log.error(e, Log.COMPONENT_MAIL) + self.log("error", str(e)) return r @@ -218,9 +231,9 @@ class MailFetcher(object): message = None try: - message = Message(data[0][1], self.verbosity) + message = Message(data[0][1], self.logging_group) except InvalidMessageError as e: - Log.error(e, Log.COMPONENT_MAIL) + self.log("error", str(e)) else: self._connection.store(num, "+FLAGS", "\\Deleted") diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index ae72381e2..8116303b5 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,10 +1,12 @@ import datetime +import logging import os import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError +from ...models import Log from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError @@ -34,7 +36,7 @@ class Command(BaseCommand): self.verbosity = options["verbosity"] try: - self.file_consumer = Consumer(verbosity=self.verbosity) + self.file_consumer = Consumer() self.mail_fetcher = MailFetcher() except (ConsumerError, MailFetcherError) as e: raise CommandError(e) @@ -44,6 +46,13 @@ class Command(BaseCommand): except FileExistsError: pass + logging.getLogger(__name__).info( + "Starting document consumer at {}".format( + settings.CONSUMPTION_DIR + ), + extra={"component": Log.COMPONENT_CONSUMER} + ) + try: while True: self.loop() diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index ac448d8e8..913f7ae79 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -1,10 +1,12 @@ +import json import os import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError +from django.core import serializers -from documents.models import Document +from documents.models import Document, Correspondent, Tag from paperless.db import GnuPG from ...mixins import Renderable @@ -14,21 +16,26 @@ class Command(Renderable, BaseCommand): help = """ Decrypt and rename all files in our collection into a given target - directory. Note that we don't export any of the parsed data since - that can always be re-collected via the consumer. + directory. And include a manifest file containing document data for + easy import. """.replace(" ", "") def add_arguments(self, parser): parser.add_argument("target") + parser.add_argument( + "--legacy", + action="store_true", + help="Don't try to export all of the document data, just dump the " + "original document files out in a format that makes " + "re-consuming them easy." + ) def __init__(self, *args, **kwargs): - self.verbosity = 0 - self.target = None BaseCommand.__init__(self, *args, **kwargs) + self.target = None def handle(self, *args, **options): - self.verbosity = options["verbosity"] self.target = options["target"] if not os.path.exists(self.target): @@ -40,9 +47,22 @@ class Command(Renderable, BaseCommand): if not settings.PASSPHRASE: settings.PASSPHRASE = input("Please enter the passphrase: ") - for document in Document.objects.all(): + if options["legacy"]: + self.dump_legacy() + else: + self.dump() + + def dump(self): + + documents = Document.objects.all() + document_map = {d.pk: d for d in documents} + manifest = json.loads(serializers.serialize("json", documents)) + for document_dict in manifest: + + document = document_map[document_dict["pk"]] target = os.path.join(self.target, document.file_name) + document_dict["__exported_file_name__"] = target print("Exporting: {}".format(target)) @@ -50,3 +70,37 @@ class Command(Renderable, BaseCommand): f.write(GnuPG.decrypted(document.source_file)) t = int(time.mktime(document.created.timetuple())) os.utime(target, times=(t, t)) + + manifest += json.loads( + serializers.serialize("json", Correspondent.objects.all())) + + manifest += json.loads(serializers.serialize( + "json", Tag.objects.all())) + + with open(os.path.join(self.target, "manifest.json"), "w") as f: + json.dump(manifest, f, indent=2) + + def dump_legacy(self): + + for document in Document.objects.all(): + + target = os.path.join( + self.target, self._get_legacy_file_name(document)) + + print("Exporting: {}".format(target)) + + with open(target, "wb") as f: + f.write(GnuPG.decrypted(document.source_file)) + t = int(time.mktime(document.created.timetuple())) + os.utime(target, times=(t, t)) + + @staticmethod + def _get_legacy_file_name(doc): + if doc.correspondent and doc.title: + tags = ",".join([t.slug for t in doc.tags.all()]) + if tags: + return "{} - {} - {}.{}".format( + doc.correspondent, doc.title, tags, doc.file_type) + return "{} - {}.{}".format( + doc.correspondent, doc.title, doc.file_type) + return os.path.basename(doc.source_path) diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py new file mode 100644 index 000000000..63c961815 --- /dev/null +++ b/src/documents/management/commands/document_importer.py @@ -0,0 +1,99 @@ +import json +import os + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.core.management import call_command + +from documents.models import Document +from paperless.db import GnuPG + +from ...mixins import Renderable + + +class Command(Renderable, BaseCommand): + + help = """ + Using a manifest.json file, load the data from there, and import the + documents it refers to. + """.replace(" ", "") + + def add_arguments(self, parser): + parser.add_argument("source") + + def __init__(self, *args, **kwargs): + BaseCommand.__init__(self, *args, **kwargs) + self.source = None + self.manifest = None + + def handle(self, *args, **options): + + self.source = options["source"] + + if not os.path.exists(self.source): + raise CommandError("That path doesn't exist") + + if not os.access(self.source, os.R_OK): + raise CommandError("That path doesn't appear to be readable") + + manifest_path = os.path.join(self.source, "manifest.json") + self._check_manifest_exists(manifest_path) + + with open(manifest_path) as f: + self.manifest = json.load(f) + + self._check_manifest() + + if not settings.PASSPHRASE: + raise CommandError( + "You need to define a passphrase before continuing. Please " + "consult the documentation for setting up Paperless." + ) + + # Fill up the database with whatever is in the manifest + call_command("loaddata", manifest_path) + + self._import_files_from_manifest() + + @staticmethod + def _check_manifest_exists(path): + if not os.path.exists(path): + raise CommandError( + "That directory doesn't appear to contain a manifest.json " + "file." + ) + + def _check_manifest(self): + + for record in self.manifest: + + if not record["model"] == "documents.document": + continue + + if "__exported_file_name__" not in record: + raise CommandError( + 'The manifest file contains a record which does not ' + 'refer to an actual document file.' + ) + + doc_file = record["__exported_file_name__"] + if not os.path.exists(os.path.join(self.source, doc_file)): + raise CommandError( + 'The manifest file refers to "{}" which does not ' + 'appear to be in the source directory.'.format(doc_file) + ) + + def _import_files_from_manifest(self): + + for record in self.manifest: + + if not record["model"] == "documents.document": + continue + + doc_file = record["__exported_file_name__"] + document = Document.objects.get(pk=record["pk"]) + with open(doc_file, "rb") as unencrypted: + with open(document.source_path, "wb") as encrypted: + print("Encrypting {} and saving it to {}".format( + doc_file, document.source_path)) + encrypted.write(GnuPG.encrypted(unencrypted)) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index d7519f53b..8f56e1eea 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -10,8 +10,8 @@ class Command(Renderable, BaseCommand): help = """ Using the current set of tagging rules, apply said rules to all documents in the database, effectively allowing you to back-tag all - previously indexed documents with tags created (or modified) after their - initial import. + previously indexed documents with tags created (or modified) after + their initial import. """.replace(" ", "") def __init__(self, *args, **kwargs): @@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand): self.verbosity = options["verbosity"] for document in Document.objects.all(): + tags = Tag.objects.exclude( pk__in=document.tags.values_list("pk", flat=True)) - for tag in tags: - if tag.matches(document.content): - print('Tagging {} with "{}"'.format(document, tag)) - document.tags.add(tag) + + for tag in Tag.match_all(document.content, tags): + print('Tagging {} with "{}"'.format(document, tag)) + document.tags.add(tag) diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py new file mode 100644 index 000000000..9cce7a047 --- /dev/null +++ b/src/documents/management/commands/loaddata_stdin.py @@ -0,0 +1,20 @@ +import sys + +from django.core.management.commands.loaddata import Command as LoadDataCommand + + +class Command(LoadDataCommand): + """ + Allow the loading of data from standard in. Sourced originally from: + https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed) + """ + + def parse_name(self, fixture_name): + self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None) + if fixture_name == '-': + return '-', 'json', 'stdin' + + def find_fixtures(self, fixture_label): + if fixture_label == '-': + return [('-', None, '-')] + return super(Command, self).find_fixtures(fixture_label) diff --git a/src/documents/managers.py b/src/documents/managers.py new file mode 100644 index 000000000..e7b0751ca --- /dev/null +++ b/src/documents/managers.py @@ -0,0 +1,70 @@ +from django.conf import settings + +from django.db import models +from django.db.models.aggregates import Max + + +class GroupConcat(models.Aggregate): + """ + Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've + only ever tested it in Sqlite. + """ + + ENGINE_SQLITE = 1 + ENGINE_POSTGRESQL = 2 + ENGINE_MYSQL = 3 + ENGINES = { + "django.db.backends.sqlite3": ENGINE_SQLITE, + "django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL, + "django.db.backends.postgresql": ENGINE_POSTGRESQL, + "django.db.backends.mysql": ENGINE_MYSQL + } + + def __init__(self, expression, separator="\n", **extra): + + self.engine = self._get_engine() + self.function = self._get_function() + self.template = self._get_template(separator) + + models.Aggregate.__init__( + self, + expression, + output_field=models.CharField(), + **extra + ) + + def _get_engine(self): + engine = settings.DATABASES["default"]["ENGINE"] + try: + return self.ENGINES[engine] + except KeyError: + raise NotImplementedError( + "There's currently no support for {} when it comes to group " + "concatenation in Paperless".format(engine) + ) + + def _get_function(self): + if self.engine == self.ENGINE_POSTGRESQL: + return "STRING_AGG" + return "GROUP_CONCAT" + + def _get_template(self, separator): + if self.engine == self.ENGINE_MYSQL: + return "%(function)s(%(expressions)s, SEPARATOR '{}')".format( + separator) + return "%(function)s(%(expressions)s, '{}')".format(separator) + + +class LogQuerySet(models.query.QuerySet): + + def by_group(self): + return self.values("group").annotate( + time=Max("modified"), + messages=GroupConcat("message"), + ).order_by("-time") + + +class LogManager(models.Manager): + + def get_queryset(self): + return LogQuerySet(self.model, using=self._db) diff --git a/src/logger/migrations/0001_initial.py b/src/documents/migrations/0010_log.py similarity index 57% rename from src/logger/migrations/0001_initial.py rename to src/documents/migrations/0010_log.py index 029fe43c2..57cf804b7 100644 --- a/src/logger/migrations/0001_initial.py +++ b/src/documents/migrations/0010_log.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Generated by Django 1.9 on 2016-02-14 16:08 +# Generated by Django 1.9 on 2016-02-27 17:54 from __future__ import unicode_literals from django.db import migrations, models @@ -7,9 +7,8 @@ from django.db import migrations, models class Migration(migrations.Migration): - initial = True - dependencies = [ + ('documents', '0009_auto_20160214_0040'), ] operations = [ @@ -17,14 +16,15 @@ class Migration(migrations.Migration): name='Log', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('time', models.DateTimeField(auto_now_add=True)), + ('group', models.UUIDField(blank=True)), ('message', models.TextField()), - ('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)), + ('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)), ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])), + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), ], - ), - migrations.AlterModelOptions( - name='log', - options={'ordering': ('-time',)}, + options={ + 'ordering': ('-modified',), + }, ), ] diff --git a/src/documents/migrations/0011_auto_20160303_1929.py b/src/documents/migrations/0011_auto_20160303_1929.py new file mode 100644 index 000000000..af4ee4c66 --- /dev/null +++ b/src/documents/migrations/0011_auto_20160303_1929.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.2 on 2016-03-03 19:29 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0010_log'), + ] + + operations = [ + migrations.RenameModel( + old_name='Sender', + new_name='Correspondent', + ), + migrations.AlterModelOptions( + name='document', + options={'ordering': ('correspondent', 'title')}, + ), + migrations.RenameField( + model_name='document', + old_name='sender', + new_name='correspondent', + ), + ] diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py new file mode 100644 index 000000000..91d384c22 --- /dev/null +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.2 on 2016-03-05 00:40 +from __future__ import unicode_literals + +import gnupg +import os +import re +import shutil +import subprocess +import tempfile + +from django.conf import settings +from django.db import migrations +from django.utils.termcolors import colorize as colourise # Spelling hurts me + + +class GnuPG(object): + """ + A handy singleton to use when handling encrypted files. + """ + + gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + + @classmethod + def decrypted(cls, file_handle): + return cls.gpg.decrypt_file( + file_handle, passphrase=settings.PASSPHRASE).data + + @classmethod + def encrypted(cls, file_handle): + return cls.gpg.encrypt_file( + file_handle, + recipients=None, + passphrase=settings.PASSPHRASE, + symmetric=True + ).data + + +def move_documents_and_create_thumbnails(apps, schema_editor): + + documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents")) + + if set(documents) == {"originals", "thumbnails"}: + return + + print(colourise( + "\n\n" + " This is a one-time only migration to generate thumbnails for all of your\n" + " documents so that future UIs will have something to work with. If you have\n" + " a lot of documents though, this may take a while, so a coffee break may be\n" + " in order." + "\n", opts=("bold",) + )) + + try: + os.makedirs(settings.SCRATCH_DIR) + except FileExistsError: + pass + + for f in sorted(documents): + + if not f.endswith("gpg"): + continue + + print(" {} {} {}".format( + colourise("*", fg="green"), + colourise("Generating a thumbnail for", fg="white"), + colourise(f, fg="cyan") + )) + + thumb_temp = tempfile.mkdtemp( + prefix="paperless", dir=settings.SCRATCH_DIR) + orig_temp = tempfile.mkdtemp( + prefix="paperless", dir=settings.SCRATCH_DIR) + + orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f) + orig_target = os.path.join(orig_temp, f.replace(".gpg", "")) + + with open(orig_source, "rb") as encrypted: + with open(orig_target, "wb") as unencrypted: + unencrypted.write(GnuPG.decrypted(encrypted)) + + subprocess.Popen(( + settings.CONVERT_BINARY, + "-scale", "500x5000", + "-alpha", "remove", + orig_target, + os.path.join(thumb_temp, "convert-%04d.png") + )).wait() + + thumb_source = os.path.join(thumb_temp, "convert-0000.png") + thumb_target = os.path.join( + settings.MEDIA_ROOT, + "documents", + "thumbnails", + re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f) + ) + with open(thumb_source, "rb") as unencrypted: + with open(thumb_target, "wb") as encrypted: + encrypted.write(GnuPG.encrypted(unencrypted)) + + shutil.rmtree(thumb_temp) + shutil.rmtree(orig_temp) + + shutil.move( + os.path.join(settings.MEDIA_ROOT, "documents", f), + os.path.join(settings.MEDIA_ROOT, "documents", "originals", f), + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0011_auto_20160303_1929'), + ] + + operations = [ + migrations.RunPython(move_documents_and_create_thumbnails), + ] diff --git a/src/documents/mixins.py b/src/documents/mixins.py index 881589fa3..4d4e9783f 100644 --- a/src/documents/mixins.py +++ b/src/documents/mixins.py @@ -1,7 +1,7 @@ class Renderable(object): """ - A handy mixin to make it easier/cleaner to print output based on a verbosity - value. + A handy mixin to make it easier/cleaner to print output based on a + verbosity value. """ def _render(self, text, verbosity): diff --git a/src/documents/models.py b/src/documents/models.py index 447beaa66..0d79dba0a 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,5 +1,7 @@ +import logging import os import re +import uuid from django.conf import settings from django.core.urlresolvers import reverse @@ -7,6 +9,8 @@ from django.db import models from django.template.defaultfilters import slugify from django.utils import timezone +from .managers import LogManager + class SluggedModel(models.Model): @@ -25,7 +29,7 @@ class SluggedModel(models.Model): return self.name -class Sender(SluggedModel): +class Correspondent(SluggedModel): # This regex is probably more restrictive than it needs to be, but it's # better safe than sorry. @@ -36,7 +40,7 @@ class Sender(SluggedModel): class Tag(SluggedModel): - + COLOURS = ( (1, "#a6cee3"), (2, "#1f78b4"), @@ -71,9 +75,9 @@ class Tag(SluggedModel): default=MATCH_ANY, help_text=( "Which algorithm you want to use when matching text to the OCR'd " - "PDF. Here, \"any\" looks for any occurrence of any word provided " - "in the PDF, while \"all\" requires that every word provided " - "appear in the PDF, albeit not in the order provided. A " + "PDF. Here, \"any\" looks for any occurrence of any word " + "provided in the PDF, while \"all\" requires that every word " + "provided appear in the PDF, albeit not in the order provided. A " "\"literal\" match means that the text you enter must appear in " "the PDF exactly as you've entered it, and \"regular expression\" " "uses a regex to match the PDF. If you don't know what a regex " @@ -86,28 +90,40 @@ class Tag(SluggedModel): return "{}: \"{}\" ({})".format( self.name, self.match, self.get_matching_algorithm_display()) + @classmethod + def match_all(cls, text, tags=None): + + if tags is None: + tags = cls.objects.all() + + text = text.lower() + for tag in tags: + if tag.matches(text): + yield tag + def matches(self, text): + # Check that match is not empty if self.match.strip() == "": return False if self.matching_algorithm == self.MATCH_ALL: for word in self.match.split(" "): - if word not in text: + if not re.search(r"\b{}\b".format(word), text): return False return True if self.matching_algorithm == self.MATCH_ANY: for word in self.match.split(" "): - if word in text: + if re.search(r"\b{}\b".format(word), text): return True return False if self.matching_algorithm == self.MATCH_LITERAL: - return self.match in text + return bool(re.search(r"\b{}\b".format(self.match), text)) if self.matching_algorithm == self.MATCH_REGEX: - return re.search(re.compile(self.match), text) + return bool(re.search(re.compile(self.match), text)) raise NotImplementedError("Unsupported matching algorithm") @@ -125,8 +141,8 @@ class Document(models.Model): TYPE_TIF = "tiff" TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) - sender = models.ForeignKey( - Sender, blank=True, null=True, related_name="documents") + correspondent = models.ForeignKey( + Correspondent, blank=True, null=True, related_name="documents") title = models.CharField(max_length=128, blank=True, db_index=True) content = models.TextField(db_index=True) file_type = models.CharField( @@ -140,14 +156,15 @@ class Document(models.Model): modified = models.DateTimeField(auto_now=True, editable=False) class Meta(object): - ordering = ("sender", "title") + ordering = ("correspondent", "title") def __str__(self): - created = self.created.strftime("%Y-%m-%d") - if self.sender and self.title: - return "{}: {}, {}".format(created, self.sender, self.title) - if self.sender or self.title: - return "{}: {}".format(created, self.sender or self.title) + created = self.created.strftime("%Y%m%d%H%M%S") + if self.correspondent and self.title: + return "{}: {} - {}".format( + created, self.correspondent, self.title) + if self.correspondent or self.title: + return "{}: {}".format(created, self.correspondent or self.title) return str(created) @property @@ -155,6 +172,7 @@ class Document(models.Model): return os.path.join( settings.MEDIA_ROOT, "documents", + "originals", "{:07}.{}.gpg".format(self.pk, self.file_type) ) @@ -164,14 +182,71 @@ class Document(models.Model): @property def file_name(self): - if self.sender and self.title: - tags = ",".join([t.slug for t in self.tags.all()]) - if tags: - return "{} - {} - {}.{}".format( - self.sender, self.title, tags, self.file_type) - return "{} - {}.{}".format(self.sender, self.title, self.file_type) - return os.path.basename(self.source_path) + return slugify(str(self)) + "." + self.file_type @property def download_url(self): - return reverse("fetch", kwargs={"pk": self.pk}) + return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk}) + + @property + def thumbnail_path(self): + return os.path.join( + settings.MEDIA_ROOT, + "documents", + "thumbnails", + "{:07}.png.gpg".format(self.pk) + ) + + @property + def thumbnail_file(self): + return open(self.thumbnail_path, "rb") + + @property + def thumbnail_url(self): + return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk}) + + +class Log(models.Model): + + LEVELS = ( + (logging.DEBUG, "Debugging"), + (logging.INFO, "Informational"), + (logging.WARNING, "Warning"), + (logging.ERROR, "Error"), + (logging.CRITICAL, "Critical"), + ) + + COMPONENT_CONSUMER = 1 + COMPONENT_MAIL = 2 + COMPONENTS = ( + (COMPONENT_CONSUMER, "Consumer"), + (COMPONENT_MAIL, "Mail Fetcher") + ) + + group = models.UUIDField(blank=True) + message = models.TextField() + level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO) + component = models.PositiveIntegerField(choices=COMPONENTS) + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + objects = LogManager() + + class Meta(object): + ordering = ("-modified",) + + def __str__(self): + return self.message + + def save(self, *args, **kwargs): + """ + To allow for the case where we don't want to group the message, we + shouldn't force the caller to specify a one-time group value. However, + allowing group=None means that the manager can't differentiate the + different un-grouped messages, so instead we set a random one here. + """ + + if not self.group: + self.group = uuid.uuid4() + + models.Model.save(self, *args, **kwargs) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py new file mode 100644 index 000000000..db50d34ba --- /dev/null +++ b/src/documents/serialisers.py @@ -0,0 +1,55 @@ +from rest_framework import serializers + +from .models import Correspondent, Tag, Document, Log + + +class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): + + class Meta(object): + model = Correspondent + fields = ("id", "slug", "name") + + +class TagSerializer(serializers.HyperlinkedModelSerializer): + + class Meta(object): + model = Tag + fields = ( + "id", "slug", "name", "colour", "match", "matching_algorithm") + + +class DocumentSerializer(serializers.ModelSerializer): + + correspondent = serializers.HyperlinkedRelatedField( + read_only=True, view_name="drf:correspondent-detail", allow_null=True) + tags = serializers.HyperlinkedRelatedField( + read_only=True, view_name="drf:tag-detail", many=True) + + class Meta(object): + model = Document + fields = ( + "id", + "correspondent", + "title", + "content", + "file_type", + "tags", + "created", + "modified", + "file_name", + "download_url", + "thumbnail_url", + ) + + +class LogSerializer(serializers.ModelSerializer): + + time = serializers.DateTimeField() + messages = serializers.CharField() + + class Meta(object): + model = Log + fields = ( + "time", + "messages" + ) diff --git a/src/documents/templates/documents/index.html b/src/documents/templates/documents/index.html new file mode 100644 index 000000000..ccde2d389 --- /dev/null +++ b/src/documents/templates/documents/index.html @@ -0,0 +1,10 @@ +<!DOCTYPE html> + +<html lang="en-gb"> + <head> + <title>Paperless + + + + + diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 7cee524c3..04f92f98c 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -4,18 +4,26 @@ from ..consumer import Consumer class TestAttachment(TestCase): - + TAGS = ("tag1", "tag2", "tag3") CONSUMER = Consumer() - + SUFFIXES = ( + "pdf", "png", "jpg", "jpeg", "gif", + "PDF", "PNG", "JPG", "JPEG", "GIF", + "PdF", "PnG", "JpG", "JPeG", "GiF", + ) + def _test_guess_attributes_from_name(self, path, sender, title, tags): - for suffix in ("pdf", "png", "jpg", "jpeg", "gif"): + for suffix in self.SUFFIXES: f = path.format(suffix) results = self.CONSUMER._guess_attributes_from_name(f) self.assertEqual(results[0].name, sender, f) self.assertEqual(results[1], title, f) self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) - self.assertEqual(results[3], suffix, f) + if suffix.lower() == "jpeg": + self.assertEqual(results[3], "jpg", f) + else: + self.assertEqual(results[3], suffix.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( diff --git a/src/documents/tests/test_importer.py b/src/documents/tests/test_importer.py new file mode 100644 index 000000000..8880aba66 --- /dev/null +++ b/src/documents/tests/test_importer.py @@ -0,0 +1,36 @@ +from django.core.management.base import CommandError +from django.test import TestCase + +from ..management.commands.document_importer import Command + + +class TestImporter(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + + def test_check_manifest_exists(self): + cmd = Command() + self.assertRaises( + CommandError, cmd._check_manifest_exists, "/tmp/manifest.json") + + def test_check_manifest(self): + + cmd = Command() + cmd.source = "/tmp" + + cmd.manifest = [{"model": "documents.document"}] + with self.assertRaises(CommandError) as cm: + cmd._check_manifest() + self.assertTrue( + 'The manifest file contains a record' in str(cm.exception)) + + cmd.manifest = [{ + "model": "documents.document", + "__exported_file_name__": "noexist.pdf" + }] + # self.assertRaises(CommandError, cmd._check_manifest) + with self.assertRaises(CommandError) as cm: + cmd._check_manifest() + self.assertTrue( + 'The manifest file refers to "noexist.pdf"' in str(cm.exception)) diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py new file mode 100644 index 000000000..23cea13e7 --- /dev/null +++ b/src/documents/tests/test_logger.py @@ -0,0 +1,142 @@ +import logging +import uuid + +from unittest import mock + +from django.test import TestCase + +from ..models import Log + + +class TestPaperlessLog(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + self.logger = logging.getLogger( + "documents.management.commands.document_consumer") + + def test_ignored(self): + with mock.patch("logging.StreamHandler.emit") as __: + self.assertEqual(Log.objects.all().count(), 0) + self.logger.info("This is an informational message") + self.logger.warning("This is an informational message") + self.logger.error("This is an informational message") + self.logger.critical("This is an informational message") + self.assertEqual(Log.objects.all().count(), 0) + + def test_that_it_saves_at_all(self): + + kw = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + + self.assertEqual(Log.objects.all().count(), 0) + + with mock.patch("logging.StreamHandler.emit") as __: + + # Debug messages are ignored by default + self.logger.debug("This is a debugging message", extra=kw) + self.assertEqual(Log.objects.all().count(), 0) + + self.logger.info("This is an informational message", extra=kw) + self.assertEqual(Log.objects.all().count(), 1) + + self.logger.warning("This is an warning message", extra=kw) + self.assertEqual(Log.objects.all().count(), 2) + + self.logger.error("This is an error message", extra=kw) + self.assertEqual(Log.objects.all().count(), 3) + + self.logger.critical("This is a critical message", extra=kw) + self.assertEqual(Log.objects.all().count(), 4) + + def test_groups(self): + + kw1 = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + kw2 = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + + self.assertEqual(Log.objects.all().count(), 0) + + with mock.patch("logging.StreamHandler.emit") as __: + + # Debug messages are ignored by default + self.logger.debug("This is a debugging message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 0) + + self.logger.info("This is an informational message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 1) + self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1) + + self.logger.warning("This is an warning message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 2) + self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1) + + self.logger.error("This is an error message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 3) + self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2) + + self.logger.critical("This is a critical message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 4) + self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2) + + def test_components(self): + + c1 = Log.COMPONENT_CONSUMER + c2 = Log.COMPONENT_MAIL + kw1 = { + "group": uuid.uuid4(), + "component": c1 + } + kw2 = { + "group": kw1["group"], + "component": c2 + } + + self.assertEqual(Log.objects.all().count(), 0) + + with mock.patch("logging.StreamHandler.emit") as __: + + # Debug messages are ignored by default + self.logger.debug("This is a debugging message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 0) + + self.logger.info("This is an informational message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 1) + self.assertEqual(Log.objects.filter(component=c2).count(), 1) + + self.logger.warning("This is an warning message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 2) + self.assertEqual(Log.objects.filter(component=c1).count(), 1) + + self.logger.error("This is an error message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 3) + self.assertEqual(Log.objects.filter(component=c2).count(), 2) + + self.logger.critical("This is a critical message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 4) + self.assertEqual(Log.objects.filter(component=c1).count(), 2) + + def test_groupped_query(self): + + kw = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + with mock.patch("logging.StreamHandler.emit") as __: + self.logger.info("Message 0", extra=kw) + self.logger.info("Message 1", extra=kw) + self.logger.info("Message 2", extra=kw) + self.logger.info("Message 3", extra=kw) + + self.assertEqual(Log.objects.all().by_group().count(), 1) + self.assertEqual( + Log.objects.all().by_group()[0]["messages"], + "Message 0\nMessage 1\nMessage 2\nMessage 3" + ) diff --git a/src/documents/tests/test_mail.py b/src/documents/tests/test_mail.py index 9a9480db4..256c77231 100644 --- a/src/documents/tests/test_mail.py +++ b/src/documents/tests/test_mail.py @@ -3,6 +3,7 @@ import os import magic from hashlib import md5 +from unittest import mock from django.conf import settings from django.test import TestCase @@ -27,7 +28,8 @@ class TestMessage(TestCase): with open(self.sample, "rb") as f: - message = Message(f.read(), verbosity=0) + with mock.patch("logging.StreamHandler.emit") as __: + message = Message(f.read()) self.assertTrue(message) self.assertEqual(message.subject, "Test 0") diff --git a/src/documents/tests/test_tags.py b/src/documents/tests/test_tags.py new file mode 100644 index 000000000..e0ab43244 --- /dev/null +++ b/src/documents/tests/test_tags.py @@ -0,0 +1,119 @@ +from django.test import TestCase + +from ..models import Tag + + +class TestTagMatching(TestCase): + + def test_match_all(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_ALL + ) + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have charlie in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertTrue(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_ALL + ) + self.assertFalse(t.matches("I have 12 in me")) + self.assertFalse(t.matches("I have 34 in me")) + self.assertFalse(t.matches("I have 56 in me")) + self.assertFalse(t.matches("I have 12 and 34 in me")) + self.assertTrue(t.matches("I have 12 34, and 56 in me")) + self.assertFalse(t.matches("I have 120, 34, and 56 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + + def test_match_any(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_ANY + ) + + self.assertTrue(t.matches("I have alpha in me")) + self.assertTrue(t.matches("I have charlie in me")) + self.assertTrue(t.matches("I have gamma in me")) + self.assertTrue(t.matches("I have alpha and charlie in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_ANY + ) + self.assertTrue(t.matches("I have 12 in me")) + self.assertTrue(t.matches("I have 34 in me")) + self.assertTrue(t.matches("I have 56 in me")) + self.assertTrue(t.matches("I have 12 and 34 in me")) + self.assertTrue(t.matches("I have 12 34, and 56 in me")) + self.assertTrue(t.matches("I have 120, 34, and 560 in me")) + self.assertFalse(t.matches("I have 120, 340, and 560 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + + def test_match_literal(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_LITERAL + ) + + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have charlie in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertTrue(t.matches("I have 'alpha charlie gamma' in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_LITERAL + ) + self.assertFalse(t.matches("I have 12 in me")) + self.assertFalse(t.matches("I have 34 in me")) + self.assertFalse(t.matches("I have 56 in me")) + self.assertFalse(t.matches("I have 12 and 34 in me")) + self.assertFalse(t.matches("I have 12 34, and 56 in me")) + self.assertFalse(t.matches("I have 120, 34, and 560 in me")) + self.assertFalse(t.matches("I have 120, 340, and 560 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + self.assertTrue(t.matches("I have 12 34 56 in me")) + + def test_match_regex(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha\w+gamma", + matching_algorithm=Tag.MATCH_REGEX + ) + + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertTrue(t.matches("I have alpha_and_gamma in me")) + self.assertTrue(t.matches("I have alphas_and_gamma in me")) + self.assertFalse(t.matches("I have alpha,and,gamma in me")) + self.assertFalse(t.matches("I have alpha and gamma in me")) + self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas in me")) diff --git a/src/documents/views.py b/src/documents/views.py index c92b6af09..1dc23aa4f 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,21 +1,41 @@ +from django.contrib.auth.mixins import LoginRequiredMixin from django.http import HttpResponse -from django.template.defaultfilters import slugify from django.views.decorators.csrf import csrf_exempt -from django.views.generic import FormView, DetailView +from django.views.generic import FormView, DetailView, TemplateView + +from rest_framework.mixins import ( + RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin) +from rest_framework.pagination import PageNumberPagination +from rest_framework.permissions import IsAuthenticated +from rest_framework.viewsets import ( + ModelViewSet, ReadOnlyModelViewSet, GenericViewSet) from paperless.db import GnuPG -from .models import Document from .forms import UploadForm +from .models import Correspondent, Tag, Document, Log +from .serialisers import ( + CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer) -class PdfView(DetailView): +class IndexView(TemplateView): + + template_name = "documents/index.html" + + def get_context_data(self, **kwargs): + print(kwargs) + print(self.request.GET) + print(self.request.POST) + return TemplateView.get_context_data(self, **kwargs) + + +class FetchView(DetailView): model = Document def render_to_response(self, context, **response_kwargs): """ - Override the default to return the unencrypted PDF as raw data. + Override the default to return the unencrypted image/PDF as raw data. """ content_types = { @@ -26,19 +46,25 @@ class PdfView(DetailView): Document.TYPE_TIF: "image/tiff", } + if self.kwargs["kind"] == "thumb": + return HttpResponse( + GnuPG.decrypted(self.object.thumbnail_file), + content_type=content_types[Document.TYPE_PNG] + ) + response = HttpResponse( GnuPG.decrypted(self.object.source_file), content_type=content_types[self.object.file_type] ) response["Content-Disposition"] = 'attachment; filename="{}"'.format( - slugify(str(self.object)) + "." + self.object.file_type) + self.object.file_name) return response -class PushView(FormView): +class PushView(LoginRequiredMixin, FormView): """ - A crude REST API for creating documents. + A crude REST-ish API for creating documents. """ form_class = UploadForm @@ -52,3 +78,45 @@ class PushView(FormView): def form_invalid(self, form): return HttpResponse("0") + + +class StandardPagination(PageNumberPagination): + page_size = 25 + page_size_query_param = "page-size" + max_page_size = 100000 + + +class CorrespondentViewSet(ModelViewSet): + model = Correspondent + queryset = Correspondent.objects.all() + serializer_class = CorrespondentSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) + + +class TagViewSet(ModelViewSet): + model = Tag + queryset = Tag.objects.all() + serializer_class = TagSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) + + +class DocumentViewSet(RetrieveModelMixin, + UpdateModelMixin, + DestroyModelMixin, + ListModelMixin, + GenericViewSet): + model = Document + queryset = Document.objects.all() + serializer_class = DocumentSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) + + +class LogViewSet(ReadOnlyModelViewSet): + model = Log + queryset = Log.objects.all().by_group() + serializer_class = LogSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) diff --git a/src/logger/admin.py b/src/logger/admin.py deleted file mode 100644 index dc9446821..000000000 --- a/src/logger/admin.py +++ /dev/null @@ -1,12 +0,0 @@ -from django.contrib import admin - -from .models import Log - - -class LogAdmin(admin.ModelAdmin): - - list_display = ("message", "level", "component") - list_filter = ("level", "component",) - - -admin.site.register(Log, LogAdmin) diff --git a/src/logger/apps.py b/src/logger/apps.py deleted file mode 100644 index 2c1a7d735..000000000 --- a/src/logger/apps.py +++ /dev/null @@ -1,5 +0,0 @@ -from django.apps import AppConfig - - -class LoggerConfig(AppConfig): - name = 'logger' diff --git a/src/logger/models.py b/src/logger/models.py deleted file mode 100644 index 48774c199..000000000 --- a/src/logger/models.py +++ /dev/null @@ -1,50 +0,0 @@ -from django.db import models - - -class Log(models.Model): - - LEVEL_ERROR = 1 - LEVEL_WARNING = 2 - LEVEL_INFO = 3 - LEVEL_DEBUG = 4 - LEVELS = ( - (LEVEL_ERROR, "Error"), - (LEVEL_WARNING, "Warning"), - (LEVEL_INFO, "Informational"), - (LEVEL_DEBUG, "Debugging"), - ) - - COMPONENT_CONSUMER = 1 - COMPONENT_MAIL = 2 - COMPONENTS = ( - (COMPONENT_CONSUMER, "Consumer"), - (COMPONENT_MAIL, "Mail Fetcher") - ) - - time = models.DateTimeField(auto_now_add=True) - message = models.TextField() - level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO) - component = models.PositiveIntegerField(choices=COMPONENTS) - - class Meta(object): - ordering = ("-time",) - - @classmethod - def error(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_ERROR, component=component) - - @classmethod - def warning(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_WARNING, component=component) - - @classmethod - def info(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_INFO, component=component) - - @classmethod - def debug(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_DEBUG, component=component) diff --git a/src/logger/tests.py b/src/logger/tests.py deleted file mode 100644 index 7ce503c2d..000000000 --- a/src/logger/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/src/logger/views.py b/src/logger/views.py deleted file mode 100644 index 91ea44a21..000000000 --- a/src/logger/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 444989990..b7daecaf8 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/ import os +from dotenv import load_dotenv + # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -42,7 +44,8 @@ INSTALLED_APPS = [ "django_extensions", "documents", - "logger", + + "rest_framework", ] @@ -87,12 +90,12 @@ DATABASES = { "NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"), } } -if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"): +if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"): DATABASES["default"] = { "ENGINE": "django.db.backends.postgresql_psycopg2", - "NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"), - "USER": os.environ.get("PAPERLESS_DBUSER"), - "PASSWORD": os.environ.get("PAPERLESS_DBPASS") + "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"), + "USER": os.getenv("PAPERLESS_DBUSER"), + "PASSWORD": os.getenv("PAPERLESS_DBPASS") } @@ -139,55 +142,119 @@ STATIC_URL = '/static/' MEDIA_URL = "/media/" -# Paperless-specific stuffs -# Change these paths if yours are different +# Paperless-specific stuff +# You shouldn't have to edit any of these values. Rather, you can set these +# values in /etc/paperless.conf instead. # ---------------------------------------------------------------------------- +# Tap paperless.conf if it's available +if os.path.exists("/etc/paperless.conf"): + load_dotenv("/etc/paperless.conf") + + +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "consumer": { + "class": "documents.loggers.PaperlessLogger", + } + }, + "loggers": { + "documents": { + "handlers": ["consumer"], + "level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"), + }, + }, +} + + # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = "eng" # The amount of threads to use for OCR -OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS") +OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") -# If this is true, any failed attempts to OCR a PDF will result in the PDF being -# indexed anyway, with whatever we could get. If it's False, the file will -# simply be left in the CONSUMPTION_DIR. -FORGIVING_OCR = True +# If this is true, any failed attempts to OCR a PDF will result in the PDF +# being indexed anyway, with whatever we could get. If it's False, the file +# will simply be left in the CONSUMPTION_DIR. +FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # GNUPG needs a home directory for some reason -GNUPG_HOME = os.environ.get("HOME", "/dev/null") +GNUPG_HOME = os.getenv("HOME", "/tmp") -# Convert is part of the Imagemagick package -CONVERT_BINARY = "/usr/bin/convert" +# Convert is part of the ImageMagick package +CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") + +# Unpaper +UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") # This will be created if it doesn't exist -SCRATCH_DIR = "/tmp/paperless" +SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index -CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") +CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") # If you want to use IMAP mail consumption, populate this with useful values. -# If you leave HOST set to None, we assume you're not going to use this feature. +# If you leave HOST set to None, we assume you're not going to use this +# feature. MAIL_CONSUMPTION = { - "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"), - "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"), - "USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"), - "PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"), + "HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"), + "PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"), + "USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"), + "PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"), "USE_SSL": True, # If True, use SSL/TLS to connect "INBOX": "INBOX" # The name of the inbox on the server } -# This is used to encrypt the original documents and decrypt them later when you -# want to download them. Set it and change the permissions on this file to +# This is used to encrypt the original documents and decrypt them later when +# you want to download them. Set it and change the permissions on this file to # 0600, or set it to `None` and you'll be prompted for the passphrase at # runtime. The default looks for an environment variable. # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things # with GPG, including an interesting case where it may "encrypt" zero-byte # files. -PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") +PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") -# If you intend to use the "API" to push files into the consumer, you'll need to -# provide a shared secret here. Leaving this as the default will disable the -# API. -UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "") +# If you intend to use the "API" to push files into the consumer, you'll need +# to provide a shared secret here. Leaving this as the default will disable +# the API. +SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") + +# +# TODO: Remove after 1.2 +# +# This logic is here to address issue #44, wherein we were using inconsistent +# constant names vs. environment variables. If you're using Paperless for the +# first time, you can safely ignore everything from here on, so long as you're +# correctly defining the variables as per the documentation. +# + + +def deprecated(before, after): + print( + "\n\n" + "WARNING: {before} has been renamed to {after}.\n" + "WARNING: Use of {before} will not work as of version 1.2." + "\n\n".format( + before=before, + after=after + ) + ) + +if not CONVERT_BINARY: + CONVERT_BINARY = "convert" + if os.getenv("PAPERLESS_CONVERT"): + deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY") + CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY) + +if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"): + deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR") + CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME") + +if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"): + deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET") + SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "") diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 060953676..a7775a588 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -15,15 +15,46 @@ Including another URLconf 3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls)) """ from django.conf import settings -from django.conf.urls import url, static +from django.conf.urls import url, static, include from django.contrib import admin -from documents.views import PdfView, PushView +from rest_framework.routers import DefaultRouter + +from documents.views import ( + IndexView, FetchView, PushView, + CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet +) + +router = DefaultRouter() +router.register(r'correspondents', CorrespondentViewSet) +router.register(r'tags', TagViewSet) +router.register(r'documents', DocumentViewSet) +router.register(r'logs', LogViewSet) urlpatterns = [ - url(r"^fetch/(?P\d+)$", PdfView.as_view(), name="fetch"), - url(r'', admin.site.urls), + + # API + url( + r"^api/auth/", + include('rest_framework.urls', namespace="rest_framework") + ), + url(r"^api/", include(router.urls, namespace="drf")), + + # Normal pages (coming soon) + # url(r"^$", IndexView.as_view(), name="index"), + + # File downloads + url( + r"^fetch/(?Pdoc|thumb)/(?P\d+)$", + FetchView.as_view(), + name="fetch" + ), + + # The Django admin + url(r"admin/", admin.site.urls), + url(r"", admin.site.urls), # This is going away + ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) -if settings.UPLOAD_SHARED_SECRET: +if settings.SHARED_SECRET: urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push")) diff --git a/src/paperless/version.py b/src/paperless/version.py index 7afad8b77..d61abb655 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (0, 0, 6) +__version__ = (0, 1, 1) diff --git a/src/tox.ini b/src/tox.ini new file mode 100644 index 000000000..1840b507e --- /dev/null +++ b/src/tox.ini @@ -0,0 +1,23 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +skipsdist = True +envlist = py34, py35, pep8 + +[testenv] +commands = {envpython} manage.py test +deps = -r{toxinidir}/../requirements.txt +setenv = + PAPERLESS_CONSUME=/tmp + PAPERLESS_PASSPHRASE=THISISNOTASECRET + PAPERLESS_SECRET=paperless + +[testenv:pep8] +commands=pep8 +deps=pep8 + +[pep8] +exclude=.tox,migrations,paperless/settings.py