Merge pull request #1 from danielquinn/master

Sync with upstream
This commit is contained in:
Brandon Odegard 2016-03-08 10:54:26 -06:00
commit f5e0a89a3f
57 changed files with 2230 additions and 453 deletions

7
.gitignore vendored
View File

@ -57,7 +57,9 @@ docs/_build/
target/
# Stored PDFs
media/*
media/documents/*.gpg
media/documents/thumbnails/*.gpg
media/documents/originals/*.gpg
# Sqlite database
db.sqlite3
@ -68,8 +70,9 @@ db.sqlite3
# Other stuff that doesn't belong
virtualenv
.vagrant
docker-compose.yml
docker-compose.env
# Used for development
scripts/import-for-development
environment

18
.travis.yml Normal file
View File

@ -0,0 +1,18 @@
language: python
sudo: false
matrix:
include:
- python: 3.4
env: TOXENV=py34
- python: 3.5
env: TOXENV=py35
- python: 3.5
env: TOXENV=pep8
install:
- pip install --requirement requirements.txt
- pip install tox
script: tox -c src/tox.ini

46
Dockerfile Normal file
View File

@ -0,0 +1,46 @@
FROM python:3.5.1
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
# Install dependencies
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
sudo \
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
&& rm -rf /var/lib/apt/lists/*
# Install python dependencies
RUN mkdir -p /usr/src/paperless
WORKDIR /usr/src/paperless
COPY requirements.txt /usr/src/paperless/
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
RUN mkdir -p /usr/src/paperless/src
RUN mkdir -p /usr/src/paperless/data
RUN mkdir -p /usr/src/paperless/media
COPY src/ /usr/src/paperless/src/
COPY data/ /usr/src/paperless/data/
COPY media/ /usr/src/paperless/media/
# Set consumption directory
ENV PAPERLESS_CONSUMPTION_DIR /consume
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
# Migrate database
WORKDIR /usr/src/paperless/src
RUN ./manage.py migrate
# Create user
RUN groupadd -g 1000 paperless \
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
&& chown -Rh paperless:paperless /usr/src/paperless
# Setup entrypoint
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
RUN chmod 755 /sbin/docker-entrypoint.sh
# Mount volumes
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
CMD ["--help"]

View File

@ -3,6 +3,7 @@ Paperless
|Documentation|
|Chat|
|Travis|
Scan, index, and archive all of your paper documents
@ -55,6 +56,7 @@ powerful tools.
* `ImageMagick`_ converts the images between colour and greyscale.
* `Tesseract`_ does the character recognition.
* `Unpaper`_ despeckles and and deskews the scanned image.
* `GNU Privacy Guard`_ is used as the encryption backend.
* `Python 3`_ is the language of the project.
@ -92,6 +94,7 @@ home.
.. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail
.. _ImageMagick: http://imagemagick.org/
.. _Tesseract: https://github.com/tesseract-ocr
.. _Unpaper: https://www.flameeyes.eu/projects/unpaper
.. _GNU Privacy Guard: https://gnupg.org/
.. _Python 3: https://python.org/
.. _Pillow: https://pypi.python.org/pypi/pillowfight/
@ -105,4 +108,5 @@ home.
.. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg
:alt: Join the chat at https://gitter.im/danielquinn/paperless
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
:target: https://travis-ci.org/danielquinn/paperless

View File

@ -0,0 +1,15 @@
# Environment variables to set for Paperless
# Commented out variables will be replaced by a default within Paperless.
# Passphrase Paperless uses to encrypt and decrypt your documents
PAPERLESS_PASSPHRASE=CHANGE_ME
# The amount of threads to use for text recognition
# PAPERLESS_OCR_THREADS=4
# Additional languages to install for text recognition
# PAPERLESS_OCR_LANGUAGES=deu ita
# You can change the default user and group id to a custom one
# USERMAP_UID=1000
# USERMAP_GID=1000

View File

@ -0,0 +1,37 @@
version: '2'
services:
webserver:
image: paperless
ports:
# You can adapt the port you want Paperless to listen on by
# modifying the part before the `:`.
- "8000:8000"
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
env_file: docker-compose.env
environment:
- PAPERLESS_OCR_LANGUAGES=
command: ["runserver", "0.0.0.0:8000"]
consumer:
image: paperless
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
# You have to adapt the local path you want the consumption
# directory to mount to by modifying the part before the ':'.
- /path/to/arbitrary/place:/consume
# Likewise, you can add a local path to mount a directory for
# exporting. This is not strictly needed for paperless to
# function, only if you're exporting your files: uncomment
# it and fill in a local path if you know you're going to
# want to export your documents.
# - /path/to/another/arbitrary/place:/export
env_file: docker-compose.env
command: ["document_consumer"]
volumes:
data:
media:

18
docs/Dockerfile Normal file
View File

@ -0,0 +1,18 @@
FROM python:3.5.1
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
# Install Sphinx and Pygments
RUN pip install Sphinx Pygments
# Setup directories, copy data
RUN mkdir /build
COPY . /build
WORKDIR /build/docs
# Build documentation
RUN make html
# Start webserver
WORKDIR /build/docs/_build/html
EXPOSE 8000/tcp
CMD ["python3", "-m", "http.server"]

23
docs/api.rst Normal file
View File

@ -0,0 +1,23 @@
.. _api:
The REST API
############
Paperless makes use of the `Django REST Framework`_ standard API interface
because of its inherent awesomeness. Conveniently, the system is also
self-documenting, so learn more about the access points, schema, what's
accepted and what isn't, you need only visit ``/api`` on your local Paperless
installation.
.. _Django REST Framework: http://django-rest-framework.org/
.. _api-uploading:
Uploading
---------
File uploads in an API are hard and so far as I've been able to tell, there's
no standard way of accepting them, so rather than crowbar file uploads into the
REST API and endure that headache, I've left that process to a simple HTTP
POST, documented on the :ref:`consumption page <consumption-http>`.

View File

@ -1,10 +1,51 @@
Changelog
#########
* 0.1.1
* Potentially **Breaking Change**: All references to "sender" in the code
have been renamed to "correspondent" to better reflect the nature of the
property (one could quite reasonably scan a document before sending it to
someone.)
* `#67`_: Rewrote the document exporter and added a new importer that allows
for full metadata retention without depending on the file name and
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
contributing conversation that lead to this change.
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
before it's OCR'd. Thanks to `Pit`_ for this one.
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
* `#68`_: Added support for using a proper config file at
``/etc/paperless.conf`` and modified the systemd unit files to use it.
* Refactored the Vagrant installation process to use environment variables
rather than asking the user to modify ``settings.py``.
* `#44`_: Harmonise environment variable names with constant names.
* `#60`_: Setup logging to actually use the Python native logging framework.
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
to be imported but made unavailable.
* 0.1.0
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
`Tikitu de Jager`_ for this one, and especially to `Pit`_
who spearheadded this effort.
* A simple REST API is in place, but it should be considered unstable.
* Cleaned up the consumer to use temporary directories instead of a single
scratch space. (Thanks `Pit`_)
* Improved the efficiency of the consumer by parsing pages more intelligently
and introducing a threaded OCR process (thanks again `Pit`_).
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
`Pit`_.
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
* `#54`_: Documented the re-tagger (`zedster`_)
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
* Added tox with pep8 checking
* 0.0.6
* Added support for parallel OCR (significant work from pitkley)
* Sped up the language detection (significant work from pitkley)
* Added support for parallel OCR (significant work from `Pit`_)
* Sped up the language detection (significant work from `Pit`_)
* Added simple logging
* 0.0.5
@ -35,3 +76,26 @@ Changelog
* 0.0.1
* Initial release
.. _Brian Conn: https://github.com/TheConnMan
.. _Christopher Luu: https://github.com/nuudles
.. _Florian Jung: https://github.com/the01
.. _Tikitu de Jager: https://github.com/tikitu
.. _Paul: https://github.com/polo2ro
.. _Pit: https://github.com/pitkley
.. _Wayne Werner: https://github.com/waynew
.. _darkmatter: https://github.com/darkmatter
.. _zedster: https://github.com/zedster
.. _#20: https://github.com/danielquinn/paperless/issues/20
.. _#44: https://github.com/danielquinn/paperless/issues/44
.. _#45: https://github.com/danielquinn/paperless/issues/45
.. _#47: https://github.com/danielquinn/paperless/issues/47
.. _#48: https://github.com/danielquinn/paperless/issues/48
.. _#53: https://github.com/danielquinn/paperless/issues/53
.. _#54: https://github.com/danielquinn/paperless/issues/54
.. _#57: https://github.com/danielquinn/paperless/issues/57
.. _#60: https://github.com/danielquinn/paperless/issues/60
.. _#67: https://github.com/danielquinn/paperless/issues/67
.. _#68: https://github.com/danielquinn/paperless/issues/68
.. _#71: https://github.com/danielquinn/paperless/issues/71

View File

@ -40,14 +40,14 @@ follow the :ref:`consumer <utilities-consumer>` instructions to get it running.
A Note on File Naming
---------------------
Any document you put into the consumption directory will be consumed, but if you
name the file right, it'll automatically set some values in the database for
you. This is is the logic the consumer follows:
Any document you put into the consumption directory will be consumed, but if
you name the file right, it'll automatically set some values in the database
for you. This is is the logic the consumer follows:
1. Try to find the sender, title, and tags in the file name following the
pattern: ``Sender - Title - tag,tag,tag.pdf``.
2. If that doesn't work, try to find the sender and title in the file name
following the pattern: ``Sender - Title.pdf``.
1. Try to find the correspondent, title, and tags in the file name following
the pattern: ``Correspondent - Title - tag,tag,tag.pdf``.
2. If that doesn't work, try to find the correspondent and title in the file
name following the pattern: ``Correspondent - Title.pdf``.
3. If that doesn't work, just assume that the name of the file is the title.
So given the above, the following examples would work as you'd expect:
@ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running:
the configured email account every 10 minutes for something new and pull down
whatever it finds.
4. Send yourself an email! Note that the subject is treated as the file name,
so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get
what you expect. Also, you must include the aforementioned secret string in
every email so the fetcher knows that it's safe to import.
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
get what you expect. Also, you must include the aforementioned secret
string in every email so the fetcher knows that it's safe to import.
5. After a few minutes, the consumer will poll your mailbox, pull down the
message, and place the attachment in the consumption directory with the
appropriate name. A few minutes later, the consumer will import it like any
@ -111,23 +111,22 @@ So, with all that in mind, here's what you do to get it running:
HTTP POST
=========
Currently, the API is limited to only handling file uploads, it doesn't do tags
yet, and the URL schema isn't concrete, but it's a start. It's also not much of
a real API, it's just a URL that accepts an HTTP POST.
You can also submit a document via HTTP POST. It doesn't do tags yet, and the
URL schema isn't concrete, but it's a start.
To push your document to *Paperless*, send an HTTP POST to the server with the
To push your document to Paperless, send an HTTP POST to the server with the
following name/value pairs:
* ``sender``: The name of the document's sender. Note that there are
restrictions on what characters you can use here. Specifically, alphanumeric
characters, `-`, `,`, `.`, and `'` are ok, everything else it out. You also
can't use the sequence ` - ` (space, dash, space).
* ``correspondent``: The name of the document's correspondent. Note that there
are restrictions on what characters you can use here. Specifically,
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it
out. You also can't use the sequence ` - ` (space, dash, space).
* ``title``: The title of the document. The rules for characters is the same
here as the sender.
* ``signature``: For security reasons, we have the sender send a signature using
a "shared secret" method to make sure that random strangers don't start
uploading stuff to your server. The means of generating this signature is
defined below.
here as the correspondent.
* ``signature``: For security reasons, we have the correspondent send a
signature using a "shared secret" method to make sure that random strangers
don't start uploading stuff to your server. The means of generating this
signature is defined below.
Specify ``enctype="multipart/form-data"``, and then POST your file with:::
@ -146,12 +145,12 @@ verification.
In the case of *Paperless*, you configure the server with the secret by setting
``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by
concatenating the sender, title, and the secret, and then using sha256 to
generate a hexdigest.
concatenating the correspondent, title, and the secret, and then using sha256
to generate a hexdigest.
If you're using Python, this is what that looks like:
.. code:: python
from hashlib import sha256
signature = sha256(sender + title + secret).hexdigest()
signature = sha256(correspondent + title + secret).hexdigest()

View File

@ -30,6 +30,7 @@ Contents
requirements
setup
consumption
api
utilities
migrating
changelog

View File

@ -4,31 +4,10 @@ Migrating, Updates, and Backups
===============================
As *Paperless* is still under active development, there's a lot that can change
as software updates roll out. The thing you just need to remember for all of
this is that for the most part, **the database is expendable** so long as you
have your files. This is because the file name of the exported files includes
the name of the sender, the title, and the tags (if any) on each file.
.. _migrating-updates:
Updates
-------
For the most part, all you have to do to update *Paperless* is run ``git pull``
on the directory containing the project files, and then use Django's ``migrate``
command to execute any database schema updates that might have been rolled in
as part of the update:
.. code:: bash
$ cd /path/to/project
$ git pull
$ cd src
$ ./manage.py migrate
Note that it's possible (even likely) that while ``git pull`` may update some
files, the ``migrate`` step may not update anything. This is totally normal.
as software updates roll out. You should backup often, so if anything goes
wrong during an update, you at least have a means of restoring to something
usable. Thankfully, there are automated ways of backing up, restoring, and
updating the software.
.. _migrating-backup:
@ -38,20 +17,8 @@ Backing Up
So you're bored of this whole project, or you want to make a remote backup of
the unencrypted files for whatever reason. This is easy to do, simply use the
:ref:`exporter <utilities-exporter>` to dump your documents out into an
arbitrary directory.
Additionally however, you'll need to back up the tags themselves. The file
names contain the tag names, but you still need to define the tags and their
matching algorithms in the database for things to work properly. We do this
with Django's ``dumpdata`` command, which produces JSON output.
.. code:: bash
$ cd /path/to/project
$ cd src
$ ./manage.py document_export /path/to/arbitrary/place/
$ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
:ref:`exporter <utilities-exporter>` to dump your documents and database out
into an arbitrary directory.
.. _migrating-restoring:
@ -66,7 +33,7 @@ create an empty database (just follow the
``tags.json`` file you created as part of your backup. Lastly, copy your
exported documents into the consumption directory and start up the consumer.
.. code:: bash
.. code-block:: shell-session
$ cd /path/to/project
$ rm data/db.sqlite3 # Delete the database
@ -77,3 +44,60 @@ exported documents into the consumption directory and start up the consumer.
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
$ ./manage.py document_consumer
Importing your data if you are :ref:`using Docker <setup-installation-docker>`
is almost as simple:
.. code-block:: shell-session
# Stop and remove your current containers
$ docker-compose stop
$ docker-compose rm -f
# Recreate them, add the superuser
$ docker-compose up -d
$ docker-compose run --rm webserver createsuperuser
# Load the tags
$ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
# Load your exported documents into the consumption directory
# (How you do this highly depends on how you have set this up)
$ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
After loading the documents into the consumption directory the consumer will
immediately start consuming the documents.
.. _migrating-updates:
Updates
-------
For the most part, all you have to do to update *Paperless* is run ``git pull``
on the directory containing the project files, and then use Django's ``migrate``
command to execute any database schema updates that might have been rolled in
as part of the update:
.. code-block:: shell-session
$ cd /path/to/project
$ git pull
$ cd src
$ ./manage.py migrate
Note that it's possible (even likely) that while ``git pull`` may update some
files, the ``migrate`` step may not update anything. This is totally normal.
If you are :ref:`using Docker <setup-installation-docker>` the update process
requires only one additional step:
.. code-block:: shell-session
$ cd /path/to/project
$ git pull
$ docker build -t paperless .
$ docker-compose up -d
$ docker-compose run --rm webserver migrate
If ``git pull`` doesn't report any changes, there is no need to continue with
the remaining steps.

View File

@ -10,11 +10,13 @@ should work) that has the following software installed on it:
* `GNU Privacy Guard`_
* `Tesseract`_
* `Imagemagick`_
* `unpaper`_
.. _Python3: https://python.org/
.. _GNU Privacy Guard: https://gnupg.org
.. _Tesseract: https://github.com/tesseract-ocr
.. _Imagemagick: http://imagemagick.org/
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
Notably, you should confirm how you access your Python3 installation. Many
Linux distributions will install Python3 in parallel to Python2, using the names
@ -101,3 +103,16 @@ you'd like to generate your own docs locally, you'll need to:
$ pip install sphinx
and then cd into the ``docs`` directory and type ``make html``.
If you are using Docker, you can use the following commands to build the
documentation and run a webserver serving it on `port 8001`_:
.. code:: bash
$ pwd
/path/to/paperless
$ docker build -t paperless:docs -f docs/Dockerfile .
$ docker run --rm -it -p "8001:8000" paperless:docs
.. _port 8001: http://127.0.0.1:8001

View File

@ -37,11 +37,19 @@ or just download the tarball and go that route:
Installation & Configuration
----------------------------
You can go two routes with setting up and running Paperless. The *Vagrant*
route is quick & easy, but means you're running a VM which comes with memory
consumption etc. Alternatively the standard, "bare metal" approach is a little
more complicated.
You can go multiple routes with setting up and running Paperless. The `Vagrant
route`_ is quick & easy, but means you're running a VM which comes with memory
consumption etc. We also `support Docker`_, which you can use natively under
Linux and in a VM with `Docker Machine`_ (this guide was written for native
Docker usage under Linux, you might have to adapt it for Docker Machine.)
Alternatively the standard, `bare metal`_ approach is a little more complicated,
but worth it because it makes it easier to should you want to contribute some
code back.
.. _Vagrant route: setup-installation-vagrant_
.. _support Docker: setup-installation-docker_
.. _bare metal: setup-installation-standard_
.. _Docker Machine: https://docs.docker.com/machine/
.. _setup-installation-standard:
@ -91,33 +99,188 @@ Vagrant Method
2. Run ``vagrant up``. An instance will start up for you. When it's ready and
provisioned...
3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
``/opt/paperless/src/paperless/settings.py`` and set the values for:
* ``CONSUMPTION_DIR``: this is where your documents will be dumped to be
consumed by Paperless.
* ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt
the original document. The default value attempts to source the
passphrase from the environment, so if you don't set it to a static value
here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the
command line whenever invoking the consumer or webserver.
4. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
5. Still inside your vagrant box, create a user for your Paperless instance with
``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
``/etc/paperless.conf`` and set the values for:
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
dumped to be consumed by Paperless.
* ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to
encrypt/decrypt the original document.
* ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming
documents from mail or via the API. If you don't use either, leaving it
blank is just fine.
4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This
updates the environment to make use of the changes you made to the config
file.
5. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
6. Still inside your vagrant box, create a user for your Paperless instance
with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
create your user.
6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``.
You should now be able to visit your (empty) `Paperless webserver`_ at
``172.28.128.4:8000``. You can login with the user/pass you created in #5.
7. In a separate window, run ``vagrant ssh`` again, but this time once inside
7. Start the webserver with
``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be
able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``.
You can login with the user/pass you created in #6.
8. In a separate window, run ``vagrant ssh`` again, but this time once inside
your vagrant instance, you should start the consumer script with
``/opt/paperless/src/manage.py document_consumer``.
8. Scan something. Put it in the ``CONSUMPTION_DIR``.
9. Wait a few minutes
10. Visit the document list on your webserver, and it should be there, indexed
9. Scan something. Put it in the ``CONSUMPTION_DIR``.
10. Wait a few minutes
11. Visit the document list on your webserver, and it should be there, indexed
and downloadable.
.. _Vagrant: https://vagrantup.com/
.. _Paperless server: http://172.28.128.4:8000
.. _setup-installation-docker:
Docker Method
.............
1. Install `Docker`_.
.. caution::
As mentioned earlier, this guide assumes that you use Docker natively
under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows,
you will have to adapt IP addresses, volume-mounting, command execution
and maybe more.
2. Install `docker-compose`_. [#compose]_
.. caution::
If you want to use the included ``docker-compose.yml.example`` file, you
need to have at least Docker version **1.10.0** and docker-compose
version **1.6.0**.
See the `Docker installation guide`_ on how to install the current
version of Docker for your operating system or Linux distribution of
choice. To get an up-to-date version of docker-compose, follow the
`docker-compose installation guide`_ if your package repository doesn't
include it.
.. _Docker installation guide: https://docs.docker.com/engine/installation/
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
editing both these files: taking a copy ensures that you can ``git pull`` to
receive updates without risking merge conflicts with your modified versions
of the configuration files.
4. Modify ``docker-compose.yml`` to your preferences, following the instructions
in comments in the file. The only change that is a hard requirement is to
specify where the consumption directory should mount.
5. Modify ``docker-compose.env`` and adapt the following environment variables:
``PAPERLESS_PASSPHRASE``
This is the passphrase Paperless uses to encrypt/decrypt the original
document.
``PAPERLESS_OCR_THREADS``
This is the number of threads the OCR process will spawn to process
document pages in parallel. If the variable is not set, Python determines
the core-count of your CPU and uses that value.
``PAPERLESS_OCR_LANGUAGES``
If you want the OCR to recognize other languages in addition to the default
English, set this parameter to a space separated list of three-letter
language-codes after `ISO 639-2/T`_. For a list of available languages --
including their three letter codes -- see the `Debian packagelist`_.
``USERMAP_UID`` and ``USERMAP_GID``
If you want to mount the consumption volume (directory ``/consume`` within
the containers) to a host-directory -- which you probably want to do --
access rights might be an issue. The default user and group ``paperless``
in the containers have an id of 1000. The containers will enforce that the
owning group of the consumption directory will be ``paperless`` to be able
to delete consumed documents. If your host-system has a group with an id of
1000 and you don't want this group to have access rights to the consumption
directory, you can use ``USERMAP_GID`` to change the id in the container
and thus the one of the consumption directory. Furthermore, you can change
the id of the default user as well using ``USERMAP_UID``.
6. Run ``docker-compose up -d``. This will create and start the necessary
containers.
7. To be able to login, you will need a super user. To create it, execute the
following command:
.. code-block:: shell-session
$ docker-compose run --rm webserver createsuperuser
This will prompt you to set a username (default ``paperless``), an optional
e-mail address and finally a password.
8. The default ``docker-compose.yml`` exports the webserver on your local port
8000. If you haven't adapted this, you should now be able to visit your
`Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
user and password you just created.
9. Add files to consumption directory the way you prefer to. Following are two
possible options:
1. Mount the consumption directory to a local host path by modifying your
``docker-compose.yml``:
.. code-block:: diff
diff --git a/docker-compose.yml b/docker-compose.yml
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,9 +18,8 @@ services:
volumes:
- paperless-data:/usr/src/paperless/data
- paperless-media:/usr/src/paperless/media
- - /consume
+ - /local/path/you/choose:/consume
.. danger::
While the consumption container will ensure at startup that it can
**delete** a consumed file from a host-mounted directory, it might not
be able to **read** the document in the first place if the access
rights to the file are incorrect.
Make sure that the documents you put into the consumption directory
will either be readable by everyone (``chmod o+r file.pdf``) or
readable by the default user or group id 1000 (or the one you have set
with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
2. Use ``docker cp`` to copy your files directly into the container:
.. code-block:: shell-session
$ # Identify your containers
$ docker-compose ps
Name Command State Ports
-------------------------------------------------------------------------
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
$ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
``docker cp`` is a one-shot-command, just like ``cp``. This means that
every time you want to consume a new document, you will have to execute
``docker cp`` again. You can of course automate this process, but option 1
is generally the preferred one.
.. danger::
``docker cp`` will change the owning user and group of a copied file
to the acting user at the destination, which will be ``root``.
You therefore need to ensure that the documents you want to copy into
the container are readable by everyone (``chmod o+r file.pdf``) before
copying them.
.. _Docker: https://www.docker.com/
.. _docker-compose: https://docs.docker.com/compose/install/
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
.. [#compose] You of course don't have to use docker-compose, but it
simplifies deployment immensely. If you know your way around Docker, feel
free to tinker around without using compose!
.. _making-things-a-little-more-permanent:
Making Things a Little more Permanent
@ -126,5 +289,9 @@ Making Things a Little more Permanent
Once you've tested things and are happy with the work flow, you can automate the
process of starting the webserver and consumer automatically. If you're running
on a bare metal system that's using Systemd, you can use the service unit files
in the ``scripts`` directory to set this up. If you're on a SysV or other
startup system (like the Vagrant box), then you're currently on your own.
in the ``scripts`` directory to set this up. If you're on another startup
system or are using a Vagrant box, then you're currently on your own. If you are
using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to
have the containers automatically start with the Docker daemon.
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart

View File

@ -26,7 +26,7 @@ How to Use It
The webserver is started via the ``manage.py`` script:
.. code:: bash
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py runserver
@ -64,7 +64,7 @@ How to Use It
The consumer is started via the ``manage.py`` script:
.. code:: bash
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_consumer
@ -95,13 +95,110 @@ How to Use It
This too is done via the ``manage.py`` script:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
to do with as you please. The files are accompanied with a special file,
``manifest.json`` which can be used to
:ref:`import the files <utilities-importer>` at a later date if you wish.
.. _utilities-exporter-howto-docker:
Docker
______
If you are :ref:`using Docker <setup-installation-docker>`, running the
expoorter is almost as easy. To mount a volume for exports, follow the
instructions in the ``docker-compose.yml.example`` file for the ``/export``
volume (making the changes in your own ``docker-compose.yml`` file, of course).
Once you have the volume mounted, the command to run an export is:
.. code-block:: shell-session
$ docker-compose run --rm consumer document_exporter /export
If you prefer to use ``docker run`` directly, supplying the necessary commandline
options:
.. code-block:: shell-session
$ # Identify your containers
$ docker-compose ps
Name Command State Ports
-------------------------------------------------------------------------
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
$ # Make sure to replace your passphrase and remove or adapt the id mapping
$ docker run --rm \
--volumes-from paperless_data_1 \
--volume /path/to/arbitrary/place:/export \
-e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
paperless document_exporter /export
.. _utilities-importer:
The Importer
------------
Looking to transfer Paperless data from one instance to another, or just want
to restore from a backup? This is your go-to toy.
.. _utilities-importer-howto:
How to Use It
.............
The importer works just like the exporter. You point it at a directory, and
the script does the rest of the work:
.. code-block:: shell-session
$ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/
Docker
______
Assuming that you've already gone through the steps above in the
:ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing
to do is just re-use the ``/export`` path you already setup:
.. code-block:: shell-session
$ docker-compose run --rm consumer document_importer /export
Similarly, if you're not using docker-compose, you can adjust the export
instructions above to do the import.
.. _utilities-retagger:
The Re-tagger
-------------
Say you've imported a few hundred documents and now want to introduce a tag
and apply its matching to all of the currently-imported docs. This problem is
common enough that there's a tool for it.
.. _utilities-retagger-howto:
How to Use It
.............
This too is done via the ``manage.py`` script:
.. code:: bash
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere
$ /path/to/paperless/src/manage.py document_retagger
This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with
as you please. The naming scheme on export is identical to that used for
import, so should you can now safely delete the entire project directly,
database, encrypted PDFs and all, and later create it all again simply by
running the consumer again and dumping all of these files into
``CONSUMPTION_DIR``.
That's it. It'll loop over all of the documents in your database and attempt
to match all of your tags to them. If one matches, it'll be applied. And
don't worry, you can run this as often as you like, it' won't double-tag
a document.

33
paperless.conf.example Normal file
View File

@ -0,0 +1,33 @@
# Sample paperless.conf
# Copy this file to /etc/paperless.conf and modify it to suit your needs.
# This where your documents should go to be consumed. Make sure that it exists
# and that the user running the paperless service can read/write its contents
# before you start Paperless.
PAPERLESS_CONSUMPTION_DIR=""
# These values are required if you want paperless to check a particular email
# box every 10 minutes and attempt to consume documents from there. If you
# don't define a HOST, mail checking will just be disabled.
PAPERLESS_CONSUME_MAIL_HOST=""
PAPERLESS_CONSUME_MAIL_PORT=""
PAPERLESS_CONSUME_MAIL_USER=""
PAPERLESS_CONSUME_MAIL_PASS=""
# You must have a passphrase in order for Paperless to work at all. If you set
# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte
# file.
#
# The passphrase you use here will be used when storing your documents in
# Paperless, but you can always export them in an unencrypted format by using
# document exporter. See the documentaiton for more information.
#
# One final note about the passphrase. Once you've consumed a document with
# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and
# can't properly export documents that were encrypted with an old passphrase if
# you've since changed it to a new one.
PAPERLESS_PASSPHRASE="secret"
# If you intend to consume documents either via HTTP POST or by email, you must
# have a shared secret here.
PAPERLESS_SHARED_SECRET=""

View File

@ -1,8 +1,10 @@
Django==1.9
Django==1.9.2
django-extensions==1.6.1
djangorestframework==3.3.2
python-dotenv==0.3.0
filemagic==1.6
langdetect==1.0.5
Pillow==3.0.0
Pillow==3.1.1
pyocr==0.3.1
python-dateutil==2.4.2
python-gnupg==0.3.8

View File

@ -0,0 +1,74 @@
#!/bin/bash
set -e
# Source: https://github.com/sameersbn/docker-gitlab/
map_uidgid() {
USERMAP_ORIG_UID=$(id -u paperless)
USERMAP_ORIG_UID=$(id -g paperless)
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
groupmod -g ${USERMAP_GID} paperless
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
fi
}
set_permissions() {
# Set permissions for consumption directory
chgrp paperless "$PAPERLESS_CONSUMPTION_DIR"
chmod g+x "$PAPERLESS_CONSUMPTION_DIR"
# Set permissions for application directory
chown -Rh paperless:paperless /usr/src/paperless
}
initialize() {
map_uidgid
set_permissions
}
install_languages() {
local langs="$1"
read -ra langs <<<"$langs"
# Check that it is not empty
if [ ${#langs[@]} -eq 0 ]; then
return
fi
# Update apt-lists
apt-get update
# Loop over languages to be installed
for lang in "${langs[@]}"; do
pkg="tesseract-ocr-$lang"
if dpkg -s "$pkg" 2>&1 > /dev/null; then
continue
fi
if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
continue
fi
apt-get install "$pkg"
done
# Remove apt lists
rm -rf /var/lib/apt/lists/*
}
if [[ "$1" != "/"* ]]; then
initialize
# Install additional languages if specified
if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then
install_languages "$PAPERLESS_OCR_LANGUAGES"
fi
exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@"
fi
exec "$@"

View File

@ -2,10 +2,9 @@
Description=Paperless consumer
[Service]
EnvironmentFile=/etc/conf.d/paperless
User=paperless
Group=paperless
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer
[Install]
WantedBy=multi-user.target

View File

@ -2,7 +2,6 @@
Description=Paperless webserver
[Service]
EnvironmentFile=/etc/conf.d/paperless
User=paperless
Group=paperless
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000

View File

@ -1,13 +1,31 @@
#!/bin/bash
# install packages
sudo apt-get update
sudo apt-get build-dep -y python-imaging
sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
# Install packages
apt-get update
apt-get build-dep -y python-imaging
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
# setup python project
pushd /opt/paperless
sudo pip3 install -r requirements.txt
popd
# Python dependencies
pip3 install -r /opt/paperless/requirements.txt
# Create the environment file
cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf
chmod 0640 /etc/paperless.conf
chown root:vagrant /etc/paperless.conf
# Create the consumption directory
mkdir /home/vagrant/consumption
chown vagrant:vagrant /home/vagrant/consumption
echo "
Now follow the remaining steps in the Vagrant section of the setup
documentation to complete the process:
http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant
"

View File

@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group
from django.core.urlresolvers import reverse
from django.templatetags.static import static
from .models import Sender, Tag, Document
from .models import Correspondent, Tag, Document, Log
class MonthListFilter(admin.SimpleListFilter):
@ -45,39 +45,73 @@ class DocumentAdmin(admin.ModelAdmin):
"all": ("paperless.css",)
}
search_fields = ("sender__name", "title", "content")
list_display = ("created", "sender", "title", "tags_", "document")
list_filter = ("tags", "sender", MonthListFilter)
search_fields = ("correspondent__name", "title", "content")
list_display = ("created_", "correspondent", "title", "tags_", "document")
list_filter = ("tags", "correspondent", MonthListFilter)
list_per_page = 25
def created_(self, obj):
return obj.created.date().strftime("%Y-%m-%d")
def tags_(self, obj):
r = ""
for tag in obj.tags.all():
r += '<a class="tag" style="background-color: {};" href="{}">{}</a>'.format(
tag.get_colour_display(),
"{}?tags__id__exact={}".format(
reverse("admin:documents_document_changelist"),
tag.pk
),
tag.slug
colour = tag.get_colour_display()
r += self._html_tag(
"a",
tag.slug,
**{
"class": "tag",
"style": "background-color: {};".format(colour),
"href": "{}?tags__id__exact={}".format(
reverse("admin:documents_document_changelist"),
tag.pk
)
}
)
return r
tags_.allow_tags = True
def document(self, obj):
return '<a href="{}">' \
'<img src="{}" width="22" height="22" alt="{} icon" title="{}">' \
'</a>'.format(
obj.download_url,
static("documents/img/{}.png".format(obj.file_type)),
obj.file_type,
obj.file_name
)
return self._html_tag(
"a",
self._html_tag(
"img",
src=static("documents/img/{}.png".format(obj.file_type)),
width=22,
height=22,
alt=obj.file_type,
title=obj.file_name
),
href=obj.download_url
)
document.allow_tags = True
admin.site.register(Sender)
@staticmethod
def _html_tag(kind, inside=None, **kwargs):
attributes = []
for lft, rgt in kwargs.items():
attributes.append('{}="{}"'.format(lft, rgt))
if inside is not None:
return "<{kind} {attributes}>{inside}</{kind}>".format(
kind=kind, attributes=" ".join(attributes), inside=inside)
return "<{} {}/>".format(kind, " ".join(attributes))
class LogAdmin(admin.ModelAdmin):
list_display = ("message", "level", "component")
list_filter = ("level", "component",)
admin.site.register(Correspondent)
admin.site.register(Tag, TagAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Log, LogAdmin)
# Unless we implement multi-user, these default registrations don't make sense.
admin.site.unregister(Group)

View File

@ -1,5 +1,8 @@
import datetime
import logging
import tempfile
import uuid
from multiprocessing.pool import Pool
import itertools
@ -17,20 +20,14 @@ from PIL import Image
from django.conf import settings
from django.utils import timezone
from django.template.defaultfilters import slugify
from pyocr.tesseract import TesseractError
from logger.models import Log
from paperless.db import GnuPG
from .models import Sender, Tag, Document
from .models import Correspondent, Tag, Document, Log
from .languages import ISO639
def image_to_string(args):
self, png, lang = args
with Image.open(os.path.join(self.SCRATCH, png)) as f:
return self.OCR.image_to_string(f, lang=lang)
class OCRError(Exception):
pass
@ -42,8 +39,8 @@ class ConsumerError(Exception):
class Consumer(object):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
1. Convert it to a greyscale pnm
2. Use tesseract on the pnm
3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database
5. Delete the document and image(s)
@ -51,28 +48,29 @@ class Consumer(object):
SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY
UNPAPER = settings.UNPAPER_BINARY
CONSUME = settings.CONSUMPTION_DIR
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_SENDER_TITLE = re.compile(
REGEX_CORRESPONDENT_TITLE = re.compile(
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_SENDER_TITLE_TAGS = re.compile(
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
def __init__(self, verbosity=1):
def __init__(self):
self.verbosity = verbosity
self.logger = logging.getLogger(__name__)
self.logging_group = None
try:
os.makedirs(self.SCRATCH)
@ -92,6 +90,12 @@ class Consumer(object):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group,
"component": Log.COMPONENT_CONSUMER
})
def consume(self):
for doc in os.listdir(self.CONSUME):
@ -110,122 +114,156 @@ class Consumer(object):
if self._is_ready(doc):
continue
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
pngs = self._get_greyscale(tempdir, doc)
imgs = self._get_greyscale(tempdir, doc)
thumbnail = self._get_thumbnail(tempdir, doc)
try:
text = self._get_ocr(pngs)
self._store(text, doc)
except OCRError:
text = self._get_ocr(imgs)
self._store(text, doc, thumbnail)
except OCRError as e:
self._ignore.append(doc)
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
self._cleanup_tempdir(tempdir)
continue
finally:
self._cleanup(tempdir, doc)
else:
self._cleanup_tempdir(tempdir)
self._cleanup_doc(doc)
def _get_greyscale(self, tempdir, doc):
"""
Greyscale images are easier for Tesseract to OCR
"""
Log.debug(
"Generating greyscale image from {}".format(doc),
Log.COMPONENT_CONSUMER
)
png = os.path.join(tempdir, "convert-%04d.jpg")
self.log("info", "Generating greyscale image from {}".format(doc))
# Convert PDF to multiple PNMs
pnm = os.path.join(tempdir, "convert-%04d.pnm")
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png
"-type", "grayscale", doc, pnm
)).wait()
pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
return sorted(filter(lambda f: os.path.isfile(f), pngs))
# Get a list of converted images
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".pnm"):
pnms.append(os.path.join(tempdir, f))
@staticmethod
def _guess_language(text):
# Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
# Return list of converted images, processed with unpaper
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".unpaper.pnm"):
pnms.append(os.path.join(tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pnms))
def _get_thumbnail(self, tempdir, doc):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
self.log("info", "Generating the thumbnail")
subprocess.Popen((
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
doc,
os.path.join(tempdir, "convert-%04d.png")
)).wait()
return os.path.join(tempdir, "convert-0000.png")
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
Log.debug(
"Language detected: {}".format(guess),
Log.COMPONENT_CONSUMER
)
self.log("debug", "Language detected: {}".format(guess))
return guess
except Exception as e:
Log.warning(
"Language detection error: {}".format(e), Log.COMPONENT_MAIL)
self.log("warning", "Language detection error: {}".format(e))
def _get_ocr(self, pngs):
def _get_ocr(self, imgs):
"""
Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error.
"""
if not pngs:
raise OCRError
if not imgs:
raise OCRError("No images found")
Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
middle = int(len(pngs) / 2)
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639:
Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
Log.warning(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
Log.COMPONENT_CONSUMER
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have."
)
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError
raise OCRError("Language detection failed")
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
try:
return self._ocr(pngs, ISO639[guessed_language])
return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
Log.warning(
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
Log.COMPONENT_CONSUMER
)
)
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError
raise OCRError(
"The guessed language is not available in this instance of "
"Tesseract."
)
def _assemble_ocr_sections(self, pngs, middle, text):
def _assemble_ocr_sections(self, imgs, middle, text):
"""
Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing.
"""
text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text
def _ocr(self, pngs, lang):
def _ocr(self, imgs, lang):
"""
Performs a single OCR attempt.
"""
if not pngs:
if not imgs:
return ""
Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool:
r = pool.map(
image_to_string, itertools.product([self], pngs, [lang]))
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r)
# Strip out excess white space to allow matching to go smoother
@ -233,16 +271,18 @@ class Consumer(object):
def _guess_attributes_from_name(self, parseable):
"""
We use a crude naming convention to make handling the sender, title, and
tags easier:
"<sender> - <title> - <tags>.<suffix>"
"<sender> - <title>.<suffix>"
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<correspondent> - <title> - <tags>.<suffix>"
"<correspondent> - <title>.<suffix>"
"<title>.<suffix>"
"""
def get_sender(sender_name):
return Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})[0]
def get_correspondent(correspondent_name):
return Correspondent.objects.get_or_create(
name=correspondent_name,
defaults={"slug": slugify(correspondent_name)}
)[0]
def get_tags(tags):
r = []
@ -251,40 +291,47 @@ class Consumer(object):
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
return tuple(r)
# First attempt: "<sender> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable)
def get_suffix(suffix):
suffix = suffix.lower()
if suffix == "jpeg":
return "jpg"
return suffix
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
if m:
return (
get_sender(m.group(1)),
get_correspondent(m.group(1)),
m.group(2),
get_tags(m.group(3)),
m.group(4)
get_suffix(m.group(4))
)
# Second attempt: "<sender> - <title>.<suffix>"
m = re.match(self.REGEX_SENDER_TITLE, parseable)
# Second attempt: "<correspondent> - <title>.<suffix>"
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
if m:
return get_sender(m.group(1)), m.group(2), (), m.group(3)
return (
get_correspondent(m.group(1)),
m.group(2),
(),
get_suffix(m.group(3))
)
# That didn't work, so we assume sender and tags are None
# That didn't work, so we assume correspondent and tags are None
m = re.match(self.REGEX_TITLE, parseable)
return None, m.group(1), (), m.group(2)
return None, m.group(1), (), get_suffix(m.group(2))
def _store(self, text, doc):
def _store(self, text, doc, thumbnail):
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
tags = list(tags)
lower_text = text.lower()
relevant_tags = set(
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
stats = os.stat(doc)
Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)
self.log("debug", "Saving record to database")
document = Document.objects.create(
sender=sender,
correspondent=sender,
title=title,
content=text,
file_type=file_type,
@ -296,22 +343,29 @@ class Consumer(object):
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
Log.debug(
"Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
# Encrypt and store the actual document
with open(doc, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
self.log("debug", "Encrypting the document")
encrypted.write(GnuPG.encrypted(unencrypted))
def _cleanup(self, tempdir, doc):
# Remove temporary directory recursively
Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
shutil.rmtree(tempdir)
# Encrypt and store the thumbnail
with open(thumbnail, "rb") as unencrypted:
with open(document.thumbnail_path, "wb") as encrypted:
self.log("debug", "Encrypting the thumbnail")
encrypted.write(GnuPG.encrypted(unencrypted))
# Remove doc
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
self.log("info", "Completed")
def _cleanup_tempdir(self, d):
self.log("debug", "Deleting directory {}".format(d))
shutil.rmtree(d)
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
def _is_ready(self, doc):
@ -329,3 +383,23 @@ class Consumer(object):
self.stats[doc] = t
return False
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except TesseractError:
pass
return ocr.image_to_string(f, lang=lang)
def run_unpaper(args):
unpaper, pnm = args
subprocess.Popen((
unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
)).wait()

View File

@ -8,13 +8,13 @@ from time import mktime
from django import forms
from django.conf import settings
from .models import Document, Sender
from .models import Document, Correspondent
from .consumer import Consumer
class UploadForm(forms.Form):
SECRET = settings.UPLOAD_SHARED_SECRET
SECRET = settings.SHARED_SECRET
TYPE_LOOKUP = {
"application/pdf": Document.TYPE_PDF,
"image/png": Document.TYPE_PNG,
@ -23,31 +23,36 @@ class UploadForm(forms.Form):
"image/tiff": Document.TYPE_TIF,
}
sender = forms.CharField(
max_length=Sender._meta.get_field("name").max_length, required=False)
correspondent = forms.CharField(
max_length=Correspondent._meta.get_field("name").max_length,
required=False
)
title = forms.CharField(
max_length=Document._meta.get_field("title").max_length, required=False)
max_length=Document._meta.get_field("title").max_length,
required=False
)
document = forms.FileField()
signature = forms.CharField(max_length=256)
def clean_sender(self):
def clean_correspondent(self):
"""
I suppose it might look cleaner to use .get_or_create() here, but that
would also allow someone to fill up the db with bogus senders before all
validation was met.
would also allow someone to fill up the db with bogus correspondents
before all validation was met.
"""
sender = self.cleaned_data.get("sender")
if not sender:
corresp = self.cleaned_data.get("correspondent")
if not corresp:
return None
if not Sender.SAFE_REGEX.match(sender) or " - " in sender:
raise forms.ValidationError("That sender name is suspicious.")
return sender
if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
raise forms.ValidationError(
"That correspondent name is suspicious.")
return corresp
def clean_title(self):
title = self.cleaned_data.get("title")
if not title:
return None
if not Sender.SAFE_REGEX.match(title) or " - " in title:
if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
raise forms.ValidationError("That title is suspicious.")
def clean_document(self):
@ -59,10 +64,10 @@ class UploadForm(forms.Form):
return document, self.TYPE_LOOKUP[file_type]
def clean(self):
sender = self.clened_data("sender")
corresp = self.clened_data("correspondent")
title = self.cleaned_data("title")
signature = self.cleaned_data("signature")
if sha256(sender + title + self.SECRET).hexdigest() == signature:
if sha256(corresp + title + self.SECRET).hexdigest() == signature:
return True
return False
@ -73,13 +78,15 @@ class UploadForm(forms.Form):
form do that as well. Think of it as a poor-man's queue server.
"""
sender = self.clened_data("sender")
correspondent = self.clened_data("correspondent")
title = self.cleaned_data("title")
document, file_type = self.cleaned_data.get("document")
t = int(mktime(datetime.now()))
file_name = os.path.join(
Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type))
Consumer.CONSUME,
"{} - {}.{}".format(correspondent, title, file_type)
)
with open(file_name, "wb") as f:
f.write(document)

View File

@ -185,10 +185,10 @@ ISO639 = {
"yo": "yor",
"za": "zha",
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I have
# no idea which one is better, so I just picked the bigger file.
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I
# have no idea which one is better, so I just picked the bigger file.
"zh": "chi_tra",
"zu": "zul"
}
}

30
src/documents/loggers.py Normal file
View File

@ -0,0 +1,30 @@
import logging
class PaperlessLogger(logging.StreamHandler):
"""
A logger smart enough to know to log some kinds of messages to the database
for later retrieval in a pretty interface.
"""
def emit(self, record):
logging.StreamHandler.emit(self, record)
if not hasattr(record, "component"):
return
# We have to do the import here or Django will barf when it tries to
# load this because the apps aren't loaded at that point
from .models import Log
kwargs = {
"message": record.msg,
"component": record.component,
"level": record.levelno,
}
if hasattr(record, "group"):
kwargs["group"] = record.group
Log.objects.create(**kwargs)

View File

@ -1,8 +1,10 @@
import datetime
import imaplib
import logging
import os
import re
import time
import uuid
from base64 import b64decode
from email import policy
@ -11,10 +13,8 @@ from dateutil import parser
from django.conf import settings
from logger.models import Log
from .consumer import Consumer
from .models import Sender
from .models import Correspondent, Log
class MailFetcherError(Exception):
@ -25,21 +25,34 @@ class InvalidMessageError(Exception):
pass
class Message(object):
class Loggable(object):
def __init__(self, group=None):
self.logger = logging.getLogger(__name__)
self.logging_group = group or uuid.uuid4()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group,
"component": Log.COMPONENT_MAIL
})
class Message(Loggable):
"""
A crude, but simple email message class. We assume that there's a subject
and n attachments, and that we don't care about the message body.
"""
SECRET = settings.UPLOAD_SHARED_SECRET
SECRET = settings.SHARED_SECRET
def __init__(self, data, verbosity=1):
def __init__(self, data, group=None):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
self.verbosity = verbosity
Loggable.__init__(self, group=group)
self.subject = None
self.time = None
@ -54,8 +67,7 @@ class Message(object):
self._set_time(message)
Log.info(
'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)
self.log("info", 'Importing email: "{}"'.format(self.subject))
attachments = []
for part in message.walk():
@ -91,7 +103,7 @@ class Message(object):
def check_subject(self):
if self.subject is None:
raise InvalidMessageError("Message does not have a subject")
if not Sender.SAFE_REGEX.match(self.subject):
if not Correspondent.SAFE_REGEX.match(self.subject):
raise InvalidMessageError("Message subject is unsafe: {}".format(
self.subject))
@ -134,9 +146,11 @@ class Attachment(object):
return self.data
class MailFetcher(object):
class MailFetcher(Loggable):
def __init__(self, verbosity=1):
def __init__(self):
Loggable.__init__(self)
self._connection = None
self._host = settings.MAIL_CONSUMPTION["HOST"]
@ -148,7 +162,6 @@ class MailFetcher(object):
self._enabled = bool(self._host)
self.last_checked = datetime.datetime.now()
self.verbosity = verbosity
def pull(self):
"""
@ -159,14 +172,14 @@ class MailFetcher(object):
if self._enabled:
Log.info("Checking mail", Log.COMPONENT_MAIL)
# Reset the grouping id for each fetch
self.logging_group = uuid.uuid4()
self.log("debug", "Checking mail")
for message in self._get_messages():
Log.debug(
'Storing email: "{}"'.format(message.subject),
Log.COMPONENT_MAIL
)
self.log("info", 'Storing email: "{}"'.format(message.subject))
t = int(time.mktime(message.time.timetuple()))
file_name = os.path.join(Consumer.CONSUME, message.file_name)
@ -193,7 +206,7 @@ class MailFetcher(object):
self._connection.logout()
except Exception as e:
Log.error(e, Log.COMPONENT_MAIL)
self.log("error", str(e))
return r
@ -218,9 +231,9 @@ class MailFetcher(object):
message = None
try:
message = Message(data[0][1], self.verbosity)
message = Message(data[0][1], self.logging_group)
except InvalidMessageError as e:
Log.error(e, Log.COMPONENT_MAIL)
self.log("error", str(e))
else:
self._connection.store(num, "+FLAGS", "\\Deleted")

View File

@ -1,10 +1,12 @@
import datetime
import logging
import os
import time
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from ...models import Log
from ...consumer import Consumer, ConsumerError
from ...mail import MailFetcher, MailFetcherError
@ -34,7 +36,7 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
try:
self.file_consumer = Consumer(verbosity=self.verbosity)
self.file_consumer = Consumer()
self.mail_fetcher = MailFetcher()
except (ConsumerError, MailFetcherError) as e:
raise CommandError(e)
@ -44,6 +46,13 @@ class Command(BaseCommand):
except FileExistsError:
pass
logging.getLogger(__name__).info(
"Starting document consumer at {}".format(
settings.CONSUMPTION_DIR
),
extra={"component": Log.COMPONENT_CONSUMER}
)
try:
while True:
self.loop()

View File

@ -1,10 +1,12 @@
import json
import os
import time
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core import serializers
from documents.models import Document
from documents.models import Document, Correspondent, Tag
from paperless.db import GnuPG
from ...mixins import Renderable
@ -14,21 +16,26 @@ class Command(Renderable, BaseCommand):
help = """
Decrypt and rename all files in our collection into a given target
directory. Note that we don't export any of the parsed data since
that can always be re-collected via the consumer.
directory. And include a manifest file containing document data for
easy import.
""".replace(" ", "")
def add_arguments(self, parser):
parser.add_argument("target")
parser.add_argument(
"--legacy",
action="store_true",
help="Don't try to export all of the document data, just dump the "
"original document files out in a format that makes "
"re-consuming them easy."
)
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.target = None
BaseCommand.__init__(self, *args, **kwargs)
self.target = None
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self.target = options["target"]
if not os.path.exists(self.target):
@ -40,9 +47,22 @@ class Command(Renderable, BaseCommand):
if not settings.PASSPHRASE:
settings.PASSPHRASE = input("Please enter the passphrase: ")
for document in Document.objects.all():
if options["legacy"]:
self.dump_legacy()
else:
self.dump()
def dump(self):
documents = Document.objects.all()
document_map = {d.pk: d for d in documents}
manifest = json.loads(serializers.serialize("json", documents))
for document_dict in manifest:
document = document_map[document_dict["pk"]]
target = os.path.join(self.target, document.file_name)
document_dict["__exported_file_name__"] = target
print("Exporting: {}".format(target))
@ -50,3 +70,37 @@ class Command(Renderable, BaseCommand):
f.write(GnuPG.decrypted(document.source_file))
t = int(time.mktime(document.created.timetuple()))
os.utime(target, times=(t, t))
manifest += json.loads(
serializers.serialize("json", Correspondent.objects.all()))
manifest += json.loads(serializers.serialize(
"json", Tag.objects.all()))
with open(os.path.join(self.target, "manifest.json"), "w") as f:
json.dump(manifest, f, indent=2)
def dump_legacy(self):
for document in Document.objects.all():
target = os.path.join(
self.target, self._get_legacy_file_name(document))
print("Exporting: {}".format(target))
with open(target, "wb") as f:
f.write(GnuPG.decrypted(document.source_file))
t = int(time.mktime(document.created.timetuple()))
os.utime(target, times=(t, t))
@staticmethod
def _get_legacy_file_name(doc):
if doc.correspondent and doc.title:
tags = ",".join([t.slug for t in doc.tags.all()])
if tags:
return "{} - {} - {}.{}".format(
doc.correspondent, doc.title, tags, doc.file_type)
return "{} - {}.{}".format(
doc.correspondent, doc.title, doc.file_type)
return os.path.basename(doc.source_path)

View File

@ -0,0 +1,99 @@
import json
import os
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.core.management import call_command
from documents.models import Document
from paperless.db import GnuPG
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
Using a manifest.json file, load the data from there, and import the
documents it refers to.
""".replace(" ", "")
def add_arguments(self, parser):
parser.add_argument("source")
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
self.source = None
self.manifest = None
def handle(self, *args, **options):
self.source = options["source"]
if not os.path.exists(self.source):
raise CommandError("That path doesn't exist")
if not os.access(self.source, os.R_OK):
raise CommandError("That path doesn't appear to be readable")
manifest_path = os.path.join(self.source, "manifest.json")
self._check_manifest_exists(manifest_path)
with open(manifest_path) as f:
self.manifest = json.load(f)
self._check_manifest()
if not settings.PASSPHRASE:
raise CommandError(
"You need to define a passphrase before continuing. Please "
"consult the documentation for setting up Paperless."
)
# Fill up the database with whatever is in the manifest
call_command("loaddata", manifest_path)
self._import_files_from_manifest()
@staticmethod
def _check_manifest_exists(path):
if not os.path.exists(path):
raise CommandError(
"That directory doesn't appear to contain a manifest.json "
"file."
)
def _check_manifest(self):
for record in self.manifest:
if not record["model"] == "documents.document":
continue
if "__exported_file_name__" not in record:
raise CommandError(
'The manifest file contains a record which does not '
'refer to an actual document file.'
)
doc_file = record["__exported_file_name__"]
if not os.path.exists(os.path.join(self.source, doc_file)):
raise CommandError(
'The manifest file refers to "{}" which does not '
'appear to be in the source directory.'.format(doc_file)
)
def _import_files_from_manifest(self):
for record in self.manifest:
if not record["model"] == "documents.document":
continue
doc_file = record["__exported_file_name__"]
document = Document.objects.get(pk=record["pk"])
with open(doc_file, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
print("Encrypting {} and saving it to {}".format(
doc_file, document.source_path))
encrypted.write(GnuPG.encrypted(unencrypted))

View File

@ -10,8 +10,8 @@ class Command(Renderable, BaseCommand):
help = """
Using the current set of tagging rules, apply said rules to all
documents in the database, effectively allowing you to back-tag all
previously indexed documents with tags created (or modified) after their
initial import.
previously indexed documents with tags created (or modified) after
their initial import.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
self.verbosity = options["verbosity"]
for document in Document.objects.all():
tags = Tag.objects.exclude(
pk__in=document.tags.values_list("pk", flat=True))
for tag in tags:
if tag.matches(document.content):
print('Tagging {} with "{}"'.format(document, tag))
document.tags.add(tag)
for tag in Tag.match_all(document.content, tags):
print('Tagging {} with "{}"'.format(document, tag))
document.tags.add(tag)

View File

@ -0,0 +1,20 @@
import sys
from django.core.management.commands.loaddata import Command as LoadDataCommand
class Command(LoadDataCommand):
"""
Allow the loading of data from standard in. Sourced originally from:
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)
"""
def parse_name(self, fixture_name):
self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None)
if fixture_name == '-':
return '-', 'json', 'stdin'
def find_fixtures(self, fixture_label):
if fixture_label == '-':
return [('-', None, '-')]
return super(Command, self).find_fixtures(fixture_label)

70
src/documents/managers.py Normal file
View File

@ -0,0 +1,70 @@
from django.conf import settings
from django.db import models
from django.db.models.aggregates import Max
class GroupConcat(models.Aggregate):
"""
Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've
only ever tested it in Sqlite.
"""
ENGINE_SQLITE = 1
ENGINE_POSTGRESQL = 2
ENGINE_MYSQL = 3
ENGINES = {
"django.db.backends.sqlite3": ENGINE_SQLITE,
"django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL,
"django.db.backends.postgresql": ENGINE_POSTGRESQL,
"django.db.backends.mysql": ENGINE_MYSQL
}
def __init__(self, expression, separator="\n", **extra):
self.engine = self._get_engine()
self.function = self._get_function()
self.template = self._get_template(separator)
models.Aggregate.__init__(
self,
expression,
output_field=models.CharField(),
**extra
)
def _get_engine(self):
engine = settings.DATABASES["default"]["ENGINE"]
try:
return self.ENGINES[engine]
except KeyError:
raise NotImplementedError(
"There's currently no support for {} when it comes to group "
"concatenation in Paperless".format(engine)
)
def _get_function(self):
if self.engine == self.ENGINE_POSTGRESQL:
return "STRING_AGG"
return "GROUP_CONCAT"
def _get_template(self, separator):
if self.engine == self.ENGINE_MYSQL:
return "%(function)s(%(expressions)s, SEPARATOR '{}')".format(
separator)
return "%(function)s(%(expressions)s, '{}')".format(separator)
class LogQuerySet(models.query.QuerySet):
def by_group(self):
return self.values("group").annotate(
time=Max("modified"),
messages=GroupConcat("message"),
).order_by("-time")
class LogManager(models.Manager):
def get_queryset(self):
return LogQuerySet(self.model, using=self._db)

View File

@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-02-14 16:08
# Generated by Django 1.9 on 2016-02-27 17:54
from __future__ import unicode_literals
from django.db import migrations, models
@ -7,9 +7,8 @@ from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('documents', '0009_auto_20160214_0040'),
]
operations = [
@ -17,14 +16,15 @@ class Migration(migrations.Migration):
name='Log',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('time', models.DateTimeField(auto_now_add=True)),
('group', models.UUIDField(blank=True)),
('message', models.TextField()),
('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)),
('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)),
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
],
),
migrations.AlterModelOptions(
name='log',
options={'ordering': ('-time',)},
options={
'ordering': ('-modified',),
},
),
]

View File

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.2 on 2016-03-03 19:29
from __future__ import unicode_literals
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('documents', '0010_log'),
]
operations = [
migrations.RenameModel(
old_name='Sender',
new_name='Correspondent',
),
migrations.AlterModelOptions(
name='document',
options={'ordering': ('correspondent', 'title')},
),
migrations.RenameField(
model_name='document',
old_name='sender',
new_name='correspondent',
),
]

View File

@ -0,0 +1,119 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.2 on 2016-03-05 00:40
from __future__ import unicode_literals
import gnupg
import os
import re
import shutil
import subprocess
import tempfile
from django.conf import settings
from django.db import migrations
from django.utils.termcolors import colorize as colourise # Spelling hurts me
class GnuPG(object):
"""
A handy singleton to use when handling encrypted files.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
@classmethod
def decrypted(cls, file_handle):
return cls.gpg.decrypt_file(
file_handle, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, file_handle):
return cls.gpg.encrypt_file(
file_handle,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data
def move_documents_and_create_thumbnails(apps, schema_editor):
documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
if set(documents) == {"originals", "thumbnails"}:
return
print(colourise(
"\n\n"
" This is a one-time only migration to generate thumbnails for all of your\n"
" documents so that future UIs will have something to work with. If you have\n"
" a lot of documents though, this may take a while, so a coffee break may be\n"
" in order."
"\n", opts=("bold",)
))
try:
os.makedirs(settings.SCRATCH_DIR)
except FileExistsError:
pass
for f in sorted(documents):
if not f.endswith("gpg"):
continue
print(" {} {} {}".format(
colourise("*", fg="green"),
colourise("Generating a thumbnail for", fg="white"),
colourise(f, fg="cyan")
))
thumb_temp = tempfile.mkdtemp(
prefix="paperless", dir=settings.SCRATCH_DIR)
orig_temp = tempfile.mkdtemp(
prefix="paperless", dir=settings.SCRATCH_DIR)
orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f)
orig_target = os.path.join(orig_temp, f.replace(".gpg", ""))
with open(orig_source, "rb") as encrypted:
with open(orig_target, "wb") as unencrypted:
unencrypted.write(GnuPG.decrypted(encrypted))
subprocess.Popen((
settings.CONVERT_BINARY,
"-scale", "500x5000",
"-alpha", "remove",
orig_target,
os.path.join(thumb_temp, "convert-%04d.png")
)).wait()
thumb_source = os.path.join(thumb_temp, "convert-0000.png")
thumb_target = os.path.join(
settings.MEDIA_ROOT,
"documents",
"thumbnails",
re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
)
with open(thumb_source, "rb") as unencrypted:
with open(thumb_target, "wb") as encrypted:
encrypted.write(GnuPG.encrypted(unencrypted))
shutil.rmtree(thumb_temp)
shutil.rmtree(orig_temp)
shutil.move(
os.path.join(settings.MEDIA_ROOT, "documents", f),
os.path.join(settings.MEDIA_ROOT, "documents", "originals", f),
)
class Migration(migrations.Migration):
dependencies = [
('documents', '0011_auto_20160303_1929'),
]
operations = [
migrations.RunPython(move_documents_and_create_thumbnails),
]

View File

@ -1,7 +1,7 @@
class Renderable(object):
"""
A handy mixin to make it easier/cleaner to print output based on a verbosity
value.
A handy mixin to make it easier/cleaner to print output based on a
verbosity value.
"""
def _render(self, text, verbosity):

View File

@ -1,5 +1,7 @@
import logging
import os
import re
import uuid
from django.conf import settings
from django.core.urlresolvers import reverse
@ -7,6 +9,8 @@ from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from .managers import LogManager
class SluggedModel(models.Model):
@ -25,7 +29,7 @@ class SluggedModel(models.Model):
return self.name
class Sender(SluggedModel):
class Correspondent(SluggedModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
@ -36,7 +40,7 @@ class Sender(SluggedModel):
class Tag(SluggedModel):
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
@ -71,9 +75,9 @@ class Tag(SluggedModel):
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word provided "
"in the PDF, while \"all\" requires that every word provided "
"appear in the PDF, albeit not in the order provided. A "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. If you don't know what a regex "
@ -86,28 +90,40 @@ class Tag(SluggedModel):
return "{}: \"{}\" ({})".format(
self.name, self.match, self.get_matching_algorithm_display())
@classmethod
def match_all(cls, text, tags=None):
if tags is None:
tags = cls.objects.all()
text = text.lower()
for tag in tags:
if tag.matches(text):
yield tag
def matches(self, text):
# Check that match is not empty
if self.match.strip() == "":
return False
if self.matching_algorithm == self.MATCH_ALL:
for word in self.match.split(" "):
if word not in text:
if not re.search(r"\b{}\b".format(word), text):
return False
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self.match.split(" "):
if word in text:
if re.search(r"\b{}\b".format(word), text):
return True
return False
if self.matching_algorithm == self.MATCH_LITERAL:
return self.match in text
return bool(re.search(r"\b{}\b".format(self.match), text))
if self.matching_algorithm == self.MATCH_REGEX:
return re.search(re.compile(self.match), text)
return bool(re.search(re.compile(self.match), text))
raise NotImplementedError("Unsupported matching algorithm")
@ -125,8 +141,8 @@ class Document(models.Model):
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents")
correspondent = models.ForeignKey(
Correspondent, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
file_type = models.CharField(
@ -140,14 +156,15 @@ class Document(models.Model):
modified = models.DateTimeField(auto_now=True, editable=False)
class Meta(object):
ordering = ("sender", "title")
ordering = ("correspondent", "title")
def __str__(self):
created = self.created.strftime("%Y-%m-%d")
if self.sender and self.title:
return "{}: {}, {}".format(created, self.sender, self.title)
if self.sender or self.title:
return "{}: {}".format(created, self.sender or self.title)
created = self.created.strftime("%Y%m%d%H%M%S")
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
if self.correspondent or self.title:
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
@property
@ -155,6 +172,7 @@ class Document(models.Model):
return os.path.join(
settings.MEDIA_ROOT,
"documents",
"originals",
"{:07}.{}.gpg".format(self.pk, self.file_type)
)
@ -164,14 +182,71 @@ class Document(models.Model):
@property
def file_name(self):
if self.sender and self.title:
tags = ",".join([t.slug for t in self.tags.all()])
if tags:
return "{} - {} - {}.{}".format(
self.sender, self.title, tags, self.file_type)
return "{} - {}.{}".format(self.sender, self.title, self.file_type)
return os.path.basename(self.source_path)
return slugify(str(self)) + "." + self.file_type
@property
def download_url(self):
return reverse("fetch", kwargs={"pk": self.pk})
return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk})
@property
def thumbnail_path(self):
return os.path.join(
settings.MEDIA_ROOT,
"documents",
"thumbnails",
"{:07}.png.gpg".format(self.pk)
)
@property
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
@property
def thumbnail_url(self):
return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
class Log(models.Model):
LEVELS = (
(logging.DEBUG, "Debugging"),
(logging.INFO, "Informational"),
(logging.WARNING, "Warning"),
(logging.ERROR, "Error"),
(logging.CRITICAL, "Critical"),
)
COMPONENT_CONSUMER = 1
COMPONENT_MAIL = 2
COMPONENTS = (
(COMPONENT_CONSUMER, "Consumer"),
(COMPONENT_MAIL, "Mail Fetcher")
)
group = models.UUIDField(blank=True)
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
component = models.PositiveIntegerField(choices=COMPONENTS)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
objects = LogManager()
class Meta(object):
ordering = ("-modified",)
def __str__(self):
return self.message
def save(self, *args, **kwargs):
"""
To allow for the case where we don't want to group the message, we
shouldn't force the caller to specify a one-time group value. However,
allowing group=None means that the manager can't differentiate the
different un-grouped messages, so instead we set a random one here.
"""
if not self.group:
self.group = uuid.uuid4()
models.Model.save(self, *args, **kwargs)

View File

@ -0,0 +1,55 @@
from rest_framework import serializers
from .models import Correspondent, Tag, Document, Log
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
class Meta(object):
model = Correspondent
fields = ("id", "slug", "name")
class TagSerializer(serializers.HyperlinkedModelSerializer):
class Meta(object):
model = Tag
fields = (
"id", "slug", "name", "colour", "match", "matching_algorithm")
class DocumentSerializer(serializers.ModelSerializer):
correspondent = serializers.HyperlinkedRelatedField(
read_only=True, view_name="drf:correspondent-detail", allow_null=True)
tags = serializers.HyperlinkedRelatedField(
read_only=True, view_name="drf:tag-detail", many=True)
class Meta(object):
model = Document
fields = (
"id",
"correspondent",
"title",
"content",
"file_type",
"tags",
"created",
"modified",
"file_name",
"download_url",
"thumbnail_url",
)
class LogSerializer(serializers.ModelSerializer):
time = serializers.DateTimeField()
messages = serializers.CharField()
class Meta(object):
model = Log
fields = (
"time",
"messages"
)

View File

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html lang="en-gb">
<head>
<title>Paperless</title>
<meta charset="utf-8">
</head>
<body>
</body>
</html>

View File

@ -4,18 +4,26 @@ from ..consumer import Consumer
class TestAttachment(TestCase):
TAGS = ("tag1", "tag2", "tag3")
CONSUMER = Consumer()
SUFFIXES = (
"pdf", "png", "jpg", "jpeg", "gif",
"PDF", "PNG", "JPG", "JPEG", "GIF",
"PdF", "PnG", "JpG", "JPeG", "GiF",
)
def _test_guess_attributes_from_name(self, path, sender, title, tags):
for suffix in ("pdf", "png", "jpg", "jpeg", "gif"):
for suffix in self.SUFFIXES:
f = path.format(suffix)
results = self.CONSUMER._guess_attributes_from_name(f)
self.assertEqual(results[0].name, sender, f)
self.assertEqual(results[1], title, f)
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
self.assertEqual(results[3], suffix, f)
if suffix.lower() == "jpeg":
self.assertEqual(results[3], "jpg", f)
else:
self.assertEqual(results[3], suffix.lower(), f)
def test_guess_attributes_from_name0(self):
self._test_guess_attributes_from_name(

View File

@ -0,0 +1,36 @@
from django.core.management.base import CommandError
from django.test import TestCase
from ..management.commands.document_importer import Command
class TestImporter(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
def test_check_manifest_exists(self):
cmd = Command()
self.assertRaises(
CommandError, cmd._check_manifest_exists, "/tmp/manifest.json")
def test_check_manifest(self):
cmd = Command()
cmd.source = "/tmp"
cmd.manifest = [{"model": "documents.document"}]
with self.assertRaises(CommandError) as cm:
cmd._check_manifest()
self.assertTrue(
'The manifest file contains a record' in str(cm.exception))
cmd.manifest = [{
"model": "documents.document",
"__exported_file_name__": "noexist.pdf"
}]
# self.assertRaises(CommandError, cmd._check_manifest)
with self.assertRaises(CommandError) as cm:
cmd._check_manifest()
self.assertTrue(
'The manifest file refers to "noexist.pdf"' in str(cm.exception))

View File

@ -0,0 +1,142 @@
import logging
import uuid
from unittest import mock
from django.test import TestCase
from ..models import Log
class TestPaperlessLog(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.logger = logging.getLogger(
"documents.management.commands.document_consumer")
def test_ignored(self):
with mock.patch("logging.StreamHandler.emit") as __:
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message")
self.logger.warning("This is an informational message")
self.logger.error("This is an informational message")
self.logger.critical("This is an informational message")
self.assertEqual(Log.objects.all().count(), 0)
def test_that_it_saves_at_all(self):
kw = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw)
self.assertEqual(Log.objects.all().count(), 1)
self.logger.warning("This is an warning message", extra=kw)
self.assertEqual(Log.objects.all().count(), 2)
self.logger.error("This is an error message", extra=kw)
self.assertEqual(Log.objects.all().count(), 3)
self.logger.critical("This is a critical message", extra=kw)
self.assertEqual(Log.objects.all().count(), 4)
def test_groups(self):
kw1 = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
kw2 = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 1)
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
self.logger.warning("This is an warning message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 2)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
self.logger.error("This is an error message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 3)
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
self.logger.critical("This is a critical message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
def test_components(self):
c1 = Log.COMPONENT_CONSUMER
c2 = Log.COMPONENT_MAIL
kw1 = {
"group": uuid.uuid4(),
"component": c1
}
kw2 = {
"group": kw1["group"],
"component": c2
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 1)
self.assertEqual(Log.objects.filter(component=c2).count(), 1)
self.logger.warning("This is an warning message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 2)
self.assertEqual(Log.objects.filter(component=c1).count(), 1)
self.logger.error("This is an error message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 3)
self.assertEqual(Log.objects.filter(component=c2).count(), 2)
self.logger.critical("This is a critical message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(component=c1).count(), 2)
def test_groupped_query(self):
kw = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
with mock.patch("logging.StreamHandler.emit") as __:
self.logger.info("Message 0", extra=kw)
self.logger.info("Message 1", extra=kw)
self.logger.info("Message 2", extra=kw)
self.logger.info("Message 3", extra=kw)
self.assertEqual(Log.objects.all().by_group().count(), 1)
self.assertEqual(
Log.objects.all().by_group()[0]["messages"],
"Message 0\nMessage 1\nMessage 2\nMessage 3"
)

View File

@ -3,6 +3,7 @@ import os
import magic
from hashlib import md5
from unittest import mock
from django.conf import settings
from django.test import TestCase
@ -27,7 +28,8 @@ class TestMessage(TestCase):
with open(self.sample, "rb") as f:
message = Message(f.read(), verbosity=0)
with mock.patch("logging.StreamHandler.emit") as __:
message = Message(f.read())
self.assertTrue(message)
self.assertEqual(message.subject, "Test 0")

View File

@ -0,0 +1,119 @@
from django.test import TestCase
from ..models import Tag
class TestTagMatching(TestCase):
def test_match_all(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_ALL
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have charlie in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_ALL
)
self.assertFalse(t.matches("I have 12 in me"))
self.assertFalse(t.matches("I have 34 in me"))
self.assertFalse(t.matches("I have 56 in me"))
self.assertFalse(t.matches("I have 12 and 34 in me"))
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
def test_match_any(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_ANY
)
self.assertTrue(t.matches("I have alpha in me"))
self.assertTrue(t.matches("I have charlie in me"))
self.assertTrue(t.matches("I have gamma in me"))
self.assertTrue(t.matches("I have alpha and charlie in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_ANY
)
self.assertTrue(t.matches("I have 12 in me"))
self.assertTrue(t.matches("I have 34 in me"))
self.assertTrue(t.matches("I have 56 in me"))
self.assertTrue(t.matches("I have 12 and 34 in me"))
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
def test_match_literal(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_LITERAL
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have charlie in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_LITERAL
)
self.assertFalse(t.matches("I have 12 in me"))
self.assertFalse(t.matches("I have 34 in me"))
self.assertFalse(t.matches("I have 56 in me"))
self.assertFalse(t.matches("I have 12 and 34 in me"))
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
self.assertTrue(t.matches("I have 12 34 56 in me"))
def test_match_regex(self):
t = Tag.objects.create(
name="Test 0",
match="alpha\w+gamma",
matching_algorithm=Tag.MATCH_REGEX
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
self.assertFalse(t.matches("I have alpha and gamma in me"))
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas in me"))

View File

@ -1,21 +1,41 @@
from django.contrib.auth.mixins import LoginRequiredMixin
from django.http import HttpResponse
from django.template.defaultfilters import slugify
from django.views.decorators.csrf import csrf_exempt
from django.views.generic import FormView, DetailView
from django.views.generic import FormView, DetailView, TemplateView
from rest_framework.mixins import (
RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin)
from rest_framework.pagination import PageNumberPagination
from rest_framework.permissions import IsAuthenticated
from rest_framework.viewsets import (
ModelViewSet, ReadOnlyModelViewSet, GenericViewSet)
from paperless.db import GnuPG
from .models import Document
from .forms import UploadForm
from .models import Correspondent, Tag, Document, Log
from .serialisers import (
CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer)
class PdfView(DetailView):
class IndexView(TemplateView):
template_name = "documents/index.html"
def get_context_data(self, **kwargs):
print(kwargs)
print(self.request.GET)
print(self.request.POST)
return TemplateView.get_context_data(self, **kwargs)
class FetchView(DetailView):
model = Document
def render_to_response(self, context, **response_kwargs):
"""
Override the default to return the unencrypted PDF as raw data.
Override the default to return the unencrypted image/PDF as raw data.
"""
content_types = {
@ -26,19 +46,25 @@ class PdfView(DetailView):
Document.TYPE_TIF: "image/tiff",
}
if self.kwargs["kind"] == "thumb":
return HttpResponse(
GnuPG.decrypted(self.object.thumbnail_file),
content_type=content_types[Document.TYPE_PNG]
)
response = HttpResponse(
GnuPG.decrypted(self.object.source_file),
content_type=content_types[self.object.file_type]
)
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
slugify(str(self.object)) + "." + self.object.file_type)
self.object.file_name)
return response
class PushView(FormView):
class PushView(LoginRequiredMixin, FormView):
"""
A crude REST API for creating documents.
A crude REST-ish API for creating documents.
"""
form_class = UploadForm
@ -52,3 +78,45 @@ class PushView(FormView):
def form_invalid(self, form):
return HttpResponse("0")
class StandardPagination(PageNumberPagination):
page_size = 25
page_size_query_param = "page-size"
max_page_size = 100000
class CorrespondentViewSet(ModelViewSet):
model = Correspondent
queryset = Correspondent.objects.all()
serializer_class = CorrespondentSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
class TagViewSet(ModelViewSet):
model = Tag
queryset = Tag.objects.all()
serializer_class = TagSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
class DocumentViewSet(RetrieveModelMixin,
UpdateModelMixin,
DestroyModelMixin,
ListModelMixin,
GenericViewSet):
model = Document
queryset = Document.objects.all()
serializer_class = DocumentSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
class LogViewSet(ReadOnlyModelViewSet):
model = Log
queryset = Log.objects.all().by_group()
serializer_class = LogSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)

View File

@ -1,12 +0,0 @@
from django.contrib import admin
from .models import Log
class LogAdmin(admin.ModelAdmin):
list_display = ("message", "level", "component")
list_filter = ("level", "component",)
admin.site.register(Log, LogAdmin)

View File

@ -1,5 +0,0 @@
from django.apps import AppConfig
class LoggerConfig(AppConfig):
name = 'logger'

View File

@ -1,50 +0,0 @@
from django.db import models
class Log(models.Model):
LEVEL_ERROR = 1
LEVEL_WARNING = 2
LEVEL_INFO = 3
LEVEL_DEBUG = 4
LEVELS = (
(LEVEL_ERROR, "Error"),
(LEVEL_WARNING, "Warning"),
(LEVEL_INFO, "Informational"),
(LEVEL_DEBUG, "Debugging"),
)
COMPONENT_CONSUMER = 1
COMPONENT_MAIL = 2
COMPONENTS = (
(COMPONENT_CONSUMER, "Consumer"),
(COMPONENT_MAIL, "Mail Fetcher")
)
time = models.DateTimeField(auto_now_add=True)
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO)
component = models.PositiveIntegerField(choices=COMPONENTS)
class Meta(object):
ordering = ("-time",)
@classmethod
def error(cls, message, component):
cls.objects.create(
message=message, level=cls.LEVEL_ERROR, component=component)
@classmethod
def warning(cls, message, component):
cls.objects.create(
message=message, level=cls.LEVEL_WARNING, component=component)
@classmethod
def info(cls, message, component):
cls.objects.create(
message=message, level=cls.LEVEL_INFO, component=component)
@classmethod
def debug(cls, message, component):
cls.objects.create(
message=message, level=cls.LEVEL_DEBUG, component=component)

View File

@ -1,3 +0,0 @@
from django.test import TestCase
# Create your tests here.

View File

@ -1,3 +0,0 @@
from django.shortcuts import render
# Create your views here.

View File

@ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
import os
from dotenv import load_dotenv
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@ -42,7 +44,8 @@ INSTALLED_APPS = [
"django_extensions",
"documents",
"logger",
"rest_framework",
]
@ -87,12 +90,12 @@ DATABASES = {
"NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"),
}
}
if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"):
if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
DATABASES["default"] = {
"ENGINE": "django.db.backends.postgresql_psycopg2",
"NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"),
"USER": os.environ.get("PAPERLESS_DBUSER"),
"PASSWORD": os.environ.get("PAPERLESS_DBPASS")
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER"),
"PASSWORD": os.getenv("PAPERLESS_DBPASS")
}
@ -139,55 +142,119 @@ STATIC_URL = '/static/'
MEDIA_URL = "/media/"
# Paperless-specific stuffs
# Change these paths if yours are different
# Paperless-specific stuff
# You shouldn't have to edit any of these values. Rather, you can set these
# values in /etc/paperless.conf instead.
# ----------------------------------------------------------------------------
# Tap paperless.conf if it's available
if os.path.exists("/etc/paperless.conf"):
load_dotenv("/etc/paperless.conf")
# Logging
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"handlers": {
"consumer": {
"class": "documents.loggers.PaperlessLogger",
}
},
"loggers": {
"documents": {
"handlers": ["consumer"],
"level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"),
},
},
}
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = "eng"
# The amount of threads to use for OCR
OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF being
# indexed anyway, with whatever we could get. If it's False, the file will
# simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = True
# If this is true, any failed attempts to OCR a PDF will result in the PDF
# being indexed anyway, with whatever we could get. If it's False, the file
# will simply be left in the CONSUMPTION_DIR.
FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true"))
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.environ.get("HOME", "/dev/null")
GNUPG_HOME = os.getenv("HOME", "/tmp")
# Convert is part of the Imagemagick package
CONVERT_BINARY = "/usr/bin/convert"
# Convert is part of the ImageMagick package
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
# Unpaper
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
# This will be created if it doesn't exist
SCRATCH_DIR = "/tmp/paperless"
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
# This is where Paperless will look for PDFs to index
CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
# If you want to use IMAP mail consumption, populate this with useful values.
# If you leave HOST set to None, we assume you're not going to use this feature.
# If you leave HOST set to None, we assume you're not going to use this
# feature.
MAIL_CONSUMPTION = {
"HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
"PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
"USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"),
"PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"),
"HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"),
"PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"),
"USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"),
"PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"),
"USE_SSL": True, # If True, use SSL/TLS to connect
"INBOX": "INBOX" # The name of the inbox on the server
}
# This is used to encrypt the original documents and decrypt them later when you
# want to download them. Set it and change the permissions on this file to
# This is used to encrypt the original documents and decrypt them later when
# you want to download them. Set it and change the permissions on this file to
# 0600, or set it to `None` and you'll be prompted for the passphrase at
# runtime. The default looks for an environment variable.
# DON'T FORGET TO SET THIS as leaving it blank may cause some strange things
# with GPG, including an interesting case where it may "encrypt" zero-byte
# files.
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
# If you intend to use the "API" to push files into the consumer, you'll need to
# provide a shared secret here. Leaving this as the default will disable the
# API.
UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "")
# If you intend to use the "API" to push files into the consumer, you'll need
# to provide a shared secret here. Leaving this as the default will disable
# the API.
SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
#
# TODO: Remove after 1.2
#
# This logic is here to address issue #44, wherein we were using inconsistent
# constant names vs. environment variables. If you're using Paperless for the
# first time, you can safely ignore everything from here on, so long as you're
# correctly defining the variables as per the documentation.
#
def deprecated(before, after):
print(
"\n\n"
"WARNING: {before} has been renamed to {after}.\n"
"WARNING: Use of {before} will not work as of version 1.2."
"\n\n".format(
before=before,
after=after
)
)
if not CONVERT_BINARY:
CONVERT_BINARY = "convert"
if os.getenv("PAPERLESS_CONVERT"):
deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY")
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY)
if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"):
deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR")
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME")
if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"):
deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET")
SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "")

View File

@ -15,15 +15,46 @@ Including another URLconf
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
"""
from django.conf import settings
from django.conf.urls import url, static
from django.conf.urls import url, static, include
from django.contrib import admin
from documents.views import PdfView, PushView
from rest_framework.routers import DefaultRouter
from documents.views import (
IndexView, FetchView, PushView,
CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet
)
router = DefaultRouter()
router.register(r'correspondents', CorrespondentViewSet)
router.register(r'tags', TagViewSet)
router.register(r'documents', DocumentViewSet)
router.register(r'logs', LogViewSet)
urlpatterns = [
url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
url(r'', admin.site.urls),
# API
url(
r"^api/auth/",
include('rest_framework.urls', namespace="rest_framework")
),
url(r"^api/", include(router.urls, namespace="drf")),
# Normal pages (coming soon)
# url(r"^$", IndexView.as_view(), name="index"),
# File downloads
url(
r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$",
FetchView.as_view(),
name="fetch"
),
# The Django admin
url(r"admin/", admin.site.urls),
url(r"", admin.site.urls), # This is going away
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
if settings.UPLOAD_SHARED_SECRET:
if settings.SHARED_SECRET:
urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push"))

View File

@ -1 +1 @@
__version__ = (0, 0, 6)
__version__ = (0, 1, 1)

23
src/tox.ini Normal file
View File

@ -0,0 +1,23 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
skipsdist = True
envlist = py34, py35, pep8
[testenv]
commands = {envpython} manage.py test
deps = -r{toxinidir}/../requirements.txt
setenv =
PAPERLESS_CONSUME=/tmp
PAPERLESS_PASSPHRASE=THISISNOTASECRET
PAPERLESS_SECRET=paperless
[testenv:pep8]
commands=pep8
deps=pep8
[pep8]
exclude=.tox,migrations,paperless/settings.py