mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
commit
f5e0a89a3f
7
.gitignore
vendored
7
.gitignore
vendored
@ -57,7 +57,9 @@ docs/_build/
|
||||
target/
|
||||
|
||||
# Stored PDFs
|
||||
media/*
|
||||
media/documents/*.gpg
|
||||
media/documents/thumbnails/*.gpg
|
||||
media/documents/originals/*.gpg
|
||||
|
||||
# Sqlite database
|
||||
db.sqlite3
|
||||
@ -68,8 +70,9 @@ db.sqlite3
|
||||
# Other stuff that doesn't belong
|
||||
virtualenv
|
||||
.vagrant
|
||||
docker-compose.yml
|
||||
docker-compose.env
|
||||
|
||||
# Used for development
|
||||
scripts/import-for-development
|
||||
environment
|
||||
|
||||
|
18
.travis.yml
Normal file
18
.travis.yml
Normal file
@ -0,0 +1,18 @@
|
||||
language: python
|
||||
|
||||
sudo: false
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- python: 3.4
|
||||
env: TOXENV=py34
|
||||
- python: 3.5
|
||||
env: TOXENV=py35
|
||||
- python: 3.5
|
||||
env: TOXENV=pep8
|
||||
|
||||
install:
|
||||
- pip install --requirement requirements.txt
|
||||
- pip install tox
|
||||
|
||||
script: tox -c src/tox.ini
|
46
Dockerfile
Normal file
46
Dockerfile
Normal file
@ -0,0 +1,46 @@
|
||||
FROM python:3.5.1
|
||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
sudo \
|
||||
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install python dependencies
|
||||
RUN mkdir -p /usr/src/paperless
|
||||
WORKDIR /usr/src/paperless
|
||||
COPY requirements.txt /usr/src/paperless/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application
|
||||
RUN mkdir -p /usr/src/paperless/src
|
||||
RUN mkdir -p /usr/src/paperless/data
|
||||
RUN mkdir -p /usr/src/paperless/media
|
||||
COPY src/ /usr/src/paperless/src/
|
||||
COPY data/ /usr/src/paperless/data/
|
||||
COPY media/ /usr/src/paperless/media/
|
||||
|
||||
# Set consumption directory
|
||||
ENV PAPERLESS_CONSUMPTION_DIR /consume
|
||||
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
|
||||
|
||||
# Migrate database
|
||||
WORKDIR /usr/src/paperless/src
|
||||
RUN ./manage.py migrate
|
||||
|
||||
# Create user
|
||||
RUN groupadd -g 1000 paperless \
|
||||
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
|
||||
&& chown -Rh paperless:paperless /usr/src/paperless
|
||||
|
||||
# Setup entrypoint
|
||||
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||
RUN chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
# Mount volumes
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
|
||||
|
||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||
CMD ["--help"]
|
@ -3,6 +3,7 @@ Paperless
|
||||
|
||||
|Documentation|
|
||||
|Chat|
|
||||
|Travis|
|
||||
|
||||
Scan, index, and archive all of your paper documents
|
||||
|
||||
@ -55,6 +56,7 @@ powerful tools.
|
||||
|
||||
* `ImageMagick`_ converts the images between colour and greyscale.
|
||||
* `Tesseract`_ does the character recognition.
|
||||
* `Unpaper`_ despeckles and and deskews the scanned image.
|
||||
* `GNU Privacy Guard`_ is used as the encryption backend.
|
||||
* `Python 3`_ is the language of the project.
|
||||
|
||||
@ -92,6 +94,7 @@ home.
|
||||
.. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail
|
||||
.. _ImageMagick: http://imagemagick.org/
|
||||
.. _Tesseract: https://github.com/tesseract-ocr
|
||||
.. _Unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||
.. _GNU Privacy Guard: https://gnupg.org/
|
||||
.. _Python 3: https://python.org/
|
||||
.. _Pillow: https://pypi.python.org/pypi/pillowfight/
|
||||
@ -105,4 +108,5 @@ home.
|
||||
.. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg
|
||||
:alt: Join the chat at https://gitter.im/danielquinn/paperless
|
||||
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
||||
|
||||
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
|
||||
:target: https://travis-ci.org/danielquinn/paperless
|
||||
|
15
docker-compose.env.example
Normal file
15
docker-compose.env.example
Normal file
@ -0,0 +1,15 @@
|
||||
# Environment variables to set for Paperless
|
||||
# Commented out variables will be replaced by a default within Paperless.
|
||||
|
||||
# Passphrase Paperless uses to encrypt and decrypt your documents
|
||||
PAPERLESS_PASSPHRASE=CHANGE_ME
|
||||
|
||||
# The amount of threads to use for text recognition
|
||||
# PAPERLESS_OCR_THREADS=4
|
||||
|
||||
# Additional languages to install for text recognition
|
||||
# PAPERLESS_OCR_LANGUAGES=deu ita
|
||||
|
||||
# You can change the default user and group id to a custom one
|
||||
# USERMAP_UID=1000
|
||||
# USERMAP_GID=1000
|
37
docker-compose.yml.example
Normal file
37
docker-compose.yml.example
Normal file
@ -0,0 +1,37 @@
|
||||
version: '2'
|
||||
|
||||
services:
|
||||
webserver:
|
||||
image: paperless
|
||||
ports:
|
||||
# You can adapt the port you want Paperless to listen on by
|
||||
# modifying the part before the `:`.
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
env_file: docker-compose.env
|
||||
environment:
|
||||
- PAPERLESS_OCR_LANGUAGES=
|
||||
command: ["runserver", "0.0.0.0:8000"]
|
||||
|
||||
consumer:
|
||||
image: paperless
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
# You have to adapt the local path you want the consumption
|
||||
# directory to mount to by modifying the part before the ':'.
|
||||
- /path/to/arbitrary/place:/consume
|
||||
# Likewise, you can add a local path to mount a directory for
|
||||
# exporting. This is not strictly needed for paperless to
|
||||
# function, only if you're exporting your files: uncomment
|
||||
# it and fill in a local path if you know you're going to
|
||||
# want to export your documents.
|
||||
# - /path/to/another/arbitrary/place:/export
|
||||
env_file: docker-compose.env
|
||||
command: ["document_consumer"]
|
||||
|
||||
volumes:
|
||||
data:
|
||||
media:
|
18
docs/Dockerfile
Normal file
18
docs/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
||||
FROM python:3.5.1
|
||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||
|
||||
# Install Sphinx and Pygments
|
||||
RUN pip install Sphinx Pygments
|
||||
|
||||
# Setup directories, copy data
|
||||
RUN mkdir /build
|
||||
COPY . /build
|
||||
WORKDIR /build/docs
|
||||
|
||||
# Build documentation
|
||||
RUN make html
|
||||
|
||||
# Start webserver
|
||||
WORKDIR /build/docs/_build/html
|
||||
EXPOSE 8000/tcp
|
||||
CMD ["python3", "-m", "http.server"]
|
23
docs/api.rst
Normal file
23
docs/api.rst
Normal file
@ -0,0 +1,23 @@
|
||||
.. _api:
|
||||
|
||||
The REST API
|
||||
############
|
||||
|
||||
Paperless makes use of the `Django REST Framework`_ standard API interface
|
||||
because of its inherent awesomeness. Conveniently, the system is also
|
||||
self-documenting, so learn more about the access points, schema, what's
|
||||
accepted and what isn't, you need only visit ``/api`` on your local Paperless
|
||||
installation.
|
||||
|
||||
.. _Django REST Framework: http://django-rest-framework.org/
|
||||
|
||||
|
||||
.. _api-uploading:
|
||||
|
||||
Uploading
|
||||
---------
|
||||
|
||||
File uploads in an API are hard and so far as I've been able to tell, there's
|
||||
no standard way of accepting them, so rather than crowbar file uploads into the
|
||||
REST API and endure that headache, I've left that process to a simple HTTP
|
||||
POST, documented on the :ref:`consumption page <consumption-http>`.
|
@ -1,10 +1,51 @@
|
||||
Changelog
|
||||
#########
|
||||
|
||||
* 0.1.1
|
||||
|
||||
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||
have been renamed to "correspondent" to better reflect the nature of the
|
||||
property (one could quite reasonably scan a document before sending it to
|
||||
someone.)
|
||||
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||
for full metadata retention without depending on the file name and
|
||||
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||
contributing conversation that lead to this change.
|
||||
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||
* `#68`_: Added support for using a proper config file at
|
||||
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||
* Refactored the Vagrant installation process to use environment variables
|
||||
rather than asking the user to modify ``settings.py``.
|
||||
* `#44`_: Harmonise environment variable names with constant names.
|
||||
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||
to be imported but made unavailable.
|
||||
|
||||
* 0.1.0
|
||||
|
||||
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||
who spearheadded this effort.
|
||||
* A simple REST API is in place, but it should be considered unstable.
|
||||
* Cleaned up the consumer to use temporary directories instead of a single
|
||||
scratch space. (Thanks `Pit`_)
|
||||
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||
`Pit`_.
|
||||
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||
* Added tox with pep8 checking
|
||||
|
||||
* 0.0.6
|
||||
|
||||
* Added support for parallel OCR (significant work from pitkley)
|
||||
* Sped up the language detection (significant work from pitkley)
|
||||
* Added support for parallel OCR (significant work from `Pit`_)
|
||||
* Sped up the language detection (significant work from `Pit`_)
|
||||
* Added simple logging
|
||||
|
||||
* 0.0.5
|
||||
@ -35,3 +76,26 @@ Changelog
|
||||
* 0.0.1
|
||||
|
||||
* Initial release
|
||||
|
||||
.. _Brian Conn: https://github.com/TheConnMan
|
||||
.. _Christopher Luu: https://github.com/nuudles
|
||||
.. _Florian Jung: https://github.com/the01
|
||||
.. _Tikitu de Jager: https://github.com/tikitu
|
||||
.. _Paul: https://github.com/polo2ro
|
||||
.. _Pit: https://github.com/pitkley
|
||||
.. _Wayne Werner: https://github.com/waynew
|
||||
.. _darkmatter: https://github.com/darkmatter
|
||||
.. _zedster: https://github.com/zedster
|
||||
|
||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||
.. _#45: https://github.com/danielquinn/paperless/issues/45
|
||||
.. _#47: https://github.com/danielquinn/paperless/issues/47
|
||||
.. _#48: https://github.com/danielquinn/paperless/issues/48
|
||||
.. _#53: https://github.com/danielquinn/paperless/issues/53
|
||||
.. _#54: https://github.com/danielquinn/paperless/issues/54
|
||||
.. _#57: https://github.com/danielquinn/paperless/issues/57
|
||||
.. _#60: https://github.com/danielquinn/paperless/issues/60
|
||||
.. _#67: https://github.com/danielquinn/paperless/issues/67
|
||||
.. _#68: https://github.com/danielquinn/paperless/issues/68
|
||||
.. _#71: https://github.com/danielquinn/paperless/issues/71
|
||||
|
@ -40,14 +40,14 @@ follow the :ref:`consumer <utilities-consumer>` instructions to get it running.
|
||||
A Note on File Naming
|
||||
---------------------
|
||||
|
||||
Any document you put into the consumption directory will be consumed, but if you
|
||||
name the file right, it'll automatically set some values in the database for
|
||||
you. This is is the logic the consumer follows:
|
||||
Any document you put into the consumption directory will be consumed, but if
|
||||
you name the file right, it'll automatically set some values in the database
|
||||
for you. This is is the logic the consumer follows:
|
||||
|
||||
1. Try to find the sender, title, and tags in the file name following the
|
||||
pattern: ``Sender - Title - tag,tag,tag.pdf``.
|
||||
2. If that doesn't work, try to find the sender and title in the file name
|
||||
following the pattern: ``Sender - Title.pdf``.
|
||||
1. Try to find the correspondent, title, and tags in the file name following
|
||||
the pattern: ``Correspondent - Title - tag,tag,tag.pdf``.
|
||||
2. If that doesn't work, try to find the correspondent and title in the file
|
||||
name following the pattern: ``Correspondent - Title.pdf``.
|
||||
3. If that doesn't work, just assume that the name of the file is the title.
|
||||
|
||||
So given the above, the following examples would work as you'd expect:
|
||||
@ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running:
|
||||
the configured email account every 10 minutes for something new and pull down
|
||||
whatever it finds.
|
||||
4. Send yourself an email! Note that the subject is treated as the file name,
|
||||
so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get
|
||||
what you expect. Also, you must include the aforementioned secret string in
|
||||
every email so the fetcher knows that it's safe to import.
|
||||
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
|
||||
get what you expect. Also, you must include the aforementioned secret
|
||||
string in every email so the fetcher knows that it's safe to import.
|
||||
5. After a few minutes, the consumer will poll your mailbox, pull down the
|
||||
message, and place the attachment in the consumption directory with the
|
||||
appropriate name. A few minutes later, the consumer will import it like any
|
||||
@ -111,23 +111,22 @@ So, with all that in mind, here's what you do to get it running:
|
||||
HTTP POST
|
||||
=========
|
||||
|
||||
Currently, the API is limited to only handling file uploads, it doesn't do tags
|
||||
yet, and the URL schema isn't concrete, but it's a start. It's also not much of
|
||||
a real API, it's just a URL that accepts an HTTP POST.
|
||||
You can also submit a document via HTTP POST. It doesn't do tags yet, and the
|
||||
URL schema isn't concrete, but it's a start.
|
||||
|
||||
To push your document to *Paperless*, send an HTTP POST to the server with the
|
||||
To push your document to Paperless, send an HTTP POST to the server with the
|
||||
following name/value pairs:
|
||||
|
||||
* ``sender``: The name of the document's sender. Note that there are
|
||||
restrictions on what characters you can use here. Specifically, alphanumeric
|
||||
characters, `-`, `,`, `.`, and `'` are ok, everything else it out. You also
|
||||
can't use the sequence ` - ` (space, dash, space).
|
||||
* ``correspondent``: The name of the document's correspondent. Note that there
|
||||
are restrictions on what characters you can use here. Specifically,
|
||||
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it
|
||||
out. You also can't use the sequence ` - ` (space, dash, space).
|
||||
* ``title``: The title of the document. The rules for characters is the same
|
||||
here as the sender.
|
||||
* ``signature``: For security reasons, we have the sender send a signature using
|
||||
a "shared secret" method to make sure that random strangers don't start
|
||||
uploading stuff to your server. The means of generating this signature is
|
||||
defined below.
|
||||
here as the correspondent.
|
||||
* ``signature``: For security reasons, we have the correspondent send a
|
||||
signature using a "shared secret" method to make sure that random strangers
|
||||
don't start uploading stuff to your server. The means of generating this
|
||||
signature is defined below.
|
||||
|
||||
Specify ``enctype="multipart/form-data"``, and then POST your file with:::
|
||||
|
||||
@ -146,12 +145,12 @@ verification.
|
||||
|
||||
In the case of *Paperless*, you configure the server with the secret by setting
|
||||
``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by
|
||||
concatenating the sender, title, and the secret, and then using sha256 to
|
||||
generate a hexdigest.
|
||||
concatenating the correspondent, title, and the secret, and then using sha256
|
||||
to generate a hexdigest.
|
||||
|
||||
If you're using Python, this is what that looks like:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from hashlib import sha256
|
||||
signature = sha256(sender + title + secret).hexdigest()
|
||||
signature = sha256(correspondent + title + secret).hexdigest()
|
||||
|
@ -30,6 +30,7 @@ Contents
|
||||
requirements
|
||||
setup
|
||||
consumption
|
||||
api
|
||||
utilities
|
||||
migrating
|
||||
changelog
|
||||
|
@ -4,31 +4,10 @@ Migrating, Updates, and Backups
|
||||
===============================
|
||||
|
||||
As *Paperless* is still under active development, there's a lot that can change
|
||||
as software updates roll out. The thing you just need to remember for all of
|
||||
this is that for the most part, **the database is expendable** so long as you
|
||||
have your files. This is because the file name of the exported files includes
|
||||
the name of the sender, the title, and the tags (if any) on each file.
|
||||
|
||||
|
||||
.. _migrating-updates:
|
||||
|
||||
Updates
|
||||
-------
|
||||
|
||||
For the most part, all you have to do to update *Paperless* is run ``git pull``
|
||||
on the directory containing the project files, and then use Django's ``migrate``
|
||||
command to execute any database schema updates that might have been rolled in
|
||||
as part of the update:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ cd /path/to/project
|
||||
$ git pull
|
||||
$ cd src
|
||||
$ ./manage.py migrate
|
||||
|
||||
Note that it's possible (even likely) that while ``git pull`` may update some
|
||||
files, the ``migrate`` step may not update anything. This is totally normal.
|
||||
as software updates roll out. You should backup often, so if anything goes
|
||||
wrong during an update, you at least have a means of restoring to something
|
||||
usable. Thankfully, there are automated ways of backing up, restoring, and
|
||||
updating the software.
|
||||
|
||||
|
||||
.. _migrating-backup:
|
||||
@ -38,20 +17,8 @@ Backing Up
|
||||
|
||||
So you're bored of this whole project, or you want to make a remote backup of
|
||||
the unencrypted files for whatever reason. This is easy to do, simply use the
|
||||
:ref:`exporter <utilities-exporter>` to dump your documents out into an
|
||||
arbitrary directory.
|
||||
|
||||
Additionally however, you'll need to back up the tags themselves. The file
|
||||
names contain the tag names, but you still need to define the tags and their
|
||||
matching algorithms in the database for things to work properly. We do this
|
||||
with Django's ``dumpdata`` command, which produces JSON output.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ cd /path/to/project
|
||||
$ cd src
|
||||
$ ./manage.py document_export /path/to/arbitrary/place/
|
||||
$ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
|
||||
:ref:`exporter <utilities-exporter>` to dump your documents and database out
|
||||
into an arbitrary directory.
|
||||
|
||||
|
||||
.. _migrating-restoring:
|
||||
@ -66,7 +33,7 @@ create an empty database (just follow the
|
||||
``tags.json`` file you created as part of your backup. Lastly, copy your
|
||||
exported documents into the consumption directory and start up the consumer.
|
||||
|
||||
.. code:: bash
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ rm data/db.sqlite3 # Delete the database
|
||||
@ -77,3 +44,60 @@ exported documents into the consumption directory and start up the consumer.
|
||||
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
|
||||
$ ./manage.py document_consumer
|
||||
|
||||
Importing your data if you are :ref:`using Docker <setup-installation-docker>`
|
||||
is almost as simple:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
# Stop and remove your current containers
|
||||
$ docker-compose stop
|
||||
$ docker-compose rm -f
|
||||
|
||||
# Recreate them, add the superuser
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
# Load the tags
|
||||
$ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
|
||||
|
||||
# Load your exported documents into the consumption directory
|
||||
# (How you do this highly depends on how you have set this up)
|
||||
$ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
|
||||
|
||||
After loading the documents into the consumption directory the consumer will
|
||||
immediately start consuming the documents.
|
||||
|
||||
|
||||
.. _migrating-updates:
|
||||
|
||||
Updates
|
||||
-------
|
||||
|
||||
For the most part, all you have to do to update *Paperless* is run ``git pull``
|
||||
on the directory containing the project files, and then use Django's ``migrate``
|
||||
command to execute any database schema updates that might have been rolled in
|
||||
as part of the update:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ git pull
|
||||
$ cd src
|
||||
$ ./manage.py migrate
|
||||
|
||||
Note that it's possible (even likely) that while ``git pull`` may update some
|
||||
files, the ``migrate`` step may not update anything. This is totally normal.
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>` the update process
|
||||
requires only one additional step:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ git pull
|
||||
$ docker build -t paperless .
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver migrate
|
||||
|
||||
If ``git pull`` doesn't report any changes, there is no need to continue with
|
||||
the remaining steps.
|
||||
|
@ -10,11 +10,13 @@ should work) that has the following software installed on it:
|
||||
* `GNU Privacy Guard`_
|
||||
* `Tesseract`_
|
||||
* `Imagemagick`_
|
||||
* `unpaper`_
|
||||
|
||||
.. _Python3: https://python.org/
|
||||
.. _GNU Privacy Guard: https://gnupg.org
|
||||
.. _Tesseract: https://github.com/tesseract-ocr
|
||||
.. _Imagemagick: http://imagemagick.org/
|
||||
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||
|
||||
Notably, you should confirm how you access your Python3 installation. Many
|
||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
||||
@ -101,3 +103,16 @@ you'd like to generate your own docs locally, you'll need to:
|
||||
$ pip install sphinx
|
||||
|
||||
and then cd into the ``docs`` directory and type ``make html``.
|
||||
|
||||
If you are using Docker, you can use the following commands to build the
|
||||
documentation and run a webserver serving it on `port 8001`_:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pwd
|
||||
/path/to/paperless
|
||||
|
||||
$ docker build -t paperless:docs -f docs/Dockerfile .
|
||||
$ docker run --rm -it -p "8001:8000" paperless:docs
|
||||
|
||||
.. _port 8001: http://127.0.0.1:8001
|
||||
|
215
docs/setup.rst
215
docs/setup.rst
@ -37,11 +37,19 @@ or just download the tarball and go that route:
|
||||
Installation & Configuration
|
||||
----------------------------
|
||||
|
||||
You can go two routes with setting up and running Paperless. The *Vagrant*
|
||||
route is quick & easy, but means you're running a VM which comes with memory
|
||||
consumption etc. Alternatively the standard, "bare metal" approach is a little
|
||||
more complicated.
|
||||
You can go multiple routes with setting up and running Paperless. The `Vagrant
|
||||
route`_ is quick & easy, but means you're running a VM which comes with memory
|
||||
consumption etc. We also `support Docker`_, which you can use natively under
|
||||
Linux and in a VM with `Docker Machine`_ (this guide was written for native
|
||||
Docker usage under Linux, you might have to adapt it for Docker Machine.)
|
||||
Alternatively the standard, `bare metal`_ approach is a little more complicated,
|
||||
but worth it because it makes it easier to should you want to contribute some
|
||||
code back.
|
||||
|
||||
.. _Vagrant route: setup-installation-vagrant_
|
||||
.. _support Docker: setup-installation-docker_
|
||||
.. _bare metal: setup-installation-standard_
|
||||
.. _Docker Machine: https://docs.docker.com/machine/
|
||||
|
||||
.. _setup-installation-standard:
|
||||
|
||||
@ -91,33 +99,188 @@ Vagrant Method
|
||||
2. Run ``vagrant up``. An instance will start up for you. When it's ready and
|
||||
provisioned...
|
||||
3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
|
||||
``/opt/paperless/src/paperless/settings.py`` and set the values for:
|
||||
* ``CONSUMPTION_DIR``: this is where your documents will be dumped to be
|
||||
consumed by Paperless.
|
||||
* ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt
|
||||
the original document. The default value attempts to source the
|
||||
passphrase from the environment, so if you don't set it to a static value
|
||||
here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the
|
||||
command line whenever invoking the consumer or webserver.
|
||||
4. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
|
||||
5. Still inside your vagrant box, create a user for your Paperless instance with
|
||||
``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
|
||||
``/etc/paperless.conf`` and set the values for:
|
||||
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
|
||||
dumped to be consumed by Paperless.
|
||||
* ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to
|
||||
encrypt/decrypt the original document.
|
||||
* ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming
|
||||
documents from mail or via the API. If you don't use either, leaving it
|
||||
blank is just fine.
|
||||
4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This
|
||||
updates the environment to make use of the changes you made to the config
|
||||
file.
|
||||
5. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
|
||||
6. Still inside your vagrant box, create a user for your Paperless instance
|
||||
with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
|
||||
create your user.
|
||||
6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``.
|
||||
You should now be able to visit your (empty) `Paperless webserver`_ at
|
||||
``172.28.128.4:8000``. You can login with the user/pass you created in #5.
|
||||
7. In a separate window, run ``vagrant ssh`` again, but this time once inside
|
||||
7. Start the webserver with
|
||||
``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be
|
||||
able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``.
|
||||
You can login with the user/pass you created in #6.
|
||||
8. In a separate window, run ``vagrant ssh`` again, but this time once inside
|
||||
your vagrant instance, you should start the consumer script with
|
||||
``/opt/paperless/src/manage.py document_consumer``.
|
||||
8. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
||||
9. Wait a few minutes
|
||||
10. Visit the document list on your webserver, and it should be there, indexed
|
||||
9. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
||||
10. Wait a few minutes
|
||||
11. Visit the document list on your webserver, and it should be there, indexed
|
||||
and downloadable.
|
||||
|
||||
.. _Vagrant: https://vagrantup.com/
|
||||
.. _Paperless server: http://172.28.128.4:8000
|
||||
|
||||
|
||||
.. _setup-installation-docker:
|
||||
|
||||
Docker Method
|
||||
.............
|
||||
|
||||
1. Install `Docker`_.
|
||||
|
||||
.. caution::
|
||||
|
||||
As mentioned earlier, this guide assumes that you use Docker natively
|
||||
under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows,
|
||||
you will have to adapt IP addresses, volume-mounting, command execution
|
||||
and maybe more.
|
||||
|
||||
2. Install `docker-compose`_. [#compose]_
|
||||
|
||||
.. caution::
|
||||
|
||||
If you want to use the included ``docker-compose.yml.example`` file, you
|
||||
need to have at least Docker version **1.10.0** and docker-compose
|
||||
version **1.6.0**.
|
||||
|
||||
See the `Docker installation guide`_ on how to install the current
|
||||
version of Docker for your operating system or Linux distribution of
|
||||
choice. To get an up-to-date version of docker-compose, follow the
|
||||
`docker-compose installation guide`_ if your package repository doesn't
|
||||
include it.
|
||||
|
||||
.. _Docker installation guide: https://docs.docker.com/engine/installation/
|
||||
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
|
||||
|
||||
3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
|
||||
a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
|
||||
editing both these files: taking a copy ensures that you can ``git pull`` to
|
||||
receive updates without risking merge conflicts with your modified versions
|
||||
of the configuration files.
|
||||
4. Modify ``docker-compose.yml`` to your preferences, following the instructions
|
||||
in comments in the file. The only change that is a hard requirement is to
|
||||
specify where the consumption directory should mount.
|
||||
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||
|
||||
``PAPERLESS_PASSPHRASE``
|
||||
This is the passphrase Paperless uses to encrypt/decrypt the original
|
||||
document.
|
||||
|
||||
``PAPERLESS_OCR_THREADS``
|
||||
This is the number of threads the OCR process will spawn to process
|
||||
document pages in parallel. If the variable is not set, Python determines
|
||||
the core-count of your CPU and uses that value.
|
||||
|
||||
``PAPERLESS_OCR_LANGUAGES``
|
||||
If you want the OCR to recognize other languages in addition to the default
|
||||
English, set this parameter to a space separated list of three-letter
|
||||
language-codes after `ISO 639-2/T`_. For a list of available languages --
|
||||
including their three letter codes -- see the `Debian packagelist`_.
|
||||
|
||||
``USERMAP_UID`` and ``USERMAP_GID``
|
||||
If you want to mount the consumption volume (directory ``/consume`` within
|
||||
the containers) to a host-directory -- which you probably want to do --
|
||||
access rights might be an issue. The default user and group ``paperless``
|
||||
in the containers have an id of 1000. The containers will enforce that the
|
||||
owning group of the consumption directory will be ``paperless`` to be able
|
||||
to delete consumed documents. If your host-system has a group with an id of
|
||||
1000 and you don't want this group to have access rights to the consumption
|
||||
directory, you can use ``USERMAP_GID`` to change the id in the container
|
||||
and thus the one of the consumption directory. Furthermore, you can change
|
||||
the id of the default user as well using ``USERMAP_UID``.
|
||||
|
||||
6. Run ``docker-compose up -d``. This will create and start the necessary
|
||||
containers.
|
||||
7. To be able to login, you will need a super user. To create it, execute the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
This will prompt you to set a username (default ``paperless``), an optional
|
||||
e-mail address and finally a password.
|
||||
8. The default ``docker-compose.yml`` exports the webserver on your local port
|
||||
8000. If you haven't adapted this, you should now be able to visit your
|
||||
`Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
|
||||
user and password you just created.
|
||||
9. Add files to consumption directory the way you prefer to. Following are two
|
||||
possible options:
|
||||
|
||||
1. Mount the consumption directory to a local host path by modifying your
|
||||
``docker-compose.yml``:
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
diff --git a/docker-compose.yml b/docker-compose.yml
|
||||
--- a/docker-compose.yml
|
||||
+++ b/docker-compose.yml
|
||||
@@ -17,9 +18,8 @@ services:
|
||||
volumes:
|
||||
- paperless-data:/usr/src/paperless/data
|
||||
- paperless-media:/usr/src/paperless/media
|
||||
- - /consume
|
||||
+ - /local/path/you/choose:/consume
|
||||
|
||||
.. danger::
|
||||
|
||||
While the consumption container will ensure at startup that it can
|
||||
**delete** a consumed file from a host-mounted directory, it might not
|
||||
be able to **read** the document in the first place if the access
|
||||
rights to the file are incorrect.
|
||||
|
||||
Make sure that the documents you put into the consumption directory
|
||||
will either be readable by everyone (``chmod o+r file.pdf``) or
|
||||
readable by the default user or group id 1000 (or the one you have set
|
||||
with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
|
||||
|
||||
2. Use ``docker cp`` to copy your files directly into the container:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Identify your containers
|
||||
$ docker-compose ps
|
||||
Name Command State Ports
|
||||
-------------------------------------------------------------------------
|
||||
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
|
||||
$ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
|
||||
|
||||
``docker cp`` is a one-shot-command, just like ``cp``. This means that
|
||||
every time you want to consume a new document, you will have to execute
|
||||
``docker cp`` again. You can of course automate this process, but option 1
|
||||
is generally the preferred one.
|
||||
|
||||
.. danger::
|
||||
|
||||
``docker cp`` will change the owning user and group of a copied file
|
||||
to the acting user at the destination, which will be ``root``.
|
||||
|
||||
You therefore need to ensure that the documents you want to copy into
|
||||
the container are readable by everyone (``chmod o+r file.pdf``) before
|
||||
copying them.
|
||||
|
||||
|
||||
.. _Docker: https://www.docker.com/
|
||||
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
|
||||
|
||||
.. [#compose] You of course don't have to use docker-compose, but it
|
||||
simplifies deployment immensely. If you know your way around Docker, feel
|
||||
free to tinker around without using compose!
|
||||
|
||||
|
||||
.. _making-things-a-little-more-permanent:
|
||||
|
||||
Making Things a Little more Permanent
|
||||
@ -126,5 +289,9 @@ Making Things a Little more Permanent
|
||||
Once you've tested things and are happy with the work flow, you can automate the
|
||||
process of starting the webserver and consumer automatically. If you're running
|
||||
on a bare metal system that's using Systemd, you can use the service unit files
|
||||
in the ``scripts`` directory to set this up. If you're on a SysV or other
|
||||
startup system (like the Vagrant box), then you're currently on your own.
|
||||
in the ``scripts`` directory to set this up. If you're on another startup
|
||||
system or are using a Vagrant box, then you're currently on your own. If you are
|
||||
using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to
|
||||
have the containers automatically start with the Docker daemon.
|
||||
|
||||
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
|
||||
|
@ -26,7 +26,7 @@ How to Use It
|
||||
|
||||
The webserver is started via the ``manage.py`` script:
|
||||
|
||||
.. code:: bash
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py runserver
|
||||
|
||||
@ -64,7 +64,7 @@ How to Use It
|
||||
|
||||
The consumer is started via the ``manage.py`` script:
|
||||
|
||||
.. code:: bash
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_consumer
|
||||
|
||||
@ -95,13 +95,110 @@ How to Use It
|
||||
|
||||
This too is done via the ``manage.py`` script:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
|
||||
|
||||
This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
|
||||
to do with as you please. The files are accompanied with a special file,
|
||||
``manifest.json`` which can be used to
|
||||
:ref:`import the files <utilities-importer>` at a later date if you wish.
|
||||
|
||||
|
||||
.. _utilities-exporter-howto-docker:
|
||||
|
||||
Docker
|
||||
______
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>`, running the
|
||||
expoorter is almost as easy. To mount a volume for exports, follow the
|
||||
instructions in the ``docker-compose.yml.example`` file for the ``/export``
|
||||
volume (making the changes in your own ``docker-compose.yml`` file, of course).
|
||||
Once you have the volume mounted, the command to run an export is:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm consumer document_exporter /export
|
||||
|
||||
If you prefer to use ``docker run`` directly, supplying the necessary commandline
|
||||
options:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Identify your containers
|
||||
$ docker-compose ps
|
||||
Name Command State Ports
|
||||
-------------------------------------------------------------------------
|
||||
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
|
||||
$ # Make sure to replace your passphrase and remove or adapt the id mapping
|
||||
$ docker run --rm \
|
||||
--volumes-from paperless_data_1 \
|
||||
--volume /path/to/arbitrary/place:/export \
|
||||
-e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
|
||||
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||
paperless document_exporter /export
|
||||
|
||||
|
||||
.. _utilities-importer:
|
||||
|
||||
The Importer
|
||||
------------
|
||||
|
||||
Looking to transfer Paperless data from one instance to another, or just want
|
||||
to restore from a backup? This is your go-to toy.
|
||||
|
||||
|
||||
.. _utilities-importer-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
The importer works just like the exporter. You point it at a directory, and
|
||||
the script does the rest of the work:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/
|
||||
|
||||
Docker
|
||||
______
|
||||
|
||||
Assuming that you've already gone through the steps above in the
|
||||
:ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing
|
||||
to do is just re-use the ``/export`` path you already setup:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm consumer document_importer /export
|
||||
|
||||
Similarly, if you're not using docker-compose, you can adjust the export
|
||||
instructions above to do the import.
|
||||
|
||||
|
||||
.. _utilities-retagger:
|
||||
|
||||
The Re-tagger
|
||||
-------------
|
||||
|
||||
Say you've imported a few hundred documents and now want to introduce a tag
|
||||
and apply its matching to all of the currently-imported docs. This problem is
|
||||
common enough that there's a tool for it.
|
||||
|
||||
|
||||
.. _utilities-retagger-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
This too is done via the ``manage.py`` script:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere
|
||||
$ /path/to/paperless/src/manage.py document_retagger
|
||||
|
||||
This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with
|
||||
as you please. The naming scheme on export is identical to that used for
|
||||
import, so should you can now safely delete the entire project directly,
|
||||
database, encrypted PDFs and all, and later create it all again simply by
|
||||
running the consumer again and dumping all of these files into
|
||||
``CONSUMPTION_DIR``.
|
||||
That's it. It'll loop over all of the documents in your database and attempt
|
||||
to match all of your tags to them. If one matches, it'll be applied. And
|
||||
don't worry, you can run this as often as you like, it' won't double-tag
|
||||
a document.
|
||||
|
33
paperless.conf.example
Normal file
33
paperless.conf.example
Normal file
@ -0,0 +1,33 @@
|
||||
# Sample paperless.conf
|
||||
# Copy this file to /etc/paperless.conf and modify it to suit your needs.
|
||||
|
||||
# This where your documents should go to be consumed. Make sure that it exists
|
||||
# and that the user running the paperless service can read/write its contents
|
||||
# before you start Paperless.
|
||||
PAPERLESS_CONSUMPTION_DIR=""
|
||||
|
||||
# These values are required if you want paperless to check a particular email
|
||||
# box every 10 minutes and attempt to consume documents from there. If you
|
||||
# don't define a HOST, mail checking will just be disabled.
|
||||
PAPERLESS_CONSUME_MAIL_HOST=""
|
||||
PAPERLESS_CONSUME_MAIL_PORT=""
|
||||
PAPERLESS_CONSUME_MAIL_USER=""
|
||||
PAPERLESS_CONSUME_MAIL_PASS=""
|
||||
|
||||
# You must have a passphrase in order for Paperless to work at all. If you set
|
||||
# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte
|
||||
# file.
|
||||
#
|
||||
# The passphrase you use here will be used when storing your documents in
|
||||
# Paperless, but you can always export them in an unencrypted format by using
|
||||
# document exporter. See the documentaiton for more information.
|
||||
#
|
||||
# One final note about the passphrase. Once you've consumed a document with
|
||||
# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and
|
||||
# can't properly export documents that were encrypted with an old passphrase if
|
||||
# you've since changed it to a new one.
|
||||
PAPERLESS_PASSPHRASE="secret"
|
||||
|
||||
# If you intend to consume documents either via HTTP POST or by email, you must
|
||||
# have a shared secret here.
|
||||
PAPERLESS_SHARED_SECRET=""
|
@ -1,8 +1,10 @@
|
||||
Django==1.9
|
||||
Django==1.9.2
|
||||
django-extensions==1.6.1
|
||||
djangorestframework==3.3.2
|
||||
python-dotenv==0.3.0
|
||||
filemagic==1.6
|
||||
langdetect==1.0.5
|
||||
Pillow==3.0.0
|
||||
Pillow==3.1.1
|
||||
pyocr==0.3.1
|
||||
python-dateutil==2.4.2
|
||||
python-gnupg==0.3.8
|
||||
|
74
scripts/docker-entrypoint.sh
Normal file
74
scripts/docker-entrypoint.sh
Normal file
@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Source: https://github.com/sameersbn/docker-gitlab/
|
||||
map_uidgid() {
|
||||
USERMAP_ORIG_UID=$(id -u paperless)
|
||||
USERMAP_ORIG_UID=$(id -g paperless)
|
||||
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
|
||||
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||
groupmod -g ${USERMAP_GID} paperless
|
||||
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||
fi
|
||||
}
|
||||
|
||||
set_permissions() {
|
||||
# Set permissions for consumption directory
|
||||
chgrp paperless "$PAPERLESS_CONSUMPTION_DIR"
|
||||
chmod g+x "$PAPERLESS_CONSUMPTION_DIR"
|
||||
|
||||
# Set permissions for application directory
|
||||
chown -Rh paperless:paperless /usr/src/paperless
|
||||
}
|
||||
|
||||
initialize() {
|
||||
map_uidgid
|
||||
set_permissions
|
||||
}
|
||||
|
||||
install_languages() {
|
||||
local langs="$1"
|
||||
read -ra langs <<<"$langs"
|
||||
|
||||
# Check that it is not empty
|
||||
if [ ${#langs[@]} -eq 0 ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
# Update apt-lists
|
||||
apt-get update
|
||||
|
||||
# Loop over languages to be installed
|
||||
for lang in "${langs[@]}"; do
|
||||
pkg="tesseract-ocr-$lang"
|
||||
if dpkg -s "$pkg" 2>&1 > /dev/null; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
|
||||
continue
|
||||
fi
|
||||
|
||||
apt-get install "$pkg"
|
||||
done
|
||||
|
||||
# Remove apt lists
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
}
|
||||
|
||||
|
||||
if [[ "$1" != "/"* ]]; then
|
||||
initialize
|
||||
|
||||
# Install additional languages if specified
|
||||
if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then
|
||||
install_languages "$PAPERLESS_OCR_LANGUAGES"
|
||||
fi
|
||||
|
||||
exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@"
|
||||
fi
|
||||
|
||||
exec "$@"
|
||||
|
@ -2,10 +2,9 @@
|
||||
Description=Paperless consumer
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/etc/conf.d/paperless
|
||||
User=paperless
|
||||
Group=paperless
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -2,7 +2,6 @@
|
||||
Description=Paperless webserver
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/etc/conf.d/paperless
|
||||
User=paperless
|
||||
Group=paperless
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000
|
||||
|
@ -1,13 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
# install packages
|
||||
sudo apt-get update
|
||||
sudo apt-get build-dep -y python-imaging
|
||||
sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
|
||||
sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
|
||||
# Install packages
|
||||
apt-get update
|
||||
apt-get build-dep -y python-imaging
|
||||
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
|
||||
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
|
||||
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
|
||||
|
||||
# setup python project
|
||||
pushd /opt/paperless
|
||||
sudo pip3 install -r requirements.txt
|
||||
popd
|
||||
# Python dependencies
|
||||
pip3 install -r /opt/paperless/requirements.txt
|
||||
|
||||
# Create the environment file
|
||||
cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf
|
||||
chmod 0640 /etc/paperless.conf
|
||||
chown root:vagrant /etc/paperless.conf
|
||||
|
||||
# Create the consumption directory
|
||||
mkdir /home/vagrant/consumption
|
||||
chown vagrant:vagrant /home/vagrant/consumption
|
||||
|
||||
echo "
|
||||
|
||||
|
||||
Now follow the remaining steps in the Vagrant section of the setup
|
||||
documentation to complete the process:
|
||||
|
||||
http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant
|
||||
|
||||
|
||||
"
|
||||
|
@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group
|
||||
from django.core.urlresolvers import reverse
|
||||
from django.templatetags.static import static
|
||||
|
||||
from .models import Sender, Tag, Document
|
||||
from .models import Correspondent, Tag, Document, Log
|
||||
|
||||
|
||||
class MonthListFilter(admin.SimpleListFilter):
|
||||
@ -45,39 +45,73 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
"all": ("paperless.css",)
|
||||
}
|
||||
|
||||
search_fields = ("sender__name", "title", "content")
|
||||
list_display = ("created", "sender", "title", "tags_", "document")
|
||||
list_filter = ("tags", "sender", MonthListFilter)
|
||||
search_fields = ("correspondent__name", "title", "content")
|
||||
list_display = ("created_", "correspondent", "title", "tags_", "document")
|
||||
list_filter = ("tags", "correspondent", MonthListFilter)
|
||||
list_per_page = 25
|
||||
|
||||
def created_(self, obj):
|
||||
return obj.created.date().strftime("%Y-%m-%d")
|
||||
|
||||
def tags_(self, obj):
|
||||
r = ""
|
||||
for tag in obj.tags.all():
|
||||
r += '<a class="tag" style="background-color: {};" href="{}">{}</a>'.format(
|
||||
tag.get_colour_display(),
|
||||
"{}?tags__id__exact={}".format(
|
||||
reverse("admin:documents_document_changelist"),
|
||||
tag.pk
|
||||
),
|
||||
tag.slug
|
||||
colour = tag.get_colour_display()
|
||||
r += self._html_tag(
|
||||
"a",
|
||||
tag.slug,
|
||||
**{
|
||||
"class": "tag",
|
||||
"style": "background-color: {};".format(colour),
|
||||
"href": "{}?tags__id__exact={}".format(
|
||||
reverse("admin:documents_document_changelist"),
|
||||
tag.pk
|
||||
)
|
||||
}
|
||||
)
|
||||
return r
|
||||
tags_.allow_tags = True
|
||||
|
||||
def document(self, obj):
|
||||
return '<a href="{}">' \
|
||||
'<img src="{}" width="22" height="22" alt="{} icon" title="{}">' \
|
||||
'</a>'.format(
|
||||
obj.download_url,
|
||||
static("documents/img/{}.png".format(obj.file_type)),
|
||||
obj.file_type,
|
||||
obj.file_name
|
||||
)
|
||||
return self._html_tag(
|
||||
"a",
|
||||
self._html_tag(
|
||||
"img",
|
||||
src=static("documents/img/{}.png".format(obj.file_type)),
|
||||
width=22,
|
||||
height=22,
|
||||
alt=obj.file_type,
|
||||
title=obj.file_name
|
||||
),
|
||||
href=obj.download_url
|
||||
)
|
||||
document.allow_tags = True
|
||||
|
||||
admin.site.register(Sender)
|
||||
@staticmethod
|
||||
def _html_tag(kind, inside=None, **kwargs):
|
||||
|
||||
attributes = []
|
||||
for lft, rgt in kwargs.items():
|
||||
attributes.append('{}="{}"'.format(lft, rgt))
|
||||
|
||||
if inside is not None:
|
||||
return "<{kind} {attributes}>{inside}</{kind}>".format(
|
||||
kind=kind, attributes=" ".join(attributes), inside=inside)
|
||||
|
||||
return "<{} {}/>".format(kind, " ".join(attributes))
|
||||
|
||||
|
||||
class LogAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = ("message", "level", "component")
|
||||
list_filter = ("level", "component",)
|
||||
|
||||
|
||||
admin.site.register(Correspondent)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(Document, DocumentAdmin)
|
||||
admin.site.register(Log, LogAdmin)
|
||||
|
||||
|
||||
# Unless we implement multi-user, these default registrations don't make sense.
|
||||
admin.site.unregister(Group)
|
||||
|
@ -1,5 +1,8 @@
|
||||
import datetime
|
||||
import logging
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import itertools
|
||||
@ -17,20 +20,14 @@ from PIL import Image
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.template.defaultfilters import slugify
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from logger.models import Log
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from .models import Sender, Tag, Document
|
||||
from .models import Correspondent, Tag, Document, Log
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
self, png, lang = args
|
||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||
return self.OCR.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
|
||||
@ -42,8 +39,8 @@ class ConsumerError(Exception):
|
||||
class Consumer(object):
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale png
|
||||
2. Use tesseract on the png
|
||||
1. Convert it to a greyscale pnm
|
||||
2. Use tesseract on the pnm
|
||||
3. Encrypt and store the document in the MEDIA_ROOT
|
||||
4. Store the OCR'd text in the database
|
||||
5. Delete the document and image(s)
|
||||
@ -51,28 +48,29 @@ class Consumer(object):
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
CONVERT = settings.CONVERT_BINARY
|
||||
UNPAPER = settings.UNPAPER_BINARY
|
||||
CONSUME = settings.CONSUMPTION_DIR
|
||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||
|
||||
OCR = pyocr.get_available_tools()[0]
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
|
||||
REGEX_TITLE = re.compile(
|
||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
REGEX_SENDER_TITLE = re.compile(
|
||||
REGEX_CORRESPONDENT_TITLE = re.compile(
|
||||
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
REGEX_SENDER_TITLE_TAGS = re.compile(
|
||||
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
||||
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
def __init__(self, verbosity=1):
|
||||
def __init__(self):
|
||||
|
||||
self.verbosity = verbosity
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
try:
|
||||
os.makedirs(self.SCRATCH)
|
||||
@ -92,6 +90,12 @@ class Consumer(object):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(self.CONSUME))
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group,
|
||||
"component": Log.COMPONENT_CONSUMER
|
||||
})
|
||||
|
||||
def consume(self):
|
||||
|
||||
for doc in os.listdir(self.CONSUME):
|
||||
@ -110,122 +114,156 @@ class Consumer(object):
|
||||
if self._is_ready(doc):
|
||||
continue
|
||||
|
||||
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
|
||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
pngs = self._get_greyscale(tempdir, doc)
|
||||
imgs = self._get_greyscale(tempdir, doc)
|
||||
thumbnail = self._get_thumbnail(tempdir, doc)
|
||||
|
||||
try:
|
||||
text = self._get_ocr(pngs)
|
||||
self._store(text, doc)
|
||||
except OCRError:
|
||||
text = self._get_ocr(imgs)
|
||||
self._store(text, doc, thumbnail)
|
||||
except OCRError as e:
|
||||
self._ignore.append(doc)
|
||||
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
||||
self._cleanup_tempdir(tempdir)
|
||||
continue
|
||||
finally:
|
||||
self._cleanup(tempdir, doc)
|
||||
else:
|
||||
self._cleanup_tempdir(tempdir)
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
def _get_greyscale(self, tempdir, doc):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
Log.debug(
|
||||
"Generating greyscale image from {}".format(doc),
|
||||
Log.COMPONENT_CONSUMER
|
||||
)
|
||||
|
||||
png = os.path.join(tempdir, "convert-%04d.jpg")
|
||||
self.log("info", "Generating greyscale image from {}".format(doc))
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
||||
subprocess.Popen((
|
||||
self.CONVERT, "-density", "300", "-depth", "8",
|
||||
"-type", "grayscale", doc, png
|
||||
"-type", "grayscale", doc, pnm
|
||||
)).wait()
|
||||
|
||||
pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
|
||||
return sorted(filter(lambda f: os.path.isfile(f), pngs))
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(tempdir, f))
|
||||
|
||||
@staticmethod
|
||||
def _guess_language(text):
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||
|
||||
# Return list of converted images, processed with unpaper
|
||||
pnms = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.endswith(".unpaper.pnm"):
|
||||
pnms.append(os.path.join(tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _get_thumbnail(self, tempdir, doc):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
|
||||
self.log("info", "Generating the thumbnail")
|
||||
|
||||
subprocess.Popen((
|
||||
self.CONVERT,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
doc,
|
||||
os.path.join(tempdir, "convert-%04d.png")
|
||||
)).wait()
|
||||
|
||||
return os.path.join(tempdir, "convert-0000.png")
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
Log.debug(
|
||||
"Language detected: {}".format(guess),
|
||||
Log.COMPONENT_CONSUMER
|
||||
)
|
||||
self.log("debug", "Language detected: {}".format(guess))
|
||||
return guess
|
||||
except Exception as e:
|
||||
Log.warning(
|
||||
"Language detection error: {}".format(e), Log.COMPONENT_MAIL)
|
||||
self.log("warning", "Language detection error: {}".format(e))
|
||||
|
||||
def _get_ocr(self, pngs):
|
||||
def _get_ocr(self, imgs):
|
||||
"""
|
||||
Attempts to do the best job possible OCR'ing the document based on
|
||||
simple language detection trial & error.
|
||||
"""
|
||||
|
||||
if not pngs:
|
||||
raise OCRError
|
||||
if not imgs:
|
||||
raise OCRError("No images found")
|
||||
|
||||
Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
|
||||
self.log("info", "OCRing the document")
|
||||
|
||||
# Since the division gets rounded down by int, this calculation works
|
||||
# for every edge-case, i.e. 1
|
||||
middle = int(len(pngs) / 2)
|
||||
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
middle = int(len(imgs) / 2)
|
||||
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||
|
||||
guessed_language = self._guess_language(raw_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
|
||||
self.log("warning", "Language detection failed!")
|
||||
if settings.FORGIVING_OCR:
|
||||
Log.warning(
|
||||
"As FORGIVING_OCR is enabled, we're going to make the best "
|
||||
"with what we have.",
|
||||
Log.COMPONENT_CONSUMER
|
||||
self.log(
|
||||
"warning",
|
||||
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||
"best with what we have."
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError
|
||||
raise OCRError("Language detection failed")
|
||||
|
||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
|
||||
try:
|
||||
return self._ocr(pngs, ISO639[guessed_language])
|
||||
return self._ocr(imgs, ISO639[guessed_language])
|
||||
except pyocr.pyocr.tesseract.TesseractError:
|
||||
if settings.FORGIVING_OCR:
|
||||
Log.warning(
|
||||
self.log(
|
||||
"warning",
|
||||
"OCR for {} failed, but we're going to stick with what "
|
||||
"we've got since FORGIVING_OCR is enabled.".format(
|
||||
guessed_language
|
||||
),
|
||||
Log.COMPONENT_CONSUMER
|
||||
)
|
||||
)
|
||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
||||
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||
return raw_text
|
||||
raise OCRError
|
||||
raise OCRError(
|
||||
"The guessed language is not available in this instance of "
|
||||
"Tesseract."
|
||||
)
|
||||
|
||||
def _assemble_ocr_sections(self, pngs, middle, text):
|
||||
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||
"""
|
||||
Given a `middle` value and the text that middle page represents, we OCR
|
||||
the remainder of the document and return the whole thing.
|
||||
"""
|
||||
text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||
return text
|
||||
|
||||
def _ocr(self, pngs, lang):
|
||||
def _ocr(self, imgs, lang):
|
||||
"""
|
||||
Performs a single OCR attempt.
|
||||
"""
|
||||
|
||||
if not pngs:
|
||||
if not imgs:
|
||||
return ""
|
||||
|
||||
Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
|
||||
self.log("info", "Parsing for {}".format(lang))
|
||||
|
||||
with Pool(processes=self.THREADS) as pool:
|
||||
r = pool.map(
|
||||
image_to_string, itertools.product([self], pngs, [lang]))
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
@ -233,16 +271,18 @@ class Consumer(object):
|
||||
|
||||
def _guess_attributes_from_name(self, parseable):
|
||||
"""
|
||||
We use a crude naming convention to make handling the sender, title, and
|
||||
tags easier:
|
||||
"<sender> - <title> - <tags>.<suffix>"
|
||||
"<sender> - <title>.<suffix>"
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<correspondent> - <title> - <tags>.<suffix>"
|
||||
"<correspondent> - <title>.<suffix>"
|
||||
"<title>.<suffix>"
|
||||
"""
|
||||
|
||||
def get_sender(sender_name):
|
||||
return Sender.objects.get_or_create(
|
||||
name=sender_name, defaults={"slug": slugify(sender_name)})[0]
|
||||
def get_correspondent(correspondent_name):
|
||||
return Correspondent.objects.get_or_create(
|
||||
name=correspondent_name,
|
||||
defaults={"slug": slugify(correspondent_name)}
|
||||
)[0]
|
||||
|
||||
def get_tags(tags):
|
||||
r = []
|
||||
@ -251,40 +291,47 @@ class Consumer(object):
|
||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||
return tuple(r)
|
||||
|
||||
# First attempt: "<sender> - <title> - <tags>.<suffix>"
|
||||
m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable)
|
||||
def get_suffix(suffix):
|
||||
suffix = suffix.lower()
|
||||
if suffix == "jpeg":
|
||||
return "jpg"
|
||||
return suffix
|
||||
|
||||
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
|
||||
if m:
|
||||
return (
|
||||
get_sender(m.group(1)),
|
||||
get_correspondent(m.group(1)),
|
||||
m.group(2),
|
||||
get_tags(m.group(3)),
|
||||
m.group(4)
|
||||
get_suffix(m.group(4))
|
||||
)
|
||||
|
||||
# Second attempt: "<sender> - <title>.<suffix>"
|
||||
m = re.match(self.REGEX_SENDER_TITLE, parseable)
|
||||
# Second attempt: "<correspondent> - <title>.<suffix>"
|
||||
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
|
||||
if m:
|
||||
return get_sender(m.group(1)), m.group(2), (), m.group(3)
|
||||
return (
|
||||
get_correspondent(m.group(1)),
|
||||
m.group(2),
|
||||
(),
|
||||
get_suffix(m.group(3))
|
||||
)
|
||||
|
||||
# That didn't work, so we assume sender and tags are None
|
||||
# That didn't work, so we assume correspondent and tags are None
|
||||
m = re.match(self.REGEX_TITLE, parseable)
|
||||
return None, m.group(1), (), m.group(2)
|
||||
return None, m.group(1), (), get_suffix(m.group(2))
|
||||
|
||||
def _store(self, text, doc):
|
||||
def _store(self, text, doc, thumbnail):
|
||||
|
||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
||||
tags = list(tags)
|
||||
|
||||
lower_text = text.lower()
|
||||
relevant_tags = set(
|
||||
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
|
||||
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
||||
|
||||
stats = os.stat(doc)
|
||||
|
||||
Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
document = Document.objects.create(
|
||||
sender=sender,
|
||||
correspondent=sender,
|
||||
title=title,
|
||||
content=text,
|
||||
file_type=file_type,
|
||||
@ -296,22 +343,29 @@ class Consumer(object):
|
||||
|
||||
if relevant_tags:
|
||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||
Log.debug(
|
||||
"Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
|
||||
self.log("debug", "Tagging with {}".format(tag_names))
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
# Encrypt and store the actual document
|
||||
with open(doc, "rb") as unencrypted:
|
||||
with open(document.source_path, "wb") as encrypted:
|
||||
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
|
||||
self.log("debug", "Encrypting the document")
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
def _cleanup(self, tempdir, doc):
|
||||
# Remove temporary directory recursively
|
||||
Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
|
||||
shutil.rmtree(tempdir)
|
||||
# Encrypt and store the thumbnail
|
||||
with open(thumbnail, "rb") as unencrypted:
|
||||
with open(document.thumbnail_path, "wb") as encrypted:
|
||||
self.log("debug", "Encrypting the thumbnail")
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
# Remove doc
|
||||
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
self.log("info", "Completed")
|
||||
|
||||
def _cleanup_tempdir(self, d):
|
||||
self.log("debug", "Deleting directory {}".format(d))
|
||||
shutil.rmtree(d)
|
||||
|
||||
def _cleanup_doc(self, doc):
|
||||
self.log("debug", "Deleting document {}".format(doc))
|
||||
os.unlink(doc)
|
||||
|
||||
def _is_ready(self, doc):
|
||||
@ -329,3 +383,23 @@ class Consumer(object):
|
||||
self.stats[doc] = t
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except TesseractError:
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
subprocess.Popen((
|
||||
unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||
)).wait()
|
||||
|
@ -8,13 +8,13 @@ from time import mktime
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
|
||||
from .models import Document, Sender
|
||||
from .models import Document, Correspondent
|
||||
from .consumer import Consumer
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
SECRET = settings.UPLOAD_SHARED_SECRET
|
||||
SECRET = settings.SHARED_SECRET
|
||||
TYPE_LOOKUP = {
|
||||
"application/pdf": Document.TYPE_PDF,
|
||||
"image/png": Document.TYPE_PNG,
|
||||
@ -23,31 +23,36 @@ class UploadForm(forms.Form):
|
||||
"image/tiff": Document.TYPE_TIF,
|
||||
}
|
||||
|
||||
sender = forms.CharField(
|
||||
max_length=Sender._meta.get_field("name").max_length, required=False)
|
||||
correspondent = forms.CharField(
|
||||
max_length=Correspondent._meta.get_field("name").max_length,
|
||||
required=False
|
||||
)
|
||||
title = forms.CharField(
|
||||
max_length=Document._meta.get_field("title").max_length, required=False)
|
||||
max_length=Document._meta.get_field("title").max_length,
|
||||
required=False
|
||||
)
|
||||
document = forms.FileField()
|
||||
signature = forms.CharField(max_length=256)
|
||||
|
||||
def clean_sender(self):
|
||||
def clean_correspondent(self):
|
||||
"""
|
||||
I suppose it might look cleaner to use .get_or_create() here, but that
|
||||
would also allow someone to fill up the db with bogus senders before all
|
||||
validation was met.
|
||||
would also allow someone to fill up the db with bogus correspondents
|
||||
before all validation was met.
|
||||
"""
|
||||
sender = self.cleaned_data.get("sender")
|
||||
if not sender:
|
||||
corresp = self.cleaned_data.get("correspondent")
|
||||
if not corresp:
|
||||
return None
|
||||
if not Sender.SAFE_REGEX.match(sender) or " - " in sender:
|
||||
raise forms.ValidationError("That sender name is suspicious.")
|
||||
return sender
|
||||
if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
|
||||
raise forms.ValidationError(
|
||||
"That correspondent name is suspicious.")
|
||||
return corresp
|
||||
|
||||
def clean_title(self):
|
||||
title = self.cleaned_data.get("title")
|
||||
if not title:
|
||||
return None
|
||||
if not Sender.SAFE_REGEX.match(title) or " - " in title:
|
||||
if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
|
||||
raise forms.ValidationError("That title is suspicious.")
|
||||
|
||||
def clean_document(self):
|
||||
@ -59,10 +64,10 @@ class UploadForm(forms.Form):
|
||||
return document, self.TYPE_LOOKUP[file_type]
|
||||
|
||||
def clean(self):
|
||||
sender = self.clened_data("sender")
|
||||
corresp = self.clened_data("correspondent")
|
||||
title = self.cleaned_data("title")
|
||||
signature = self.cleaned_data("signature")
|
||||
if sha256(sender + title + self.SECRET).hexdigest() == signature:
|
||||
if sha256(corresp + title + self.SECRET).hexdigest() == signature:
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -73,13 +78,15 @@ class UploadForm(forms.Form):
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
sender = self.clened_data("sender")
|
||||
correspondent = self.clened_data("correspondent")
|
||||
title = self.cleaned_data("title")
|
||||
document, file_type = self.cleaned_data.get("document")
|
||||
|
||||
t = int(mktime(datetime.now()))
|
||||
file_name = os.path.join(
|
||||
Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type))
|
||||
Consumer.CONSUME,
|
||||
"{} - {}.{}".format(correspondent, title, file_type)
|
||||
)
|
||||
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(document)
|
||||
|
@ -185,10 +185,10 @@ ISO639 = {
|
||||
"yo": "yor",
|
||||
"za": "zha",
|
||||
|
||||
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I have
|
||||
# no idea which one is better, so I just picked the bigger file.
|
||||
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I
|
||||
# have no idea which one is better, so I just picked the bigger file.
|
||||
"zh": "chi_tra",
|
||||
|
||||
"zu": "zul"
|
||||
|
||||
}
|
||||
}
|
||||
|
30
src/documents/loggers.py
Normal file
30
src/documents/loggers.py
Normal file
@ -0,0 +1,30 @@
|
||||
import logging
|
||||
|
||||
|
||||
class PaperlessLogger(logging.StreamHandler):
|
||||
"""
|
||||
A logger smart enough to know to log some kinds of messages to the database
|
||||
for later retrieval in a pretty interface.
|
||||
"""
|
||||
|
||||
def emit(self, record):
|
||||
|
||||
logging.StreamHandler.emit(self, record)
|
||||
|
||||
if not hasattr(record, "component"):
|
||||
return
|
||||
|
||||
# We have to do the import here or Django will barf when it tries to
|
||||
# load this because the apps aren't loaded at that point
|
||||
from .models import Log
|
||||
|
||||
kwargs = {
|
||||
"message": record.msg,
|
||||
"component": record.component,
|
||||
"level": record.levelno,
|
||||
}
|
||||
|
||||
if hasattr(record, "group"):
|
||||
kwargs["group"] = record.group
|
||||
|
||||
Log.objects.create(**kwargs)
|
@ -1,8 +1,10 @@
|
||||
import datetime
|
||||
import imaplib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from base64 import b64decode
|
||||
from email import policy
|
||||
@ -11,10 +13,8 @@ from dateutil import parser
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from logger.models import Log
|
||||
|
||||
from .consumer import Consumer
|
||||
from .models import Sender
|
||||
from .models import Correspondent, Log
|
||||
|
||||
|
||||
class MailFetcherError(Exception):
|
||||
@ -25,21 +25,34 @@ class InvalidMessageError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Message(object):
|
||||
class Loggable(object):
|
||||
|
||||
def __init__(self, group=None):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = group or uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group,
|
||||
"component": Log.COMPONENT_MAIL
|
||||
})
|
||||
|
||||
|
||||
class Message(Loggable):
|
||||
"""
|
||||
A crude, but simple email message class. We assume that there's a subject
|
||||
and n attachments, and that we don't care about the message body.
|
||||
"""
|
||||
|
||||
SECRET = settings.UPLOAD_SHARED_SECRET
|
||||
SECRET = settings.SHARED_SECRET
|
||||
|
||||
def __init__(self, data, verbosity=1):
|
||||
def __init__(self, data, group=None):
|
||||
"""
|
||||
Cribbed heavily from
|
||||
https://www.ianlewis.org/en/parsing-email-attachments-python
|
||||
"""
|
||||
|
||||
self.verbosity = verbosity
|
||||
Loggable.__init__(self, group=group)
|
||||
|
||||
self.subject = None
|
||||
self.time = None
|
||||
@ -54,8 +67,7 @@ class Message(object):
|
||||
|
||||
self._set_time(message)
|
||||
|
||||
Log.info(
|
||||
'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)
|
||||
self.log("info", 'Importing email: "{}"'.format(self.subject))
|
||||
|
||||
attachments = []
|
||||
for part in message.walk():
|
||||
@ -91,7 +103,7 @@ class Message(object):
|
||||
def check_subject(self):
|
||||
if self.subject is None:
|
||||
raise InvalidMessageError("Message does not have a subject")
|
||||
if not Sender.SAFE_REGEX.match(self.subject):
|
||||
if not Correspondent.SAFE_REGEX.match(self.subject):
|
||||
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
||||
self.subject))
|
||||
|
||||
@ -134,9 +146,11 @@ class Attachment(object):
|
||||
return self.data
|
||||
|
||||
|
||||
class MailFetcher(object):
|
||||
class MailFetcher(Loggable):
|
||||
|
||||
def __init__(self, verbosity=1):
|
||||
def __init__(self):
|
||||
|
||||
Loggable.__init__(self)
|
||||
|
||||
self._connection = None
|
||||
self._host = settings.MAIL_CONSUMPTION["HOST"]
|
||||
@ -148,7 +162,6 @@ class MailFetcher(object):
|
||||
self._enabled = bool(self._host)
|
||||
|
||||
self.last_checked = datetime.datetime.now()
|
||||
self.verbosity = verbosity
|
||||
|
||||
def pull(self):
|
||||
"""
|
||||
@ -159,14 +172,14 @@ class MailFetcher(object):
|
||||
|
||||
if self._enabled:
|
||||
|
||||
Log.info("Checking mail", Log.COMPONENT_MAIL)
|
||||
# Reset the grouping id for each fetch
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("debug", "Checking mail")
|
||||
|
||||
for message in self._get_messages():
|
||||
|
||||
Log.debug(
|
||||
'Storing email: "{}"'.format(message.subject),
|
||||
Log.COMPONENT_MAIL
|
||||
)
|
||||
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
||||
|
||||
t = int(time.mktime(message.time.timetuple()))
|
||||
file_name = os.path.join(Consumer.CONSUME, message.file_name)
|
||||
@ -193,7 +206,7 @@ class MailFetcher(object):
|
||||
self._connection.logout()
|
||||
|
||||
except Exception as e:
|
||||
Log.error(e, Log.COMPONENT_MAIL)
|
||||
self.log("error", str(e))
|
||||
|
||||
return r
|
||||
|
||||
@ -218,9 +231,9 @@ class MailFetcher(object):
|
||||
|
||||
message = None
|
||||
try:
|
||||
message = Message(data[0][1], self.verbosity)
|
||||
message = Message(data[0][1], self.logging_group)
|
||||
except InvalidMessageError as e:
|
||||
Log.error(e, Log.COMPONENT_MAIL)
|
||||
self.log("error", str(e))
|
||||
else:
|
||||
self._connection.store(num, "+FLAGS", "\\Deleted")
|
||||
|
||||
|
@ -1,10 +1,12 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from ...models import Log
|
||||
from ...consumer import Consumer, ConsumerError
|
||||
from ...mail import MailFetcher, MailFetcherError
|
||||
|
||||
@ -34,7 +36,7 @@ class Command(BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
try:
|
||||
self.file_consumer = Consumer(verbosity=self.verbosity)
|
||||
self.file_consumer = Consumer()
|
||||
self.mail_fetcher = MailFetcher()
|
||||
except (ConsumerError, MailFetcherError) as e:
|
||||
raise CommandError(e)
|
||||
@ -44,6 +46,13 @@ class Command(BaseCommand):
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(
|
||||
settings.CONSUMPTION_DIR
|
||||
),
|
||||
extra={"component": Log.COMPONENT_CONSUMER}
|
||||
)
|
||||
|
||||
try:
|
||||
while True:
|
||||
self.loop()
|
||||
|
@ -1,10 +1,12 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.core import serializers
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import Document, Correspondent, Tag
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
@ -14,21 +16,26 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Decrypt and rename all files in our collection into a given target
|
||||
directory. Note that we don't export any of the parsed data since
|
||||
that can always be re-collected via the consumer.
|
||||
directory. And include a manifest file containing document data for
|
||||
easy import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
parser.add_argument(
|
||||
"--legacy",
|
||||
action="store_true",
|
||||
help="Don't try to export all of the document data, just dump the "
|
||||
"original document files out in a format that makes "
|
||||
"re-consuming them easy."
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
self.target = None
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.target = None
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
self.target = options["target"]
|
||||
|
||||
if not os.path.exists(self.target):
|
||||
@ -40,9 +47,22 @@ class Command(Renderable, BaseCommand):
|
||||
if not settings.PASSPHRASE:
|
||||
settings.PASSPHRASE = input("Please enter the passphrase: ")
|
||||
|
||||
for document in Document.objects.all():
|
||||
if options["legacy"]:
|
||||
self.dump_legacy()
|
||||
else:
|
||||
self.dump()
|
||||
|
||||
def dump(self):
|
||||
|
||||
documents = Document.objects.all()
|
||||
document_map = {d.pk: d for d in documents}
|
||||
manifest = json.loads(serializers.serialize("json", documents))
|
||||
for document_dict in manifest:
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
target = os.path.join(self.target, document.file_name)
|
||||
document_dict["__exported_file_name__"] = target
|
||||
|
||||
print("Exporting: {}".format(target))
|
||||
|
||||
@ -50,3 +70,37 @@ class Command(Renderable, BaseCommand):
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
os.utime(target, times=(t, t))
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", Tag.objects.all()))
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json"), "w") as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
def dump_legacy(self):
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
target = os.path.join(
|
||||
self.target, self._get_legacy_file_name(document))
|
||||
|
||||
print("Exporting: {}".format(target))
|
||||
|
||||
with open(target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
os.utime(target, times=(t, t))
|
||||
|
||||
@staticmethod
|
||||
def _get_legacy_file_name(doc):
|
||||
if doc.correspondent and doc.title:
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
if tags:
|
||||
return "{} - {} - {}.{}".format(
|
||||
doc.correspondent, doc.title, tags, doc.file_type)
|
||||
return "{} - {}.{}".format(
|
||||
doc.correspondent, doc.title, doc.file_type)
|
||||
return os.path.basename(doc.source_path)
|
||||
|
99
src/documents/management/commands/document_importer.py
Normal file
99
src/documents/management/commands/document_importer.py
Normal file
@ -0,0 +1,99 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.core.management import call_command
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Using a manifest.json file, load the data from there, and import the
|
||||
documents it refers to.
|
||||
""".replace(" ", "")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("source")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.source = None
|
||||
self.manifest = None
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.source = options["source"]
|
||||
|
||||
if not os.path.exists(self.source):
|
||||
raise CommandError("That path doesn't exist")
|
||||
|
||||
if not os.access(self.source, os.R_OK):
|
||||
raise CommandError("That path doesn't appear to be readable")
|
||||
|
||||
manifest_path = os.path.join(self.source, "manifest.json")
|
||||
self._check_manifest_exists(manifest_path)
|
||||
|
||||
with open(manifest_path) as f:
|
||||
self.manifest = json.load(f)
|
||||
|
||||
self._check_manifest()
|
||||
|
||||
if not settings.PASSPHRASE:
|
||||
raise CommandError(
|
||||
"You need to define a passphrase before continuing. Please "
|
||||
"consult the documentation for setting up Paperless."
|
||||
)
|
||||
|
||||
# Fill up the database with whatever is in the manifest
|
||||
call_command("loaddata", manifest_path)
|
||||
|
||||
self._import_files_from_manifest()
|
||||
|
||||
@staticmethod
|
||||
def _check_manifest_exists(path):
|
||||
if not os.path.exists(path):
|
||||
raise CommandError(
|
||||
"That directory doesn't appear to contain a manifest.json "
|
||||
"file."
|
||||
)
|
||||
|
||||
def _check_manifest(self):
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
if "__exported_file_name__" not in record:
|
||||
raise CommandError(
|
||||
'The manifest file contains a record which does not '
|
||||
'refer to an actual document file.'
|
||||
)
|
||||
|
||||
doc_file = record["__exported_file_name__"]
|
||||
if not os.path.exists(os.path.join(self.source, doc_file)):
|
||||
raise CommandError(
|
||||
'The manifest file refers to "{}" which does not '
|
||||
'appear to be in the source directory.'.format(doc_file)
|
||||
)
|
||||
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
doc_file = record["__exported_file_name__"]
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
with open(doc_file, "rb") as unencrypted:
|
||||
with open(document.source_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
doc_file, document.source_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
@ -10,8 +10,8 @@ class Command(Renderable, BaseCommand):
|
||||
help = """
|
||||
Using the current set of tagging rules, apply said rules to all
|
||||
documents in the database, effectively allowing you to back-tag all
|
||||
previously indexed documents with tags created (or modified) after their
|
||||
initial import.
|
||||
previously indexed documents with tags created (or modified) after
|
||||
their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
tags = Tag.objects.exclude(
|
||||
pk__in=document.tags.values_list("pk", flat=True))
|
||||
for tag in tags:
|
||||
if tag.matches(document.content):
|
||||
print('Tagging {} with "{}"'.format(document, tag))
|
||||
document.tags.add(tag)
|
||||
|
||||
for tag in Tag.match_all(document.content, tags):
|
||||
print('Tagging {} with "{}"'.format(document, tag))
|
||||
document.tags.add(tag)
|
||||
|
20
src/documents/management/commands/loaddata_stdin.py
Normal file
20
src/documents/management/commands/loaddata_stdin.py
Normal file
@ -0,0 +1,20 @@
|
||||
import sys
|
||||
|
||||
from django.core.management.commands.loaddata import Command as LoadDataCommand
|
||||
|
||||
|
||||
class Command(LoadDataCommand):
|
||||
"""
|
||||
Allow the loading of data from standard in. Sourced originally from:
|
||||
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)
|
||||
"""
|
||||
|
||||
def parse_name(self, fixture_name):
|
||||
self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None)
|
||||
if fixture_name == '-':
|
||||
return '-', 'json', 'stdin'
|
||||
|
||||
def find_fixtures(self, fixture_label):
|
||||
if fixture_label == '-':
|
||||
return [('-', None, '-')]
|
||||
return super(Command, self).find_fixtures(fixture_label)
|
70
src/documents/managers.py
Normal file
70
src/documents/managers.py
Normal file
@ -0,0 +1,70 @@
|
||||
from django.conf import settings
|
||||
|
||||
from django.db import models
|
||||
from django.db.models.aggregates import Max
|
||||
|
||||
|
||||
class GroupConcat(models.Aggregate):
|
||||
"""
|
||||
Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've
|
||||
only ever tested it in Sqlite.
|
||||
"""
|
||||
|
||||
ENGINE_SQLITE = 1
|
||||
ENGINE_POSTGRESQL = 2
|
||||
ENGINE_MYSQL = 3
|
||||
ENGINES = {
|
||||
"django.db.backends.sqlite3": ENGINE_SQLITE,
|
||||
"django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL,
|
||||
"django.db.backends.postgresql": ENGINE_POSTGRESQL,
|
||||
"django.db.backends.mysql": ENGINE_MYSQL
|
||||
}
|
||||
|
||||
def __init__(self, expression, separator="\n", **extra):
|
||||
|
||||
self.engine = self._get_engine()
|
||||
self.function = self._get_function()
|
||||
self.template = self._get_template(separator)
|
||||
|
||||
models.Aggregate.__init__(
|
||||
self,
|
||||
expression,
|
||||
output_field=models.CharField(),
|
||||
**extra
|
||||
)
|
||||
|
||||
def _get_engine(self):
|
||||
engine = settings.DATABASES["default"]["ENGINE"]
|
||||
try:
|
||||
return self.ENGINES[engine]
|
||||
except KeyError:
|
||||
raise NotImplementedError(
|
||||
"There's currently no support for {} when it comes to group "
|
||||
"concatenation in Paperless".format(engine)
|
||||
)
|
||||
|
||||
def _get_function(self):
|
||||
if self.engine == self.ENGINE_POSTGRESQL:
|
||||
return "STRING_AGG"
|
||||
return "GROUP_CONCAT"
|
||||
|
||||
def _get_template(self, separator):
|
||||
if self.engine == self.ENGINE_MYSQL:
|
||||
return "%(function)s(%(expressions)s, SEPARATOR '{}')".format(
|
||||
separator)
|
||||
return "%(function)s(%(expressions)s, '{}')".format(separator)
|
||||
|
||||
|
||||
class LogQuerySet(models.query.QuerySet):
|
||||
|
||||
def by_group(self):
|
||||
return self.values("group").annotate(
|
||||
time=Max("modified"),
|
||||
messages=GroupConcat("message"),
|
||||
).order_by("-time")
|
||||
|
||||
|
||||
class LogManager(models.Manager):
|
||||
|
||||
def get_queryset(self):
|
||||
return LogQuerySet(self.model, using=self._db)
|
@ -1,5 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.9 on 2016-02-14 16:08
|
||||
# Generated by Django 1.9 on 2016-02-27 17:54
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
@ -7,9 +7,8 @@ from django.db import migrations, models
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('documents', '0009_auto_20160214_0040'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@ -17,14 +16,15 @@ class Migration(migrations.Migration):
|
||||
name='Log',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('time', models.DateTimeField(auto_now_add=True)),
|
||||
('group', models.UUIDField(blank=True)),
|
||||
('message', models.TextField()),
|
||||
('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)),
|
||||
('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)),
|
||||
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('modified', models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='log',
|
||||
options={'ordering': ('-time',)},
|
||||
options={
|
||||
'ordering': ('-modified',),
|
||||
},
|
||||
),
|
||||
]
|
28
src/documents/migrations/0011_auto_20160303_1929.py
Normal file
28
src/documents/migrations/0011_auto_20160303_1929.py
Normal file
@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.9.2 on 2016-03-03 19:29
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0010_log'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameModel(
|
||||
old_name='Sender',
|
||||
new_name='Correspondent',
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='document',
|
||||
options={'ordering': ('correspondent', 'title')},
|
||||
),
|
||||
migrations.RenameField(
|
||||
model_name='document',
|
||||
old_name='sender',
|
||||
new_name='correspondent',
|
||||
),
|
||||
]
|
119
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
119
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
@ -0,0 +1,119 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.9.2 on 2016-03-05 00:40
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import gnupg
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.utils.termcolors import colorize as colourise # Spelling hurts me
|
||||
|
||||
|
||||
class GnuPG(object):
|
||||
"""
|
||||
A handy singleton to use when handling encrypted files.
|
||||
"""
|
||||
|
||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
|
||||
@classmethod
|
||||
def decrypted(cls, file_handle):
|
||||
return cls.gpg.decrypt_file(
|
||||
file_handle, passphrase=settings.PASSPHRASE).data
|
||||
|
||||
@classmethod
|
||||
def encrypted(cls, file_handle):
|
||||
return cls.gpg.encrypt_file(
|
||||
file_handle,
|
||||
recipients=None,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
symmetric=True
|
||||
).data
|
||||
|
||||
|
||||
def move_documents_and_create_thumbnails(apps, schema_editor):
|
||||
|
||||
documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
|
||||
|
||||
if set(documents) == {"originals", "thumbnails"}:
|
||||
return
|
||||
|
||||
print(colourise(
|
||||
"\n\n"
|
||||
" This is a one-time only migration to generate thumbnails for all of your\n"
|
||||
" documents so that future UIs will have something to work with. If you have\n"
|
||||
" a lot of documents though, this may take a while, so a coffee break may be\n"
|
||||
" in order."
|
||||
"\n", opts=("bold",)
|
||||
))
|
||||
|
||||
try:
|
||||
os.makedirs(settings.SCRATCH_DIR)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
for f in sorted(documents):
|
||||
|
||||
if not f.endswith("gpg"):
|
||||
continue
|
||||
|
||||
print(" {} {} {}".format(
|
||||
colourise("*", fg="green"),
|
||||
colourise("Generating a thumbnail for", fg="white"),
|
||||
colourise(f, fg="cyan")
|
||||
))
|
||||
|
||||
thumb_temp = tempfile.mkdtemp(
|
||||
prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||
orig_temp = tempfile.mkdtemp(
|
||||
prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||
|
||||
orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f)
|
||||
orig_target = os.path.join(orig_temp, f.replace(".gpg", ""))
|
||||
|
||||
with open(orig_source, "rb") as encrypted:
|
||||
with open(orig_target, "wb") as unencrypted:
|
||||
unencrypted.write(GnuPG.decrypted(encrypted))
|
||||
|
||||
subprocess.Popen((
|
||||
settings.CONVERT_BINARY,
|
||||
"-scale", "500x5000",
|
||||
"-alpha", "remove",
|
||||
orig_target,
|
||||
os.path.join(thumb_temp, "convert-%04d.png")
|
||||
)).wait()
|
||||
|
||||
thumb_source = os.path.join(thumb_temp, "convert-0000.png")
|
||||
thumb_target = os.path.join(
|
||||
settings.MEDIA_ROOT,
|
||||
"documents",
|
||||
"thumbnails",
|
||||
re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
|
||||
)
|
||||
with open(thumb_source, "rb") as unencrypted:
|
||||
with open(thumb_target, "wb") as encrypted:
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
shutil.rmtree(thumb_temp)
|
||||
shutil.rmtree(orig_temp)
|
||||
|
||||
shutil.move(
|
||||
os.path.join(settings.MEDIA_ROOT, "documents", f),
|
||||
os.path.join(settings.MEDIA_ROOT, "documents", "originals", f),
|
||||
)
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0011_auto_20160303_1929'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(move_documents_and_create_thumbnails),
|
||||
]
|
@ -1,7 +1,7 @@
|
||||
class Renderable(object):
|
||||
"""
|
||||
A handy mixin to make it easier/cleaner to print output based on a verbosity
|
||||
value.
|
||||
A handy mixin to make it easier/cleaner to print output based on a
|
||||
verbosity value.
|
||||
"""
|
||||
|
||||
def _render(self, text, verbosity):
|
||||
|
@ -1,5 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.urlresolvers import reverse
|
||||
@ -7,6 +9,8 @@ from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
|
||||
from .managers import LogManager
|
||||
|
||||
|
||||
class SluggedModel(models.Model):
|
||||
|
||||
@ -25,7 +29,7 @@ class SluggedModel(models.Model):
|
||||
return self.name
|
||||
|
||||
|
||||
class Sender(SluggedModel):
|
||||
class Correspondent(SluggedModel):
|
||||
|
||||
# This regex is probably more restrictive than it needs to be, but it's
|
||||
# better safe than sorry.
|
||||
@ -36,7 +40,7 @@ class Sender(SluggedModel):
|
||||
|
||||
|
||||
class Tag(SluggedModel):
|
||||
|
||||
|
||||
COLOURS = (
|
||||
(1, "#a6cee3"),
|
||||
(2, "#1f78b4"),
|
||||
@ -71,9 +75,9 @@ class Tag(SluggedModel):
|
||||
default=MATCH_ANY,
|
||||
help_text=(
|
||||
"Which algorithm you want to use when matching text to the OCR'd "
|
||||
"PDF. Here, \"any\" looks for any occurrence of any word provided "
|
||||
"in the PDF, while \"all\" requires that every word provided "
|
||||
"appear in the PDF, albeit not in the order provided. A "
|
||||
"PDF. Here, \"any\" looks for any occurrence of any word "
|
||||
"provided in the PDF, while \"all\" requires that every word "
|
||||
"provided appear in the PDF, albeit not in the order provided. A "
|
||||
"\"literal\" match means that the text you enter must appear in "
|
||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||
"uses a regex to match the PDF. If you don't know what a regex "
|
||||
@ -86,28 +90,40 @@ class Tag(SluggedModel):
|
||||
return "{}: \"{}\" ({})".format(
|
||||
self.name, self.match, self.get_matching_algorithm_display())
|
||||
|
||||
@classmethod
|
||||
def match_all(cls, text, tags=None):
|
||||
|
||||
if tags is None:
|
||||
tags = cls.objects.all()
|
||||
|
||||
text = text.lower()
|
||||
for tag in tags:
|
||||
if tag.matches(text):
|
||||
yield tag
|
||||
|
||||
def matches(self, text):
|
||||
|
||||
# Check that match is not empty
|
||||
if self.match.strip() == "":
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ALL:
|
||||
for word in self.match.split(" "):
|
||||
if word not in text:
|
||||
if not re.search(r"\b{}\b".format(word), text):
|
||||
return False
|
||||
return True
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ANY:
|
||||
for word in self.match.split(" "):
|
||||
if word in text:
|
||||
if re.search(r"\b{}\b".format(word), text):
|
||||
return True
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_LITERAL:
|
||||
return self.match in text
|
||||
return bool(re.search(r"\b{}\b".format(self.match), text))
|
||||
|
||||
if self.matching_algorithm == self.MATCH_REGEX:
|
||||
return re.search(re.compile(self.match), text)
|
||||
return bool(re.search(re.compile(self.match), text))
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
@ -125,8 +141,8 @@ class Document(models.Model):
|
||||
TYPE_TIF = "tiff"
|
||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
|
||||
|
||||
sender = models.ForeignKey(
|
||||
Sender, blank=True, null=True, related_name="documents")
|
||||
correspondent = models.ForeignKey(
|
||||
Correspondent, blank=True, null=True, related_name="documents")
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
content = models.TextField(db_index=True)
|
||||
file_type = models.CharField(
|
||||
@ -140,14 +156,15 @@ class Document(models.Model):
|
||||
modified = models.DateTimeField(auto_now=True, editable=False)
|
||||
|
||||
class Meta(object):
|
||||
ordering = ("sender", "title")
|
||||
ordering = ("correspondent", "title")
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y-%m-%d")
|
||||
if self.sender and self.title:
|
||||
return "{}: {}, {}".format(created, self.sender, self.title)
|
||||
if self.sender or self.title:
|
||||
return "{}: {}".format(created, self.sender or self.title)
|
||||
created = self.created.strftime("%Y%m%d%H%M%S")
|
||||
if self.correspondent and self.title:
|
||||
return "{}: {} - {}".format(
|
||||
created, self.correspondent, self.title)
|
||||
if self.correspondent or self.title:
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
|
||||
@property
|
||||
@ -155,6 +172,7 @@ class Document(models.Model):
|
||||
return os.path.join(
|
||||
settings.MEDIA_ROOT,
|
||||
"documents",
|
||||
"originals",
|
||||
"{:07}.{}.gpg".format(self.pk, self.file_type)
|
||||
)
|
||||
|
||||
@ -164,14 +182,71 @@ class Document(models.Model):
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
if self.sender and self.title:
|
||||
tags = ",".join([t.slug for t in self.tags.all()])
|
||||
if tags:
|
||||
return "{} - {} - {}.{}".format(
|
||||
self.sender, self.title, tags, self.file_type)
|
||||
return "{} - {}.{}".format(self.sender, self.title, self.file_type)
|
||||
return os.path.basename(self.source_path)
|
||||
return slugify(str(self)) + "." + self.file_type
|
||||
|
||||
@property
|
||||
def download_url(self):
|
||||
return reverse("fetch", kwargs={"pk": self.pk})
|
||||
return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk})
|
||||
|
||||
@property
|
||||
def thumbnail_path(self):
|
||||
return os.path.join(
|
||||
settings.MEDIA_ROOT,
|
||||
"documents",
|
||||
"thumbnails",
|
||||
"{:07}.png.gpg".format(self.pk)
|
||||
)
|
||||
|
||||
@property
|
||||
def thumbnail_file(self):
|
||||
return open(self.thumbnail_path, "rb")
|
||||
|
||||
@property
|
||||
def thumbnail_url(self):
|
||||
return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
|
||||
|
||||
|
||||
class Log(models.Model):
|
||||
|
||||
LEVELS = (
|
||||
(logging.DEBUG, "Debugging"),
|
||||
(logging.INFO, "Informational"),
|
||||
(logging.WARNING, "Warning"),
|
||||
(logging.ERROR, "Error"),
|
||||
(logging.CRITICAL, "Critical"),
|
||||
)
|
||||
|
||||
COMPONENT_CONSUMER = 1
|
||||
COMPONENT_MAIL = 2
|
||||
COMPONENTS = (
|
||||
(COMPONENT_CONSUMER, "Consumer"),
|
||||
(COMPONENT_MAIL, "Mail Fetcher")
|
||||
)
|
||||
|
||||
group = models.UUIDField(blank=True)
|
||||
message = models.TextField()
|
||||
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
|
||||
component = models.PositiveIntegerField(choices=COMPONENTS)
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
modified = models.DateTimeField(auto_now=True)
|
||||
|
||||
objects = LogManager()
|
||||
|
||||
class Meta(object):
|
||||
ordering = ("-modified",)
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
"""
|
||||
To allow for the case where we don't want to group the message, we
|
||||
shouldn't force the caller to specify a one-time group value. However,
|
||||
allowing group=None means that the manager can't differentiate the
|
||||
different un-grouped messages, so instead we set a random one here.
|
||||
"""
|
||||
|
||||
if not self.group:
|
||||
self.group = uuid.uuid4()
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
55
src/documents/serialisers.py
Normal file
55
src/documents/serialisers.py
Normal file
@ -0,0 +1,55 @@
|
||||
from rest_framework import serializers
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log
|
||||
|
||||
|
||||
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
|
||||
class Meta(object):
|
||||
model = Correspondent
|
||||
fields = ("id", "slug", "name")
|
||||
|
||||
|
||||
class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||
|
||||
class Meta(object):
|
||||
model = Tag
|
||||
fields = (
|
||||
"id", "slug", "name", "colour", "match", "matching_algorithm")
|
||||
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent = serializers.HyperlinkedRelatedField(
|
||||
read_only=True, view_name="drf:correspondent-detail", allow_null=True)
|
||||
tags = serializers.HyperlinkedRelatedField(
|
||||
read_only=True, view_name="drf:tag-detail", many=True)
|
||||
|
||||
class Meta(object):
|
||||
model = Document
|
||||
fields = (
|
||||
"id",
|
||||
"correspondent",
|
||||
"title",
|
||||
"content",
|
||||
"file_type",
|
||||
"tags",
|
||||
"created",
|
||||
"modified",
|
||||
"file_name",
|
||||
"download_url",
|
||||
"thumbnail_url",
|
||||
)
|
||||
|
||||
|
||||
class LogSerializer(serializers.ModelSerializer):
|
||||
|
||||
time = serializers.DateTimeField()
|
||||
messages = serializers.CharField()
|
||||
|
||||
class Meta(object):
|
||||
model = Log
|
||||
fields = (
|
||||
"time",
|
||||
"messages"
|
||||
)
|
10
src/documents/templates/documents/index.html
Normal file
10
src/documents/templates/documents/index.html
Normal file
@ -0,0 +1,10 @@
|
||||
<!DOCTYPE html>
|
||||
|
||||
<html lang="en-gb">
|
||||
<head>
|
||||
<title>Paperless</title>
|
||||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
@ -4,18 +4,26 @@ from ..consumer import Consumer
|
||||
|
||||
|
||||
class TestAttachment(TestCase):
|
||||
|
||||
|
||||
TAGS = ("tag1", "tag2", "tag3")
|
||||
CONSUMER = Consumer()
|
||||
|
||||
SUFFIXES = (
|
||||
"pdf", "png", "jpg", "jpeg", "gif",
|
||||
"PDF", "PNG", "JPG", "JPEG", "GIF",
|
||||
"PdF", "PnG", "JpG", "JPeG", "GiF",
|
||||
)
|
||||
|
||||
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
||||
for suffix in ("pdf", "png", "jpg", "jpeg", "gif"):
|
||||
for suffix in self.SUFFIXES:
|
||||
f = path.format(suffix)
|
||||
results = self.CONSUMER._guess_attributes_from_name(f)
|
||||
self.assertEqual(results[0].name, sender, f)
|
||||
self.assertEqual(results[1], title, f)
|
||||
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
|
||||
self.assertEqual(results[3], suffix, f)
|
||||
if suffix.lower() == "jpeg":
|
||||
self.assertEqual(results[3], "jpg", f)
|
||||
else:
|
||||
self.assertEqual(results[3], suffix.lower(), f)
|
||||
|
||||
def test_guess_attributes_from_name0(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
|
36
src/documents/tests/test_importer.py
Normal file
36
src/documents/tests/test_importer.py
Normal file
@ -0,0 +1,36 @@
|
||||
from django.core.management.base import CommandError
|
||||
from django.test import TestCase
|
||||
|
||||
from ..management.commands.document_importer import Command
|
||||
|
||||
|
||||
class TestImporter(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
|
||||
def test_check_manifest_exists(self):
|
||||
cmd = Command()
|
||||
self.assertRaises(
|
||||
CommandError, cmd._check_manifest_exists, "/tmp/manifest.json")
|
||||
|
||||
def test_check_manifest(self):
|
||||
|
||||
cmd = Command()
|
||||
cmd.source = "/tmp"
|
||||
|
||||
cmd.manifest = [{"model": "documents.document"}]
|
||||
with self.assertRaises(CommandError) as cm:
|
||||
cmd._check_manifest()
|
||||
self.assertTrue(
|
||||
'The manifest file contains a record' in str(cm.exception))
|
||||
|
||||
cmd.manifest = [{
|
||||
"model": "documents.document",
|
||||
"__exported_file_name__": "noexist.pdf"
|
||||
}]
|
||||
# self.assertRaises(CommandError, cmd._check_manifest)
|
||||
with self.assertRaises(CommandError) as cm:
|
||||
cmd._check_manifest()
|
||||
self.assertTrue(
|
||||
'The manifest file refers to "noexist.pdf"' in str(cm.exception))
|
142
src/documents/tests/test_logger.py
Normal file
142
src/documents/tests/test_logger.py
Normal file
@ -0,0 +1,142 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import Log
|
||||
|
||||
|
||||
class TestPaperlessLog(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
self.logger = logging.getLogger(
|
||||
"documents.management.commands.document_consumer")
|
||||
|
||||
def test_ignored(self):
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
self.logger.info("This is an informational message")
|
||||
self.logger.warning("This is an informational message")
|
||||
self.logger.error("This is an informational message")
|
||||
self.logger.critical("This is an informational message")
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
def test_that_it_saves_at_all(self):
|
||||
|
||||
kw = {
|
||||
"group": uuid.uuid4(),
|
||||
"component": Log.COMPONENT_MAIL
|
||||
}
|
||||
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
|
||||
# Debug messages are ignored by default
|
||||
self.logger.debug("This is a debugging message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
self.logger.info("This is an informational message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 1)
|
||||
|
||||
self.logger.warning("This is an warning message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 2)
|
||||
|
||||
self.logger.error("This is an error message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 3)
|
||||
|
||||
self.logger.critical("This is a critical message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 4)
|
||||
|
||||
def test_groups(self):
|
||||
|
||||
kw1 = {
|
||||
"group": uuid.uuid4(),
|
||||
"component": Log.COMPONENT_MAIL
|
||||
}
|
||||
kw2 = {
|
||||
"group": uuid.uuid4(),
|
||||
"component": Log.COMPONENT_MAIL
|
||||
}
|
||||
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
|
||||
# Debug messages are ignored by default
|
||||
self.logger.debug("This is a debugging message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
self.logger.info("This is an informational message", extra=kw2)
|
||||
self.assertEqual(Log.objects.all().count(), 1)
|
||||
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
|
||||
|
||||
self.logger.warning("This is an warning message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 2)
|
||||
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
|
||||
|
||||
self.logger.error("This is an error message", extra=kw2)
|
||||
self.assertEqual(Log.objects.all().count(), 3)
|
||||
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
|
||||
|
||||
self.logger.critical("This is a critical message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 4)
|
||||
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
|
||||
|
||||
def test_components(self):
|
||||
|
||||
c1 = Log.COMPONENT_CONSUMER
|
||||
c2 = Log.COMPONENT_MAIL
|
||||
kw1 = {
|
||||
"group": uuid.uuid4(),
|
||||
"component": c1
|
||||
}
|
||||
kw2 = {
|
||||
"group": kw1["group"],
|
||||
"component": c2
|
||||
}
|
||||
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
|
||||
# Debug messages are ignored by default
|
||||
self.logger.debug("This is a debugging message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 0)
|
||||
|
||||
self.logger.info("This is an informational message", extra=kw2)
|
||||
self.assertEqual(Log.objects.all().count(), 1)
|
||||
self.assertEqual(Log.objects.filter(component=c2).count(), 1)
|
||||
|
||||
self.logger.warning("This is an warning message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 2)
|
||||
self.assertEqual(Log.objects.filter(component=c1).count(), 1)
|
||||
|
||||
self.logger.error("This is an error message", extra=kw2)
|
||||
self.assertEqual(Log.objects.all().count(), 3)
|
||||
self.assertEqual(Log.objects.filter(component=c2).count(), 2)
|
||||
|
||||
self.logger.critical("This is a critical message", extra=kw1)
|
||||
self.assertEqual(Log.objects.all().count(), 4)
|
||||
self.assertEqual(Log.objects.filter(component=c1).count(), 2)
|
||||
|
||||
def test_groupped_query(self):
|
||||
|
||||
kw = {
|
||||
"group": uuid.uuid4(),
|
||||
"component": Log.COMPONENT_MAIL
|
||||
}
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
self.logger.info("Message 0", extra=kw)
|
||||
self.logger.info("Message 1", extra=kw)
|
||||
self.logger.info("Message 2", extra=kw)
|
||||
self.logger.info("Message 3", extra=kw)
|
||||
|
||||
self.assertEqual(Log.objects.all().by_group().count(), 1)
|
||||
self.assertEqual(
|
||||
Log.objects.all().by_group()[0]["messages"],
|
||||
"Message 0\nMessage 1\nMessage 2\nMessage 3"
|
||||
)
|
@ -3,6 +3,7 @@ import os
|
||||
import magic
|
||||
|
||||
from hashlib import md5
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
@ -27,7 +28,8 @@ class TestMessage(TestCase):
|
||||
|
||||
with open(self.sample, "rb") as f:
|
||||
|
||||
message = Message(f.read(), verbosity=0)
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
message = Message(f.read())
|
||||
|
||||
self.assertTrue(message)
|
||||
self.assertEqual(message.subject, "Test 0")
|
||||
|
119
src/documents/tests/test_tags.py
Normal file
119
src/documents/tests/test_tags.py
Normal file
@ -0,0 +1,119 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import Tag
|
||||
|
||||
|
||||
class TestTagMatching(TestCase):
|
||||
|
||||
def test_match_all(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_ALL
|
||||
)
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have charlie in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_ALL
|
||||
)
|
||||
self.assertFalse(t.matches("I have 12 in me"))
|
||||
self.assertFalse(t.matches("I have 34 in me"))
|
||||
self.assertFalse(t.matches("I have 56 in me"))
|
||||
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
|
||||
def test_match_any(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_ANY
|
||||
)
|
||||
|
||||
self.assertTrue(t.matches("I have alpha in me"))
|
||||
self.assertTrue(t.matches("I have charlie in me"))
|
||||
self.assertTrue(t.matches("I have gamma in me"))
|
||||
self.assertTrue(t.matches("I have alpha and charlie in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_ANY
|
||||
)
|
||||
self.assertTrue(t.matches("I have 12 in me"))
|
||||
self.assertTrue(t.matches("I have 34 in me"))
|
||||
self.assertTrue(t.matches("I have 56 in me"))
|
||||
self.assertTrue(t.matches("I have 12 and 34 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
|
||||
def test_match_literal(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_LITERAL
|
||||
)
|
||||
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have charlie in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_LITERAL
|
||||
)
|
||||
self.assertFalse(t.matches("I have 12 in me"))
|
||||
self.assertFalse(t.matches("I have 34 in me"))
|
||||
self.assertFalse(t.matches("I have 56 in me"))
|
||||
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34 56 in me"))
|
||||
|
||||
def test_match_regex(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha\w+gamma",
|
||||
matching_algorithm=Tag.MATCH_REGEX
|
||||
)
|
||||
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
|
||||
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
@ -1,21 +1,41 @@
|
||||
from django.contrib.auth.mixins import LoginRequiredMixin
|
||||
from django.http import HttpResponse
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.generic import FormView, DetailView
|
||||
from django.views.generic import FormView, DetailView, TemplateView
|
||||
|
||||
from rest_framework.mixins import (
|
||||
RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin)
|
||||
from rest_framework.pagination import PageNumberPagination
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.viewsets import (
|
||||
ModelViewSet, ReadOnlyModelViewSet, GenericViewSet)
|
||||
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from .models import Document
|
||||
from .forms import UploadForm
|
||||
from .models import Correspondent, Tag, Document, Log
|
||||
from .serialisers import (
|
||||
CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer)
|
||||
|
||||
|
||||
class PdfView(DetailView):
|
||||
class IndexView(TemplateView):
|
||||
|
||||
template_name = "documents/index.html"
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
print(kwargs)
|
||||
print(self.request.GET)
|
||||
print(self.request.POST)
|
||||
return TemplateView.get_context_data(self, **kwargs)
|
||||
|
||||
|
||||
class FetchView(DetailView):
|
||||
|
||||
model = Document
|
||||
|
||||
def render_to_response(self, context, **response_kwargs):
|
||||
"""
|
||||
Override the default to return the unencrypted PDF as raw data.
|
||||
Override the default to return the unencrypted image/PDF as raw data.
|
||||
"""
|
||||
|
||||
content_types = {
|
||||
@ -26,19 +46,25 @@ class PdfView(DetailView):
|
||||
Document.TYPE_TIF: "image/tiff",
|
||||
}
|
||||
|
||||
if self.kwargs["kind"] == "thumb":
|
||||
return HttpResponse(
|
||||
GnuPG.decrypted(self.object.thumbnail_file),
|
||||
content_type=content_types[Document.TYPE_PNG]
|
||||
)
|
||||
|
||||
response = HttpResponse(
|
||||
GnuPG.decrypted(self.object.source_file),
|
||||
content_type=content_types[self.object.file_type]
|
||||
)
|
||||
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
||||
slugify(str(self.object)) + "." + self.object.file_type)
|
||||
self.object.file_name)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class PushView(FormView):
|
||||
class PushView(LoginRequiredMixin, FormView):
|
||||
"""
|
||||
A crude REST API for creating documents.
|
||||
A crude REST-ish API for creating documents.
|
||||
"""
|
||||
|
||||
form_class = UploadForm
|
||||
@ -52,3 +78,45 @@ class PushView(FormView):
|
||||
|
||||
def form_invalid(self, form):
|
||||
return HttpResponse("0")
|
||||
|
||||
|
||||
class StandardPagination(PageNumberPagination):
|
||||
page_size = 25
|
||||
page_size_query_param = "page-size"
|
||||
max_page_size = 100000
|
||||
|
||||
|
||||
class CorrespondentViewSet(ModelViewSet):
|
||||
model = Correspondent
|
||||
queryset = Correspondent.objects.all()
|
||||
serializer_class = CorrespondentSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
|
||||
class TagViewSet(ModelViewSet):
|
||||
model = Tag
|
||||
queryset = Tag.objects.all()
|
||||
serializer_class = TagSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
|
||||
class DocumentViewSet(RetrieveModelMixin,
|
||||
UpdateModelMixin,
|
||||
DestroyModelMixin,
|
||||
ListModelMixin,
|
||||
GenericViewSet):
|
||||
model = Document
|
||||
queryset = Document.objects.all()
|
||||
serializer_class = DocumentSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
|
||||
class LogViewSet(ReadOnlyModelViewSet):
|
||||
model = Log
|
||||
queryset = Log.objects.all().by_group()
|
||||
serializer_class = LogSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
@ -1,12 +0,0 @@
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import Log
|
||||
|
||||
|
||||
class LogAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = ("message", "level", "component")
|
||||
list_filter = ("level", "component",)
|
||||
|
||||
|
||||
admin.site.register(Log, LogAdmin)
|
@ -1,5 +0,0 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class LoggerConfig(AppConfig):
|
||||
name = 'logger'
|
@ -1,50 +0,0 @@
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Log(models.Model):
|
||||
|
||||
LEVEL_ERROR = 1
|
||||
LEVEL_WARNING = 2
|
||||
LEVEL_INFO = 3
|
||||
LEVEL_DEBUG = 4
|
||||
LEVELS = (
|
||||
(LEVEL_ERROR, "Error"),
|
||||
(LEVEL_WARNING, "Warning"),
|
||||
(LEVEL_INFO, "Informational"),
|
||||
(LEVEL_DEBUG, "Debugging"),
|
||||
)
|
||||
|
||||
COMPONENT_CONSUMER = 1
|
||||
COMPONENT_MAIL = 2
|
||||
COMPONENTS = (
|
||||
(COMPONENT_CONSUMER, "Consumer"),
|
||||
(COMPONENT_MAIL, "Mail Fetcher")
|
||||
)
|
||||
|
||||
time = models.DateTimeField(auto_now_add=True)
|
||||
message = models.TextField()
|
||||
level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO)
|
||||
component = models.PositiveIntegerField(choices=COMPONENTS)
|
||||
|
||||
class Meta(object):
|
||||
ordering = ("-time",)
|
||||
|
||||
@classmethod
|
||||
def error(cls, message, component):
|
||||
cls.objects.create(
|
||||
message=message, level=cls.LEVEL_ERROR, component=component)
|
||||
|
||||
@classmethod
|
||||
def warning(cls, message, component):
|
||||
cls.objects.create(
|
||||
message=message, level=cls.LEVEL_WARNING, component=component)
|
||||
|
||||
@classmethod
|
||||
def info(cls, message, component):
|
||||
cls.objects.create(
|
||||
message=message, level=cls.LEVEL_INFO, component=component)
|
||||
|
||||
@classmethod
|
||||
def debug(cls, message, component):
|
||||
cls.objects.create(
|
||||
message=message, level=cls.LEVEL_DEBUG, component=component)
|
@ -1,3 +0,0 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
@ -1,3 +0,0 @@
|
||||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
@ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
|
||||
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
@ -42,7 +44,8 @@ INSTALLED_APPS = [
|
||||
"django_extensions",
|
||||
|
||||
"documents",
|
||||
"logger",
|
||||
|
||||
"rest_framework",
|
||||
|
||||
]
|
||||
|
||||
@ -87,12 +90,12 @@ DATABASES = {
|
||||
"NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"),
|
||||
}
|
||||
}
|
||||
if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"):
|
||||
if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
|
||||
DATABASES["default"] = {
|
||||
"ENGINE": "django.db.backends.postgresql_psycopg2",
|
||||
"NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"),
|
||||
"USER": os.environ.get("PAPERLESS_DBUSER"),
|
||||
"PASSWORD": os.environ.get("PAPERLESS_DBPASS")
|
||||
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
|
||||
"USER": os.getenv("PAPERLESS_DBUSER"),
|
||||
"PASSWORD": os.getenv("PAPERLESS_DBPASS")
|
||||
}
|
||||
|
||||
|
||||
@ -139,55 +142,119 @@ STATIC_URL = '/static/'
|
||||
MEDIA_URL = "/media/"
|
||||
|
||||
|
||||
# Paperless-specific stuffs
|
||||
# Change these paths if yours are different
|
||||
# Paperless-specific stuff
|
||||
# You shouldn't have to edit any of these values. Rather, you can set these
|
||||
# values in /etc/paperless.conf instead.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# Tap paperless.conf if it's available
|
||||
if os.path.exists("/etc/paperless.conf"):
|
||||
load_dotenv("/etc/paperless.conf")
|
||||
|
||||
|
||||
# Logging
|
||||
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"handlers": {
|
||||
"consumer": {
|
||||
"class": "documents.loggers.PaperlessLogger",
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"documents": {
|
||||
"handlers": ["consumer"],
|
||||
"level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = "eng"
|
||||
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
|
||||
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||
|
||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF being
|
||||
# indexed anyway, with whatever we could get. If it's False, the file will
|
||||
# simply be left in the CONSUMPTION_DIR.
|
||||
FORGIVING_OCR = True
|
||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
||||
# being indexed anyway, with whatever we could get. If it's False, the file
|
||||
# will simply be left in the CONSUMPTION_DIR.
|
||||
FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
# GNUPG needs a home directory for some reason
|
||||
GNUPG_HOME = os.environ.get("HOME", "/dev/null")
|
||||
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
|
||||
# Convert is part of the Imagemagick package
|
||||
CONVERT_BINARY = "/usr/bin/convert"
|
||||
# Convert is part of the ImageMagick package
|
||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
|
||||
|
||||
# Unpaper
|
||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||
|
||||
# This will be created if it doesn't exist
|
||||
SCRATCH_DIR = "/tmp/paperless"
|
||||
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
|
||||
|
||||
# This is where Paperless will look for PDFs to index
|
||||
CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
|
||||
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
|
||||
|
||||
# If you want to use IMAP mail consumption, populate this with useful values.
|
||||
# If you leave HOST set to None, we assume you're not going to use this feature.
|
||||
# If you leave HOST set to None, we assume you're not going to use this
|
||||
# feature.
|
||||
MAIL_CONSUMPTION = {
|
||||
"HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
|
||||
"PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
|
||||
"USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"),
|
||||
"PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"),
|
||||
"HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"),
|
||||
"PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"),
|
||||
"USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"),
|
||||
"PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"),
|
||||
"USE_SSL": True, # If True, use SSL/TLS to connect
|
||||
"INBOX": "INBOX" # The name of the inbox on the server
|
||||
}
|
||||
|
||||
# This is used to encrypt the original documents and decrypt them later when you
|
||||
# want to download them. Set it and change the permissions on this file to
|
||||
# This is used to encrypt the original documents and decrypt them later when
|
||||
# you want to download them. Set it and change the permissions on this file to
|
||||
# 0600, or set it to `None` and you'll be prompted for the passphrase at
|
||||
# runtime. The default looks for an environment variable.
|
||||
# DON'T FORGET TO SET THIS as leaving it blank may cause some strange things
|
||||
# with GPG, including an interesting case where it may "encrypt" zero-byte
|
||||
# files.
|
||||
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
||||
PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
|
||||
|
||||
# If you intend to use the "API" to push files into the consumer, you'll need to
|
||||
# provide a shared secret here. Leaving this as the default will disable the
|
||||
# API.
|
||||
UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "")
|
||||
# If you intend to use the "API" to push files into the consumer, you'll need
|
||||
# to provide a shared secret here. Leaving this as the default will disable
|
||||
# the API.
|
||||
SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
|
||||
|
||||
#
|
||||
# TODO: Remove after 1.2
|
||||
#
|
||||
# This logic is here to address issue #44, wherein we were using inconsistent
|
||||
# constant names vs. environment variables. If you're using Paperless for the
|
||||
# first time, you can safely ignore everything from here on, so long as you're
|
||||
# correctly defining the variables as per the documentation.
|
||||
#
|
||||
|
||||
|
||||
def deprecated(before, after):
|
||||
print(
|
||||
"\n\n"
|
||||
"WARNING: {before} has been renamed to {after}.\n"
|
||||
"WARNING: Use of {before} will not work as of version 1.2."
|
||||
"\n\n".format(
|
||||
before=before,
|
||||
after=after
|
||||
)
|
||||
)
|
||||
|
||||
if not CONVERT_BINARY:
|
||||
CONVERT_BINARY = "convert"
|
||||
if os.getenv("PAPERLESS_CONVERT"):
|
||||
deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY")
|
||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY)
|
||||
|
||||
if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"):
|
||||
deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR")
|
||||
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME")
|
||||
|
||||
if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"):
|
||||
deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET")
|
||||
SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "")
|
||||
|
@ -15,15 +15,46 @@ Including another URLconf
|
||||
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
|
||||
"""
|
||||
from django.conf import settings
|
||||
from django.conf.urls import url, static
|
||||
from django.conf.urls import url, static, include
|
||||
from django.contrib import admin
|
||||
|
||||
from documents.views import PdfView, PushView
|
||||
from rest_framework.routers import DefaultRouter
|
||||
|
||||
from documents.views import (
|
||||
IndexView, FetchView, PushView,
|
||||
CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet
|
||||
)
|
||||
|
||||
router = DefaultRouter()
|
||||
router.register(r'correspondents', CorrespondentViewSet)
|
||||
router.register(r'tags', TagViewSet)
|
||||
router.register(r'documents', DocumentViewSet)
|
||||
router.register(r'logs', LogViewSet)
|
||||
|
||||
urlpatterns = [
|
||||
url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
|
||||
url(r'', admin.site.urls),
|
||||
|
||||
# API
|
||||
url(
|
||||
r"^api/auth/",
|
||||
include('rest_framework.urls', namespace="rest_framework")
|
||||
),
|
||||
url(r"^api/", include(router.urls, namespace="drf")),
|
||||
|
||||
# Normal pages (coming soon)
|
||||
# url(r"^$", IndexView.as_view(), name="index"),
|
||||
|
||||
# File downloads
|
||||
url(
|
||||
r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$",
|
||||
FetchView.as_view(),
|
||||
name="fetch"
|
||||
),
|
||||
|
||||
# The Django admin
|
||||
url(r"admin/", admin.site.urls),
|
||||
url(r"", admin.site.urls), # This is going away
|
||||
|
||||
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||
|
||||
if settings.UPLOAD_SHARED_SECRET:
|
||||
if settings.SHARED_SECRET:
|
||||
urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push"))
|
||||
|
@ -1 +1 @@
|
||||
__version__ = (0, 0, 6)
|
||||
__version__ = (0, 1, 1)
|
||||
|
23
src/tox.ini
Normal file
23
src/tox.ini
Normal file
@ -0,0 +1,23 @@
|
||||
# Tox (http://tox.testrun.org/) is a tool for running tests
|
||||
# in multiple virtualenvs. This configuration file will run the
|
||||
# test suite on all supported python versions. To use it, "pip install tox"
|
||||
# and then run "tox" from this directory.
|
||||
|
||||
[tox]
|
||||
skipsdist = True
|
||||
envlist = py34, py35, pep8
|
||||
|
||||
[testenv]
|
||||
commands = {envpython} manage.py test
|
||||
deps = -r{toxinidir}/../requirements.txt
|
||||
setenv =
|
||||
PAPERLESS_CONSUME=/tmp
|
||||
PAPERLESS_PASSPHRASE=THISISNOTASECRET
|
||||
PAPERLESS_SECRET=paperless
|
||||
|
||||
[testenv:pep8]
|
||||
commands=pep8
|
||||
deps=pep8
|
||||
|
||||
[pep8]
|
||||
exclude=.tox,migrations,paperless/settings.py
|
Loading…
x
Reference in New Issue
Block a user