mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
commit
f5e0a89a3f
7
.gitignore
vendored
7
.gitignore
vendored
@ -57,7 +57,9 @@ docs/_build/
|
|||||||
target/
|
target/
|
||||||
|
|
||||||
# Stored PDFs
|
# Stored PDFs
|
||||||
media/*
|
media/documents/*.gpg
|
||||||
|
media/documents/thumbnails/*.gpg
|
||||||
|
media/documents/originals/*.gpg
|
||||||
|
|
||||||
# Sqlite database
|
# Sqlite database
|
||||||
db.sqlite3
|
db.sqlite3
|
||||||
@ -68,8 +70,9 @@ db.sqlite3
|
|||||||
# Other stuff that doesn't belong
|
# Other stuff that doesn't belong
|
||||||
virtualenv
|
virtualenv
|
||||||
.vagrant
|
.vagrant
|
||||||
|
docker-compose.yml
|
||||||
|
docker-compose.env
|
||||||
|
|
||||||
# Used for development
|
# Used for development
|
||||||
scripts/import-for-development
|
scripts/import-for-development
|
||||||
environment
|
environment
|
||||||
|
|
||||||
|
18
.travis.yml
Normal file
18
.travis.yml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
language: python
|
||||||
|
|
||||||
|
sudo: false
|
||||||
|
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- python: 3.4
|
||||||
|
env: TOXENV=py34
|
||||||
|
- python: 3.5
|
||||||
|
env: TOXENV=py35
|
||||||
|
- python: 3.5
|
||||||
|
env: TOXENV=pep8
|
||||||
|
|
||||||
|
install:
|
||||||
|
- pip install --requirement requirements.txt
|
||||||
|
- pip install tox
|
||||||
|
|
||||||
|
script: tox -c src/tox.ini
|
46
Dockerfile
Normal file
46
Dockerfile
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
FROM python:3.5.1
|
||||||
|
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
sudo \
|
||||||
|
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install python dependencies
|
||||||
|
RUN mkdir -p /usr/src/paperless
|
||||||
|
WORKDIR /usr/src/paperless
|
||||||
|
COPY requirements.txt /usr/src/paperless/
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application
|
||||||
|
RUN mkdir -p /usr/src/paperless/src
|
||||||
|
RUN mkdir -p /usr/src/paperless/data
|
||||||
|
RUN mkdir -p /usr/src/paperless/media
|
||||||
|
COPY src/ /usr/src/paperless/src/
|
||||||
|
COPY data/ /usr/src/paperless/data/
|
||||||
|
COPY media/ /usr/src/paperless/media/
|
||||||
|
|
||||||
|
# Set consumption directory
|
||||||
|
ENV PAPERLESS_CONSUMPTION_DIR /consume
|
||||||
|
RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR
|
||||||
|
|
||||||
|
# Migrate database
|
||||||
|
WORKDIR /usr/src/paperless/src
|
||||||
|
RUN ./manage.py migrate
|
||||||
|
|
||||||
|
# Create user
|
||||||
|
RUN groupadd -g 1000 paperless \
|
||||||
|
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
|
||||||
|
&& chown -Rh paperless:paperless /usr/src/paperless
|
||||||
|
|
||||||
|
# Setup entrypoint
|
||||||
|
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||||
|
RUN chmod 755 /sbin/docker-entrypoint.sh
|
||||||
|
|
||||||
|
# Mount volumes
|
||||||
|
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
|
||||||
|
|
||||||
|
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||||
|
CMD ["--help"]
|
@ -3,6 +3,7 @@ Paperless
|
|||||||
|
|
||||||
|Documentation|
|
|Documentation|
|
||||||
|Chat|
|
|Chat|
|
||||||
|
|Travis|
|
||||||
|
|
||||||
Scan, index, and archive all of your paper documents
|
Scan, index, and archive all of your paper documents
|
||||||
|
|
||||||
@ -55,6 +56,7 @@ powerful tools.
|
|||||||
|
|
||||||
* `ImageMagick`_ converts the images between colour and greyscale.
|
* `ImageMagick`_ converts the images between colour and greyscale.
|
||||||
* `Tesseract`_ does the character recognition.
|
* `Tesseract`_ does the character recognition.
|
||||||
|
* `Unpaper`_ despeckles and and deskews the scanned image.
|
||||||
* `GNU Privacy Guard`_ is used as the encryption backend.
|
* `GNU Privacy Guard`_ is used as the encryption backend.
|
||||||
* `Python 3`_ is the language of the project.
|
* `Python 3`_ is the language of the project.
|
||||||
|
|
||||||
@ -92,6 +94,7 @@ home.
|
|||||||
.. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail
|
.. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail
|
||||||
.. _ImageMagick: http://imagemagick.org/
|
.. _ImageMagick: http://imagemagick.org/
|
||||||
.. _Tesseract: https://github.com/tesseract-ocr
|
.. _Tesseract: https://github.com/tesseract-ocr
|
||||||
|
.. _Unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||||
.. _GNU Privacy Guard: https://gnupg.org/
|
.. _GNU Privacy Guard: https://gnupg.org/
|
||||||
.. _Python 3: https://python.org/
|
.. _Python 3: https://python.org/
|
||||||
.. _Pillow: https://pypi.python.org/pypi/pillowfight/
|
.. _Pillow: https://pypi.python.org/pypi/pillowfight/
|
||||||
@ -105,4 +108,5 @@ home.
|
|||||||
.. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg
|
.. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg
|
||||||
:alt: Join the chat at https://gitter.im/danielquinn/paperless
|
:alt: Join the chat at https://gitter.im/danielquinn/paperless
|
||||||
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
:target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
|
||||||
|
.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master
|
||||||
|
:target: https://travis-ci.org/danielquinn/paperless
|
||||||
|
15
docker-compose.env.example
Normal file
15
docker-compose.env.example
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# Environment variables to set for Paperless
|
||||||
|
# Commented out variables will be replaced by a default within Paperless.
|
||||||
|
|
||||||
|
# Passphrase Paperless uses to encrypt and decrypt your documents
|
||||||
|
PAPERLESS_PASSPHRASE=CHANGE_ME
|
||||||
|
|
||||||
|
# The amount of threads to use for text recognition
|
||||||
|
# PAPERLESS_OCR_THREADS=4
|
||||||
|
|
||||||
|
# Additional languages to install for text recognition
|
||||||
|
# PAPERLESS_OCR_LANGUAGES=deu ita
|
||||||
|
|
||||||
|
# You can change the default user and group id to a custom one
|
||||||
|
# USERMAP_UID=1000
|
||||||
|
# USERMAP_GID=1000
|
37
docker-compose.yml.example
Normal file
37
docker-compose.yml.example
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
version: '2'
|
||||||
|
|
||||||
|
services:
|
||||||
|
webserver:
|
||||||
|
image: paperless
|
||||||
|
ports:
|
||||||
|
# You can adapt the port you want Paperless to listen on by
|
||||||
|
# modifying the part before the `:`.
|
||||||
|
- "8000:8000"
|
||||||
|
volumes:
|
||||||
|
- data:/usr/src/paperless/data
|
||||||
|
- media:/usr/src/paperless/media
|
||||||
|
env_file: docker-compose.env
|
||||||
|
environment:
|
||||||
|
- PAPERLESS_OCR_LANGUAGES=
|
||||||
|
command: ["runserver", "0.0.0.0:8000"]
|
||||||
|
|
||||||
|
consumer:
|
||||||
|
image: paperless
|
||||||
|
volumes:
|
||||||
|
- data:/usr/src/paperless/data
|
||||||
|
- media:/usr/src/paperless/media
|
||||||
|
# You have to adapt the local path you want the consumption
|
||||||
|
# directory to mount to by modifying the part before the ':'.
|
||||||
|
- /path/to/arbitrary/place:/consume
|
||||||
|
# Likewise, you can add a local path to mount a directory for
|
||||||
|
# exporting. This is not strictly needed for paperless to
|
||||||
|
# function, only if you're exporting your files: uncomment
|
||||||
|
# it and fill in a local path if you know you're going to
|
||||||
|
# want to export your documents.
|
||||||
|
# - /path/to/another/arbitrary/place:/export
|
||||||
|
env_file: docker-compose.env
|
||||||
|
command: ["document_consumer"]
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
data:
|
||||||
|
media:
|
18
docs/Dockerfile
Normal file
18
docs/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
FROM python:3.5.1
|
||||||
|
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||||
|
|
||||||
|
# Install Sphinx and Pygments
|
||||||
|
RUN pip install Sphinx Pygments
|
||||||
|
|
||||||
|
# Setup directories, copy data
|
||||||
|
RUN mkdir /build
|
||||||
|
COPY . /build
|
||||||
|
WORKDIR /build/docs
|
||||||
|
|
||||||
|
# Build documentation
|
||||||
|
RUN make html
|
||||||
|
|
||||||
|
# Start webserver
|
||||||
|
WORKDIR /build/docs/_build/html
|
||||||
|
EXPOSE 8000/tcp
|
||||||
|
CMD ["python3", "-m", "http.server"]
|
23
docs/api.rst
Normal file
23
docs/api.rst
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
.. _api:
|
||||||
|
|
||||||
|
The REST API
|
||||||
|
############
|
||||||
|
|
||||||
|
Paperless makes use of the `Django REST Framework`_ standard API interface
|
||||||
|
because of its inherent awesomeness. Conveniently, the system is also
|
||||||
|
self-documenting, so learn more about the access points, schema, what's
|
||||||
|
accepted and what isn't, you need only visit ``/api`` on your local Paperless
|
||||||
|
installation.
|
||||||
|
|
||||||
|
.. _Django REST Framework: http://django-rest-framework.org/
|
||||||
|
|
||||||
|
|
||||||
|
.. _api-uploading:
|
||||||
|
|
||||||
|
Uploading
|
||||||
|
---------
|
||||||
|
|
||||||
|
File uploads in an API are hard and so far as I've been able to tell, there's
|
||||||
|
no standard way of accepting them, so rather than crowbar file uploads into the
|
||||||
|
REST API and endure that headache, I've left that process to a simple HTTP
|
||||||
|
POST, documented on the :ref:`consumption page <consumption-http>`.
|
@ -1,10 +1,51 @@
|
|||||||
Changelog
|
Changelog
|
||||||
#########
|
#########
|
||||||
|
|
||||||
|
* 0.1.1
|
||||||
|
|
||||||
|
* Potentially **Breaking Change**: All references to "sender" in the code
|
||||||
|
have been renamed to "correspondent" to better reflect the nature of the
|
||||||
|
property (one could quite reasonably scan a document before sending it to
|
||||||
|
someone.)
|
||||||
|
* `#67`_: Rewrote the document exporter and added a new importer that allows
|
||||||
|
for full metadata retention without depending on the file name and
|
||||||
|
modification time. A big thanks to `Tikitu de Jager`_, `Pit`_,
|
||||||
|
`Florian Jung`_, and `Christopher Luu`_ for their code snippets and
|
||||||
|
contributing conversation that lead to this change.
|
||||||
|
* `#20`_: Added *unpaper* support to help in cleaning up the scanned image
|
||||||
|
before it's OCR'd. Thanks to `Pit`_ for this one.
|
||||||
|
* `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI.
|
||||||
|
* `#68`_: Added support for using a proper config file at
|
||||||
|
``/etc/paperless.conf`` and modified the systemd unit files to use it.
|
||||||
|
* Refactored the Vagrant installation process to use environment variables
|
||||||
|
rather than asking the user to modify ``settings.py``.
|
||||||
|
* `#44`_: Harmonise environment variable names with constant names.
|
||||||
|
* `#60`_: Setup logging to actually use the Python native logging framework.
|
||||||
|
* `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images
|
||||||
|
to be imported but made unavailable.
|
||||||
|
|
||||||
|
* 0.1.0
|
||||||
|
|
||||||
|
* Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and
|
||||||
|
`Tikitu de Jager`_ for this one, and especially to `Pit`_
|
||||||
|
who spearheadded this effort.
|
||||||
|
* A simple REST API is in place, but it should be considered unstable.
|
||||||
|
* Cleaned up the consumer to use temporary directories instead of a single
|
||||||
|
scratch space. (Thanks `Pit`_)
|
||||||
|
* Improved the efficiency of the consumer by parsing pages more intelligently
|
||||||
|
and introducing a threaded OCR process (thanks again `Pit`_).
|
||||||
|
* `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_.
|
||||||
|
* `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by
|
||||||
|
`Pit`_.
|
||||||
|
* `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_)
|
||||||
|
* `#54`_: Documented the re-tagger (`zedster`_)
|
||||||
|
* `#57`_: Make sure file is preserved on import failure (`darkmatter`_)
|
||||||
|
* Added tox with pep8 checking
|
||||||
|
|
||||||
* 0.0.6
|
* 0.0.6
|
||||||
|
|
||||||
* Added support for parallel OCR (significant work from pitkley)
|
* Added support for parallel OCR (significant work from `Pit`_)
|
||||||
* Sped up the language detection (significant work from pitkley)
|
* Sped up the language detection (significant work from `Pit`_)
|
||||||
* Added simple logging
|
* Added simple logging
|
||||||
|
|
||||||
* 0.0.5
|
* 0.0.5
|
||||||
@ -35,3 +76,26 @@ Changelog
|
|||||||
* 0.0.1
|
* 0.0.1
|
||||||
|
|
||||||
* Initial release
|
* Initial release
|
||||||
|
|
||||||
|
.. _Brian Conn: https://github.com/TheConnMan
|
||||||
|
.. _Christopher Luu: https://github.com/nuudles
|
||||||
|
.. _Florian Jung: https://github.com/the01
|
||||||
|
.. _Tikitu de Jager: https://github.com/tikitu
|
||||||
|
.. _Paul: https://github.com/polo2ro
|
||||||
|
.. _Pit: https://github.com/pitkley
|
||||||
|
.. _Wayne Werner: https://github.com/waynew
|
||||||
|
.. _darkmatter: https://github.com/darkmatter
|
||||||
|
.. _zedster: https://github.com/zedster
|
||||||
|
|
||||||
|
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||||
|
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||||
|
.. _#45: https://github.com/danielquinn/paperless/issues/45
|
||||||
|
.. _#47: https://github.com/danielquinn/paperless/issues/47
|
||||||
|
.. _#48: https://github.com/danielquinn/paperless/issues/48
|
||||||
|
.. _#53: https://github.com/danielquinn/paperless/issues/53
|
||||||
|
.. _#54: https://github.com/danielquinn/paperless/issues/54
|
||||||
|
.. _#57: https://github.com/danielquinn/paperless/issues/57
|
||||||
|
.. _#60: https://github.com/danielquinn/paperless/issues/60
|
||||||
|
.. _#67: https://github.com/danielquinn/paperless/issues/67
|
||||||
|
.. _#68: https://github.com/danielquinn/paperless/issues/68
|
||||||
|
.. _#71: https://github.com/danielquinn/paperless/issues/71
|
||||||
|
@ -40,14 +40,14 @@ follow the :ref:`consumer <utilities-consumer>` instructions to get it running.
|
|||||||
A Note on File Naming
|
A Note on File Naming
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
Any document you put into the consumption directory will be consumed, but if you
|
Any document you put into the consumption directory will be consumed, but if
|
||||||
name the file right, it'll automatically set some values in the database for
|
you name the file right, it'll automatically set some values in the database
|
||||||
you. This is is the logic the consumer follows:
|
for you. This is is the logic the consumer follows:
|
||||||
|
|
||||||
1. Try to find the sender, title, and tags in the file name following the
|
1. Try to find the correspondent, title, and tags in the file name following
|
||||||
pattern: ``Sender - Title - tag,tag,tag.pdf``.
|
the pattern: ``Correspondent - Title - tag,tag,tag.pdf``.
|
||||||
2. If that doesn't work, try to find the sender and title in the file name
|
2. If that doesn't work, try to find the correspondent and title in the file
|
||||||
following the pattern: ``Sender - Title.pdf``.
|
name following the pattern: ``Correspondent - Title.pdf``.
|
||||||
3. If that doesn't work, just assume that the name of the file is the title.
|
3. If that doesn't work, just assume that the name of the file is the title.
|
||||||
|
|
||||||
So given the above, the following examples would work as you'd expect:
|
So given the above, the following examples would work as you'd expect:
|
||||||
@ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running:
|
|||||||
the configured email account every 10 minutes for something new and pull down
|
the configured email account every 10 minutes for something new and pull down
|
||||||
whatever it finds.
|
whatever it finds.
|
||||||
4. Send yourself an email! Note that the subject is treated as the file name,
|
4. Send yourself an email! Note that the subject is treated as the file name,
|
||||||
so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get
|
so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll
|
||||||
what you expect. Also, you must include the aforementioned secret string in
|
get what you expect. Also, you must include the aforementioned secret
|
||||||
every email so the fetcher knows that it's safe to import.
|
string in every email so the fetcher knows that it's safe to import.
|
||||||
5. After a few minutes, the consumer will poll your mailbox, pull down the
|
5. After a few minutes, the consumer will poll your mailbox, pull down the
|
||||||
message, and place the attachment in the consumption directory with the
|
message, and place the attachment in the consumption directory with the
|
||||||
appropriate name. A few minutes later, the consumer will import it like any
|
appropriate name. A few minutes later, the consumer will import it like any
|
||||||
@ -111,23 +111,22 @@ So, with all that in mind, here's what you do to get it running:
|
|||||||
HTTP POST
|
HTTP POST
|
||||||
=========
|
=========
|
||||||
|
|
||||||
Currently, the API is limited to only handling file uploads, it doesn't do tags
|
You can also submit a document via HTTP POST. It doesn't do tags yet, and the
|
||||||
yet, and the URL schema isn't concrete, but it's a start. It's also not much of
|
URL schema isn't concrete, but it's a start.
|
||||||
a real API, it's just a URL that accepts an HTTP POST.
|
|
||||||
|
|
||||||
To push your document to *Paperless*, send an HTTP POST to the server with the
|
To push your document to Paperless, send an HTTP POST to the server with the
|
||||||
following name/value pairs:
|
following name/value pairs:
|
||||||
|
|
||||||
* ``sender``: The name of the document's sender. Note that there are
|
* ``correspondent``: The name of the document's correspondent. Note that there
|
||||||
restrictions on what characters you can use here. Specifically, alphanumeric
|
are restrictions on what characters you can use here. Specifically,
|
||||||
characters, `-`, `,`, `.`, and `'` are ok, everything else it out. You also
|
alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it
|
||||||
can't use the sequence ` - ` (space, dash, space).
|
out. You also can't use the sequence ` - ` (space, dash, space).
|
||||||
* ``title``: The title of the document. The rules for characters is the same
|
* ``title``: The title of the document. The rules for characters is the same
|
||||||
here as the sender.
|
here as the correspondent.
|
||||||
* ``signature``: For security reasons, we have the sender send a signature using
|
* ``signature``: For security reasons, we have the correspondent send a
|
||||||
a "shared secret" method to make sure that random strangers don't start
|
signature using a "shared secret" method to make sure that random strangers
|
||||||
uploading stuff to your server. The means of generating this signature is
|
don't start uploading stuff to your server. The means of generating this
|
||||||
defined below.
|
signature is defined below.
|
||||||
|
|
||||||
Specify ``enctype="multipart/form-data"``, and then POST your file with:::
|
Specify ``enctype="multipart/form-data"``, and then POST your file with:::
|
||||||
|
|
||||||
@ -146,12 +145,12 @@ verification.
|
|||||||
|
|
||||||
In the case of *Paperless*, you configure the server with the secret by setting
|
In the case of *Paperless*, you configure the server with the secret by setting
|
||||||
``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by
|
``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by
|
||||||
concatenating the sender, title, and the secret, and then using sha256 to
|
concatenating the correspondent, title, and the secret, and then using sha256
|
||||||
generate a hexdigest.
|
to generate a hexdigest.
|
||||||
|
|
||||||
If you're using Python, this is what that looks like:
|
If you're using Python, this is what that looks like:
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
signature = sha256(sender + title + secret).hexdigest()
|
signature = sha256(correspondent + title + secret).hexdigest()
|
||||||
|
@ -30,6 +30,7 @@ Contents
|
|||||||
requirements
|
requirements
|
||||||
setup
|
setup
|
||||||
consumption
|
consumption
|
||||||
|
api
|
||||||
utilities
|
utilities
|
||||||
migrating
|
migrating
|
||||||
changelog
|
changelog
|
||||||
|
@ -4,31 +4,10 @@ Migrating, Updates, and Backups
|
|||||||
===============================
|
===============================
|
||||||
|
|
||||||
As *Paperless* is still under active development, there's a lot that can change
|
As *Paperless* is still under active development, there's a lot that can change
|
||||||
as software updates roll out. The thing you just need to remember for all of
|
as software updates roll out. You should backup often, so if anything goes
|
||||||
this is that for the most part, **the database is expendable** so long as you
|
wrong during an update, you at least have a means of restoring to something
|
||||||
have your files. This is because the file name of the exported files includes
|
usable. Thankfully, there are automated ways of backing up, restoring, and
|
||||||
the name of the sender, the title, and the tags (if any) on each file.
|
updating the software.
|
||||||
|
|
||||||
|
|
||||||
.. _migrating-updates:
|
|
||||||
|
|
||||||
Updates
|
|
||||||
-------
|
|
||||||
|
|
||||||
For the most part, all you have to do to update *Paperless* is run ``git pull``
|
|
||||||
on the directory containing the project files, and then use Django's ``migrate``
|
|
||||||
command to execute any database schema updates that might have been rolled in
|
|
||||||
as part of the update:
|
|
||||||
|
|
||||||
.. code:: bash
|
|
||||||
|
|
||||||
$ cd /path/to/project
|
|
||||||
$ git pull
|
|
||||||
$ cd src
|
|
||||||
$ ./manage.py migrate
|
|
||||||
|
|
||||||
Note that it's possible (even likely) that while ``git pull`` may update some
|
|
||||||
files, the ``migrate`` step may not update anything. This is totally normal.
|
|
||||||
|
|
||||||
|
|
||||||
.. _migrating-backup:
|
.. _migrating-backup:
|
||||||
@ -38,20 +17,8 @@ Backing Up
|
|||||||
|
|
||||||
So you're bored of this whole project, or you want to make a remote backup of
|
So you're bored of this whole project, or you want to make a remote backup of
|
||||||
the unencrypted files for whatever reason. This is easy to do, simply use the
|
the unencrypted files for whatever reason. This is easy to do, simply use the
|
||||||
:ref:`exporter <utilities-exporter>` to dump your documents out into an
|
:ref:`exporter <utilities-exporter>` to dump your documents and database out
|
||||||
arbitrary directory.
|
into an arbitrary directory.
|
||||||
|
|
||||||
Additionally however, you'll need to back up the tags themselves. The file
|
|
||||||
names contain the tag names, but you still need to define the tags and their
|
|
||||||
matching algorithms in the database for things to work properly. We do this
|
|
||||||
with Django's ``dumpdata`` command, which produces JSON output.
|
|
||||||
|
|
||||||
.. code:: bash
|
|
||||||
|
|
||||||
$ cd /path/to/project
|
|
||||||
$ cd src
|
|
||||||
$ ./manage.py document_export /path/to/arbitrary/place/
|
|
||||||
$ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
|
|
||||||
|
|
||||||
|
|
||||||
.. _migrating-restoring:
|
.. _migrating-restoring:
|
||||||
@ -66,7 +33,7 @@ create an empty database (just follow the
|
|||||||
``tags.json`` file you created as part of your backup. Lastly, copy your
|
``tags.json`` file you created as part of your backup. Lastly, copy your
|
||||||
exported documents into the consumption directory and start up the consumer.
|
exported documents into the consumption directory and start up the consumer.
|
||||||
|
|
||||||
.. code:: bash
|
.. code-block:: shell-session
|
||||||
|
|
||||||
$ cd /path/to/project
|
$ cd /path/to/project
|
||||||
$ rm data/db.sqlite3 # Delete the database
|
$ rm data/db.sqlite3 # Delete the database
|
||||||
@ -77,3 +44,60 @@ exported documents into the consumption directory and start up the consumer.
|
|||||||
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
|
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
|
||||||
$ ./manage.py document_consumer
|
$ ./manage.py document_consumer
|
||||||
|
|
||||||
|
Importing your data if you are :ref:`using Docker <setup-installation-docker>`
|
||||||
|
is almost as simple:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
# Stop and remove your current containers
|
||||||
|
$ docker-compose stop
|
||||||
|
$ docker-compose rm -f
|
||||||
|
|
||||||
|
# Recreate them, add the superuser
|
||||||
|
$ docker-compose up -d
|
||||||
|
$ docker-compose run --rm webserver createsuperuser
|
||||||
|
|
||||||
|
# Load the tags
|
||||||
|
$ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
|
||||||
|
|
||||||
|
# Load your exported documents into the consumption directory
|
||||||
|
# (How you do this highly depends on how you have set this up)
|
||||||
|
$ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
|
||||||
|
|
||||||
|
After loading the documents into the consumption directory the consumer will
|
||||||
|
immediately start consuming the documents.
|
||||||
|
|
||||||
|
|
||||||
|
.. _migrating-updates:
|
||||||
|
|
||||||
|
Updates
|
||||||
|
-------
|
||||||
|
|
||||||
|
For the most part, all you have to do to update *Paperless* is run ``git pull``
|
||||||
|
on the directory containing the project files, and then use Django's ``migrate``
|
||||||
|
command to execute any database schema updates that might have been rolled in
|
||||||
|
as part of the update:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ cd /path/to/project
|
||||||
|
$ git pull
|
||||||
|
$ cd src
|
||||||
|
$ ./manage.py migrate
|
||||||
|
|
||||||
|
Note that it's possible (even likely) that while ``git pull`` may update some
|
||||||
|
files, the ``migrate`` step may not update anything. This is totally normal.
|
||||||
|
|
||||||
|
If you are :ref:`using Docker <setup-installation-docker>` the update process
|
||||||
|
requires only one additional step:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ cd /path/to/project
|
||||||
|
$ git pull
|
||||||
|
$ docker build -t paperless .
|
||||||
|
$ docker-compose up -d
|
||||||
|
$ docker-compose run --rm webserver migrate
|
||||||
|
|
||||||
|
If ``git pull`` doesn't report any changes, there is no need to continue with
|
||||||
|
the remaining steps.
|
||||||
|
@ -10,11 +10,13 @@ should work) that has the following software installed on it:
|
|||||||
* `GNU Privacy Guard`_
|
* `GNU Privacy Guard`_
|
||||||
* `Tesseract`_
|
* `Tesseract`_
|
||||||
* `Imagemagick`_
|
* `Imagemagick`_
|
||||||
|
* `unpaper`_
|
||||||
|
|
||||||
.. _Python3: https://python.org/
|
.. _Python3: https://python.org/
|
||||||
.. _GNU Privacy Guard: https://gnupg.org
|
.. _GNU Privacy Guard: https://gnupg.org
|
||||||
.. _Tesseract: https://github.com/tesseract-ocr
|
.. _Tesseract: https://github.com/tesseract-ocr
|
||||||
.. _Imagemagick: http://imagemagick.org/
|
.. _Imagemagick: http://imagemagick.org/
|
||||||
|
.. _unpaper: https://www.flameeyes.eu/projects/unpaper
|
||||||
|
|
||||||
Notably, you should confirm how you access your Python3 installation. Many
|
Notably, you should confirm how you access your Python3 installation. Many
|
||||||
Linux distributions will install Python3 in parallel to Python2, using the names
|
Linux distributions will install Python3 in parallel to Python2, using the names
|
||||||
@ -101,3 +103,16 @@ you'd like to generate your own docs locally, you'll need to:
|
|||||||
$ pip install sphinx
|
$ pip install sphinx
|
||||||
|
|
||||||
and then cd into the ``docs`` directory and type ``make html``.
|
and then cd into the ``docs`` directory and type ``make html``.
|
||||||
|
|
||||||
|
If you are using Docker, you can use the following commands to build the
|
||||||
|
documentation and run a webserver serving it on `port 8001`_:
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
$ pwd
|
||||||
|
/path/to/paperless
|
||||||
|
|
||||||
|
$ docker build -t paperless:docs -f docs/Dockerfile .
|
||||||
|
$ docker run --rm -it -p "8001:8000" paperless:docs
|
||||||
|
|
||||||
|
.. _port 8001: http://127.0.0.1:8001
|
||||||
|
215
docs/setup.rst
215
docs/setup.rst
@ -37,11 +37,19 @@ or just download the tarball and go that route:
|
|||||||
Installation & Configuration
|
Installation & Configuration
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
You can go two routes with setting up and running Paperless. The *Vagrant*
|
You can go multiple routes with setting up and running Paperless. The `Vagrant
|
||||||
route is quick & easy, but means you're running a VM which comes with memory
|
route`_ is quick & easy, but means you're running a VM which comes with memory
|
||||||
consumption etc. Alternatively the standard, "bare metal" approach is a little
|
consumption etc. We also `support Docker`_, which you can use natively under
|
||||||
more complicated.
|
Linux and in a VM with `Docker Machine`_ (this guide was written for native
|
||||||
|
Docker usage under Linux, you might have to adapt it for Docker Machine.)
|
||||||
|
Alternatively the standard, `bare metal`_ approach is a little more complicated,
|
||||||
|
but worth it because it makes it easier to should you want to contribute some
|
||||||
|
code back.
|
||||||
|
|
||||||
|
.. _Vagrant route: setup-installation-vagrant_
|
||||||
|
.. _support Docker: setup-installation-docker_
|
||||||
|
.. _bare metal: setup-installation-standard_
|
||||||
|
.. _Docker Machine: https://docs.docker.com/machine/
|
||||||
|
|
||||||
.. _setup-installation-standard:
|
.. _setup-installation-standard:
|
||||||
|
|
||||||
@ -91,33 +99,188 @@ Vagrant Method
|
|||||||
2. Run ``vagrant up``. An instance will start up for you. When it's ready and
|
2. Run ``vagrant up``. An instance will start up for you. When it's ready and
|
||||||
provisioned...
|
provisioned...
|
||||||
3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
|
3. Run ``vagrant ssh`` and once inside your new vagrant box, edit
|
||||||
``/opt/paperless/src/paperless/settings.py`` and set the values for:
|
``/etc/paperless.conf`` and set the values for:
|
||||||
* ``CONSUMPTION_DIR``: this is where your documents will be dumped to be
|
* ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be
|
||||||
consumed by Paperless.
|
dumped to be consumed by Paperless.
|
||||||
* ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt
|
* ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to
|
||||||
the original document. The default value attempts to source the
|
encrypt/decrypt the original document.
|
||||||
passphrase from the environment, so if you don't set it to a static value
|
* ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming
|
||||||
here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the
|
documents from mail or via the API. If you don't use either, leaving it
|
||||||
command line whenever invoking the consumer or webserver.
|
blank is just fine.
|
||||||
4. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
|
4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This
|
||||||
5. Still inside your vagrant box, create a user for your Paperless instance with
|
updates the environment to make use of the changes you made to the config
|
||||||
``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
|
file.
|
||||||
|
5. Initialise the database with ``/opt/paperless/src/manage.py migrate``.
|
||||||
|
6. Still inside your vagrant box, create a user for your Paperless instance
|
||||||
|
with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to
|
||||||
create your user.
|
create your user.
|
||||||
6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``.
|
7. Start the webserver with
|
||||||
You should now be able to visit your (empty) `Paperless webserver`_ at
|
``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be
|
||||||
``172.28.128.4:8000``. You can login with the user/pass you created in #5.
|
able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``.
|
||||||
7. In a separate window, run ``vagrant ssh`` again, but this time once inside
|
You can login with the user/pass you created in #6.
|
||||||
|
8. In a separate window, run ``vagrant ssh`` again, but this time once inside
|
||||||
your vagrant instance, you should start the consumer script with
|
your vagrant instance, you should start the consumer script with
|
||||||
``/opt/paperless/src/manage.py document_consumer``.
|
``/opt/paperless/src/manage.py document_consumer``.
|
||||||
8. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
9. Scan something. Put it in the ``CONSUMPTION_DIR``.
|
||||||
9. Wait a few minutes
|
10. Wait a few minutes
|
||||||
10. Visit the document list on your webserver, and it should be there, indexed
|
11. Visit the document list on your webserver, and it should be there, indexed
|
||||||
and downloadable.
|
and downloadable.
|
||||||
|
|
||||||
.. _Vagrant: https://vagrantup.com/
|
.. _Vagrant: https://vagrantup.com/
|
||||||
.. _Paperless server: http://172.28.128.4:8000
|
.. _Paperless server: http://172.28.128.4:8000
|
||||||
|
|
||||||
|
|
||||||
|
.. _setup-installation-docker:
|
||||||
|
|
||||||
|
Docker Method
|
||||||
|
.............
|
||||||
|
|
||||||
|
1. Install `Docker`_.
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
As mentioned earlier, this guide assumes that you use Docker natively
|
||||||
|
under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows,
|
||||||
|
you will have to adapt IP addresses, volume-mounting, command execution
|
||||||
|
and maybe more.
|
||||||
|
|
||||||
|
2. Install `docker-compose`_. [#compose]_
|
||||||
|
|
||||||
|
.. caution::
|
||||||
|
|
||||||
|
If you want to use the included ``docker-compose.yml.example`` file, you
|
||||||
|
need to have at least Docker version **1.10.0** and docker-compose
|
||||||
|
version **1.6.0**.
|
||||||
|
|
||||||
|
See the `Docker installation guide`_ on how to install the current
|
||||||
|
version of Docker for your operating system or Linux distribution of
|
||||||
|
choice. To get an up-to-date version of docker-compose, follow the
|
||||||
|
`docker-compose installation guide`_ if your package repository doesn't
|
||||||
|
include it.
|
||||||
|
|
||||||
|
.. _Docker installation guide: https://docs.docker.com/engine/installation/
|
||||||
|
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
|
||||||
|
|
||||||
|
3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
|
||||||
|
a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
|
||||||
|
editing both these files: taking a copy ensures that you can ``git pull`` to
|
||||||
|
receive updates without risking merge conflicts with your modified versions
|
||||||
|
of the configuration files.
|
||||||
|
4. Modify ``docker-compose.yml`` to your preferences, following the instructions
|
||||||
|
in comments in the file. The only change that is a hard requirement is to
|
||||||
|
specify where the consumption directory should mount.
|
||||||
|
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||||
|
|
||||||
|
``PAPERLESS_PASSPHRASE``
|
||||||
|
This is the passphrase Paperless uses to encrypt/decrypt the original
|
||||||
|
document.
|
||||||
|
|
||||||
|
``PAPERLESS_OCR_THREADS``
|
||||||
|
This is the number of threads the OCR process will spawn to process
|
||||||
|
document pages in parallel. If the variable is not set, Python determines
|
||||||
|
the core-count of your CPU and uses that value.
|
||||||
|
|
||||||
|
``PAPERLESS_OCR_LANGUAGES``
|
||||||
|
If you want the OCR to recognize other languages in addition to the default
|
||||||
|
English, set this parameter to a space separated list of three-letter
|
||||||
|
language-codes after `ISO 639-2/T`_. For a list of available languages --
|
||||||
|
including their three letter codes -- see the `Debian packagelist`_.
|
||||||
|
|
||||||
|
``USERMAP_UID`` and ``USERMAP_GID``
|
||||||
|
If you want to mount the consumption volume (directory ``/consume`` within
|
||||||
|
the containers) to a host-directory -- which you probably want to do --
|
||||||
|
access rights might be an issue. The default user and group ``paperless``
|
||||||
|
in the containers have an id of 1000. The containers will enforce that the
|
||||||
|
owning group of the consumption directory will be ``paperless`` to be able
|
||||||
|
to delete consumed documents. If your host-system has a group with an id of
|
||||||
|
1000 and you don't want this group to have access rights to the consumption
|
||||||
|
directory, you can use ``USERMAP_GID`` to change the id in the container
|
||||||
|
and thus the one of the consumption directory. Furthermore, you can change
|
||||||
|
the id of the default user as well using ``USERMAP_UID``.
|
||||||
|
|
||||||
|
6. Run ``docker-compose up -d``. This will create and start the necessary
|
||||||
|
containers.
|
||||||
|
7. To be able to login, you will need a super user. To create it, execute the
|
||||||
|
following command:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ docker-compose run --rm webserver createsuperuser
|
||||||
|
|
||||||
|
This will prompt you to set a username (default ``paperless``), an optional
|
||||||
|
e-mail address and finally a password.
|
||||||
|
8. The default ``docker-compose.yml`` exports the webserver on your local port
|
||||||
|
8000. If you haven't adapted this, you should now be able to visit your
|
||||||
|
`Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
|
||||||
|
user and password you just created.
|
||||||
|
9. Add files to consumption directory the way you prefer to. Following are two
|
||||||
|
possible options:
|
||||||
|
|
||||||
|
1. Mount the consumption directory to a local host path by modifying your
|
||||||
|
``docker-compose.yml``:
|
||||||
|
|
||||||
|
.. code-block:: diff
|
||||||
|
|
||||||
|
diff --git a/docker-compose.yml b/docker-compose.yml
|
||||||
|
--- a/docker-compose.yml
|
||||||
|
+++ b/docker-compose.yml
|
||||||
|
@@ -17,9 +18,8 @@ services:
|
||||||
|
volumes:
|
||||||
|
- paperless-data:/usr/src/paperless/data
|
||||||
|
- paperless-media:/usr/src/paperless/media
|
||||||
|
- - /consume
|
||||||
|
+ - /local/path/you/choose:/consume
|
||||||
|
|
||||||
|
.. danger::
|
||||||
|
|
||||||
|
While the consumption container will ensure at startup that it can
|
||||||
|
**delete** a consumed file from a host-mounted directory, it might not
|
||||||
|
be able to **read** the document in the first place if the access
|
||||||
|
rights to the file are incorrect.
|
||||||
|
|
||||||
|
Make sure that the documents you put into the consumption directory
|
||||||
|
will either be readable by everyone (``chmod o+r file.pdf``) or
|
||||||
|
readable by the default user or group id 1000 (or the one you have set
|
||||||
|
with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
|
||||||
|
|
||||||
|
2. Use ``docker cp`` to copy your files directly into the container:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ # Identify your containers
|
||||||
|
$ docker-compose ps
|
||||||
|
Name Command State Ports
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||||
|
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||||
|
|
||||||
|
$ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
|
||||||
|
|
||||||
|
``docker cp`` is a one-shot-command, just like ``cp``. This means that
|
||||||
|
every time you want to consume a new document, you will have to execute
|
||||||
|
``docker cp`` again. You can of course automate this process, but option 1
|
||||||
|
is generally the preferred one.
|
||||||
|
|
||||||
|
.. danger::
|
||||||
|
|
||||||
|
``docker cp`` will change the owning user and group of a copied file
|
||||||
|
to the acting user at the destination, which will be ``root``.
|
||||||
|
|
||||||
|
You therefore need to ensure that the documents you want to copy into
|
||||||
|
the container are readable by everyone (``chmod o+r file.pdf``) before
|
||||||
|
copying them.
|
||||||
|
|
||||||
|
|
||||||
|
.. _Docker: https://www.docker.com/
|
||||||
|
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||||
|
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||||
|
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
|
||||||
|
|
||||||
|
.. [#compose] You of course don't have to use docker-compose, but it
|
||||||
|
simplifies deployment immensely. If you know your way around Docker, feel
|
||||||
|
free to tinker around without using compose!
|
||||||
|
|
||||||
|
|
||||||
.. _making-things-a-little-more-permanent:
|
.. _making-things-a-little-more-permanent:
|
||||||
|
|
||||||
Making Things a Little more Permanent
|
Making Things a Little more Permanent
|
||||||
@ -126,5 +289,9 @@ Making Things a Little more Permanent
|
|||||||
Once you've tested things and are happy with the work flow, you can automate the
|
Once you've tested things and are happy with the work flow, you can automate the
|
||||||
process of starting the webserver and consumer automatically. If you're running
|
process of starting the webserver and consumer automatically. If you're running
|
||||||
on a bare metal system that's using Systemd, you can use the service unit files
|
on a bare metal system that's using Systemd, you can use the service unit files
|
||||||
in the ``scripts`` directory to set this up. If you're on a SysV or other
|
in the ``scripts`` directory to set this up. If you're on another startup
|
||||||
startup system (like the Vagrant box), then you're currently on your own.
|
system or are using a Vagrant box, then you're currently on your own. If you are
|
||||||
|
using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to
|
||||||
|
have the containers automatically start with the Docker daemon.
|
||||||
|
|
||||||
|
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
|
||||||
|
@ -26,7 +26,7 @@ How to Use It
|
|||||||
|
|
||||||
The webserver is started via the ``manage.py`` script:
|
The webserver is started via the ``manage.py`` script:
|
||||||
|
|
||||||
.. code:: bash
|
.. code-block:: shell-session
|
||||||
|
|
||||||
$ /path/to/paperless/src/manage.py runserver
|
$ /path/to/paperless/src/manage.py runserver
|
||||||
|
|
||||||
@ -64,7 +64,7 @@ How to Use It
|
|||||||
|
|
||||||
The consumer is started via the ``manage.py`` script:
|
The consumer is started via the ``manage.py`` script:
|
||||||
|
|
||||||
.. code:: bash
|
.. code-block:: shell-session
|
||||||
|
|
||||||
$ /path/to/paperless/src/manage.py document_consumer
|
$ /path/to/paperless/src/manage.py document_consumer
|
||||||
|
|
||||||
@ -95,13 +95,110 @@ How to Use It
|
|||||||
|
|
||||||
This too is done via the ``manage.py`` script:
|
This too is done via the ``manage.py`` script:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/
|
||||||
|
|
||||||
|
This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you
|
||||||
|
to do with as you please. The files are accompanied with a special file,
|
||||||
|
``manifest.json`` which can be used to
|
||||||
|
:ref:`import the files <utilities-importer>` at a later date if you wish.
|
||||||
|
|
||||||
|
|
||||||
|
.. _utilities-exporter-howto-docker:
|
||||||
|
|
||||||
|
Docker
|
||||||
|
______
|
||||||
|
|
||||||
|
If you are :ref:`using Docker <setup-installation-docker>`, running the
|
||||||
|
expoorter is almost as easy. To mount a volume for exports, follow the
|
||||||
|
instructions in the ``docker-compose.yml.example`` file for the ``/export``
|
||||||
|
volume (making the changes in your own ``docker-compose.yml`` file, of course).
|
||||||
|
Once you have the volume mounted, the command to run an export is:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ docker-compose run --rm consumer document_exporter /export
|
||||||
|
|
||||||
|
If you prefer to use ``docker run`` directly, supplying the necessary commandline
|
||||||
|
options:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ # Identify your containers
|
||||||
|
$ docker-compose ps
|
||||||
|
Name Command State Ports
|
||||||
|
-------------------------------------------------------------------------
|
||||||
|
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||||
|
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||||
|
|
||||||
|
$ # Make sure to replace your passphrase and remove or adapt the id mapping
|
||||||
|
$ docker run --rm \
|
||||||
|
--volumes-from paperless_data_1 \
|
||||||
|
--volume /path/to/arbitrary/place:/export \
|
||||||
|
-e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
|
||||||
|
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||||
|
paperless document_exporter /export
|
||||||
|
|
||||||
|
|
||||||
|
.. _utilities-importer:
|
||||||
|
|
||||||
|
The Importer
|
||||||
|
------------
|
||||||
|
|
||||||
|
Looking to transfer Paperless data from one instance to another, or just want
|
||||||
|
to restore from a backup? This is your go-to toy.
|
||||||
|
|
||||||
|
|
||||||
|
.. _utilities-importer-howto:
|
||||||
|
|
||||||
|
How to Use It
|
||||||
|
.............
|
||||||
|
|
||||||
|
The importer works just like the exporter. You point it at a directory, and
|
||||||
|
the script does the rest of the work:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/
|
||||||
|
|
||||||
|
Docker
|
||||||
|
______
|
||||||
|
|
||||||
|
Assuming that you've already gone through the steps above in the
|
||||||
|
:ref:`export <utilities-exporter-howto-docker>` section, then the easiest thing
|
||||||
|
to do is just re-use the ``/export`` path you already setup:
|
||||||
|
|
||||||
|
.. code-block:: shell-session
|
||||||
|
|
||||||
|
$ docker-compose run --rm consumer document_importer /export
|
||||||
|
|
||||||
|
Similarly, if you're not using docker-compose, you can adjust the export
|
||||||
|
instructions above to do the import.
|
||||||
|
|
||||||
|
|
||||||
|
.. _utilities-retagger:
|
||||||
|
|
||||||
|
The Re-tagger
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Say you've imported a few hundred documents and now want to introduce a tag
|
||||||
|
and apply its matching to all of the currently-imported docs. This problem is
|
||||||
|
common enough that there's a tool for it.
|
||||||
|
|
||||||
|
|
||||||
|
.. _utilities-retagger-howto:
|
||||||
|
|
||||||
|
How to Use It
|
||||||
|
.............
|
||||||
|
|
||||||
|
This too is done via the ``manage.py`` script:
|
||||||
|
|
||||||
.. code:: bash
|
.. code:: bash
|
||||||
|
|
||||||
$ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere
|
$ /path/to/paperless/src/manage.py document_retagger
|
||||||
|
|
||||||
This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with
|
That's it. It'll loop over all of the documents in your database and attempt
|
||||||
as you please. The naming scheme on export is identical to that used for
|
to match all of your tags to them. If one matches, it'll be applied. And
|
||||||
import, so should you can now safely delete the entire project directly,
|
don't worry, you can run this as often as you like, it' won't double-tag
|
||||||
database, encrypted PDFs and all, and later create it all again simply by
|
a document.
|
||||||
running the consumer again and dumping all of these files into
|
|
||||||
``CONSUMPTION_DIR``.
|
|
||||||
|
33
paperless.conf.example
Normal file
33
paperless.conf.example
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# Sample paperless.conf
|
||||||
|
# Copy this file to /etc/paperless.conf and modify it to suit your needs.
|
||||||
|
|
||||||
|
# This where your documents should go to be consumed. Make sure that it exists
|
||||||
|
# and that the user running the paperless service can read/write its contents
|
||||||
|
# before you start Paperless.
|
||||||
|
PAPERLESS_CONSUMPTION_DIR=""
|
||||||
|
|
||||||
|
# These values are required if you want paperless to check a particular email
|
||||||
|
# box every 10 minutes and attempt to consume documents from there. If you
|
||||||
|
# don't define a HOST, mail checking will just be disabled.
|
||||||
|
PAPERLESS_CONSUME_MAIL_HOST=""
|
||||||
|
PAPERLESS_CONSUME_MAIL_PORT=""
|
||||||
|
PAPERLESS_CONSUME_MAIL_USER=""
|
||||||
|
PAPERLESS_CONSUME_MAIL_PASS=""
|
||||||
|
|
||||||
|
# You must have a passphrase in order for Paperless to work at all. If you set
|
||||||
|
# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte
|
||||||
|
# file.
|
||||||
|
#
|
||||||
|
# The passphrase you use here will be used when storing your documents in
|
||||||
|
# Paperless, but you can always export them in an unencrypted format by using
|
||||||
|
# document exporter. See the documentaiton for more information.
|
||||||
|
#
|
||||||
|
# One final note about the passphrase. Once you've consumed a document with
|
||||||
|
# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and
|
||||||
|
# can't properly export documents that were encrypted with an old passphrase if
|
||||||
|
# you've since changed it to a new one.
|
||||||
|
PAPERLESS_PASSPHRASE="secret"
|
||||||
|
|
||||||
|
# If you intend to consume documents either via HTTP POST or by email, you must
|
||||||
|
# have a shared secret here.
|
||||||
|
PAPERLESS_SHARED_SECRET=""
|
@ -1,8 +1,10 @@
|
|||||||
Django==1.9
|
Django==1.9.2
|
||||||
django-extensions==1.6.1
|
django-extensions==1.6.1
|
||||||
|
djangorestframework==3.3.2
|
||||||
|
python-dotenv==0.3.0
|
||||||
filemagic==1.6
|
filemagic==1.6
|
||||||
langdetect==1.0.5
|
langdetect==1.0.5
|
||||||
Pillow==3.0.0
|
Pillow==3.1.1
|
||||||
pyocr==0.3.1
|
pyocr==0.3.1
|
||||||
python-dateutil==2.4.2
|
python-dateutil==2.4.2
|
||||||
python-gnupg==0.3.8
|
python-gnupg==0.3.8
|
||||||
|
74
scripts/docker-entrypoint.sh
Normal file
74
scripts/docker-entrypoint.sh
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Source: https://github.com/sameersbn/docker-gitlab/
|
||||||
|
map_uidgid() {
|
||||||
|
USERMAP_ORIG_UID=$(id -u paperless)
|
||||||
|
USERMAP_ORIG_UID=$(id -g paperless)
|
||||||
|
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
|
||||||
|
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||||
|
if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
|
||||||
|
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||||
|
groupmod -g ${USERMAP_GID} paperless
|
||||||
|
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
set_permissions() {
|
||||||
|
# Set permissions for consumption directory
|
||||||
|
chgrp paperless "$PAPERLESS_CONSUMPTION_DIR"
|
||||||
|
chmod g+x "$PAPERLESS_CONSUMPTION_DIR"
|
||||||
|
|
||||||
|
# Set permissions for application directory
|
||||||
|
chown -Rh paperless:paperless /usr/src/paperless
|
||||||
|
}
|
||||||
|
|
||||||
|
initialize() {
|
||||||
|
map_uidgid
|
||||||
|
set_permissions
|
||||||
|
}
|
||||||
|
|
||||||
|
install_languages() {
|
||||||
|
local langs="$1"
|
||||||
|
read -ra langs <<<"$langs"
|
||||||
|
|
||||||
|
# Check that it is not empty
|
||||||
|
if [ ${#langs[@]} -eq 0 ]; then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Update apt-lists
|
||||||
|
apt-get update
|
||||||
|
|
||||||
|
# Loop over languages to be installed
|
||||||
|
for lang in "${langs[@]}"; do
|
||||||
|
pkg="tesseract-ocr-$lang"
|
||||||
|
if dpkg -s "$pkg" 2>&1 > /dev/null; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
apt-get install "$pkg"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Remove apt lists
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if [[ "$1" != "/"* ]]; then
|
||||||
|
initialize
|
||||||
|
|
||||||
|
# Install additional languages if specified
|
||||||
|
if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then
|
||||||
|
install_languages "$PAPERLESS_OCR_LANGUAGES"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$@"
|
||||||
|
|
@ -2,10 +2,9 @@
|
|||||||
Description=Paperless consumer
|
Description=Paperless consumer
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
EnvironmentFile=/etc/conf.d/paperless
|
|
||||||
User=paperless
|
User=paperless
|
||||||
Group=paperless
|
Group=paperless
|
||||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
|
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
Description=Paperless webserver
|
Description=Paperless webserver
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
EnvironmentFile=/etc/conf.d/paperless
|
|
||||||
User=paperless
|
User=paperless
|
||||||
Group=paperless
|
Group=paperless
|
||||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000
|
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000
|
||||||
|
@ -1,13 +1,31 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# install packages
|
# Install packages
|
||||||
sudo apt-get update
|
apt-get update
|
||||||
sudo apt-get build-dep -y python-imaging
|
apt-get build-dep -y python-imaging
|
||||||
sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
|
apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev
|
||||||
sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
|
apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git
|
||||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick
|
apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper
|
||||||
|
|
||||||
# setup python project
|
# Python dependencies
|
||||||
pushd /opt/paperless
|
pip3 install -r /opt/paperless/requirements.txt
|
||||||
sudo pip3 install -r requirements.txt
|
|
||||||
popd
|
# Create the environment file
|
||||||
|
cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf
|
||||||
|
chmod 0640 /etc/paperless.conf
|
||||||
|
chown root:vagrant /etc/paperless.conf
|
||||||
|
|
||||||
|
# Create the consumption directory
|
||||||
|
mkdir /home/vagrant/consumption
|
||||||
|
chown vagrant:vagrant /home/vagrant/consumption
|
||||||
|
|
||||||
|
echo "
|
||||||
|
|
||||||
|
|
||||||
|
Now follow the remaining steps in the Vagrant section of the setup
|
||||||
|
documentation to complete the process:
|
||||||
|
|
||||||
|
http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant
|
||||||
|
|
||||||
|
|
||||||
|
"
|
||||||
|
@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group
|
|||||||
from django.core.urlresolvers import reverse
|
from django.core.urlresolvers import reverse
|
||||||
from django.templatetags.static import static
|
from django.templatetags.static import static
|
||||||
|
|
||||||
from .models import Sender, Tag, Document
|
from .models import Correspondent, Tag, Document, Log
|
||||||
|
|
||||||
|
|
||||||
class MonthListFilter(admin.SimpleListFilter):
|
class MonthListFilter(admin.SimpleListFilter):
|
||||||
@ -45,39 +45,73 @@ class DocumentAdmin(admin.ModelAdmin):
|
|||||||
"all": ("paperless.css",)
|
"all": ("paperless.css",)
|
||||||
}
|
}
|
||||||
|
|
||||||
search_fields = ("sender__name", "title", "content")
|
search_fields = ("correspondent__name", "title", "content")
|
||||||
list_display = ("created", "sender", "title", "tags_", "document")
|
list_display = ("created_", "correspondent", "title", "tags_", "document")
|
||||||
list_filter = ("tags", "sender", MonthListFilter)
|
list_filter = ("tags", "correspondent", MonthListFilter)
|
||||||
list_per_page = 25
|
list_per_page = 25
|
||||||
|
|
||||||
|
def created_(self, obj):
|
||||||
|
return obj.created.date().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
def tags_(self, obj):
|
def tags_(self, obj):
|
||||||
r = ""
|
r = ""
|
||||||
for tag in obj.tags.all():
|
for tag in obj.tags.all():
|
||||||
r += '<a class="tag" style="background-color: {};" href="{}">{}</a>'.format(
|
colour = tag.get_colour_display()
|
||||||
tag.get_colour_display(),
|
r += self._html_tag(
|
||||||
"{}?tags__id__exact={}".format(
|
"a",
|
||||||
reverse("admin:documents_document_changelist"),
|
tag.slug,
|
||||||
tag.pk
|
**{
|
||||||
),
|
"class": "tag",
|
||||||
tag.slug
|
"style": "background-color: {};".format(colour),
|
||||||
|
"href": "{}?tags__id__exact={}".format(
|
||||||
|
reverse("admin:documents_document_changelist"),
|
||||||
|
tag.pk
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
return r
|
return r
|
||||||
tags_.allow_tags = True
|
tags_.allow_tags = True
|
||||||
|
|
||||||
def document(self, obj):
|
def document(self, obj):
|
||||||
return '<a href="{}">' \
|
return self._html_tag(
|
||||||
'<img src="{}" width="22" height="22" alt="{} icon" title="{}">' \
|
"a",
|
||||||
'</a>'.format(
|
self._html_tag(
|
||||||
obj.download_url,
|
"img",
|
||||||
static("documents/img/{}.png".format(obj.file_type)),
|
src=static("documents/img/{}.png".format(obj.file_type)),
|
||||||
obj.file_type,
|
width=22,
|
||||||
obj.file_name
|
height=22,
|
||||||
)
|
alt=obj.file_type,
|
||||||
|
title=obj.file_name
|
||||||
|
),
|
||||||
|
href=obj.download_url
|
||||||
|
)
|
||||||
document.allow_tags = True
|
document.allow_tags = True
|
||||||
|
|
||||||
admin.site.register(Sender)
|
@staticmethod
|
||||||
|
def _html_tag(kind, inside=None, **kwargs):
|
||||||
|
|
||||||
|
attributes = []
|
||||||
|
for lft, rgt in kwargs.items():
|
||||||
|
attributes.append('{}="{}"'.format(lft, rgt))
|
||||||
|
|
||||||
|
if inside is not None:
|
||||||
|
return "<{kind} {attributes}>{inside}</{kind}>".format(
|
||||||
|
kind=kind, attributes=" ".join(attributes), inside=inside)
|
||||||
|
|
||||||
|
return "<{} {}/>".format(kind, " ".join(attributes))
|
||||||
|
|
||||||
|
|
||||||
|
class LogAdmin(admin.ModelAdmin):
|
||||||
|
|
||||||
|
list_display = ("message", "level", "component")
|
||||||
|
list_filter = ("level", "component",)
|
||||||
|
|
||||||
|
|
||||||
|
admin.site.register(Correspondent)
|
||||||
admin.site.register(Tag, TagAdmin)
|
admin.site.register(Tag, TagAdmin)
|
||||||
admin.site.register(Document, DocumentAdmin)
|
admin.site.register(Document, DocumentAdmin)
|
||||||
|
admin.site.register(Log, LogAdmin)
|
||||||
|
|
||||||
|
|
||||||
# Unless we implement multi-user, these default registrations don't make sense.
|
# Unless we implement multi-user, these default registrations don't make sense.
|
||||||
admin.site.unregister(Group)
|
admin.site.unregister(Group)
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import datetime
|
import datetime
|
||||||
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import uuid
|
||||||
|
|
||||||
from multiprocessing.pool import Pool
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
@ -17,20 +20,14 @@ from PIL import Image
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
|
from pyocr.tesseract import TesseractError
|
||||||
|
|
||||||
from logger.models import Log
|
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
from .models import Sender, Tag, Document
|
from .models import Correspondent, Tag, Document, Log
|
||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
def image_to_string(args):
|
|
||||||
self, png, lang = args
|
|
||||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
|
||||||
return self.OCR.image_to_string(f, lang=lang)
|
|
||||||
|
|
||||||
|
|
||||||
class OCRError(Exception):
|
class OCRError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -42,8 +39,8 @@ class ConsumerError(Exception):
|
|||||||
class Consumer(object):
|
class Consumer(object):
|
||||||
"""
|
"""
|
||||||
Loop over every file found in CONSUMPTION_DIR and:
|
Loop over every file found in CONSUMPTION_DIR and:
|
||||||
1. Convert it to a greyscale png
|
1. Convert it to a greyscale pnm
|
||||||
2. Use tesseract on the png
|
2. Use tesseract on the pnm
|
||||||
3. Encrypt and store the document in the MEDIA_ROOT
|
3. Encrypt and store the document in the MEDIA_ROOT
|
||||||
4. Store the OCR'd text in the database
|
4. Store the OCR'd text in the database
|
||||||
5. Delete the document and image(s)
|
5. Delete the document and image(s)
|
||||||
@ -51,28 +48,29 @@ class Consumer(object):
|
|||||||
|
|
||||||
SCRATCH = settings.SCRATCH_DIR
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
CONVERT = settings.CONVERT_BINARY
|
CONVERT = settings.CONVERT_BINARY
|
||||||
|
UNPAPER = settings.UNPAPER_BINARY
|
||||||
CONSUME = settings.CONSUMPTION_DIR
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
|
|
||||||
OCR = pyocr.get_available_tools()[0]
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
REGEX_TITLE = re.compile(
|
REGEX_TITLE = re.compile(
|
||||||
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
flags=re.IGNORECASE
|
flags=re.IGNORECASE
|
||||||
)
|
)
|
||||||
REGEX_SENDER_TITLE = re.compile(
|
REGEX_CORRESPONDENT_TITLE = re.compile(
|
||||||
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
flags=re.IGNORECASE
|
flags=re.IGNORECASE
|
||||||
)
|
)
|
||||||
REGEX_SENDER_TITLE_TAGS = re.compile(
|
REGEX_CORRESPONDENT_TITLE_TAGS = re.compile(
|
||||||
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$",
|
||||||
flags=re.IGNORECASE
|
flags=re.IGNORECASE
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, verbosity=1):
|
def __init__(self):
|
||||||
|
|
||||||
self.verbosity = verbosity
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.logging_group = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(self.SCRATCH)
|
os.makedirs(self.SCRATCH)
|
||||||
@ -92,6 +90,12 @@ class Consumer(object):
|
|||||||
raise ConsumerError(
|
raise ConsumerError(
|
||||||
"Consumption directory {} does not exist".format(self.CONSUME))
|
"Consumption directory {} does not exist".format(self.CONSUME))
|
||||||
|
|
||||||
|
def log(self, level, message):
|
||||||
|
getattr(self.logger, level)(message, extra={
|
||||||
|
"group": self.logging_group,
|
||||||
|
"component": Log.COMPONENT_CONSUMER
|
||||||
|
})
|
||||||
|
|
||||||
def consume(self):
|
def consume(self):
|
||||||
|
|
||||||
for doc in os.listdir(self.CONSUME):
|
for doc in os.listdir(self.CONSUME):
|
||||||
@ -110,122 +114,156 @@ class Consumer(object):
|
|||||||
if self._is_ready(doc):
|
if self._is_ready(doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
|
self.logging_group = uuid.uuid4()
|
||||||
|
|
||||||
|
self.log("info", "Consuming {}".format(doc))
|
||||||
|
|
||||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||||
pngs = self._get_greyscale(tempdir, doc)
|
imgs = self._get_greyscale(tempdir, doc)
|
||||||
|
thumbnail = self._get_thumbnail(tempdir, doc)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = self._get_ocr(pngs)
|
text = self._get_ocr(imgs)
|
||||||
self._store(text, doc)
|
self._store(text, doc, thumbnail)
|
||||||
except OCRError:
|
except OCRError as e:
|
||||||
self._ignore.append(doc)
|
self._ignore.append(doc)
|
||||||
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
|
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
||||||
|
self._cleanup_tempdir(tempdir)
|
||||||
continue
|
continue
|
||||||
finally:
|
else:
|
||||||
self._cleanup(tempdir, doc)
|
self._cleanup_tempdir(tempdir)
|
||||||
|
self._cleanup_doc(doc)
|
||||||
|
|
||||||
def _get_greyscale(self, tempdir, doc):
|
def _get_greyscale(self, tempdir, doc):
|
||||||
|
"""
|
||||||
|
Greyscale images are easier for Tesseract to OCR
|
||||||
|
"""
|
||||||
|
|
||||||
Log.debug(
|
self.log("info", "Generating greyscale image from {}".format(doc))
|
||||||
"Generating greyscale image from {}".format(doc),
|
|
||||||
Log.COMPONENT_CONSUMER
|
|
||||||
)
|
|
||||||
|
|
||||||
png = os.path.join(tempdir, "convert-%04d.jpg")
|
|
||||||
|
|
||||||
|
# Convert PDF to multiple PNMs
|
||||||
|
pnm = os.path.join(tempdir, "convert-%04d.pnm")
|
||||||
subprocess.Popen((
|
subprocess.Popen((
|
||||||
self.CONVERT, "-density", "300", "-depth", "8",
|
self.CONVERT, "-density", "300", "-depth", "8",
|
||||||
"-type", "grayscale", doc, png
|
"-type", "grayscale", doc, pnm
|
||||||
)).wait()
|
)).wait()
|
||||||
|
|
||||||
pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
|
# Get a list of converted images
|
||||||
return sorted(filter(lambda f: os.path.isfile(f), pngs))
|
pnms = []
|
||||||
|
for f in os.listdir(tempdir):
|
||||||
|
if f.endswith(".pnm"):
|
||||||
|
pnms.append(os.path.join(tempdir, f))
|
||||||
|
|
||||||
@staticmethod
|
# Run unpaper in parallel on converted images
|
||||||
def _guess_language(text):
|
with Pool(processes=self.THREADS) as pool:
|
||||||
|
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
|
||||||
|
|
||||||
|
# Return list of converted images, processed with unpaper
|
||||||
|
pnms = []
|
||||||
|
for f in os.listdir(tempdir):
|
||||||
|
if f.endswith(".unpaper.pnm"):
|
||||||
|
pnms.append(os.path.join(tempdir, f))
|
||||||
|
|
||||||
|
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||||
|
|
||||||
|
def _get_thumbnail(self, tempdir, doc):
|
||||||
|
"""
|
||||||
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.log("info", "Generating the thumbnail")
|
||||||
|
|
||||||
|
subprocess.Popen((
|
||||||
|
self.CONVERT,
|
||||||
|
"-scale", "500x5000",
|
||||||
|
"-alpha", "remove",
|
||||||
|
doc,
|
||||||
|
os.path.join(tempdir, "convert-%04d.png")
|
||||||
|
)).wait()
|
||||||
|
|
||||||
|
return os.path.join(tempdir, "convert-0000.png")
|
||||||
|
|
||||||
|
def _guess_language(self, text):
|
||||||
try:
|
try:
|
||||||
guess = langdetect.detect(text)
|
guess = langdetect.detect(text)
|
||||||
Log.debug(
|
self.log("debug", "Language detected: {}".format(guess))
|
||||||
"Language detected: {}".format(guess),
|
|
||||||
Log.COMPONENT_CONSUMER
|
|
||||||
)
|
|
||||||
return guess
|
return guess
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
Log.warning(
|
self.log("warning", "Language detection error: {}".format(e))
|
||||||
"Language detection error: {}".format(e), Log.COMPONENT_MAIL)
|
|
||||||
|
|
||||||
def _get_ocr(self, pngs):
|
def _get_ocr(self, imgs):
|
||||||
"""
|
"""
|
||||||
Attempts to do the best job possible OCR'ing the document based on
|
Attempts to do the best job possible OCR'ing the document based on
|
||||||
simple language detection trial & error.
|
simple language detection trial & error.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not pngs:
|
if not imgs:
|
||||||
raise OCRError
|
raise OCRError("No images found")
|
||||||
|
|
||||||
Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
|
self.log("info", "OCRing the document")
|
||||||
|
|
||||||
# Since the division gets rounded down by int, this calculation works
|
# Since the division gets rounded down by int, this calculation works
|
||||||
# for every edge-case, i.e. 1
|
# for every edge-case, i.e. 1
|
||||||
middle = int(len(pngs) / 2)
|
middle = int(len(imgs) / 2)
|
||||||
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
guessed_language = self._guess_language(raw_text)
|
guessed_language = self._guess_language(raw_text)
|
||||||
|
|
||||||
if not guessed_language or guessed_language not in ISO639:
|
if not guessed_language or guessed_language not in ISO639:
|
||||||
Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
|
self.log("warning", "Language detection failed!")
|
||||||
if settings.FORGIVING_OCR:
|
if settings.FORGIVING_OCR:
|
||||||
Log.warning(
|
self.log(
|
||||||
"As FORGIVING_OCR is enabled, we're going to make the best "
|
"warning",
|
||||||
"with what we have.",
|
"As FORGIVING_OCR is enabled, we're going to make the "
|
||||||
Log.COMPONENT_CONSUMER
|
"best with what we have."
|
||||||
)
|
)
|
||||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError
|
raise OCRError("Language detection failed")
|
||||||
|
|
||||||
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
|
||||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self._ocr(pngs, ISO639[guessed_language])
|
return self._ocr(imgs, ISO639[guessed_language])
|
||||||
except pyocr.pyocr.tesseract.TesseractError:
|
except pyocr.pyocr.tesseract.TesseractError:
|
||||||
if settings.FORGIVING_OCR:
|
if settings.FORGIVING_OCR:
|
||||||
Log.warning(
|
self.log(
|
||||||
|
"warning",
|
||||||
"OCR for {} failed, but we're going to stick with what "
|
"OCR for {} failed, but we're going to stick with what "
|
||||||
"we've got since FORGIVING_OCR is enabled.".format(
|
"we've got since FORGIVING_OCR is enabled.".format(
|
||||||
guessed_language
|
guessed_language
|
||||||
),
|
)
|
||||||
Log.COMPONENT_CONSUMER
|
|
||||||
)
|
)
|
||||||
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
|
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
|
||||||
return raw_text
|
return raw_text
|
||||||
raise OCRError
|
raise OCRError(
|
||||||
|
"The guessed language is not available in this instance of "
|
||||||
|
"Tesseract."
|
||||||
|
)
|
||||||
|
|
||||||
def _assemble_ocr_sections(self, pngs, middle, text):
|
def _assemble_ocr_sections(self, imgs, middle, text):
|
||||||
"""
|
"""
|
||||||
Given a `middle` value and the text that middle page represents, we OCR
|
Given a `middle` value and the text that middle page represents, we OCR
|
||||||
the remainder of the document and return the whole thing.
|
the remainder of the document and return the whole thing.
|
||||||
"""
|
"""
|
||||||
text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
|
||||||
text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
|
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _ocr(self, pngs, lang):
|
def _ocr(self, imgs, lang):
|
||||||
"""
|
"""
|
||||||
Performs a single OCR attempt.
|
Performs a single OCR attempt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not pngs:
|
if not imgs:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
|
self.log("info", "Parsing for {}".format(lang))
|
||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=self.THREADS) as pool:
|
||||||
r = pool.map(
|
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||||
image_to_string, itertools.product([self], pngs, [lang]))
|
|
||||||
r = " ".join(r)
|
r = " ".join(r)
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
@ -233,16 +271,18 @@ class Consumer(object):
|
|||||||
|
|
||||||
def _guess_attributes_from_name(self, parseable):
|
def _guess_attributes_from_name(self, parseable):
|
||||||
"""
|
"""
|
||||||
We use a crude naming convention to make handling the sender, title, and
|
We use a crude naming convention to make handling the correspondent,
|
||||||
tags easier:
|
title, and tags easier:
|
||||||
"<sender> - <title> - <tags>.<suffix>"
|
"<correspondent> - <title> - <tags>.<suffix>"
|
||||||
"<sender> - <title>.<suffix>"
|
"<correspondent> - <title>.<suffix>"
|
||||||
"<title>.<suffix>"
|
"<title>.<suffix>"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_sender(sender_name):
|
def get_correspondent(correspondent_name):
|
||||||
return Sender.objects.get_or_create(
|
return Correspondent.objects.get_or_create(
|
||||||
name=sender_name, defaults={"slug": slugify(sender_name)})[0]
|
name=correspondent_name,
|
||||||
|
defaults={"slug": slugify(correspondent_name)}
|
||||||
|
)[0]
|
||||||
|
|
||||||
def get_tags(tags):
|
def get_tags(tags):
|
||||||
r = []
|
r = []
|
||||||
@ -251,40 +291,47 @@ class Consumer(object):
|
|||||||
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
Tag.objects.get_or_create(slug=t, defaults={"name": t})[0])
|
||||||
return tuple(r)
|
return tuple(r)
|
||||||
|
|
||||||
# First attempt: "<sender> - <title> - <tags>.<suffix>"
|
def get_suffix(suffix):
|
||||||
m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable)
|
suffix = suffix.lower()
|
||||||
|
if suffix == "jpeg":
|
||||||
|
return "jpg"
|
||||||
|
return suffix
|
||||||
|
|
||||||
|
# First attempt: "<correspondent> - <title> - <tags>.<suffix>"
|
||||||
|
m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable)
|
||||||
if m:
|
if m:
|
||||||
return (
|
return (
|
||||||
get_sender(m.group(1)),
|
get_correspondent(m.group(1)),
|
||||||
m.group(2),
|
m.group(2),
|
||||||
get_tags(m.group(3)),
|
get_tags(m.group(3)),
|
||||||
m.group(4)
|
get_suffix(m.group(4))
|
||||||
)
|
)
|
||||||
|
|
||||||
# Second attempt: "<sender> - <title>.<suffix>"
|
# Second attempt: "<correspondent> - <title>.<suffix>"
|
||||||
m = re.match(self.REGEX_SENDER_TITLE, parseable)
|
m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable)
|
||||||
if m:
|
if m:
|
||||||
return get_sender(m.group(1)), m.group(2), (), m.group(3)
|
return (
|
||||||
|
get_correspondent(m.group(1)),
|
||||||
|
m.group(2),
|
||||||
|
(),
|
||||||
|
get_suffix(m.group(3))
|
||||||
|
)
|
||||||
|
|
||||||
# That didn't work, so we assume sender and tags are None
|
# That didn't work, so we assume correspondent and tags are None
|
||||||
m = re.match(self.REGEX_TITLE, parseable)
|
m = re.match(self.REGEX_TITLE, parseable)
|
||||||
return None, m.group(1), (), m.group(2)
|
return None, m.group(1), (), get_suffix(m.group(2))
|
||||||
|
|
||||||
def _store(self, text, doc):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
||||||
tags = list(tags)
|
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
||||||
|
|
||||||
lower_text = text.lower()
|
|
||||||
relevant_tags = set(
|
|
||||||
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
|
|
||||||
|
|
||||||
stats = os.stat(doc)
|
stats = os.stat(doc)
|
||||||
|
|
||||||
Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)
|
self.log("debug", "Saving record to database")
|
||||||
|
|
||||||
document = Document.objects.create(
|
document = Document.objects.create(
|
||||||
sender=sender,
|
correspondent=sender,
|
||||||
title=title,
|
title=title,
|
||||||
content=text,
|
content=text,
|
||||||
file_type=file_type,
|
file_type=file_type,
|
||||||
@ -296,22 +343,29 @@ class Consumer(object):
|
|||||||
|
|
||||||
if relevant_tags:
|
if relevant_tags:
|
||||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||||
Log.debug(
|
self.log("debug", "Tagging with {}".format(tag_names))
|
||||||
"Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
|
|
||||||
document.tags.add(*relevant_tags)
|
document.tags.add(*relevant_tags)
|
||||||
|
|
||||||
|
# Encrypt and store the actual document
|
||||||
with open(doc, "rb") as unencrypted:
|
with open(doc, "rb") as unencrypted:
|
||||||
with open(document.source_path, "wb") as encrypted:
|
with open(document.source_path, "wb") as encrypted:
|
||||||
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
|
self.log("debug", "Encrypting the document")
|
||||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
def _cleanup(self, tempdir, doc):
|
# Encrypt and store the thumbnail
|
||||||
# Remove temporary directory recursively
|
with open(thumbnail, "rb") as unencrypted:
|
||||||
Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
|
with open(document.thumbnail_path, "wb") as encrypted:
|
||||||
shutil.rmtree(tempdir)
|
self.log("debug", "Encrypting the thumbnail")
|
||||||
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
# Remove doc
|
self.log("info", "Completed")
|
||||||
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
|
|
||||||
|
def _cleanup_tempdir(self, d):
|
||||||
|
self.log("debug", "Deleting directory {}".format(d))
|
||||||
|
shutil.rmtree(d)
|
||||||
|
|
||||||
|
def _cleanup_doc(self, doc):
|
||||||
|
self.log("debug", "Deleting document {}".format(doc))
|
||||||
os.unlink(doc)
|
os.unlink(doc)
|
||||||
|
|
||||||
def _is_ready(self, doc):
|
def _is_ready(self, doc):
|
||||||
@ -329,3 +383,23 @@ class Consumer(object):
|
|||||||
self.stats[doc] = t
|
self.stats[doc] = t
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def image_to_string(args):
|
||||||
|
img, lang = args
|
||||||
|
ocr = pyocr.get_available_tools()[0]
|
||||||
|
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
|
||||||
|
if ocr.can_detect_orientation():
|
||||||
|
try:
|
||||||
|
orientation = ocr.detect_orientation(f, lang=lang)
|
||||||
|
f = f.rotate(orientation["angle"], expand=1)
|
||||||
|
except TesseractError:
|
||||||
|
pass
|
||||||
|
return ocr.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
|
|
||||||
|
def run_unpaper(args):
|
||||||
|
unpaper, pnm = args
|
||||||
|
subprocess.Popen((
|
||||||
|
unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||||
|
)).wait()
|
||||||
|
@ -8,13 +8,13 @@ from time import mktime
|
|||||||
from django import forms
|
from django import forms
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from .models import Document, Sender
|
from .models import Document, Correspondent
|
||||||
from .consumer import Consumer
|
from .consumer import Consumer
|
||||||
|
|
||||||
|
|
||||||
class UploadForm(forms.Form):
|
class UploadForm(forms.Form):
|
||||||
|
|
||||||
SECRET = settings.UPLOAD_SHARED_SECRET
|
SECRET = settings.SHARED_SECRET
|
||||||
TYPE_LOOKUP = {
|
TYPE_LOOKUP = {
|
||||||
"application/pdf": Document.TYPE_PDF,
|
"application/pdf": Document.TYPE_PDF,
|
||||||
"image/png": Document.TYPE_PNG,
|
"image/png": Document.TYPE_PNG,
|
||||||
@ -23,31 +23,36 @@ class UploadForm(forms.Form):
|
|||||||
"image/tiff": Document.TYPE_TIF,
|
"image/tiff": Document.TYPE_TIF,
|
||||||
}
|
}
|
||||||
|
|
||||||
sender = forms.CharField(
|
correspondent = forms.CharField(
|
||||||
max_length=Sender._meta.get_field("name").max_length, required=False)
|
max_length=Correspondent._meta.get_field("name").max_length,
|
||||||
|
required=False
|
||||||
|
)
|
||||||
title = forms.CharField(
|
title = forms.CharField(
|
||||||
max_length=Document._meta.get_field("title").max_length, required=False)
|
max_length=Document._meta.get_field("title").max_length,
|
||||||
|
required=False
|
||||||
|
)
|
||||||
document = forms.FileField()
|
document = forms.FileField()
|
||||||
signature = forms.CharField(max_length=256)
|
signature = forms.CharField(max_length=256)
|
||||||
|
|
||||||
def clean_sender(self):
|
def clean_correspondent(self):
|
||||||
"""
|
"""
|
||||||
I suppose it might look cleaner to use .get_or_create() here, but that
|
I suppose it might look cleaner to use .get_or_create() here, but that
|
||||||
would also allow someone to fill up the db with bogus senders before all
|
would also allow someone to fill up the db with bogus correspondents
|
||||||
validation was met.
|
before all validation was met.
|
||||||
"""
|
"""
|
||||||
sender = self.cleaned_data.get("sender")
|
corresp = self.cleaned_data.get("correspondent")
|
||||||
if not sender:
|
if not corresp:
|
||||||
return None
|
return None
|
||||||
if not Sender.SAFE_REGEX.match(sender) or " - " in sender:
|
if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp:
|
||||||
raise forms.ValidationError("That sender name is suspicious.")
|
raise forms.ValidationError(
|
||||||
return sender
|
"That correspondent name is suspicious.")
|
||||||
|
return corresp
|
||||||
|
|
||||||
def clean_title(self):
|
def clean_title(self):
|
||||||
title = self.cleaned_data.get("title")
|
title = self.cleaned_data.get("title")
|
||||||
if not title:
|
if not title:
|
||||||
return None
|
return None
|
||||||
if not Sender.SAFE_REGEX.match(title) or " - " in title:
|
if not Correspondent.SAFE_REGEX.match(title) or " - " in title:
|
||||||
raise forms.ValidationError("That title is suspicious.")
|
raise forms.ValidationError("That title is suspicious.")
|
||||||
|
|
||||||
def clean_document(self):
|
def clean_document(self):
|
||||||
@ -59,10 +64,10 @@ class UploadForm(forms.Form):
|
|||||||
return document, self.TYPE_LOOKUP[file_type]
|
return document, self.TYPE_LOOKUP[file_type]
|
||||||
|
|
||||||
def clean(self):
|
def clean(self):
|
||||||
sender = self.clened_data("sender")
|
corresp = self.clened_data("correspondent")
|
||||||
title = self.cleaned_data("title")
|
title = self.cleaned_data("title")
|
||||||
signature = self.cleaned_data("signature")
|
signature = self.cleaned_data("signature")
|
||||||
if sha256(sender + title + self.SECRET).hexdigest() == signature:
|
if sha256(corresp + title + self.SECRET).hexdigest() == signature:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -73,13 +78,15 @@ class UploadForm(forms.Form):
|
|||||||
form do that as well. Think of it as a poor-man's queue server.
|
form do that as well. Think of it as a poor-man's queue server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sender = self.clened_data("sender")
|
correspondent = self.clened_data("correspondent")
|
||||||
title = self.cleaned_data("title")
|
title = self.cleaned_data("title")
|
||||||
document, file_type = self.cleaned_data.get("document")
|
document, file_type = self.cleaned_data.get("document")
|
||||||
|
|
||||||
t = int(mktime(datetime.now()))
|
t = int(mktime(datetime.now()))
|
||||||
file_name = os.path.join(
|
file_name = os.path.join(
|
||||||
Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type))
|
Consumer.CONSUME,
|
||||||
|
"{} - {}.{}".format(correspondent, title, file_type)
|
||||||
|
)
|
||||||
|
|
||||||
with open(file_name, "wb") as f:
|
with open(file_name, "wb") as f:
|
||||||
f.write(document)
|
f.write(document)
|
||||||
|
@ -185,10 +185,10 @@ ISO639 = {
|
|||||||
"yo": "yor",
|
"yo": "yor",
|
||||||
"za": "zha",
|
"za": "zha",
|
||||||
|
|
||||||
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I have
|
# Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I
|
||||||
# no idea which one is better, so I just picked the bigger file.
|
# have no idea which one is better, so I just picked the bigger file.
|
||||||
"zh": "chi_tra",
|
"zh": "chi_tra",
|
||||||
|
|
||||||
"zu": "zul"
|
"zu": "zul"
|
||||||
|
|
||||||
}
|
}
|
||||||
|
30
src/documents/loggers.py
Normal file
30
src/documents/loggers.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
class PaperlessLogger(logging.StreamHandler):
|
||||||
|
"""
|
||||||
|
A logger smart enough to know to log some kinds of messages to the database
|
||||||
|
for later retrieval in a pretty interface.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def emit(self, record):
|
||||||
|
|
||||||
|
logging.StreamHandler.emit(self, record)
|
||||||
|
|
||||||
|
if not hasattr(record, "component"):
|
||||||
|
return
|
||||||
|
|
||||||
|
# We have to do the import here or Django will barf when it tries to
|
||||||
|
# load this because the apps aren't loaded at that point
|
||||||
|
from .models import Log
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"message": record.msg,
|
||||||
|
"component": record.component,
|
||||||
|
"level": record.levelno,
|
||||||
|
}
|
||||||
|
|
||||||
|
if hasattr(record, "group"):
|
||||||
|
kwargs["group"] = record.group
|
||||||
|
|
||||||
|
Log.objects.create(**kwargs)
|
@ -1,8 +1,10 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import imaplib
|
import imaplib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import uuid
|
||||||
|
|
||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
from email import policy
|
from email import policy
|
||||||
@ -11,10 +13,8 @@ from dateutil import parser
|
|||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from logger.models import Log
|
|
||||||
|
|
||||||
from .consumer import Consumer
|
from .consumer import Consumer
|
||||||
from .models import Sender
|
from .models import Correspondent, Log
|
||||||
|
|
||||||
|
|
||||||
class MailFetcherError(Exception):
|
class MailFetcherError(Exception):
|
||||||
@ -25,21 +25,34 @@ class InvalidMessageError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Message(object):
|
class Loggable(object):
|
||||||
|
|
||||||
|
def __init__(self, group=None):
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
self.logging_group = group or uuid.uuid4()
|
||||||
|
|
||||||
|
def log(self, level, message):
|
||||||
|
getattr(self.logger, level)(message, extra={
|
||||||
|
"group": self.logging_group,
|
||||||
|
"component": Log.COMPONENT_MAIL
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
class Message(Loggable):
|
||||||
"""
|
"""
|
||||||
A crude, but simple email message class. We assume that there's a subject
|
A crude, but simple email message class. We assume that there's a subject
|
||||||
and n attachments, and that we don't care about the message body.
|
and n attachments, and that we don't care about the message body.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
SECRET = settings.UPLOAD_SHARED_SECRET
|
SECRET = settings.SHARED_SECRET
|
||||||
|
|
||||||
def __init__(self, data, verbosity=1):
|
def __init__(self, data, group=None):
|
||||||
"""
|
"""
|
||||||
Cribbed heavily from
|
Cribbed heavily from
|
||||||
https://www.ianlewis.org/en/parsing-email-attachments-python
|
https://www.ianlewis.org/en/parsing-email-attachments-python
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.verbosity = verbosity
|
Loggable.__init__(self, group=group)
|
||||||
|
|
||||||
self.subject = None
|
self.subject = None
|
||||||
self.time = None
|
self.time = None
|
||||||
@ -54,8 +67,7 @@ class Message(object):
|
|||||||
|
|
||||||
self._set_time(message)
|
self._set_time(message)
|
||||||
|
|
||||||
Log.info(
|
self.log("info", 'Importing email: "{}"'.format(self.subject))
|
||||||
'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)
|
|
||||||
|
|
||||||
attachments = []
|
attachments = []
|
||||||
for part in message.walk():
|
for part in message.walk():
|
||||||
@ -91,7 +103,7 @@ class Message(object):
|
|||||||
def check_subject(self):
|
def check_subject(self):
|
||||||
if self.subject is None:
|
if self.subject is None:
|
||||||
raise InvalidMessageError("Message does not have a subject")
|
raise InvalidMessageError("Message does not have a subject")
|
||||||
if not Sender.SAFE_REGEX.match(self.subject):
|
if not Correspondent.SAFE_REGEX.match(self.subject):
|
||||||
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
||||||
self.subject))
|
self.subject))
|
||||||
|
|
||||||
@ -134,9 +146,11 @@ class Attachment(object):
|
|||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
|
|
||||||
class MailFetcher(object):
|
class MailFetcher(Loggable):
|
||||||
|
|
||||||
def __init__(self, verbosity=1):
|
def __init__(self):
|
||||||
|
|
||||||
|
Loggable.__init__(self)
|
||||||
|
|
||||||
self._connection = None
|
self._connection = None
|
||||||
self._host = settings.MAIL_CONSUMPTION["HOST"]
|
self._host = settings.MAIL_CONSUMPTION["HOST"]
|
||||||
@ -148,7 +162,6 @@ class MailFetcher(object):
|
|||||||
self._enabled = bool(self._host)
|
self._enabled = bool(self._host)
|
||||||
|
|
||||||
self.last_checked = datetime.datetime.now()
|
self.last_checked = datetime.datetime.now()
|
||||||
self.verbosity = verbosity
|
|
||||||
|
|
||||||
def pull(self):
|
def pull(self):
|
||||||
"""
|
"""
|
||||||
@ -159,14 +172,14 @@ class MailFetcher(object):
|
|||||||
|
|
||||||
if self._enabled:
|
if self._enabled:
|
||||||
|
|
||||||
Log.info("Checking mail", Log.COMPONENT_MAIL)
|
# Reset the grouping id for each fetch
|
||||||
|
self.logging_group = uuid.uuid4()
|
||||||
|
|
||||||
|
self.log("debug", "Checking mail")
|
||||||
|
|
||||||
for message in self._get_messages():
|
for message in self._get_messages():
|
||||||
|
|
||||||
Log.debug(
|
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
||||||
'Storing email: "{}"'.format(message.subject),
|
|
||||||
Log.COMPONENT_MAIL
|
|
||||||
)
|
|
||||||
|
|
||||||
t = int(time.mktime(message.time.timetuple()))
|
t = int(time.mktime(message.time.timetuple()))
|
||||||
file_name = os.path.join(Consumer.CONSUME, message.file_name)
|
file_name = os.path.join(Consumer.CONSUME, message.file_name)
|
||||||
@ -193,7 +206,7 @@ class MailFetcher(object):
|
|||||||
self._connection.logout()
|
self._connection.logout()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
Log.error(e, Log.COMPONENT_MAIL)
|
self.log("error", str(e))
|
||||||
|
|
||||||
return r
|
return r
|
||||||
|
|
||||||
@ -218,9 +231,9 @@ class MailFetcher(object):
|
|||||||
|
|
||||||
message = None
|
message = None
|
||||||
try:
|
try:
|
||||||
message = Message(data[0][1], self.verbosity)
|
message = Message(data[0][1], self.logging_group)
|
||||||
except InvalidMessageError as e:
|
except InvalidMessageError as e:
|
||||||
Log.error(e, Log.COMPONENT_MAIL)
|
self.log("error", str(e))
|
||||||
else:
|
else:
|
||||||
self._connection.store(num, "+FLAGS", "\\Deleted")
|
self._connection.store(num, "+FLAGS", "\\Deleted")
|
||||||
|
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
import datetime
|
import datetime
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand, CommandError
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
|
||||||
|
from ...models import Log
|
||||||
from ...consumer import Consumer, ConsumerError
|
from ...consumer import Consumer, ConsumerError
|
||||||
from ...mail import MailFetcher, MailFetcherError
|
from ...mail import MailFetcher, MailFetcherError
|
||||||
|
|
||||||
@ -34,7 +36,7 @@ class Command(BaseCommand):
|
|||||||
self.verbosity = options["verbosity"]
|
self.verbosity = options["verbosity"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.file_consumer = Consumer(verbosity=self.verbosity)
|
self.file_consumer = Consumer()
|
||||||
self.mail_fetcher = MailFetcher()
|
self.mail_fetcher = MailFetcher()
|
||||||
except (ConsumerError, MailFetcherError) as e:
|
except (ConsumerError, MailFetcherError) as e:
|
||||||
raise CommandError(e)
|
raise CommandError(e)
|
||||||
@ -44,6 +46,13 @@ class Command(BaseCommand):
|
|||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
logging.getLogger(__name__).info(
|
||||||
|
"Starting document consumer at {}".format(
|
||||||
|
settings.CONSUMPTION_DIR
|
||||||
|
),
|
||||||
|
extra={"component": Log.COMPONENT_CONSUMER}
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
self.loop()
|
self.loop()
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand, CommandError
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
from django.core import serializers
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document, Correspondent, Tag
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
from ...mixins import Renderable
|
from ...mixins import Renderable
|
||||||
@ -14,21 +16,26 @@ class Command(Renderable, BaseCommand):
|
|||||||
|
|
||||||
help = """
|
help = """
|
||||||
Decrypt and rename all files in our collection into a given target
|
Decrypt and rename all files in our collection into a given target
|
||||||
directory. Note that we don't export any of the parsed data since
|
directory. And include a manifest file containing document data for
|
||||||
that can always be re-collected via the consumer.
|
easy import.
|
||||||
""".replace(" ", "")
|
""".replace(" ", "")
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
def add_arguments(self, parser):
|
||||||
parser.add_argument("target")
|
parser.add_argument("target")
|
||||||
|
parser.add_argument(
|
||||||
|
"--legacy",
|
||||||
|
action="store_true",
|
||||||
|
help="Don't try to export all of the document data, just dump the "
|
||||||
|
"original document files out in a format that makes "
|
||||||
|
"re-consuming them easy."
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.verbosity = 0
|
|
||||||
self.target = None
|
|
||||||
BaseCommand.__init__(self, *args, **kwargs)
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
self.target = None
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
self.verbosity = options["verbosity"]
|
|
||||||
self.target = options["target"]
|
self.target = options["target"]
|
||||||
|
|
||||||
if not os.path.exists(self.target):
|
if not os.path.exists(self.target):
|
||||||
@ -40,9 +47,22 @@ class Command(Renderable, BaseCommand):
|
|||||||
if not settings.PASSPHRASE:
|
if not settings.PASSPHRASE:
|
||||||
settings.PASSPHRASE = input("Please enter the passphrase: ")
|
settings.PASSPHRASE = input("Please enter the passphrase: ")
|
||||||
|
|
||||||
for document in Document.objects.all():
|
if options["legacy"]:
|
||||||
|
self.dump_legacy()
|
||||||
|
else:
|
||||||
|
self.dump()
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
|
||||||
|
documents = Document.objects.all()
|
||||||
|
document_map = {d.pk: d for d in documents}
|
||||||
|
manifest = json.loads(serializers.serialize("json", documents))
|
||||||
|
for document_dict in manifest:
|
||||||
|
|
||||||
|
document = document_map[document_dict["pk"]]
|
||||||
|
|
||||||
target = os.path.join(self.target, document.file_name)
|
target = os.path.join(self.target, document.file_name)
|
||||||
|
document_dict["__exported_file_name__"] = target
|
||||||
|
|
||||||
print("Exporting: {}".format(target))
|
print("Exporting: {}".format(target))
|
||||||
|
|
||||||
@ -50,3 +70,37 @@ class Command(Renderable, BaseCommand):
|
|||||||
f.write(GnuPG.decrypted(document.source_file))
|
f.write(GnuPG.decrypted(document.source_file))
|
||||||
t = int(time.mktime(document.created.timetuple()))
|
t = int(time.mktime(document.created.timetuple()))
|
||||||
os.utime(target, times=(t, t))
|
os.utime(target, times=(t, t))
|
||||||
|
|
||||||
|
manifest += json.loads(
|
||||||
|
serializers.serialize("json", Correspondent.objects.all()))
|
||||||
|
|
||||||
|
manifest += json.loads(serializers.serialize(
|
||||||
|
"json", Tag.objects.all()))
|
||||||
|
|
||||||
|
with open(os.path.join(self.target, "manifest.json"), "w") as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
def dump_legacy(self):
|
||||||
|
|
||||||
|
for document in Document.objects.all():
|
||||||
|
|
||||||
|
target = os.path.join(
|
||||||
|
self.target, self._get_legacy_file_name(document))
|
||||||
|
|
||||||
|
print("Exporting: {}".format(target))
|
||||||
|
|
||||||
|
with open(target, "wb") as f:
|
||||||
|
f.write(GnuPG.decrypted(document.source_file))
|
||||||
|
t = int(time.mktime(document.created.timetuple()))
|
||||||
|
os.utime(target, times=(t, t))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_legacy_file_name(doc):
|
||||||
|
if doc.correspondent and doc.title:
|
||||||
|
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||||
|
if tags:
|
||||||
|
return "{} - {} - {}.{}".format(
|
||||||
|
doc.correspondent, doc.title, tags, doc.file_type)
|
||||||
|
return "{} - {}.{}".format(
|
||||||
|
doc.correspondent, doc.title, doc.file_type)
|
||||||
|
return os.path.basename(doc.source_path)
|
||||||
|
99
src/documents/management/commands/document_importer.py
Normal file
99
src/documents/management/commands/document_importer.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
from django.core.management import call_command
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
|
from ...mixins import Renderable
|
||||||
|
|
||||||
|
|
||||||
|
class Command(Renderable, BaseCommand):
|
||||||
|
|
||||||
|
help = """
|
||||||
|
Using a manifest.json file, load the data from there, and import the
|
||||||
|
documents it refers to.
|
||||||
|
""".replace(" ", "")
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("source")
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
self.source = None
|
||||||
|
self.manifest = None
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
|
self.source = options["source"]
|
||||||
|
|
||||||
|
if not os.path.exists(self.source):
|
||||||
|
raise CommandError("That path doesn't exist")
|
||||||
|
|
||||||
|
if not os.access(self.source, os.R_OK):
|
||||||
|
raise CommandError("That path doesn't appear to be readable")
|
||||||
|
|
||||||
|
manifest_path = os.path.join(self.source, "manifest.json")
|
||||||
|
self._check_manifest_exists(manifest_path)
|
||||||
|
|
||||||
|
with open(manifest_path) as f:
|
||||||
|
self.manifest = json.load(f)
|
||||||
|
|
||||||
|
self._check_manifest()
|
||||||
|
|
||||||
|
if not settings.PASSPHRASE:
|
||||||
|
raise CommandError(
|
||||||
|
"You need to define a passphrase before continuing. Please "
|
||||||
|
"consult the documentation for setting up Paperless."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fill up the database with whatever is in the manifest
|
||||||
|
call_command("loaddata", manifest_path)
|
||||||
|
|
||||||
|
self._import_files_from_manifest()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _check_manifest_exists(path):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
raise CommandError(
|
||||||
|
"That directory doesn't appear to contain a manifest.json "
|
||||||
|
"file."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _check_manifest(self):
|
||||||
|
|
||||||
|
for record in self.manifest:
|
||||||
|
|
||||||
|
if not record["model"] == "documents.document":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "__exported_file_name__" not in record:
|
||||||
|
raise CommandError(
|
||||||
|
'The manifest file contains a record which does not '
|
||||||
|
'refer to an actual document file.'
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_file = record["__exported_file_name__"]
|
||||||
|
if not os.path.exists(os.path.join(self.source, doc_file)):
|
||||||
|
raise CommandError(
|
||||||
|
'The manifest file refers to "{}" which does not '
|
||||||
|
'appear to be in the source directory.'.format(doc_file)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _import_files_from_manifest(self):
|
||||||
|
|
||||||
|
for record in self.manifest:
|
||||||
|
|
||||||
|
if not record["model"] == "documents.document":
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc_file = record["__exported_file_name__"]
|
||||||
|
document = Document.objects.get(pk=record["pk"])
|
||||||
|
with open(doc_file, "rb") as unencrypted:
|
||||||
|
with open(document.source_path, "wb") as encrypted:
|
||||||
|
print("Encrypting {} and saving it to {}".format(
|
||||||
|
doc_file, document.source_path))
|
||||||
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
@ -10,8 +10,8 @@ class Command(Renderable, BaseCommand):
|
|||||||
help = """
|
help = """
|
||||||
Using the current set of tagging rules, apply said rules to all
|
Using the current set of tagging rules, apply said rules to all
|
||||||
documents in the database, effectively allowing you to back-tag all
|
documents in the database, effectively allowing you to back-tag all
|
||||||
previously indexed documents with tags created (or modified) after their
|
previously indexed documents with tags created (or modified) after
|
||||||
initial import.
|
their initial import.
|
||||||
""".replace(" ", "")
|
""".replace(" ", "")
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
|
|||||||
self.verbosity = options["verbosity"]
|
self.verbosity = options["verbosity"]
|
||||||
|
|
||||||
for document in Document.objects.all():
|
for document in Document.objects.all():
|
||||||
|
|
||||||
tags = Tag.objects.exclude(
|
tags = Tag.objects.exclude(
|
||||||
pk__in=document.tags.values_list("pk", flat=True))
|
pk__in=document.tags.values_list("pk", flat=True))
|
||||||
for tag in tags:
|
|
||||||
if tag.matches(document.content):
|
for tag in Tag.match_all(document.content, tags):
|
||||||
print('Tagging {} with "{}"'.format(document, tag))
|
print('Tagging {} with "{}"'.format(document, tag))
|
||||||
document.tags.add(tag)
|
document.tags.add(tag)
|
||||||
|
20
src/documents/management/commands/loaddata_stdin.py
Normal file
20
src/documents/management/commands/loaddata_stdin.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from django.core.management.commands.loaddata import Command as LoadDataCommand
|
||||||
|
|
||||||
|
|
||||||
|
class Command(LoadDataCommand):
|
||||||
|
"""
|
||||||
|
Allow the loading of data from standard in. Sourced originally from:
|
||||||
|
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_name(self, fixture_name):
|
||||||
|
self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None)
|
||||||
|
if fixture_name == '-':
|
||||||
|
return '-', 'json', 'stdin'
|
||||||
|
|
||||||
|
def find_fixtures(self, fixture_label):
|
||||||
|
if fixture_label == '-':
|
||||||
|
return [('-', None, '-')]
|
||||||
|
return super(Command, self).find_fixtures(fixture_label)
|
70
src/documents/managers.py
Normal file
70
src/documents/managers.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from django.db import models
|
||||||
|
from django.db.models.aggregates import Max
|
||||||
|
|
||||||
|
|
||||||
|
class GroupConcat(models.Aggregate):
|
||||||
|
"""
|
||||||
|
Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've
|
||||||
|
only ever tested it in Sqlite.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ENGINE_SQLITE = 1
|
||||||
|
ENGINE_POSTGRESQL = 2
|
||||||
|
ENGINE_MYSQL = 3
|
||||||
|
ENGINES = {
|
||||||
|
"django.db.backends.sqlite3": ENGINE_SQLITE,
|
||||||
|
"django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL,
|
||||||
|
"django.db.backends.postgresql": ENGINE_POSTGRESQL,
|
||||||
|
"django.db.backends.mysql": ENGINE_MYSQL
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, expression, separator="\n", **extra):
|
||||||
|
|
||||||
|
self.engine = self._get_engine()
|
||||||
|
self.function = self._get_function()
|
||||||
|
self.template = self._get_template(separator)
|
||||||
|
|
||||||
|
models.Aggregate.__init__(
|
||||||
|
self,
|
||||||
|
expression,
|
||||||
|
output_field=models.CharField(),
|
||||||
|
**extra
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_engine(self):
|
||||||
|
engine = settings.DATABASES["default"]["ENGINE"]
|
||||||
|
try:
|
||||||
|
return self.ENGINES[engine]
|
||||||
|
except KeyError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"There's currently no support for {} when it comes to group "
|
||||||
|
"concatenation in Paperless".format(engine)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_function(self):
|
||||||
|
if self.engine == self.ENGINE_POSTGRESQL:
|
||||||
|
return "STRING_AGG"
|
||||||
|
return "GROUP_CONCAT"
|
||||||
|
|
||||||
|
def _get_template(self, separator):
|
||||||
|
if self.engine == self.ENGINE_MYSQL:
|
||||||
|
return "%(function)s(%(expressions)s, SEPARATOR '{}')".format(
|
||||||
|
separator)
|
||||||
|
return "%(function)s(%(expressions)s, '{}')".format(separator)
|
||||||
|
|
||||||
|
|
||||||
|
class LogQuerySet(models.query.QuerySet):
|
||||||
|
|
||||||
|
def by_group(self):
|
||||||
|
return self.values("group").annotate(
|
||||||
|
time=Max("modified"),
|
||||||
|
messages=GroupConcat("message"),
|
||||||
|
).order_by("-time")
|
||||||
|
|
||||||
|
|
||||||
|
class LogManager(models.Manager):
|
||||||
|
|
||||||
|
def get_queryset(self):
|
||||||
|
return LogQuerySet(self.model, using=self._db)
|
@ -1,5 +1,5 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Generated by Django 1.9 on 2016-02-14 16:08
|
# Generated by Django 1.9 on 2016-02-27 17:54
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from django.db import migrations, models
|
from django.db import migrations, models
|
||||||
@ -7,9 +7,8 @@ from django.db import migrations, models
|
|||||||
|
|
||||||
class Migration(migrations.Migration):
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
initial = True
|
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
('documents', '0009_auto_20160214_0040'),
|
||||||
]
|
]
|
||||||
|
|
||||||
operations = [
|
operations = [
|
||||||
@ -17,14 +16,15 @@ class Migration(migrations.Migration):
|
|||||||
name='Log',
|
name='Log',
|
||||||
fields=[
|
fields=[
|
||||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
('time', models.DateTimeField(auto_now_add=True)),
|
('group', models.UUIDField(blank=True)),
|
||||||
('message', models.TextField()),
|
('message', models.TextField()),
|
||||||
('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)),
|
('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)),
|
||||||
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
|
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
|
||||||
|
('created', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('modified', models.DateTimeField(auto_now=True)),
|
||||||
],
|
],
|
||||||
),
|
options={
|
||||||
migrations.AlterModelOptions(
|
'ordering': ('-modified',),
|
||||||
name='log',
|
},
|
||||||
options={'ordering': ('-time',)},
|
|
||||||
),
|
),
|
||||||
]
|
]
|
28
src/documents/migrations/0011_auto_20160303_1929.py
Normal file
28
src/documents/migrations/0011_auto_20160303_1929.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.9.2 on 2016-03-03 19:29
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0010_log'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RenameModel(
|
||||||
|
old_name='Sender',
|
||||||
|
new_name='Correspondent',
|
||||||
|
),
|
||||||
|
migrations.AlterModelOptions(
|
||||||
|
name='document',
|
||||||
|
options={'ordering': ('correspondent', 'title')},
|
||||||
|
),
|
||||||
|
migrations.RenameField(
|
||||||
|
model_name='document',
|
||||||
|
old_name='sender',
|
||||||
|
new_name='correspondent',
|
||||||
|
),
|
||||||
|
]
|
119
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
119
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.9.2 on 2016-03-05 00:40
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import gnupg
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations
|
||||||
|
from django.utils.termcolors import colorize as colourise # Spelling hurts me
|
||||||
|
|
||||||
|
|
||||||
|
class GnuPG(object):
|
||||||
|
"""
|
||||||
|
A handy singleton to use when handling encrypted files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def decrypted(cls, file_handle):
|
||||||
|
return cls.gpg.decrypt_file(
|
||||||
|
file_handle, passphrase=settings.PASSPHRASE).data
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def encrypted(cls, file_handle):
|
||||||
|
return cls.gpg.encrypt_file(
|
||||||
|
file_handle,
|
||||||
|
recipients=None,
|
||||||
|
passphrase=settings.PASSPHRASE,
|
||||||
|
symmetric=True
|
||||||
|
).data
|
||||||
|
|
||||||
|
|
||||||
|
def move_documents_and_create_thumbnails(apps, schema_editor):
|
||||||
|
|
||||||
|
documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
|
||||||
|
|
||||||
|
if set(documents) == {"originals", "thumbnails"}:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(colourise(
|
||||||
|
"\n\n"
|
||||||
|
" This is a one-time only migration to generate thumbnails for all of your\n"
|
||||||
|
" documents so that future UIs will have something to work with. If you have\n"
|
||||||
|
" a lot of documents though, this may take a while, so a coffee break may be\n"
|
||||||
|
" in order."
|
||||||
|
"\n", opts=("bold",)
|
||||||
|
))
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs(settings.SCRATCH_DIR)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for f in sorted(documents):
|
||||||
|
|
||||||
|
if not f.endswith("gpg"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(" {} {} {}".format(
|
||||||
|
colourise("*", fg="green"),
|
||||||
|
colourise("Generating a thumbnail for", fg="white"),
|
||||||
|
colourise(f, fg="cyan")
|
||||||
|
))
|
||||||
|
|
||||||
|
thumb_temp = tempfile.mkdtemp(
|
||||||
|
prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||||
|
orig_temp = tempfile.mkdtemp(
|
||||||
|
prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||||
|
|
||||||
|
orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f)
|
||||||
|
orig_target = os.path.join(orig_temp, f.replace(".gpg", ""))
|
||||||
|
|
||||||
|
with open(orig_source, "rb") as encrypted:
|
||||||
|
with open(orig_target, "wb") as unencrypted:
|
||||||
|
unencrypted.write(GnuPG.decrypted(encrypted))
|
||||||
|
|
||||||
|
subprocess.Popen((
|
||||||
|
settings.CONVERT_BINARY,
|
||||||
|
"-scale", "500x5000",
|
||||||
|
"-alpha", "remove",
|
||||||
|
orig_target,
|
||||||
|
os.path.join(thumb_temp, "convert-%04d.png")
|
||||||
|
)).wait()
|
||||||
|
|
||||||
|
thumb_source = os.path.join(thumb_temp, "convert-0000.png")
|
||||||
|
thumb_target = os.path.join(
|
||||||
|
settings.MEDIA_ROOT,
|
||||||
|
"documents",
|
||||||
|
"thumbnails",
|
||||||
|
re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
|
||||||
|
)
|
||||||
|
with open(thumb_source, "rb") as unencrypted:
|
||||||
|
with open(thumb_target, "wb") as encrypted:
|
||||||
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
|
shutil.rmtree(thumb_temp)
|
||||||
|
shutil.rmtree(orig_temp)
|
||||||
|
|
||||||
|
shutil.move(
|
||||||
|
os.path.join(settings.MEDIA_ROOT, "documents", f),
|
||||||
|
os.path.join(settings.MEDIA_ROOT, "documents", "originals", f),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0011_auto_20160303_1929'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RunPython(move_documents_and_create_thumbnails),
|
||||||
|
]
|
@ -1,7 +1,7 @@
|
|||||||
class Renderable(object):
|
class Renderable(object):
|
||||||
"""
|
"""
|
||||||
A handy mixin to make it easier/cleaner to print output based on a verbosity
|
A handy mixin to make it easier/cleaner to print output based on a
|
||||||
value.
|
verbosity value.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _render(self, text, verbosity):
|
def _render(self, text, verbosity):
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import uuid
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.urlresolvers import reverse
|
from django.core.urlresolvers import reverse
|
||||||
@ -7,6 +9,8 @@ from django.db import models
|
|||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from .managers import LogManager
|
||||||
|
|
||||||
|
|
||||||
class SluggedModel(models.Model):
|
class SluggedModel(models.Model):
|
||||||
|
|
||||||
@ -25,7 +29,7 @@ class SluggedModel(models.Model):
|
|||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
|
|
||||||
class Sender(SluggedModel):
|
class Correspondent(SluggedModel):
|
||||||
|
|
||||||
# This regex is probably more restrictive than it needs to be, but it's
|
# This regex is probably more restrictive than it needs to be, but it's
|
||||||
# better safe than sorry.
|
# better safe than sorry.
|
||||||
@ -36,7 +40,7 @@ class Sender(SluggedModel):
|
|||||||
|
|
||||||
|
|
||||||
class Tag(SluggedModel):
|
class Tag(SluggedModel):
|
||||||
|
|
||||||
COLOURS = (
|
COLOURS = (
|
||||||
(1, "#a6cee3"),
|
(1, "#a6cee3"),
|
||||||
(2, "#1f78b4"),
|
(2, "#1f78b4"),
|
||||||
@ -71,9 +75,9 @@ class Tag(SluggedModel):
|
|||||||
default=MATCH_ANY,
|
default=MATCH_ANY,
|
||||||
help_text=(
|
help_text=(
|
||||||
"Which algorithm you want to use when matching text to the OCR'd "
|
"Which algorithm you want to use when matching text to the OCR'd "
|
||||||
"PDF. Here, \"any\" looks for any occurrence of any word provided "
|
"PDF. Here, \"any\" looks for any occurrence of any word "
|
||||||
"in the PDF, while \"all\" requires that every word provided "
|
"provided in the PDF, while \"all\" requires that every word "
|
||||||
"appear in the PDF, albeit not in the order provided. A "
|
"provided appear in the PDF, albeit not in the order provided. A "
|
||||||
"\"literal\" match means that the text you enter must appear in "
|
"\"literal\" match means that the text you enter must appear in "
|
||||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||||
"uses a regex to match the PDF. If you don't know what a regex "
|
"uses a regex to match the PDF. If you don't know what a regex "
|
||||||
@ -86,28 +90,40 @@ class Tag(SluggedModel):
|
|||||||
return "{}: \"{}\" ({})".format(
|
return "{}: \"{}\" ({})".format(
|
||||||
self.name, self.match, self.get_matching_algorithm_display())
|
self.name, self.match, self.get_matching_algorithm_display())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def match_all(cls, text, tags=None):
|
||||||
|
|
||||||
|
if tags is None:
|
||||||
|
tags = cls.objects.all()
|
||||||
|
|
||||||
|
text = text.lower()
|
||||||
|
for tag in tags:
|
||||||
|
if tag.matches(text):
|
||||||
|
yield tag
|
||||||
|
|
||||||
def matches(self, text):
|
def matches(self, text):
|
||||||
|
|
||||||
# Check that match is not empty
|
# Check that match is not empty
|
||||||
if self.match.strip() == "":
|
if self.match.strip() == "":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_ALL:
|
if self.matching_algorithm == self.MATCH_ALL:
|
||||||
for word in self.match.split(" "):
|
for word in self.match.split(" "):
|
||||||
if word not in text:
|
if not re.search(r"\b{}\b".format(word), text):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_ANY:
|
if self.matching_algorithm == self.MATCH_ANY:
|
||||||
for word in self.match.split(" "):
|
for word in self.match.split(" "):
|
||||||
if word in text:
|
if re.search(r"\b{}\b".format(word), text):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_LITERAL:
|
if self.matching_algorithm == self.MATCH_LITERAL:
|
||||||
return self.match in text
|
return bool(re.search(r"\b{}\b".format(self.match), text))
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_REGEX:
|
if self.matching_algorithm == self.MATCH_REGEX:
|
||||||
return re.search(re.compile(self.match), text)
|
return bool(re.search(re.compile(self.match), text))
|
||||||
|
|
||||||
raise NotImplementedError("Unsupported matching algorithm")
|
raise NotImplementedError("Unsupported matching algorithm")
|
||||||
|
|
||||||
@ -125,8 +141,8 @@ class Document(models.Model):
|
|||||||
TYPE_TIF = "tiff"
|
TYPE_TIF = "tiff"
|
||||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
|
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
|
||||||
|
|
||||||
sender = models.ForeignKey(
|
correspondent = models.ForeignKey(
|
||||||
Sender, blank=True, null=True, related_name="documents")
|
Correspondent, blank=True, null=True, related_name="documents")
|
||||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||||
content = models.TextField(db_index=True)
|
content = models.TextField(db_index=True)
|
||||||
file_type = models.CharField(
|
file_type = models.CharField(
|
||||||
@ -140,14 +156,15 @@ class Document(models.Model):
|
|||||||
modified = models.DateTimeField(auto_now=True, editable=False)
|
modified = models.DateTimeField(auto_now=True, editable=False)
|
||||||
|
|
||||||
class Meta(object):
|
class Meta(object):
|
||||||
ordering = ("sender", "title")
|
ordering = ("correspondent", "title")
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
created = self.created.strftime("%Y-%m-%d")
|
created = self.created.strftime("%Y%m%d%H%M%S")
|
||||||
if self.sender and self.title:
|
if self.correspondent and self.title:
|
||||||
return "{}: {}, {}".format(created, self.sender, self.title)
|
return "{}: {} - {}".format(
|
||||||
if self.sender or self.title:
|
created, self.correspondent, self.title)
|
||||||
return "{}: {}".format(created, self.sender or self.title)
|
if self.correspondent or self.title:
|
||||||
|
return "{}: {}".format(created, self.correspondent or self.title)
|
||||||
return str(created)
|
return str(created)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -155,6 +172,7 @@ class Document(models.Model):
|
|||||||
return os.path.join(
|
return os.path.join(
|
||||||
settings.MEDIA_ROOT,
|
settings.MEDIA_ROOT,
|
||||||
"documents",
|
"documents",
|
||||||
|
"originals",
|
||||||
"{:07}.{}.gpg".format(self.pk, self.file_type)
|
"{:07}.{}.gpg".format(self.pk, self.file_type)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -164,14 +182,71 @@ class Document(models.Model):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def file_name(self):
|
def file_name(self):
|
||||||
if self.sender and self.title:
|
return slugify(str(self)) + "." + self.file_type
|
||||||
tags = ",".join([t.slug for t in self.tags.all()])
|
|
||||||
if tags:
|
|
||||||
return "{} - {} - {}.{}".format(
|
|
||||||
self.sender, self.title, tags, self.file_type)
|
|
||||||
return "{} - {}.{}".format(self.sender, self.title, self.file_type)
|
|
||||||
return os.path.basename(self.source_path)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def download_url(self):
|
def download_url(self):
|
||||||
return reverse("fetch", kwargs={"pk": self.pk})
|
return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk})
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thumbnail_path(self):
|
||||||
|
return os.path.join(
|
||||||
|
settings.MEDIA_ROOT,
|
||||||
|
"documents",
|
||||||
|
"thumbnails",
|
||||||
|
"{:07}.png.gpg".format(self.pk)
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thumbnail_file(self):
|
||||||
|
return open(self.thumbnail_path, "rb")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thumbnail_url(self):
|
||||||
|
return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
|
||||||
|
|
||||||
|
|
||||||
|
class Log(models.Model):
|
||||||
|
|
||||||
|
LEVELS = (
|
||||||
|
(logging.DEBUG, "Debugging"),
|
||||||
|
(logging.INFO, "Informational"),
|
||||||
|
(logging.WARNING, "Warning"),
|
||||||
|
(logging.ERROR, "Error"),
|
||||||
|
(logging.CRITICAL, "Critical"),
|
||||||
|
)
|
||||||
|
|
||||||
|
COMPONENT_CONSUMER = 1
|
||||||
|
COMPONENT_MAIL = 2
|
||||||
|
COMPONENTS = (
|
||||||
|
(COMPONENT_CONSUMER, "Consumer"),
|
||||||
|
(COMPONENT_MAIL, "Mail Fetcher")
|
||||||
|
)
|
||||||
|
|
||||||
|
group = models.UUIDField(blank=True)
|
||||||
|
message = models.TextField()
|
||||||
|
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
|
||||||
|
component = models.PositiveIntegerField(choices=COMPONENTS)
|
||||||
|
created = models.DateTimeField(auto_now_add=True)
|
||||||
|
modified = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
|
objects = LogManager()
|
||||||
|
|
||||||
|
class Meta(object):
|
||||||
|
ordering = ("-modified",)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.message
|
||||||
|
|
||||||
|
def save(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
To allow for the case where we don't want to group the message, we
|
||||||
|
shouldn't force the caller to specify a one-time group value. However,
|
||||||
|
allowing group=None means that the manager can't differentiate the
|
||||||
|
different un-grouped messages, so instead we set a random one here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self.group:
|
||||||
|
self.group = uuid.uuid4()
|
||||||
|
|
||||||
|
models.Model.save(self, *args, **kwargs)
|
||||||
|
55
src/documents/serialisers.py
Normal file
55
src/documents/serialisers.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from rest_framework import serializers
|
||||||
|
|
||||||
|
from .models import Correspondent, Tag, Document, Log
|
||||||
|
|
||||||
|
|
||||||
|
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||||
|
|
||||||
|
class Meta(object):
|
||||||
|
model = Correspondent
|
||||||
|
fields = ("id", "slug", "name")
|
||||||
|
|
||||||
|
|
||||||
|
class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||||
|
|
||||||
|
class Meta(object):
|
||||||
|
model = Tag
|
||||||
|
fields = (
|
||||||
|
"id", "slug", "name", "colour", "match", "matching_algorithm")
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentSerializer(serializers.ModelSerializer):
|
||||||
|
|
||||||
|
correspondent = serializers.HyperlinkedRelatedField(
|
||||||
|
read_only=True, view_name="drf:correspondent-detail", allow_null=True)
|
||||||
|
tags = serializers.HyperlinkedRelatedField(
|
||||||
|
read_only=True, view_name="drf:tag-detail", many=True)
|
||||||
|
|
||||||
|
class Meta(object):
|
||||||
|
model = Document
|
||||||
|
fields = (
|
||||||
|
"id",
|
||||||
|
"correspondent",
|
||||||
|
"title",
|
||||||
|
"content",
|
||||||
|
"file_type",
|
||||||
|
"tags",
|
||||||
|
"created",
|
||||||
|
"modified",
|
||||||
|
"file_name",
|
||||||
|
"download_url",
|
||||||
|
"thumbnail_url",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LogSerializer(serializers.ModelSerializer):
|
||||||
|
|
||||||
|
time = serializers.DateTimeField()
|
||||||
|
messages = serializers.CharField()
|
||||||
|
|
||||||
|
class Meta(object):
|
||||||
|
model = Log
|
||||||
|
fields = (
|
||||||
|
"time",
|
||||||
|
"messages"
|
||||||
|
)
|
10
src/documents/templates/documents/index.html
Normal file
10
src/documents/templates/documents/index.html
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
|
||||||
|
<html lang="en-gb">
|
||||||
|
<head>
|
||||||
|
<title>Paperless</title>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -4,18 +4,26 @@ from ..consumer import Consumer
|
|||||||
|
|
||||||
|
|
||||||
class TestAttachment(TestCase):
|
class TestAttachment(TestCase):
|
||||||
|
|
||||||
TAGS = ("tag1", "tag2", "tag3")
|
TAGS = ("tag1", "tag2", "tag3")
|
||||||
CONSUMER = Consumer()
|
CONSUMER = Consumer()
|
||||||
|
SUFFIXES = (
|
||||||
|
"pdf", "png", "jpg", "jpeg", "gif",
|
||||||
|
"PDF", "PNG", "JPG", "JPEG", "GIF",
|
||||||
|
"PdF", "PnG", "JpG", "JPeG", "GiF",
|
||||||
|
)
|
||||||
|
|
||||||
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
||||||
for suffix in ("pdf", "png", "jpg", "jpeg", "gif"):
|
for suffix in self.SUFFIXES:
|
||||||
f = path.format(suffix)
|
f = path.format(suffix)
|
||||||
results = self.CONSUMER._guess_attributes_from_name(f)
|
results = self.CONSUMER._guess_attributes_from_name(f)
|
||||||
self.assertEqual(results[0].name, sender, f)
|
self.assertEqual(results[0].name, sender, f)
|
||||||
self.assertEqual(results[1], title, f)
|
self.assertEqual(results[1], title, f)
|
||||||
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
|
self.assertEqual(tuple([t.slug for t in results[2]]), tags, f)
|
||||||
self.assertEqual(results[3], suffix, f)
|
if suffix.lower() == "jpeg":
|
||||||
|
self.assertEqual(results[3], "jpg", f)
|
||||||
|
else:
|
||||||
|
self.assertEqual(results[3], suffix.lower(), f)
|
||||||
|
|
||||||
def test_guess_attributes_from_name0(self):
|
def test_guess_attributes_from_name0(self):
|
||||||
self._test_guess_attributes_from_name(
|
self._test_guess_attributes_from_name(
|
||||||
|
36
src/documents/tests/test_importer.py
Normal file
36
src/documents/tests/test_importer.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
from django.core.management.base import CommandError
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from ..management.commands.document_importer import Command
|
||||||
|
|
||||||
|
|
||||||
|
class TestImporter(TestCase):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
TestCase.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
def test_check_manifest_exists(self):
|
||||||
|
cmd = Command()
|
||||||
|
self.assertRaises(
|
||||||
|
CommandError, cmd._check_manifest_exists, "/tmp/manifest.json")
|
||||||
|
|
||||||
|
def test_check_manifest(self):
|
||||||
|
|
||||||
|
cmd = Command()
|
||||||
|
cmd.source = "/tmp"
|
||||||
|
|
||||||
|
cmd.manifest = [{"model": "documents.document"}]
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
cmd._check_manifest()
|
||||||
|
self.assertTrue(
|
||||||
|
'The manifest file contains a record' in str(cm.exception))
|
||||||
|
|
||||||
|
cmd.manifest = [{
|
||||||
|
"model": "documents.document",
|
||||||
|
"__exported_file_name__": "noexist.pdf"
|
||||||
|
}]
|
||||||
|
# self.assertRaises(CommandError, cmd._check_manifest)
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
cmd._check_manifest()
|
||||||
|
self.assertTrue(
|
||||||
|
'The manifest file refers to "noexist.pdf"' in str(cm.exception))
|
142
src/documents/tests/test_logger.py
Normal file
142
src/documents/tests/test_logger.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from ..models import Log
|
||||||
|
|
||||||
|
|
||||||
|
class TestPaperlessLog(TestCase):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
TestCase.__init__(self, *args, **kwargs)
|
||||||
|
self.logger = logging.getLogger(
|
||||||
|
"documents.management.commands.document_consumer")
|
||||||
|
|
||||||
|
def test_ignored(self):
|
||||||
|
with mock.patch("logging.StreamHandler.emit") as __:
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
self.logger.info("This is an informational message")
|
||||||
|
self.logger.warning("This is an informational message")
|
||||||
|
self.logger.error("This is an informational message")
|
||||||
|
self.logger.critical("This is an informational message")
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
def test_that_it_saves_at_all(self):
|
||||||
|
|
||||||
|
kw = {
|
||||||
|
"group": uuid.uuid4(),
|
||||||
|
"component": Log.COMPONENT_MAIL
|
||||||
|
}
|
||||||
|
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
with mock.patch("logging.StreamHandler.emit") as __:
|
||||||
|
|
||||||
|
# Debug messages are ignored by default
|
||||||
|
self.logger.debug("This is a debugging message", extra=kw)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
self.logger.info("This is an informational message", extra=kw)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 1)
|
||||||
|
|
||||||
|
self.logger.warning("This is an warning message", extra=kw)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 2)
|
||||||
|
|
||||||
|
self.logger.error("This is an error message", extra=kw)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 3)
|
||||||
|
|
||||||
|
self.logger.critical("This is a critical message", extra=kw)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 4)
|
||||||
|
|
||||||
|
def test_groups(self):
|
||||||
|
|
||||||
|
kw1 = {
|
||||||
|
"group": uuid.uuid4(),
|
||||||
|
"component": Log.COMPONENT_MAIL
|
||||||
|
}
|
||||||
|
kw2 = {
|
||||||
|
"group": uuid.uuid4(),
|
||||||
|
"component": Log.COMPONENT_MAIL
|
||||||
|
}
|
||||||
|
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
with mock.patch("logging.StreamHandler.emit") as __:
|
||||||
|
|
||||||
|
# Debug messages are ignored by default
|
||||||
|
self.logger.debug("This is a debugging message", extra=kw1)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
self.logger.info("This is an informational message", extra=kw2)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 1)
|
||||||
|
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
|
||||||
|
|
||||||
|
self.logger.warning("This is an warning message", extra=kw1)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 2)
|
||||||
|
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
|
||||||
|
|
||||||
|
self.logger.error("This is an error message", extra=kw2)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 3)
|
||||||
|
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
|
||||||
|
|
||||||
|
self.logger.critical("This is a critical message", extra=kw1)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 4)
|
||||||
|
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
|
||||||
|
|
||||||
|
def test_components(self):
|
||||||
|
|
||||||
|
c1 = Log.COMPONENT_CONSUMER
|
||||||
|
c2 = Log.COMPONENT_MAIL
|
||||||
|
kw1 = {
|
||||||
|
"group": uuid.uuid4(),
|
||||||
|
"component": c1
|
||||||
|
}
|
||||||
|
kw2 = {
|
||||||
|
"group": kw1["group"],
|
||||||
|
"component": c2
|
||||||
|
}
|
||||||
|
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
with mock.patch("logging.StreamHandler.emit") as __:
|
||||||
|
|
||||||
|
# Debug messages are ignored by default
|
||||||
|
self.logger.debug("This is a debugging message", extra=kw1)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 0)
|
||||||
|
|
||||||
|
self.logger.info("This is an informational message", extra=kw2)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 1)
|
||||||
|
self.assertEqual(Log.objects.filter(component=c2).count(), 1)
|
||||||
|
|
||||||
|
self.logger.warning("This is an warning message", extra=kw1)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 2)
|
||||||
|
self.assertEqual(Log.objects.filter(component=c1).count(), 1)
|
||||||
|
|
||||||
|
self.logger.error("This is an error message", extra=kw2)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 3)
|
||||||
|
self.assertEqual(Log.objects.filter(component=c2).count(), 2)
|
||||||
|
|
||||||
|
self.logger.critical("This is a critical message", extra=kw1)
|
||||||
|
self.assertEqual(Log.objects.all().count(), 4)
|
||||||
|
self.assertEqual(Log.objects.filter(component=c1).count(), 2)
|
||||||
|
|
||||||
|
def test_groupped_query(self):
|
||||||
|
|
||||||
|
kw = {
|
||||||
|
"group": uuid.uuid4(),
|
||||||
|
"component": Log.COMPONENT_MAIL
|
||||||
|
}
|
||||||
|
with mock.patch("logging.StreamHandler.emit") as __:
|
||||||
|
self.logger.info("Message 0", extra=kw)
|
||||||
|
self.logger.info("Message 1", extra=kw)
|
||||||
|
self.logger.info("Message 2", extra=kw)
|
||||||
|
self.logger.info("Message 3", extra=kw)
|
||||||
|
|
||||||
|
self.assertEqual(Log.objects.all().by_group().count(), 1)
|
||||||
|
self.assertEqual(
|
||||||
|
Log.objects.all().by_group()[0]["messages"],
|
||||||
|
"Message 0\nMessage 1\nMessage 2\nMessage 3"
|
||||||
|
)
|
@ -3,6 +3,7 @@ import os
|
|||||||
import magic
|
import magic
|
||||||
|
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
@ -27,7 +28,8 @@ class TestMessage(TestCase):
|
|||||||
|
|
||||||
with open(self.sample, "rb") as f:
|
with open(self.sample, "rb") as f:
|
||||||
|
|
||||||
message = Message(f.read(), verbosity=0)
|
with mock.patch("logging.StreamHandler.emit") as __:
|
||||||
|
message = Message(f.read())
|
||||||
|
|
||||||
self.assertTrue(message)
|
self.assertTrue(message)
|
||||||
self.assertEqual(message.subject, "Test 0")
|
self.assertEqual(message.subject, "Test 0")
|
||||||
|
119
src/documents/tests/test_tags.py
Normal file
119
src/documents/tests/test_tags.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from ..models import Tag
|
||||||
|
|
||||||
|
|
||||||
|
class TestTagMatching(TestCase):
|
||||||
|
|
||||||
|
def test_match_all(self):
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 0",
|
||||||
|
match="alpha charlie gamma",
|
||||||
|
matching_algorithm=Tag.MATCH_ALL
|
||||||
|
)
|
||||||
|
self.assertFalse(t.matches("I have alpha in me"))
|
||||||
|
self.assertFalse(t.matches("I have charlie in me"))
|
||||||
|
self.assertFalse(t.matches("I have gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||||
|
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas in me"))
|
||||||
|
self.assertFalse(t.matches("I have bravo in me"))
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 1",
|
||||||
|
match="12 34 56",
|
||||||
|
matching_algorithm=Tag.MATCH_ALL
|
||||||
|
)
|
||||||
|
self.assertFalse(t.matches("I have 12 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 34 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 56 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 123456 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||||
|
|
||||||
|
def test_match_any(self):
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 0",
|
||||||
|
match="alpha charlie gamma",
|
||||||
|
matching_algorithm=Tag.MATCH_ANY
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertTrue(t.matches("I have alpha in me"))
|
||||||
|
self.assertTrue(t.matches("I have charlie in me"))
|
||||||
|
self.assertTrue(t.matches("I have gamma in me"))
|
||||||
|
self.assertTrue(t.matches("I have alpha and charlie in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas in me"))
|
||||||
|
self.assertFalse(t.matches("I have bravo in me"))
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 1",
|
||||||
|
match="12 34 56",
|
||||||
|
matching_algorithm=Tag.MATCH_ANY
|
||||||
|
)
|
||||||
|
self.assertTrue(t.matches("I have 12 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 34 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 56 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 12 and 34 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 123456 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||||
|
|
||||||
|
def test_match_literal(self):
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 0",
|
||||||
|
match="alpha charlie gamma",
|
||||||
|
matching_algorithm=Tag.MATCH_LITERAL
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertFalse(t.matches("I have alpha in me"))
|
||||||
|
self.assertFalse(t.matches("I have charlie in me"))
|
||||||
|
self.assertFalse(t.matches("I have gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||||
|
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas in me"))
|
||||||
|
self.assertFalse(t.matches("I have bravo in me"))
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 1",
|
||||||
|
match="12 34 56",
|
||||||
|
matching_algorithm=Tag.MATCH_LITERAL
|
||||||
|
)
|
||||||
|
self.assertFalse(t.matches("I have 12 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 34 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 56 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 123456 in me"))
|
||||||
|
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||||
|
self.assertTrue(t.matches("I have 12 34 56 in me"))
|
||||||
|
|
||||||
|
def test_match_regex(self):
|
||||||
|
|
||||||
|
t = Tag.objects.create(
|
||||||
|
name="Test 0",
|
||||||
|
match="alpha\w+gamma",
|
||||||
|
matching_algorithm=Tag.MATCH_REGEX
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertFalse(t.matches("I have alpha in me"))
|
||||||
|
self.assertFalse(t.matches("I have gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||||
|
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
|
||||||
|
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha and gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||||
|
self.assertFalse(t.matches("I have alphas in me"))
|
@ -1,21 +1,41 @@
|
|||||||
|
from django.contrib.auth.mixins import LoginRequiredMixin
|
||||||
from django.http import HttpResponse
|
from django.http import HttpResponse
|
||||||
from django.template.defaultfilters import slugify
|
|
||||||
from django.views.decorators.csrf import csrf_exempt
|
from django.views.decorators.csrf import csrf_exempt
|
||||||
from django.views.generic import FormView, DetailView
|
from django.views.generic import FormView, DetailView, TemplateView
|
||||||
|
|
||||||
|
from rest_framework.mixins import (
|
||||||
|
RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin)
|
||||||
|
from rest_framework.pagination import PageNumberPagination
|
||||||
|
from rest_framework.permissions import IsAuthenticated
|
||||||
|
from rest_framework.viewsets import (
|
||||||
|
ModelViewSet, ReadOnlyModelViewSet, GenericViewSet)
|
||||||
|
|
||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
from .models import Document
|
|
||||||
from .forms import UploadForm
|
from .forms import UploadForm
|
||||||
|
from .models import Correspondent, Tag, Document, Log
|
||||||
|
from .serialisers import (
|
||||||
|
CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer)
|
||||||
|
|
||||||
|
|
||||||
class PdfView(DetailView):
|
class IndexView(TemplateView):
|
||||||
|
|
||||||
|
template_name = "documents/index.html"
|
||||||
|
|
||||||
|
def get_context_data(self, **kwargs):
|
||||||
|
print(kwargs)
|
||||||
|
print(self.request.GET)
|
||||||
|
print(self.request.POST)
|
||||||
|
return TemplateView.get_context_data(self, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class FetchView(DetailView):
|
||||||
|
|
||||||
model = Document
|
model = Document
|
||||||
|
|
||||||
def render_to_response(self, context, **response_kwargs):
|
def render_to_response(self, context, **response_kwargs):
|
||||||
"""
|
"""
|
||||||
Override the default to return the unencrypted PDF as raw data.
|
Override the default to return the unencrypted image/PDF as raw data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content_types = {
|
content_types = {
|
||||||
@ -26,19 +46,25 @@ class PdfView(DetailView):
|
|||||||
Document.TYPE_TIF: "image/tiff",
|
Document.TYPE_TIF: "image/tiff",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.kwargs["kind"] == "thumb":
|
||||||
|
return HttpResponse(
|
||||||
|
GnuPG.decrypted(self.object.thumbnail_file),
|
||||||
|
content_type=content_types[Document.TYPE_PNG]
|
||||||
|
)
|
||||||
|
|
||||||
response = HttpResponse(
|
response = HttpResponse(
|
||||||
GnuPG.decrypted(self.object.source_file),
|
GnuPG.decrypted(self.object.source_file),
|
||||||
content_type=content_types[self.object.file_type]
|
content_type=content_types[self.object.file_type]
|
||||||
)
|
)
|
||||||
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
||||||
slugify(str(self.object)) + "." + self.object.file_type)
|
self.object.file_name)
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
class PushView(FormView):
|
class PushView(LoginRequiredMixin, FormView):
|
||||||
"""
|
"""
|
||||||
A crude REST API for creating documents.
|
A crude REST-ish API for creating documents.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
form_class = UploadForm
|
form_class = UploadForm
|
||||||
@ -52,3 +78,45 @@ class PushView(FormView):
|
|||||||
|
|
||||||
def form_invalid(self, form):
|
def form_invalid(self, form):
|
||||||
return HttpResponse("0")
|
return HttpResponse("0")
|
||||||
|
|
||||||
|
|
||||||
|
class StandardPagination(PageNumberPagination):
|
||||||
|
page_size = 25
|
||||||
|
page_size_query_param = "page-size"
|
||||||
|
max_page_size = 100000
|
||||||
|
|
||||||
|
|
||||||
|
class CorrespondentViewSet(ModelViewSet):
|
||||||
|
model = Correspondent
|
||||||
|
queryset = Correspondent.objects.all()
|
||||||
|
serializer_class = CorrespondentSerializer
|
||||||
|
pagination_class = StandardPagination
|
||||||
|
permission_classes = (IsAuthenticated,)
|
||||||
|
|
||||||
|
|
||||||
|
class TagViewSet(ModelViewSet):
|
||||||
|
model = Tag
|
||||||
|
queryset = Tag.objects.all()
|
||||||
|
serializer_class = TagSerializer
|
||||||
|
pagination_class = StandardPagination
|
||||||
|
permission_classes = (IsAuthenticated,)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentViewSet(RetrieveModelMixin,
|
||||||
|
UpdateModelMixin,
|
||||||
|
DestroyModelMixin,
|
||||||
|
ListModelMixin,
|
||||||
|
GenericViewSet):
|
||||||
|
model = Document
|
||||||
|
queryset = Document.objects.all()
|
||||||
|
serializer_class = DocumentSerializer
|
||||||
|
pagination_class = StandardPagination
|
||||||
|
permission_classes = (IsAuthenticated,)
|
||||||
|
|
||||||
|
|
||||||
|
class LogViewSet(ReadOnlyModelViewSet):
|
||||||
|
model = Log
|
||||||
|
queryset = Log.objects.all().by_group()
|
||||||
|
serializer_class = LogSerializer
|
||||||
|
pagination_class = StandardPagination
|
||||||
|
permission_classes = (IsAuthenticated,)
|
||||||
|
@ -1,12 +0,0 @@
|
|||||||
from django.contrib import admin
|
|
||||||
|
|
||||||
from .models import Log
|
|
||||||
|
|
||||||
|
|
||||||
class LogAdmin(admin.ModelAdmin):
|
|
||||||
|
|
||||||
list_display = ("message", "level", "component")
|
|
||||||
list_filter = ("level", "component",)
|
|
||||||
|
|
||||||
|
|
||||||
admin.site.register(Log, LogAdmin)
|
|
@ -1,5 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
|
|
||||||
class LoggerConfig(AppConfig):
|
|
||||||
name = 'logger'
|
|
@ -1,50 +0,0 @@
|
|||||||
from django.db import models
|
|
||||||
|
|
||||||
|
|
||||||
class Log(models.Model):
|
|
||||||
|
|
||||||
LEVEL_ERROR = 1
|
|
||||||
LEVEL_WARNING = 2
|
|
||||||
LEVEL_INFO = 3
|
|
||||||
LEVEL_DEBUG = 4
|
|
||||||
LEVELS = (
|
|
||||||
(LEVEL_ERROR, "Error"),
|
|
||||||
(LEVEL_WARNING, "Warning"),
|
|
||||||
(LEVEL_INFO, "Informational"),
|
|
||||||
(LEVEL_DEBUG, "Debugging"),
|
|
||||||
)
|
|
||||||
|
|
||||||
COMPONENT_CONSUMER = 1
|
|
||||||
COMPONENT_MAIL = 2
|
|
||||||
COMPONENTS = (
|
|
||||||
(COMPONENT_CONSUMER, "Consumer"),
|
|
||||||
(COMPONENT_MAIL, "Mail Fetcher")
|
|
||||||
)
|
|
||||||
|
|
||||||
time = models.DateTimeField(auto_now_add=True)
|
|
||||||
message = models.TextField()
|
|
||||||
level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO)
|
|
||||||
component = models.PositiveIntegerField(choices=COMPONENTS)
|
|
||||||
|
|
||||||
class Meta(object):
|
|
||||||
ordering = ("-time",)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def error(cls, message, component):
|
|
||||||
cls.objects.create(
|
|
||||||
message=message, level=cls.LEVEL_ERROR, component=component)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def warning(cls, message, component):
|
|
||||||
cls.objects.create(
|
|
||||||
message=message, level=cls.LEVEL_WARNING, component=component)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def info(cls, message, component):
|
|
||||||
cls.objects.create(
|
|
||||||
message=message, level=cls.LEVEL_INFO, component=component)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def debug(cls, message, component):
|
|
||||||
cls.objects.create(
|
|
||||||
message=message, level=cls.LEVEL_DEBUG, component=component)
|
|
@ -1,3 +0,0 @@
|
|||||||
from django.test import TestCase
|
|
||||||
|
|
||||||
# Create your tests here.
|
|
@ -1,3 +0,0 @@
|
|||||||
from django.shortcuts import render
|
|
||||||
|
|
||||||
# Create your views here.
|
|
@ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
@ -42,7 +44,8 @@ INSTALLED_APPS = [
|
|||||||
"django_extensions",
|
"django_extensions",
|
||||||
|
|
||||||
"documents",
|
"documents",
|
||||||
"logger",
|
|
||||||
|
"rest_framework",
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -87,12 +90,12 @@ DATABASES = {
|
|||||||
"NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"),
|
"NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"):
|
if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"):
|
||||||
DATABASES["default"] = {
|
DATABASES["default"] = {
|
||||||
"ENGINE": "django.db.backends.postgresql_psycopg2",
|
"ENGINE": "django.db.backends.postgresql_psycopg2",
|
||||||
"NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"),
|
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
|
||||||
"USER": os.environ.get("PAPERLESS_DBUSER"),
|
"USER": os.getenv("PAPERLESS_DBUSER"),
|
||||||
"PASSWORD": os.environ.get("PAPERLESS_DBPASS")
|
"PASSWORD": os.getenv("PAPERLESS_DBPASS")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -139,55 +142,119 @@ STATIC_URL = '/static/'
|
|||||||
MEDIA_URL = "/media/"
|
MEDIA_URL = "/media/"
|
||||||
|
|
||||||
|
|
||||||
# Paperless-specific stuffs
|
# Paperless-specific stuff
|
||||||
# Change these paths if yours are different
|
# You shouldn't have to edit any of these values. Rather, you can set these
|
||||||
|
# values in /etc/paperless.conf instead.
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Tap paperless.conf if it's available
|
||||||
|
if os.path.exists("/etc/paperless.conf"):
|
||||||
|
load_dotenv("/etc/paperless.conf")
|
||||||
|
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
|
||||||
|
LOGGING = {
|
||||||
|
"version": 1,
|
||||||
|
"disable_existing_loggers": False,
|
||||||
|
"handlers": {
|
||||||
|
"consumer": {
|
||||||
|
"class": "documents.loggers.PaperlessLogger",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"loggers": {
|
||||||
|
"documents": {
|
||||||
|
"handlers": ["consumer"],
|
||||||
|
"level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# The default language that tesseract will attempt to use when parsing
|
# The default language that tesseract will attempt to use when parsing
|
||||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||||
OCR_LANGUAGE = "eng"
|
OCR_LANGUAGE = "eng"
|
||||||
|
|
||||||
# The amount of threads to use for OCR
|
# The amount of threads to use for OCR
|
||||||
OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
|
OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS")
|
||||||
|
|
||||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF being
|
# If this is true, any failed attempts to OCR a PDF will result in the PDF
|
||||||
# indexed anyway, with whatever we could get. If it's False, the file will
|
# being indexed anyway, with whatever we could get. If it's False, the file
|
||||||
# simply be left in the CONSUMPTION_DIR.
|
# will simply be left in the CONSUMPTION_DIR.
|
||||||
FORGIVING_OCR = True
|
FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true"))
|
||||||
|
|
||||||
# GNUPG needs a home directory for some reason
|
# GNUPG needs a home directory for some reason
|
||||||
GNUPG_HOME = os.environ.get("HOME", "/dev/null")
|
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||||
|
|
||||||
# Convert is part of the Imagemagick package
|
# Convert is part of the ImageMagick package
|
||||||
CONVERT_BINARY = "/usr/bin/convert"
|
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY")
|
||||||
|
|
||||||
|
# Unpaper
|
||||||
|
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||||
|
|
||||||
# This will be created if it doesn't exist
|
# This will be created if it doesn't exist
|
||||||
SCRATCH_DIR = "/tmp/paperless"
|
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
|
||||||
|
|
||||||
# This is where Paperless will look for PDFs to index
|
# This is where Paperless will look for PDFs to index
|
||||||
CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
|
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR")
|
||||||
|
|
||||||
# If you want to use IMAP mail consumption, populate this with useful values.
|
# If you want to use IMAP mail consumption, populate this with useful values.
|
||||||
# If you leave HOST set to None, we assume you're not going to use this feature.
|
# If you leave HOST set to None, we assume you're not going to use this
|
||||||
|
# feature.
|
||||||
MAIL_CONSUMPTION = {
|
MAIL_CONSUMPTION = {
|
||||||
"HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
|
"HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"),
|
||||||
"PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
|
"PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"),
|
||||||
"USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"),
|
"USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"),
|
||||||
"PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"),
|
"PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"),
|
||||||
"USE_SSL": True, # If True, use SSL/TLS to connect
|
"USE_SSL": True, # If True, use SSL/TLS to connect
|
||||||
"INBOX": "INBOX" # The name of the inbox on the server
|
"INBOX": "INBOX" # The name of the inbox on the server
|
||||||
}
|
}
|
||||||
|
|
||||||
# This is used to encrypt the original documents and decrypt them later when you
|
# This is used to encrypt the original documents and decrypt them later when
|
||||||
# want to download them. Set it and change the permissions on this file to
|
# you want to download them. Set it and change the permissions on this file to
|
||||||
# 0600, or set it to `None` and you'll be prompted for the passphrase at
|
# 0600, or set it to `None` and you'll be prompted for the passphrase at
|
||||||
# runtime. The default looks for an environment variable.
|
# runtime. The default looks for an environment variable.
|
||||||
# DON'T FORGET TO SET THIS as leaving it blank may cause some strange things
|
# DON'T FORGET TO SET THIS as leaving it blank may cause some strange things
|
||||||
# with GPG, including an interesting case where it may "encrypt" zero-byte
|
# with GPG, including an interesting case where it may "encrypt" zero-byte
|
||||||
# files.
|
# files.
|
||||||
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE")
|
||||||
|
|
||||||
# If you intend to use the "API" to push files into the consumer, you'll need to
|
# If you intend to use the "API" to push files into the consumer, you'll need
|
||||||
# provide a shared secret here. Leaving this as the default will disable the
|
# to provide a shared secret here. Leaving this as the default will disable
|
||||||
# API.
|
# the API.
|
||||||
UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "")
|
SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "")
|
||||||
|
|
||||||
|
#
|
||||||
|
# TODO: Remove after 1.2
|
||||||
|
#
|
||||||
|
# This logic is here to address issue #44, wherein we were using inconsistent
|
||||||
|
# constant names vs. environment variables. If you're using Paperless for the
|
||||||
|
# first time, you can safely ignore everything from here on, so long as you're
|
||||||
|
# correctly defining the variables as per the documentation.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
def deprecated(before, after):
|
||||||
|
print(
|
||||||
|
"\n\n"
|
||||||
|
"WARNING: {before} has been renamed to {after}.\n"
|
||||||
|
"WARNING: Use of {before} will not work as of version 1.2."
|
||||||
|
"\n\n".format(
|
||||||
|
before=before,
|
||||||
|
after=after
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not CONVERT_BINARY:
|
||||||
|
CONVERT_BINARY = "convert"
|
||||||
|
if os.getenv("PAPERLESS_CONVERT"):
|
||||||
|
deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY")
|
||||||
|
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY)
|
||||||
|
|
||||||
|
if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"):
|
||||||
|
deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR")
|
||||||
|
CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME")
|
||||||
|
|
||||||
|
if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"):
|
||||||
|
deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET")
|
||||||
|
SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "")
|
||||||
|
@ -15,15 +15,46 @@ Including another URLconf
|
|||||||
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
|
3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls))
|
||||||
"""
|
"""
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.conf.urls import url, static
|
from django.conf.urls import url, static, include
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
|
|
||||||
from documents.views import PdfView, PushView
|
from rest_framework.routers import DefaultRouter
|
||||||
|
|
||||||
|
from documents.views import (
|
||||||
|
IndexView, FetchView, PushView,
|
||||||
|
CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet
|
||||||
|
)
|
||||||
|
|
||||||
|
router = DefaultRouter()
|
||||||
|
router.register(r'correspondents', CorrespondentViewSet)
|
||||||
|
router.register(r'tags', TagViewSet)
|
||||||
|
router.register(r'documents', DocumentViewSet)
|
||||||
|
router.register(r'logs', LogViewSet)
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
|
|
||||||
url(r'', admin.site.urls),
|
# API
|
||||||
|
url(
|
||||||
|
r"^api/auth/",
|
||||||
|
include('rest_framework.urls', namespace="rest_framework")
|
||||||
|
),
|
||||||
|
url(r"^api/", include(router.urls, namespace="drf")),
|
||||||
|
|
||||||
|
# Normal pages (coming soon)
|
||||||
|
# url(r"^$", IndexView.as_view(), name="index"),
|
||||||
|
|
||||||
|
# File downloads
|
||||||
|
url(
|
||||||
|
r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$",
|
||||||
|
FetchView.as_view(),
|
||||||
|
name="fetch"
|
||||||
|
),
|
||||||
|
|
||||||
|
# The Django admin
|
||||||
|
url(r"admin/", admin.site.urls),
|
||||||
|
url(r"", admin.site.urls), # This is going away
|
||||||
|
|
||||||
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||||
|
|
||||||
if settings.UPLOAD_SHARED_SECRET:
|
if settings.SHARED_SECRET:
|
||||||
urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push"))
|
urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push"))
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = (0, 0, 6)
|
__version__ = (0, 1, 1)
|
||||||
|
23
src/tox.ini
Normal file
23
src/tox.ini
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# Tox (http://tox.testrun.org/) is a tool for running tests
|
||||||
|
# in multiple virtualenvs. This configuration file will run the
|
||||||
|
# test suite on all supported python versions. To use it, "pip install tox"
|
||||||
|
# and then run "tox" from this directory.
|
||||||
|
|
||||||
|
[tox]
|
||||||
|
skipsdist = True
|
||||||
|
envlist = py34, py35, pep8
|
||||||
|
|
||||||
|
[testenv]
|
||||||
|
commands = {envpython} manage.py test
|
||||||
|
deps = -r{toxinidir}/../requirements.txt
|
||||||
|
setenv =
|
||||||
|
PAPERLESS_CONSUME=/tmp
|
||||||
|
PAPERLESS_PASSPHRASE=THISISNOTASECRET
|
||||||
|
PAPERLESS_SECRET=paperless
|
||||||
|
|
||||||
|
[testenv:pep8]
|
||||||
|
commands=pep8
|
||||||
|
deps=pep8
|
||||||
|
|
||||||
|
[pep8]
|
||||||
|
exclude=.tox,migrations,paperless/settings.py
|
Loading…
x
Reference in New Issue
Block a user