mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'master' into feature/api
This commit is contained in:
commit
a5124cade6
2
.gitignore
vendored
2
.gitignore
vendored
@ -68,6 +68,8 @@ db.sqlite3
|
||||
# Other stuff that doesn't belong
|
||||
virtualenv
|
||||
.vagrant
|
||||
docker-compose.yml
|
||||
docker-compose.env
|
||||
|
||||
# Used for development
|
||||
scripts/import-for-development
|
||||
|
43
Dockerfile
Normal file
43
Dockerfile
Normal file
@ -0,0 +1,43 @@
|
||||
FROM python:3.5.1
|
||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
sudo \
|
||||
tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install python dependencies
|
||||
RUN mkdir -p /usr/src/paperless
|
||||
WORKDIR /usr/src/paperless
|
||||
COPY requirements.txt /usr/src/paperless/
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application
|
||||
RUN mkdir -p /usr/src/paperless/src
|
||||
COPY src/ /usr/src/paperless/src/
|
||||
|
||||
# Set consumption directory
|
||||
ENV PAPERLESS_CONSUME /consume
|
||||
RUN mkdir -p $PAPERLESS_CONSUME
|
||||
|
||||
# Migrate database
|
||||
WORKDIR /usr/src/paperless/src
|
||||
RUN mkdir /usr/src/paperless/data
|
||||
RUN ./manage.py migrate
|
||||
|
||||
# Create user
|
||||
RUN groupadd -g 1000 paperless \
|
||||
&& useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \
|
||||
&& chown -Rh paperless:paperless /usr/src/paperless
|
||||
|
||||
# Setup entrypoint
|
||||
COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh
|
||||
RUN chmod 755 /sbin/docker-entrypoint.sh
|
||||
|
||||
# Mount volumes
|
||||
VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"]
|
||||
|
||||
ENTRYPOINT ["/sbin/docker-entrypoint.sh"]
|
||||
CMD ["--help"]
|
15
docker-compose.env.example
Normal file
15
docker-compose.env.example
Normal file
@ -0,0 +1,15 @@
|
||||
# Environment variables to set for Paperless
|
||||
# Commented out variables will be replaced by a default within Paperless.
|
||||
|
||||
# Passphrase Paperless uses to encrypt and decrypt your documents
|
||||
PAPERLESS_PASSPHRASE=CHANGE_ME
|
||||
|
||||
# The amount of threads to use for text recognition
|
||||
# PAPERLESS_OCR_THREADS=4
|
||||
|
||||
# Additional languages to install for text recognition
|
||||
# PAPERLESS_OCR_LANGUAGES=deu ita
|
||||
|
||||
# You can change the default user and group id to a custom one
|
||||
# USERMAP_UID=1000
|
||||
# USERMAP_GID=1000
|
37
docker-compose.yml.example
Normal file
37
docker-compose.yml.example
Normal file
@ -0,0 +1,37 @@
|
||||
version: '2'
|
||||
|
||||
services:
|
||||
webserver:
|
||||
image: paperless
|
||||
ports:
|
||||
# You can adapt the port you want Paperless to listen on by
|
||||
# modifying the part before the `:`.
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- paperless-data:/usr/src/paperless/data
|
||||
- paperless-media:/usr/src/paperless/media
|
||||
env_file: docker-compose.env
|
||||
environment:
|
||||
- PAPERLESS_OCR_LANGUAGES=
|
||||
command: ["runserver", "0.0.0.0:8000"]
|
||||
|
||||
consumer:
|
||||
image: paperless
|
||||
volumes:
|
||||
- paperless-data:/usr/src/paperless/data
|
||||
- paperless-media:/usr/src/paperless/media
|
||||
# You have to adapt the local path you want the consumption
|
||||
# directory to mount to by modifying the part before the ':'.
|
||||
- /path/to/arbitrary/place:/consume
|
||||
# Likewise, you can add a local path to mount a directory for
|
||||
# exporting. This is not strictly needed for paperless to
|
||||
# function, only if you're exporting your files: uncomment
|
||||
# it and fill in a local path if you know you're going to
|
||||
# want to export your documents.
|
||||
# - /path/to/another/arbitrary/place:/export
|
||||
env_file: docker-compose.env
|
||||
command: ["document_consumer"]
|
||||
|
||||
volumes:
|
||||
paperless-data:
|
||||
paperless-media:
|
18
docs/Dockerfile
Normal file
18
docs/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
||||
FROM python:3.5.1
|
||||
MAINTAINER Pit Kleyersburg <pitkley@googlemail.com>
|
||||
|
||||
# Install Sphinx and Pygments
|
||||
RUN pip install Sphinx Pygments
|
||||
|
||||
# Setup directories, copy data
|
||||
RUN mkdir /build
|
||||
COPY . /build
|
||||
WORKDIR /build/docs
|
||||
|
||||
# Build documentation
|
||||
RUN make html
|
||||
|
||||
# Start webserver
|
||||
WORKDIR /build/docs/_build/html
|
||||
EXPOSE 8000/tcp
|
||||
CMD ["python3", "-m", "http.server"]
|
@ -30,6 +30,20 @@ as part of the update:
|
||||
Note that it's possible (even likely) that while ``git pull`` may update some
|
||||
files, the ``migrate`` step may not update anything. This is totally normal.
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>` the update process
|
||||
requires only one additional step:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ cd /path/to/project
|
||||
$ git pull
|
||||
$ docker build -t paperless .
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver migrate
|
||||
|
||||
If ``git pull`` doesn't report any changes, there is no need to continue with
|
||||
the remaining steps.
|
||||
|
||||
|
||||
.. _migrating-backup:
|
||||
|
||||
@ -53,6 +67,45 @@ with Django's ``dumpdata`` command, which produces JSON output.
|
||||
$ ./manage.py document_export /path/to/arbitrary/place/
|
||||
$ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>`, exporting your tags
|
||||
as JSON is almost as easy:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json
|
||||
|
||||
To export the documents you can either use ``docker run`` directly, specifying all
|
||||
the commandline options by hand, or (more simply) mount a second volume for export.
|
||||
|
||||
To mount a volume for exports, follow the instructions in the
|
||||
``docker-compose.yml.example`` file for the ``/export`` volume (making the changes
|
||||
in your own ``docker-compose.yml`` file, of course). Once you have the
|
||||
volume mounted, the command to run an export is:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ docker-compose run --rm consumer document_exporter /export
|
||||
|
||||
If you prefer to use ``docker run`` directly, supplying the necessary commandline
|
||||
options:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Identify your containers
|
||||
$ docker-compose ps
|
||||
Name Command State Ports
|
||||
-------------------------------------------------------------------------
|
||||
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
|
||||
$ # Make sure to replace your passphrase and remove or adapt the id mapping
|
||||
$ docker run --rm \
|
||||
--volumes-from paperless_data_1 \
|
||||
--volume /path/to/arbitrary/place:/export \
|
||||
-e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \
|
||||
-e USERMAP_UID=1000 -e USERMAP_GID=1000 \
|
||||
paperless document_exporter /export
|
||||
|
||||
|
||||
.. _migrating-restoring:
|
||||
|
||||
@ -77,3 +130,25 @@ exported documents into the consumption directory and start up the consumer.
|
||||
$ cp /path/to/exported/docs/* /path/to/consumption/dir/
|
||||
$ ./manage.py document_consumer
|
||||
|
||||
Importing your data if you are :ref:`using Docker <setup-installation-docker>`
|
||||
is almost as simple:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Stop and remove your current containers
|
||||
$ docker-compose stop
|
||||
$ docker-compose rm -f
|
||||
|
||||
$ # Recreate them, add the superuser
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
$ # Load the tags
|
||||
$ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin -
|
||||
|
||||
$ # Load your exported documents into the consumption directory
|
||||
$ # (How you do this highly depends on how you have set this up)
|
||||
$ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/
|
||||
|
||||
After loading the documents into the consumption directory the consumer will
|
||||
immediately start consuming the documents.
|
||||
|
@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to:
|
||||
$ pip install sphinx
|
||||
|
||||
and then cd into the ``docs`` directory and type ``make html``.
|
||||
|
||||
If you are using Docker, you can use the following commands to build the
|
||||
documentation and run a webserver serving it on `port 8001`_:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pwd
|
||||
/path/to/paperless
|
||||
|
||||
$ docker build -t paperless:docs -f docs/Dockerfile .
|
||||
$ docker run --rm -it -p "8001:8000" paperless:docs
|
||||
|
||||
.. _port 8001: http://127.0.0.1:8001
|
||||
|
174
docs/setup.rst
174
docs/setup.rst
@ -37,11 +37,18 @@ or just download the tarball and go that route:
|
||||
Installation & Configuration
|
||||
----------------------------
|
||||
|
||||
You can go two routes with setting up and running Paperless. The *Vagrant*
|
||||
route is quick & easy, but means you're running a VM which comes with memory
|
||||
consumption etc. Alternatively the standard, "bare metal" approach is a little
|
||||
more complicated.
|
||||
You can go multiple routes with setting up and running Paperless. The `Vagrant
|
||||
route`_ is quick & easy, but means you're running a VM which comes with memory
|
||||
consumption etc. We also `support Docker`_, which you can use natively under
|
||||
Linux and in a VM with `Docker Machine`_ (this guide was written for native
|
||||
Docker usage under Linux, you might have to adapt it for Docker Machine.)
|
||||
Alternatively the standard, `bare metal`_ approach is a little more complicated.
|
||||
|
||||
.. _Vagrant route: setup-installation-vagrant_
|
||||
.. _support Docker: setup-installation-docker_
|
||||
.. _bare metal: setup-installation-standard_
|
||||
|
||||
.. _Docker Machine: https://docs.docker.com/machine/
|
||||
|
||||
.. _setup-installation-standard:
|
||||
|
||||
@ -118,6 +125,157 @@ Vagrant Method
|
||||
.. _Paperless server: http://172.28.128.4:8000
|
||||
|
||||
|
||||
.. _setup-installation-docker:
|
||||
|
||||
Docker Method
|
||||
.............
|
||||
|
||||
1. Install `Docker`_.
|
||||
|
||||
.. caution::
|
||||
|
||||
As mentioned earlier, this guide assumes that you use Docker natively
|
||||
under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows,
|
||||
you will have to adapt IP addresses, volume-mounting, command execution
|
||||
and maybe more.
|
||||
|
||||
2. Install `docker-compose`_. [#compose]_
|
||||
|
||||
.. caution::
|
||||
|
||||
If you want to use the included ``docker-compose.yml.example`` file, you
|
||||
need to have at least Docker version **1.10.0** and docker-compose
|
||||
version **1.6.0**.
|
||||
|
||||
See the `Docker installation guide`_ on how to install the current
|
||||
version of Docker for your operating system or Linux distribution of
|
||||
choice. To get an up-to-date version of docker-compose, follow the
|
||||
`docker-compose installation guide`_ if your package repository doesn't
|
||||
include it.
|
||||
|
||||
.. _Docker installation guide: https://docs.docker.com/engine/installation/
|
||||
.. _docker-compose installation guide: https://docs.docker.com/compose/install/
|
||||
|
||||
3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and
|
||||
a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be
|
||||
editing both these files: taking a copy ensures that you can ``git pull`` to
|
||||
receive updates without risking merge conflicts with your modified versions
|
||||
of the configuration files.
|
||||
4. Modify ``docker-compose.yml`` to your preferences, following the instructions
|
||||
in comments in the file. The only change that is a hard requirement is to
|
||||
specify where the consumption directory should mount.
|
||||
5. Modify ``docker-compose.env`` and adapt the following environment variables:
|
||||
|
||||
``PAPERLESS_PASSPHRASE``
|
||||
This is the passphrase Paperless uses to encrypt/decrypt the original
|
||||
document.
|
||||
|
||||
``PAPERLESS_OCR_THREADS``
|
||||
This is the number of threads the OCR process will spawn to process
|
||||
document pages in parallel. If the variable is not set, Python determines
|
||||
the core-count of your CPU and uses that value.
|
||||
|
||||
``PAPERLESS_OCR_LANGUAGES``
|
||||
If you want the OCR to recognize other languages in addition to the default
|
||||
English, set this parameter to a space separated list of three-letter
|
||||
language-codes after `ISO 639-2/T`_. For a list of available languages --
|
||||
including their three letter codes -- see the `Debian packagelist`_.
|
||||
|
||||
``USERMAP_UID`` and ``USERMAP_GID``
|
||||
If you want to mount the consumption volume (directory ``/consume`` within
|
||||
the containers) to a host-directory -- which you probably want to do --
|
||||
access rights might be an issue. The default user and group ``paperless``
|
||||
in the containers have an id of 1000. The containers will enforce that the
|
||||
owning group of the consumption directory will be ``paperless`` to be able
|
||||
to delete consumed documents. If your host-system has a group with an id of
|
||||
1000 and you don't want this group to have access rights to the consumption
|
||||
directory, you can use ``USERMAP_GID`` to change the id in the container
|
||||
and thus the one of the consumption directory. Furthermore, you can change
|
||||
the id of the default user as well using ``USERMAP_UID``.
|
||||
|
||||
6. Run ``docker-compose up -d``. This will create and start the necessary
|
||||
containers.
|
||||
7. To be able to login, you will need a super user. To create it, execute the
|
||||
following command:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ docker-compose run --rm webserver createsuperuser
|
||||
|
||||
This will prompt you to set a username (default ``paperless``), an optional
|
||||
e-mail address and finally a password.
|
||||
8. The default ``docker-compose.yml`` exports the webserver on your local port
|
||||
8000. If you haven't adapted this, you should now be able to visit your
|
||||
`Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the
|
||||
user and password you just created.
|
||||
9. Add files to consumption directory the way you prefer to. Following are two
|
||||
possible options:
|
||||
|
||||
1. Mount the consumption directory to a local host path by modifying your
|
||||
``docker-compose.yml``:
|
||||
|
||||
.. code-block:: diff
|
||||
|
||||
diff --git a/docker-compose.yml b/docker-compose.yml
|
||||
--- a/docker-compose.yml
|
||||
+++ b/docker-compose.yml
|
||||
@@ -17,9 +18,8 @@ services:
|
||||
volumes:
|
||||
- paperless-data:/usr/src/paperless/data
|
||||
- paperless-media:/usr/src/paperless/media
|
||||
- - /consume
|
||||
+ - /local/path/you/choose:/consume
|
||||
|
||||
.. danger::
|
||||
|
||||
While the consumption container will ensure at startup that it can
|
||||
**delete** a consumed file from a host-mounted directory, it might not
|
||||
be able to **read** the document in the first place if the access
|
||||
rights to the file are incorrect.
|
||||
|
||||
Make sure that the documents you put into the consumption directory
|
||||
will either be readable by everyone (``chmod o+r file.pdf``) or
|
||||
readable by the default user or group id 1000 (or the one you have set
|
||||
with ``USERMAP_UID`` or ``USERMAP_GID`` respectively).
|
||||
|
||||
2. Use ``docker cp`` to copy your files directly into the container:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ # Identify your containers
|
||||
$ docker-compose ps
|
||||
Name Command State Ports
|
||||
-------------------------------------------------------------------------
|
||||
paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0
|
||||
|
||||
$ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume
|
||||
|
||||
``docker cp`` is a one-shot-command, just like ``cp``. This means that
|
||||
every time you want to consume a new document, you will have to execute
|
||||
``docker cp`` again. You can of course automate this process, but option 1
|
||||
is generally the preferred one.
|
||||
|
||||
.. danger::
|
||||
|
||||
``docker cp`` will change the owning user and group of a copied file
|
||||
to the acting user at the destination, which will be ``root``.
|
||||
|
||||
You therefore need to ensure that the documents you want to copy into
|
||||
the container are readable by everyone (``chmod o+r file.pdf``) before
|
||||
copying them.
|
||||
|
||||
|
||||
.. _Docker: https://www.docker.com/
|
||||
.. _docker-compose: https://docs.docker.com/compose/install/
|
||||
.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
|
||||
.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr-
|
||||
|
||||
.. [#compose] You of course don't have to use docker-compose, but it
|
||||
simplifies deployment immensely. If you know your way around Docker, feel
|
||||
free to tinker around without using compose!
|
||||
|
||||
|
||||
.. _making-things-a-little-more-permanent:
|
||||
|
||||
Making Things a Little more Permanent
|
||||
@ -126,5 +284,9 @@ Making Things a Little more Permanent
|
||||
Once you've tested things and are happy with the work flow, you can automate the
|
||||
process of starting the webserver and consumer automatically. If you're running
|
||||
on a bare metal system that's using Systemd, you can use the service unit files
|
||||
in the ``scripts`` directory to set this up. If you're on a SysV or other
|
||||
startup system (like the Vagrant box), then you're currently on your own.
|
||||
in the ``scripts`` directory to set this up. If you're on another startup
|
||||
system or are using a Vagrant box, then you're currently on your own. If you are
|
||||
using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to
|
||||
have the containers automatically start with the Docker daemon.
|
||||
|
||||
.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart
|
||||
|
@ -105,3 +105,30 @@ import, so should you can now safely delete the entire project directly,
|
||||
database, encrypted PDFs and all, and later create it all again simply by
|
||||
running the consumer again and dumping all of these files into
|
||||
``CONSUMPTION_DIR``.
|
||||
|
||||
|
||||
.. _utilities-retagger:
|
||||
|
||||
The Re-tagger
|
||||
-------------
|
||||
|
||||
Say you've imported a few hundred documents and now want to introduce a tag
|
||||
and apply its matching to all of the currently-imported docs. This problem is
|
||||
common enough that there's a tool for it.
|
||||
|
||||
|
||||
.. _utilities-retagger-howto:
|
||||
|
||||
How to Use It
|
||||
.............
|
||||
|
||||
This too is done via the ``manage.py`` script:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ /path/to/paperless/src/manage.py document_retagger
|
||||
|
||||
That's it. It'll loop over all of the documents in your database and attempt
|
||||
to match all of your tags to them. If one matches, it'll be applied. And
|
||||
don't worry, you can run this as often as you like, it' won't double-tag
|
||||
a document.
|
||||
|
74
scripts/docker-entrypoint.sh
Normal file
74
scripts/docker-entrypoint.sh
Normal file
@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Source: https://github.com/sameersbn/docker-gitlab/
|
||||
map_uidgid() {
|
||||
USERMAP_ORIG_UID=$(id -u paperless)
|
||||
USERMAP_ORIG_UID=$(id -g paperless)
|
||||
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
|
||||
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||
groupmod -g ${USERMAP_GID} paperless
|
||||
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||
fi
|
||||
}
|
||||
|
||||
set_permissions() {
|
||||
# Set permissions for consumption directory
|
||||
chgrp paperless "$PAPERLESS_CONSUME"
|
||||
chmod g+x "$PAPERLESS_CONSUME"
|
||||
|
||||
# Set permissions for application directory
|
||||
chown -Rh paperless:paperless /usr/src/paperless
|
||||
}
|
||||
|
||||
initialize() {
|
||||
map_uidgid
|
||||
set_permissions
|
||||
}
|
||||
|
||||
install_languages() {
|
||||
local langs="$1"
|
||||
read -ra langs <<<"$langs"
|
||||
|
||||
# Check that it is not empty
|
||||
if [ ${#langs[@]} -eq 0 ]; then
|
||||
return
|
||||
fi
|
||||
|
||||
# Update apt-lists
|
||||
apt-get update
|
||||
|
||||
# Loop over languages to be installed
|
||||
for lang in "${langs[@]}"; do
|
||||
pkg="tesseract-ocr-$lang"
|
||||
if dpkg -s "$pkg" 2>&1 > /dev/null; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if ! apt-cache show "$pkg" 2>&1 > /dev/null; then
|
||||
continue
|
||||
fi
|
||||
|
||||
apt-get install "$pkg"
|
||||
done
|
||||
|
||||
# Remove apt lists
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
}
|
||||
|
||||
|
||||
if [[ "$1" != "/"* ]]; then
|
||||
initialize
|
||||
|
||||
# Install additional languages if specified
|
||||
if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then
|
||||
install_languages "$PAPERLESS_OCR_LANGUAGES"
|
||||
fi
|
||||
|
||||
exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@"
|
||||
fi
|
||||
|
||||
exec "$@"
|
||||
|
@ -1,21 +1,23 @@
|
||||
import datetime
|
||||
import glob
|
||||
import tempfile
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import itertools
|
||||
|
||||
import langdetect
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import pyocr
|
||||
import shutil
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.template.defaultfilters import slugify
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from logger.models import Log
|
||||
from paperless.db import GnuPG
|
||||
@ -27,6 +29,12 @@ from .languages import ISO639
|
||||
def image_to_string(args):
|
||||
self, png, lang = args
|
||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||
if self.OCR.can_detect_orientation():
|
||||
try:
|
||||
orientation = self.OCR.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except TesseractError:
|
||||
pass
|
||||
return self.OCR.image_to_string(f, lang=lang)
|
||||
|
||||
|
||||
@ -111,34 +119,41 @@ class Consumer(object):
|
||||
|
||||
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
|
||||
pngs = self._get_greyscale(doc)
|
||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
pngs = self._get_greyscale(tempdir, doc)
|
||||
|
||||
try:
|
||||
text = self._get_ocr(pngs)
|
||||
self._store(text, doc)
|
||||
except OCRError:
|
||||
self._ignore.append(doc)
|
||||
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
self._cleanup_tempdir(tempdir)
|
||||
continue
|
||||
else:
|
||||
self._cleanup_tempdir(tempdir)
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self._store(text, doc)
|
||||
self._cleanup(pngs, doc)
|
||||
|
||||
def _get_greyscale(self, doc):
|
||||
def _get_greyscale(self, tempdir, doc):
|
||||
|
||||
Log.debug(
|
||||
"Generating greyscale image from {}".format(doc),
|
||||
Log.COMPONENT_CONSUMER
|
||||
)
|
||||
|
||||
i = random.randint(1000000, 9999999)
|
||||
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
||||
png = os.path.join(tempdir, "convert-%04d.jpg")
|
||||
|
||||
subprocess.Popen((
|
||||
self.CONVERT, "-density", "300", "-depth", "8",
|
||||
"-type", "grayscale", doc, png
|
||||
)).wait()
|
||||
|
||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||
pngs = []
|
||||
for f in os.listdir(tempdir):
|
||||
if f.startswith("convert"):
|
||||
pngs.append(os.path.join(tempdir, f))
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pngs))
|
||||
|
||||
@staticmethod
|
||||
def _guess_language(text):
|
||||
@ -271,11 +286,7 @@ class Consumer(object):
|
||||
def _store(self, text, doc):
|
||||
|
||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
||||
tags = list(tags)
|
||||
|
||||
lower_text = text.lower()
|
||||
relevant_tags = set(
|
||||
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
|
||||
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
||||
|
||||
stats = os.stat(doc)
|
||||
|
||||
@ -303,14 +314,15 @@ class Consumer(object):
|
||||
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
def _cleanup(self, pngs, doc):
|
||||
@staticmethod
|
||||
def _cleanup_tempdir(d):
|
||||
Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
|
||||
shutil.rmtree(d)
|
||||
|
||||
png_glob = os.path.join(
|
||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
||||
|
||||
for f in list(glob.glob(png_glob)) + [doc]:
|
||||
Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
|
||||
os.unlink(f)
|
||||
@staticmethod
|
||||
def _cleanup_doc(doc):
|
||||
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
|
||||
os.unlink(doc)
|
||||
|
||||
def _is_ready(self, doc):
|
||||
"""
|
||||
|
@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
tags = Tag.objects.exclude(
|
||||
pk__in=document.tags.values_list("pk", flat=True))
|
||||
for tag in tags:
|
||||
if tag.matches(document.content):
|
||||
print('Tagging {} with "{}"'.format(document, tag))
|
||||
document.tags.add(tag)
|
||||
|
||||
for tag in Tag.match_all(document.content, tags):
|
||||
print('Tagging {} with "{}"'.format(document, tag))
|
||||
document.tags.add(tag)
|
||||
|
23
src/documents/management/commands/loaddata_stdin.py
Normal file
23
src/documents/management/commands/loaddata_stdin.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""
|
||||
Source:
|
||||
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051
|
||||
|
||||
License:
|
||||
MIT
|
||||
Copyright (c) 2016 Baptiste Mispelon
|
||||
"""
|
||||
import sys
|
||||
|
||||
from django.core.management.commands.loaddata import Command as LoadDataCommand
|
||||
|
||||
|
||||
class Command(LoadDataCommand):
|
||||
def parse_name(self, fixture_name):
|
||||
self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None)
|
||||
if fixture_name == '-':
|
||||
return '-', 'json', 'stdin'
|
||||
|
||||
def find_fixtures(self, fixture_label):
|
||||
if fixture_label == '-':
|
||||
return [('-', None, '-')]
|
||||
return super(Command, self).find_fixtures(fixture_label)
|
@ -86,28 +86,40 @@ class Tag(SluggedModel):
|
||||
return "{}: \"{}\" ({})".format(
|
||||
self.name, self.match, self.get_matching_algorithm_display())
|
||||
|
||||
@classmethod
|
||||
def match_all(cls, text, tags=None):
|
||||
|
||||
if tags is None:
|
||||
tags = cls.objects.all()
|
||||
|
||||
text = text.lower()
|
||||
for tag in tags:
|
||||
if tag.matches(text):
|
||||
yield tag
|
||||
|
||||
def matches(self, text):
|
||||
|
||||
# Check that match is not empty
|
||||
if self.match.strip() == "":
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ALL:
|
||||
for word in self.match.split(" "):
|
||||
if word not in text:
|
||||
if not re.search(r"\b{}\b".format(word), text):
|
||||
return False
|
||||
return True
|
||||
|
||||
if self.matching_algorithm == self.MATCH_ANY:
|
||||
for word in self.match.split(" "):
|
||||
if word in text:
|
||||
if re.search(r"\b{}\b".format(word), text):
|
||||
return True
|
||||
return False
|
||||
|
||||
if self.matching_algorithm == self.MATCH_LITERAL:
|
||||
return self.match in text
|
||||
return bool(re.search(r"\b{}\b".format(self.match), text))
|
||||
|
||||
if self.matching_algorithm == self.MATCH_REGEX:
|
||||
return re.search(re.compile(self.match), text)
|
||||
return bool(re.search(re.compile(self.match), text))
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
120
src/documents/tests/test_tags.py
Normal file
120
src/documents/tests/test_tags.py
Normal file
@ -0,0 +1,120 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import Tag
|
||||
|
||||
|
||||
class TestTagMatching(TestCase):
|
||||
|
||||
def test_match_all(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_ALL
|
||||
)
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have charlie in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_ALL
|
||||
)
|
||||
self.assertFalse(t.matches("I have 12 in me"))
|
||||
self.assertFalse(t.matches("I have 34 in me"))
|
||||
self.assertFalse(t.matches("I have 56 in me"))
|
||||
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
|
||||
def test_match_any(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_ANY
|
||||
)
|
||||
|
||||
self.assertTrue(t.matches("I have alpha in me"))
|
||||
self.assertTrue(t.matches("I have charlie in me"))
|
||||
self.assertTrue(t.matches("I have gamma in me"))
|
||||
self.assertTrue(t.matches("I have alpha and charlie in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_ANY
|
||||
)
|
||||
self.assertTrue(t.matches("I have 12 in me"))
|
||||
self.assertTrue(t.matches("I have 34 in me"))
|
||||
self.assertTrue(t.matches("I have 56 in me"))
|
||||
self.assertTrue(t.matches("I have 12 and 34 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
|
||||
def test_match_literal(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha charlie gamma",
|
||||
matching_algorithm=Tag.MATCH_LITERAL
|
||||
)
|
||||
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have charlie in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
self.assertFalse(t.matches("I have bravo in me"))
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 1",
|
||||
match="12 34 56",
|
||||
matching_algorithm=Tag.MATCH_LITERAL
|
||||
)
|
||||
self.assertFalse(t.matches("I have 12 in me"))
|
||||
self.assertFalse(t.matches("I have 34 in me"))
|
||||
self.assertFalse(t.matches("I have 56 in me"))
|
||||
self.assertFalse(t.matches("I have 12 and 34 in me"))
|
||||
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
|
||||
self.assertFalse(t.matches("I have 123456 in me"))
|
||||
self.assertFalse(t.matches("I have 01234567 in me"))
|
||||
self.assertTrue(t.matches("I have 12 34 56 in me"))
|
||||
|
||||
def test_match_regex(self):
|
||||
|
||||
t = Tag.objects.create(
|
||||
name="Test 0",
|
||||
match="alpha\w+gamma",
|
||||
matching_algorithm=Tag.MATCH_REGEX
|
||||
)
|
||||
|
||||
self.assertFalse(t.matches("I have alpha in me"))
|
||||
self.assertFalse(t.matches("I have gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and charlie in me"))
|
||||
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
|
||||
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
|
||||
self.assertFalse(t.matches("I have alphas in me"))
|
||||
|
@ -23,4 +23,8 @@ class Migration(migrations.Migration):
|
||||
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
|
||||
],
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='log',
|
||||
options={'ordering': ('-time',)},
|
||||
),
|
||||
]
|
||||
|
@ -27,7 +27,10 @@ class Log(models.Model):
|
||||
component = models.PositiveIntegerField(choices=COMPONENTS)
|
||||
|
||||
class Meta(object):
|
||||
ordering = ("time",)
|
||||
ordering = ("-time",)
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
@classmethod
|
||||
def error(cls, message, component):
|
||||
|
Loading…
x
Reference in New Issue
Block a user