From 46f8f492f57f1b5b43b6674197f71f87660f7c2f Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Sun, 14 Feb 2016 17:40:37 +0100 Subject: [PATCH 01/14] Safely and non-randomly create scratch directory Creating the scratch-files in `_get_grayscale` using a random integer is for one inherently unsafe and can cause a collision. On the other hand, it should be unnecessary given that the files will be cleaned up after the OCR run. Since we don't know if OCR runs might be parallel in the future, this commit implements thread-safe and deterministic directory-creation. Additionally it fixes the call to `_cleanup` by `consume`. In the current implementation `_cleanup` will not be called if the last consumed document failed with an `OCRError`, this commit fixes this. --- src/documents/consumer.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index c432ee261..d7ee0e9ee 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,15 +1,16 @@ import datetime -import glob +import tempfile from multiprocessing.pool import Pool import itertools + import langdetect import os -import random import re import subprocess import pyocr +import shutil from PIL import Image @@ -111,34 +112,35 @@ class Consumer(object): Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) - pngs = self._get_greyscale(doc) + tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) + pngs = self._get_greyscale(tempdir, doc) try: text = self._get_ocr(pngs) + self._store(text, doc) except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) continue + finally: + self._cleanup(tempdir, doc) - self._store(text, doc) - self._cleanup(pngs, doc) - - def _get_greyscale(self, doc): + def _get_greyscale(self, tempdir, doc): Log.debug( "Generating greyscale image from {}".format(doc), Log.COMPONENT_CONSUMER ) - i = random.randint(1000000, 9999999) - png = os.path.join(self.SCRATCH, "{}.png".format(i)) + png = os.path.join(tempdir, "convert.png") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", "-type", "grayscale", doc, png )).wait() - return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) + pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] + return sorted(filter(lambda f: os.path.isfile(f), pngs)) @staticmethod def _guess_language(text): @@ -303,14 +305,14 @@ class Consumer(object): Log.debug("Encrypting", Log.COMPONENT_CONSUMER) encrypted.write(GnuPG.encrypted(unencrypted)) - def _cleanup(self, pngs, doc): + def _cleanup(self, tempdir, doc): + # Remove temporary directory recursively + Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) + shutil.rmtree(tempdir) - png_glob = os.path.join( - self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) - - for f in list(glob.glob(png_glob)) + [doc]: - Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER) - os.unlink(f) + # Remove doc + Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) + os.unlink(doc) def _is_ready(self, doc): """ From 6f95b052872a669b3f710d9b86783c9fd1f398db Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 00:10:05 +0000 Subject: [PATCH 02/14] Support appropriate sorting for long documents --- src/documents/consumer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d7ee0e9ee..5ca42813b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -132,7 +132,7 @@ class Consumer(object): Log.COMPONENT_CONSUMER ) - png = os.path.join(tempdir, "convert.png") + png = os.path.join(tempdir, "convert-%04d.jpg") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", From 550184cbae3f794992339e0871f4a1190bbc27ae Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 00:11:46 +0000 Subject: [PATCH 03/14] Patched sorting --- src/logger/migrations/0001_initial.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/logger/migrations/0001_initial.py b/src/logger/migrations/0001_initial.py index b9b81c296..029fe43c2 100644 --- a/src/logger/migrations/0001_initial.py +++ b/src/logger/migrations/0001_initial.py @@ -23,4 +23,8 @@ class Migration(migrations.Migration): ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])), ], ), + migrations.AlterModelOptions( + name='log', + options={'ordering': ('-time',)}, + ), ] From 1c45ca10d4d84b48683f1f7a84aefb251aae3ed3 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 00:11:57 +0000 Subject: [PATCH 04/14] Patched sorting --- src/logger/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/logger/models.py b/src/logger/models.py index 2e22ec931..48774c199 100644 --- a/src/logger/models.py +++ b/src/logger/models.py @@ -27,7 +27,7 @@ class Log(models.Model): component = models.PositiveIntegerField(choices=COMPONENTS) class Meta(object): - ordering = ("time",) + ordering = ("-time",) @classmethod def error(cls, message, component): From eb01bcf98b168d160fb667a7f53b2119c4c143bb Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 23:06:35 +0000 Subject: [PATCH 05/14] The Log class needed a __str__() method --- src/logger/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/logger/models.py b/src/logger/models.py index 48774c199..f7f2c421a 100644 --- a/src/logger/models.py +++ b/src/logger/models.py @@ -29,6 +29,9 @@ class Log(models.Model): class Meta(object): ordering = ("-time",) + def __str__(self): + return self.message + @classmethod def error(cls, message, component): cls.objects.create( From 1e7ece81ee7afe44342c3b649687550fde702e15 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 23:07:54 +0000 Subject: [PATCH 06/14] Fixes #45 --- src/documents/consumer.py | 6 +----- .../management/commands/document_retagger.py | 9 +++++---- src/documents/models.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5ca42813b..98fedde09 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -273,11 +273,7 @@ class Consumer(object): def _store(self, text, doc): sender, title, tags, file_type = self._guess_attributes_from_name(doc) - tags = list(tags) - - lower_text = text.lower() - relevant_tags = set( - [t for t in Tag.objects.all() if t.matches(lower_text)] + tags) + relevant_tags = set(list(Tag.match_all(text)) + list(tags)) stats = os.stat(doc) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index d7519f53b..09a3fb917 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand): self.verbosity = options["verbosity"] for document in Document.objects.all(): + tags = Tag.objects.exclude( pk__in=document.tags.values_list("pk", flat=True)) - for tag in tags: - if tag.matches(document.content): - print('Tagging {} with "{}"'.format(document, tag)) - document.tags.add(tag) + + for tag in Tag.match_all(document.content, tags): + print('Tagging {} with "{}"'.format(document, tag)) + document.tags.add(tag) diff --git a/src/documents/models.py b/src/documents/models.py index 447beaa66..03758eff5 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -86,7 +86,19 @@ class Tag(SluggedModel): return "{}: \"{}\" ({})".format( self.name, self.match, self.get_matching_algorithm_display()) + @classmethod + def match_all(cls, text, tags=None): + + if tags is None: + tags = cls.objects.all() + + text = text.lower() + for tag in tags: + if tag.matches(text): + yield tag + def matches(self, text): + # Check that match is not empty if self.match.strip() == "": return False From c34d57a872859e8f6799dceb41022b043490c6bd Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Thu, 18 Feb 2016 09:37:13 +0100 Subject: [PATCH 07/14] Detect image orientation if the OCR supports it Fixes issue #47. --- src/documents/consumer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 98fedde09..12761e992 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -28,6 +28,9 @@ from .languages import ISO639 def image_to_string(args): self, png, lang = args with Image.open(os.path.join(self.SCRATCH, png)) as f: + if self.OCR.can_detect_orientation(): + orientation = self.OCR.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) return self.OCR.image_to_string(f, lang=lang) From 724afa59c75853bf71e735650133e4d414558dfa Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Wed, 17 Feb 2016 18:45:04 +0100 Subject: [PATCH 08/14] Add Dockerfile for application and documentation This commit adds a `Dockerfile` to the root of the project, accompanied by a `docker-compose.yml.example` for simplified deployment. The `Dockerfile` is agnostic to whether it will be the webserver, the consumer, or if it is run for a one-off command (i.e. creation of a superuser, migration of the database, document export, ...). The containers entrypoint is the `scripts/docker-entrypoint.sh` script. This script verifies that the required permissions are set, remaps the default users and/or groups id if required and installs additional languages if the user wishes to. After initialization, it analyzes the command the user supplied: - If the command starts with a slash, it is expected that the user wants to execute a binary file and the command will be executed without further intervention. (Using `exec` to effectively replace the started shell-script and not have any reaping-issues.) - If the command does not start with a slash, the command will be passed directly to the `manage.py` script without further modification. (Again using `exec`.) The default command is set to `--help`. If the user wants to execute a command that is not meant for `manage.py` but doesn't start with a slash, the Docker `--entrypoint` parameter can be used to circumvent the mechanics of `docker-entrypoint.sh`. Further information can be found in `docs/setup.rst` and in `docs/migrating.rst`. For additional convenience, a `Dockerfile` has been added to the `docs/` directory which allows for easy building and serving of the documentation. This is documented in `docs/requirements.rst`. --- .gitignore | 1 + Dockerfile | 43 +++++ docker-compose.env | 15 ++ docker-compose.yml.example | 31 ++++ docs/Dockerfile | 18 ++ docs/migrating.rst | 95 ++++++++++ docs/requirements.rst | 13 ++ docs/setup.rst | 167 +++++++++++++++++- scripts/docker-entrypoint.sh | 74 ++++++++ .../management/commands/loaddata_stdin.py | 23 +++ 10 files changed, 474 insertions(+), 6 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.env create mode 100644 docker-compose.yml.example create mode 100644 docs/Dockerfile create mode 100644 scripts/docker-entrypoint.sh create mode 100644 src/documents/management/commands/loaddata_stdin.py diff --git a/.gitignore b/.gitignore index 908fa9748..2c65f8dcd 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ db.sqlite3 # Other stuff that doesn't belong virtualenv .vagrant +docker-compose.yml # Used for development scripts/import-for-development diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..dade863ca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install dependencies +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + sudo \ + tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \ + && rm -rf /var/lib/apt/lists/* + +# Install python dependencies +RUN mkdir -p /usr/src/paperless +WORKDIR /usr/src/paperless +COPY requirements.txt /usr/src/paperless/ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +RUN mkdir -p /usr/src/paperless/src +COPY src/ /usr/src/paperless/src/ + +# Set consumption directory +ENV PAPERLESS_CONSUME /consume +RUN mkdir -p $PAPERLESS_CONSUME + +# Migrate database +WORKDIR /usr/src/paperless/src +RUN mkdir /usr/src/paperless/data +RUN ./manage.py migrate + +# Create user +RUN groupadd -g 1000 paperless \ + && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ + && chown -Rh paperless:paperless /usr/src/paperless + +# Setup entrypoint +COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh +RUN chmod 755 /sbin/docker-entrypoint.sh + +# Mount volumes +VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"] + +ENTRYPOINT ["/sbin/docker-entrypoint.sh"] +CMD ["--help"] diff --git a/docker-compose.env b/docker-compose.env new file mode 100644 index 000000000..13c74b6ab --- /dev/null +++ b/docker-compose.env @@ -0,0 +1,15 @@ +# Environment variables to set for Paperless +# Commented out variables will be replaced by a default within Paperless. + +# Passphrase Paperless uses to encrypt and decrypt your documents +PAPERLESS_PASSPHRASE=CHANGE_ME + +# The amount of threads to use for text recognition +# PAPERLESS_OCR_THREADS=4 + +# Additional languages to install for text recognition +# PAPERLESS_OCR_LANGUAGES=deu ita + +# You can change the default user and group id to a custom one +# USERMAP_UID=1000 +# USERMAP_GID=1000 diff --git a/docker-compose.yml.example b/docker-compose.yml.example new file mode 100644 index 000000000..f8e9b5b93 --- /dev/null +++ b/docker-compose.yml.example @@ -0,0 +1,31 @@ +version: '2' + +services: + webserver: + image: paperless + ports: + # You can adapt the port you want Paperless to listen on by + # modifying the part before the `:`. + - "8000:8000" + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + env_file: docker-compose.env + environment: + - PAPERLESS_OCR_LANGUAGES= + command: ["runserver", "0.0.0.0:8000"] + + consumer: + image: paperless + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + # You have to adapt the local path you want the consumption + # directory to mount to by modifying the part before the ':'. + - /path/to/arbitrary/place:/consume + env_file: docker-compose.env + command: ["document_consumer"] + +volumes: + paperless-data: + paperless-media: diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 000000000..ee63aebb4 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install Sphinx and Pygments +RUN pip install Sphinx Pygments + +# Setup directories, copy data +RUN mkdir /build +COPY . /build +WORKDIR /build/docs + +# Build documentation +RUN make html + +# Start webserver +WORKDIR /build/docs/_build/html +EXPOSE 8000/tcp +CMD ["python3", "-m", "http.server"] diff --git a/docs/migrating.rst b/docs/migrating.rst index 46083533a..1e03bb3cb 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -30,6 +30,20 @@ as part of the update: Note that it's possible (even likely) that while ``git pull`` may update some files, the ``migrate`` step may not update anything. This is totally normal. +If you are :ref:`using Docker ` the update process +requires only one additional step: + +.. code-block:: shell-session + + $ cd /path/to/project + $ git pull + $ docker build -t paperless . + $ docker-compose up -d + $ docker-compose run --rm webserver migrate + +If ``git pull`` doesn't report any changes, there is no need to continue with +the remaining steps. + .. _migrating-backup: @@ -53,6 +67,65 @@ with Django's ``dumpdata`` command, which produces JSON output. $ ./manage.py document_export /path/to/arbitrary/place/ $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json +If you are :ref:`using Docker `, exporting your tags +as JSON is almost as easy: + +.. code-block:: shell-session + + $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json + +Exporting the documents though is a little more involved, since docker-compose +doesn't support mounting additional volumes with the ``run`` command. You have +three general options: + +1. Use the consumption directory if you happen to already have it mounted to a + host directory. + + .. code-block:: console + + $ # Stop the consumer so that it doesn't consume the exported documents + $ docker-compose stop consumer + $ # Export into the consumption directory + $ docker-compose run --rm consumer document_exporter /consume + +2. Add another volume to ``docker-compose.yml`` for exports and use + ``docker-compose run``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - /consume + + - /path/to/arbitrary/place:/export + + .. code-block:: shell-session + + $ docker-compose run --rm consumer document_exporter /export + +3. Use ``docker run`` directly, supplying the necessary commandline options: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export + .. _migrating-restoring: @@ -77,3 +150,25 @@ exported documents into the consumption directory and start up the consumer. $ cp /path/to/exported/docs/* /path/to/consumption/dir/ $ ./manage.py document_consumer +Importing your data if you are :ref:`using Docker ` +is almost as simple: + +.. code-block:: shell-session + + $ # Stop and remove your current containers + $ docker-compose stop + $ docker-compose rm -f + + $ # Recreate them, add the superuser + $ docker-compose up -d + $ docker-compose run --rm webserver createsuperuser + + $ # Load the tags + $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - + + $ # Load your exported documents into the consumption directory + $ # (How you do this highly depends on how you have set this up) + $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ + +After loading the documents into the consumption directory the consumer will +immediately start consuming the documents. diff --git a/docs/requirements.rst b/docs/requirements.rst index 1c4f989db..ee287d835 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to: $ pip install sphinx and then cd into the ``docs`` directory and type ``make html``. + +If you are using Docker, you can use the following commands to build the +documentation and run a webserver serving it on `port 8001`_: + +.. code:: bash + + $ pwd + /path/to/paperless + + $ docker build -t paperless:docs -f docs/Dockerfile . + $ docker run --rm -it -p "8001:8000" paperless:docs + +.. _port 8001: http://127.0.0.1:8001 diff --git a/docs/setup.rst b/docs/setup.rst index 24a9b9fa2..796de88e6 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -37,11 +37,18 @@ or just download the tarball and go that route: Installation & Configuration ---------------------------- -You can go two routes with setting up and running Paperless. The *Vagrant* -route is quick & easy, but means you're running a VM which comes with memory -consumption etc. Alternatively the standard, "bare metal" approach is a little -more complicated. +You can go multiple routes with setting up and running Paperless. The `Vagrant +route`_ is quick & easy, but means you're running a VM which comes with memory +consumption etc. We also `support Docker`_, which you can use natively under +Linux and in a VM with `Docker Machine`_ (this guide was written for native +Docker usage under Linux, you might have to adapt it for Docker Machine.) +Alternatively the standard, `bare metal`_ approach is a little more complicated. +.. _Vagrant route: setup-installation-vagrant_ +.. _support Docker: setup-installation-docker_ +.. _bare metal: setup-installation-standard_ + +.. _Docker Machine: https://docs.docker.com/machine/ .. _setup-installation-standard: @@ -118,6 +125,150 @@ Vagrant Method .. _Paperless server: http://172.28.128.4:8000 +.. _setup-installation-docker: + +Docker Method +............. + +1. Install `Docker`_. + + .. caution:: + + As mentioned earlier, this guide assumes that you use Docker natively + under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows, + you will have to adapt IP addresses, volume-mounting, command execution + and maybe more. + +2. Install `docker-compose`_. [#compose]_ + + .. caution:: + + If you want to use the included ``docker-compose.yml.example`` file, you + need to have at least Docker version **1.10.0** and docker-compose + version **1.6.0**. + + See the `Docker installation guide`_ on how to install the current + version of Docker for your operating system or Linux distribution of + choice. To get an up-to-date version of docker-compose, follow the + `docker-compose installation guide`_ if your package repository doesn't + include it. + + .. _Docker installation guide: https://docs.docker.com/engine/installation/ + .. _docker-compose installation guide: https://docs.docker.com/compose/install/ + +3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``. +4. Modify ``docker-compose.env`` and adapt the following environment variables: + + ``PAPERLESS_PASSPHRASE`` + This is the passphrase Paperless uses to encrypt/decrypt the original + document. + + ``PAPERLESS_OCR_THREADS`` + This is the number of threads the OCR process will spawn to process + document pages in parallel. If the variable is not set, Python determines + the core-count of your CPU and uses that value. + + ``PAPERLESS_OCR_LANGUAGES`` + If you want the OCR to recognize other languages in addition to the default + English, set this parameter to a space separated list of three-letter + language-codes after `ISO 639-2/T`_. For a list of available languages -- + including their three letter codes -- see the `Debian packagelist`_. + + ``USERMAP_UID`` and ``USERMAP_GID`` + If you want to mount the consumption volume (directory ``/consume`` within + the containers) to a host-directory -- which you probably want to do -- + access rights might be an issue. The default user and group ``paperless`` + in the containers have an id of 1000. The containers will enforce that the + owning group of the consumption directory will be ``paperless`` to be able + to delete consumed documents. If your host-system has a group with an id of + 1000 and you don't want this group to have access rights to the consumption + directory, you can use ``USERMAP_GID`` to change the id in the container + and thus the one of the consumption directory. Furthermore, you can change + the id of the default user as well using ``USERMAP_UID``. + +5. Run ``docker-compose up -d``. This will create and start the necessary + containers. +6. To be able to login, you will need a super user. To create it, execute the + following command: + + .. code-block:: shell-session + + $ docker-compose run --rm webserver createsuperuser + + This will prompt you to set a username (default ``paperless``), an optional + e-mail address and finally a password. +7. The default ``docker-compose.yml`` exports the webserver on your local port + 8000. If you haven't adapted this, you should now be able to visit your + `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the + user and password you just created. +8. Add files to consumption directory the way you prefer to. Following are two + possible options: + + 1. Mount the consumption directory to a local host path by modifying your + ``docker-compose.yml``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - - /consume + + - /local/path/you/choose:/consume + + .. danger:: + + While the consumption container will ensure at startup that it can + **delete** a consumed file from a host-mounted directory, it might not + be able to **read** the document in the first place if the access + rights to the file are incorrect. + + Make sure that the documents you put into the consumption directory + will either be readable by everyone (``chmod o+r file.pdf``) or + readable by the default user or group id 1000 (or the one you have set + with ``USERMAP_UID`` or ``USERMAP_GID`` respectively). + + 2. Use ``docker cp`` to copy your files directly into the container: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume + + ``docker cp`` is a one-shot-command, just like ``cp``. This means that + every time you want to consume a new document, you will have to execute + ``docker cp`` again. You can of course automate this process, but option 1 + is generally the preferred one. + + .. danger:: + + ``docker cp`` will change the owning user and group of a copied file + to the acting user at the destination, which will be ``root``. + + You therefore need to ensure that the documents you want to copy into + the container are readable by everyone (``chmod o+r file.pdf``) before + copying them. + + +.. _Docker: https://www.docker.com/ +.. _docker-compose: https://docs.docker.com/compose/install/ +.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- + +.. [#compose] You of course don't have to use docker-compose, but it + simplifies deployment immensely. If you know your way around Docker, feel + free to tinker around without using compose! + + .. _making-things-a-little-more-permanent: Making Things a Little more Permanent @@ -126,5 +277,9 @@ Making Things a Little more Permanent Once you've tested things and are happy with the work flow, you can automate the process of starting the webserver and consumer automatically. If you're running on a bare metal system that's using Systemd, you can use the service unit files -in the ``scripts`` directory to set this up. If you're on a SysV or other -startup system (like the Vagrant box), then you're currently on your own. +in the ``scripts`` directory to set this up. If you're on another startup +system or are using a Vagrant box, then you're currently on your own. If you are +using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to +have the containers automatically start with the Docker daemon. + +.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh new file mode 100644 index 000000000..9001574a1 --- /dev/null +++ b/scripts/docker-entrypoint.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -e + +# Source: https://github.com/sameersbn/docker-gitlab/ +map_uidgid() { + USERMAP_ORIG_UID=$(id -u paperless) + USERMAP_ORIG_UID=$(id -g paperless) + USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}} + USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} + if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then + echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" + groupmod -g ${USERMAP_GID} paperless + sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd + fi +} + +set_permissions() { + # Set permissions for consumption directory + chgrp paperless "$PAPERLESS_CONSUME" + chmod g+x "$PAPERLESS_CONSUME" + + # Set permissions for application directory + chown -Rh paperless:paperless /usr/src/paperless +} + +initialize() { + map_uidgid + set_permissions +} + +install_languages() { + local langs="$1" + read -ra langs <<<"$langs" + + # Check that it is not empty + if [ ${#langs[@]} -eq 0 ]; then + return + fi + + # Update apt-lists + apt-get update + + # Loop over languages to be installed + for lang in "${langs[@]}"; do + pkg="tesseract-ocr-$lang" + if dpkg -s "$pkg" 2>&1 > /dev/null; then + continue + fi + + if ! apt-cache show "$pkg" 2>&1 > /dev/null; then + continue + fi + + apt-get install "$pkg" + done + + # Remove apt lists + rm -rf /var/lib/apt/lists/* +} + + +if [[ "$1" != "/"* ]]; then + initialize + + # Install additional languages if specified + if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then + install_languages "$PAPERLESS_OCR_LANGUAGES" + fi + + exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" +fi + +exec "$@" + diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py new file mode 100644 index 000000000..b6848f1eb --- /dev/null +++ b/src/documents/management/commands/loaddata_stdin.py @@ -0,0 +1,23 @@ +""" +Source: + https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 + +License: + MIT + Copyright (c) 2016 Baptiste Mispelon +""" +import sys + +from django.core.management.commands.loaddata import Command as LoadDataCommand + + +class Command(LoadDataCommand): + def parse_name(self, fixture_name): + self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None) + if fixture_name == '-': + return '-', 'json', 'stdin' + + def find_fixtures(self, fixture_label): + if fixture_label == '-': + return [('-', None, '-')] + return super(Command, self).find_fixtures(fixture_label) From ec88ea73f67e8f8b1d8f36da5d10296e75a26b4c Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 19 Feb 2016 00:45:02 +0000 Subject: [PATCH 09/14] #48: make the tag matching smarter --- src/documents/models.py | 8 +-- src/documents/tests/test_tags.py | 120 +++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 src/documents/tests/test_tags.py diff --git a/src/documents/models.py b/src/documents/models.py index 03758eff5..d4d95aa38 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -105,21 +105,21 @@ class Tag(SluggedModel): if self.matching_algorithm == self.MATCH_ALL: for word in self.match.split(" "): - if word not in text: + if not re.search(r"\b{}\b".format(word), text): return False return True if self.matching_algorithm == self.MATCH_ANY: for word in self.match.split(" "): - if word in text: + if re.search(r"\b{}\b".format(word), text): return True return False if self.matching_algorithm == self.MATCH_LITERAL: - return self.match in text + return bool(re.search(r"\b{}\b".format(self.match), text)) if self.matching_algorithm == self.MATCH_REGEX: - return re.search(re.compile(self.match), text) + return bool(re.search(re.compile(self.match), text)) raise NotImplementedError("Unsupported matching algorithm") diff --git a/src/documents/tests/test_tags.py b/src/documents/tests/test_tags.py new file mode 100644 index 000000000..f3518e012 --- /dev/null +++ b/src/documents/tests/test_tags.py @@ -0,0 +1,120 @@ +from django.test import TestCase + +from ..models import Tag + + +class TestTagMatching(TestCase): + + def test_match_all(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_ALL + ) + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have charlie in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertTrue(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_ALL + ) + self.assertFalse(t.matches("I have 12 in me")) + self.assertFalse(t.matches("I have 34 in me")) + self.assertFalse(t.matches("I have 56 in me")) + self.assertFalse(t.matches("I have 12 and 34 in me")) + self.assertTrue(t.matches("I have 12 34, and 56 in me")) + self.assertFalse(t.matches("I have 120, 34, and 56 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + + def test_match_any(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_ANY + ) + + self.assertTrue(t.matches("I have alpha in me")) + self.assertTrue(t.matches("I have charlie in me")) + self.assertTrue(t.matches("I have gamma in me")) + self.assertTrue(t.matches("I have alpha and charlie in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_ANY + ) + self.assertTrue(t.matches("I have 12 in me")) + self.assertTrue(t.matches("I have 34 in me")) + self.assertTrue(t.matches("I have 56 in me")) + self.assertTrue(t.matches("I have 12 and 34 in me")) + self.assertTrue(t.matches("I have 12 34, and 56 in me")) + self.assertTrue(t.matches("I have 120, 34, and 560 in me")) + self.assertFalse(t.matches("I have 120, 340, and 560 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + + def test_match_literal(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_LITERAL + ) + + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have charlie in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertTrue(t.matches("I have 'alpha charlie gamma' in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_LITERAL + ) + self.assertFalse(t.matches("I have 12 in me")) + self.assertFalse(t.matches("I have 34 in me")) + self.assertFalse(t.matches("I have 56 in me")) + self.assertFalse(t.matches("I have 12 and 34 in me")) + self.assertFalse(t.matches("I have 12 34, and 56 in me")) + self.assertFalse(t.matches("I have 120, 34, and 560 in me")) + self.assertFalse(t.matches("I have 120, 340, and 560 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + self.assertTrue(t.matches("I have 12 34 56 in me")) + + def test_match_regex(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha\w+gamma", + matching_algorithm=Tag.MATCH_REGEX + ) + + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertTrue(t.matches("I have alpha_and_gamma in me")) + self.assertTrue(t.matches("I have alphas_and_gamma in me")) + self.assertFalse(t.matches("I have alpha,and,gamma in me")) + self.assertFalse(t.matches("I have alpha and gamma in me")) + self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas in me")) + From c45f951ca017f1fa94c87d15768e6ed06d99ca15 Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Fri, 19 Feb 2016 09:52:32 +0100 Subject: [PATCH 10/14] Ignore error if orientation detection fails Fixes an additional issue that came up in #48. --- src/documents/consumer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 12761e992..21484036b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -17,6 +17,7 @@ from PIL import Image from django.conf import settings from django.utils import timezone from django.template.defaultfilters import slugify +from pyocr.tesseract import TesseractError from logger.models import Log from paperless.db import GnuPG @@ -29,8 +30,11 @@ def image_to_string(args): self, png, lang = args with Image.open(os.path.join(self.SCRATCH, png)) as f: if self.OCR.can_detect_orientation(): - orientation = self.OCR.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) + try: + orientation = self.OCR.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) + except TesseractError: + pass return self.OCR.image_to_string(f, lang=lang) From 3a8755e4c8e8ea09a091985852da6bdba5355ed3 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 19 Feb 2016 17:26:40 +0000 Subject: [PATCH 11/14] Document the retagger Fixes #54 --- docs/utilities.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/utilities.rst b/docs/utilities.rst index 2b795d31a..f5b452a6f 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -105,3 +105,30 @@ import, so should you can now safely delete the entire project directly, database, encrypted PDFs and all, and later create it all again simply by running the consumer again and dumping all of these files into ``CONSUMPTION_DIR``. + + +.. _utilities-retagger: + +The Re-tagger +------------- + +Say you've imported a few hundred documents and now want to introduce a tag +and apply its matching to all of the currently-imported docs. This problem is +common enough that there's a tool for it. + + +.. _utilities-retagger-howto: + +How to Use It +............. + +This too is done via the ``manage.py`` script: + +.. code:: bash + + $ /path/to/paperless/src/manage.py document_retagger + +That's it. It'll loop over all of the documents in your database and attempt +to match all of your tags to them. If one matches, it'll be applied. And +don't worry, you can run this as often as you like, it' won't double-tag +a document. From 147f8f72a2b76f3118d7f28fe316a2c7e49412fe Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Fri, 19 Feb 2016 09:48:43 +0200 Subject: [PATCH 12/14] Simplify instructions for exporting with docker The export workflow reusing the `/consume` volume is complex and error- prone, and not at all necessary if the `docker-compose.yml` file has a volume for `/export` from the beginning. --- docker-compose.yml.example | 6 ++++ docs/migrating.rst | 68 ++++++++++++++------------------------ 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/docker-compose.yml.example b/docker-compose.yml.example index f8e9b5b93..7e3557aa8 100644 --- a/docker-compose.yml.example +++ b/docker-compose.yml.example @@ -23,6 +23,12 @@ services: # You have to adapt the local path you want the consumption # directory to mount to by modifying the part before the ':'. - /path/to/arbitrary/place:/consume + # Likewise, you can add a local path to mount a directory for + # exporting. This is not strictly needed for paperless to + # function, only if you're exporting your files: uncomment + # it and fill in a local path if you know you're going to + # want to export your documents. + # - /path/to/another/arbitrary/place:/export env_file: docker-compose.env command: ["document_consumer"] diff --git a/docs/migrating.rst b/docs/migrating.rst index 1e03bb3cb..491eeace4 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -74,57 +74,37 @@ as JSON is almost as easy: $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json -Exporting the documents though is a little more involved, since docker-compose -doesn't support mounting additional volumes with the ``run`` command. You have -three general options: +To export the documents you can either use ``docker run`` directly, specifying all +the commandline options by hand, or (more simply) mount a second volume for export. -1. Use the consumption directory if you happen to already have it mounted to a - host directory. +To mount a volume for exports, follow the instructions in the +``docker-compose.yml.example`` file for the ``/export`` volume (making the changes +in your own ``docker-compose.yml`` file, of course). Once you have the +volume mounted, the command to run an export is: - .. code-block:: console +.. code-block:: console - $ # Stop the consumer so that it doesn't consume the exported documents - $ docker-compose stop consumer - $ # Export into the consumption directory - $ docker-compose run --rm consumer document_exporter /consume + $ docker-compose run --rm consumer document_exporter /export -2. Add another volume to ``docker-compose.yml`` for exports and use - ``docker-compose run``: +If you prefer to use ``docker run`` directly, supplying the necessary commandline +options: - .. code-block:: diff +.. code-block:: shell-session - diff --git a/docker-compose.yml b/docker-compose.yml - --- a/docker-compose.yml - +++ b/docker-compose.yml - @@ -17,9 +18,8 @@ services: - volumes: - - paperless-data:/usr/src/paperless/data - - paperless-media:/usr/src/paperless/media - - /consume - + - /path/to/arbitrary/place:/export + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 - .. code-block:: shell-session - - $ docker-compose run --rm consumer document_exporter /export - -3. Use ``docker run`` directly, supplying the necessary commandline options: - - .. code-block:: shell-session - - $ # Identify your containers - $ docker-compose ps - Name Command State Ports - ------------------------------------------------------------------------- - paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 - paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 - - $ # Make sure to replace your passphrase and remove or adapt the id mapping - $ docker run --rm \ - --volumes-from paperless_data_1 \ - --volume /path/to/arbitrary/place:/export \ - -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ - -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ - paperless document_exporter /export + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export .. _migrating-restoring: From 438b161a25d6d26fd8c5bc0b3aa9d20ea2f6376a Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Fri, 19 Feb 2016 22:51:49 +0200 Subject: [PATCH 13/14] Move `docker-compose.env` to `docker-compose.env.example` & adjust docs This file, like `docker-compose.yml`, should be edited by the user. To avoid merge conflicts when pulling updates, the edited version should not be committed to the repository. --- .gitignore | 1 + ...-compose.env => docker-compose.env.example | 0 docs/setup.rst | 19 +++++++++++++------ 3 files changed, 14 insertions(+), 6 deletions(-) rename docker-compose.env => docker-compose.env.example (100%) diff --git a/.gitignore b/.gitignore index 2c65f8dcd..d4c3fe38e 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ db.sqlite3 virtualenv .vagrant docker-compose.yml +docker-compose.env # Used for development scripts/import-for-development diff --git a/docker-compose.env b/docker-compose.env.example similarity index 100% rename from docker-compose.env rename to docker-compose.env.example diff --git a/docs/setup.rst b/docs/setup.rst index 796de88e6..be8a349d8 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -156,8 +156,15 @@ Docker Method .. _Docker installation guide: https://docs.docker.com/engine/installation/ .. _docker-compose installation guide: https://docs.docker.com/compose/install/ -3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``. -4. Modify ``docker-compose.env`` and adapt the following environment variables: +3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and + a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be + editing both these files: taking a copy ensures that you can ``git pull`` to + receive updates without risking merge conflicts with your modified versions + of the configuration files. +4. Modify ``docker-compose.yml`` to your preferences, following the instructions + in comments in the file. The only change that is a hard requirement is to + specify where the consumption directory should mount. +5. Modify ``docker-compose.env`` and adapt the following environment variables: ``PAPERLESS_PASSPHRASE`` This is the passphrase Paperless uses to encrypt/decrypt the original @@ -186,9 +193,9 @@ Docker Method and thus the one of the consumption directory. Furthermore, you can change the id of the default user as well using ``USERMAP_UID``. -5. Run ``docker-compose up -d``. This will create and start the necessary +6. Run ``docker-compose up -d``. This will create and start the necessary containers. -6. To be able to login, you will need a super user. To create it, execute the +7. To be able to login, you will need a super user. To create it, execute the following command: .. code-block:: shell-session @@ -197,11 +204,11 @@ Docker Method This will prompt you to set a username (default ``paperless``), an optional e-mail address and finally a password. -7. The default ``docker-compose.yml`` exports the webserver on your local port +8. The default ``docker-compose.yml`` exports the webserver on your local port 8000. If you haven't adapted this, you should now be able to visit your `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the user and password you just created. -8. Add files to consumption directory the way you prefer to. Following are two +9. Add files to consumption directory the way you prefer to. Following are two possible options: 1. Mount the consumption directory to a local host path by modifying your From 51b19f4c19fc38e45712c12f41fa86c8a7dac75f Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 20 Feb 2016 22:30:01 +0000 Subject: [PATCH 14/14] Issue #57 --- src/documents/consumer.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 21484036b..d6818cf5d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -128,9 +128,11 @@ class Consumer(object): except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) + self._cleanup_tempdir(tempdir) continue - finally: - self._cleanup(tempdir, doc) + else: + self._cleanup_tempdir(tempdir) + self._cleanup_doc(doc) def _get_greyscale(self, tempdir, doc): @@ -146,8 +148,12 @@ class Consumer(object): "-type", "grayscale", doc, png )).wait() - pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] - return sorted(filter(lambda f: os.path.isfile(f), pngs)) + pngs = [] + for f in os.listdir(tempdir): + if f.startswith("convert"): + pngs.append(os.path.join(tempdir, f)) + + return sorted(filter(lambda __: os.path.isfile(__), pngs)) @staticmethod def _guess_language(text): @@ -308,12 +314,13 @@ class Consumer(object): Log.debug("Encrypting", Log.COMPONENT_CONSUMER) encrypted.write(GnuPG.encrypted(unencrypted)) - def _cleanup(self, tempdir, doc): - # Remove temporary directory recursively - Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) - shutil.rmtree(tempdir) + @staticmethod + def _cleanup_tempdir(d): + Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER) + shutil.rmtree(d) - # Remove doc + @staticmethod + def _cleanup_doc(doc): Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) os.unlink(doc)