From cebc44f2c98653a7977d8c675fd791934564a44e Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Tue, 16 Feb 2016 09:28:34 +0000 Subject: [PATCH 01/71] API is halfway there --- src/documents/admin.py | 5 ++++- src/documents/serialisers.py | 38 ++++++++++++++++++++++++++++++++++++ src/documents/views.py | 20 ++++++++++++++++++- src/paperless/settings.py | 2 ++ src/paperless/urls.py | 16 ++++++++++++--- 5 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 src/documents/serialisers.py diff --git a/src/documents/admin.py b/src/documents/admin.py index 635b9ddf8..d3bdd3ba4 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -46,10 +46,13 @@ class DocumentAdmin(admin.ModelAdmin): } search_fields = ("sender__name", "title", "content") - list_display = ("created", "sender", "title", "tags_", "document") + list_display = ("created_", "sender", "title", "tags_", "document") list_filter = ("tags", "sender", MonthListFilter) list_per_page = 25 + def created_(self, obj): + return obj.created.date().strftime("%Y-%m-%d") + def tags_(self, obj): r = "" for tag in obj.tags.all(): diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py new file mode 100644 index 000000000..209c778a1 --- /dev/null +++ b/src/documents/serialisers.py @@ -0,0 +1,38 @@ +from rest_framework import serializers + +from .models import Sender, Tag, Document + + +class SenderSerializer(serializers.ModelSerializer): + + class Meta(object): + model = Sender + fields = ("id", "slug", "name") + + +class TagSerializer(serializers.ModelSerializer): + + class Meta(object): + model = Tag + fields = ("id", "slug", "name", "colour", "match", "matching_algorithm") + + +class DocumentSerializer(serializers.ModelSerializer): + + sender = serializers.HyperlinkedModelSerializer(read_only=True) + tags = serializers.HyperlinkedModelSerializer(read_only=True) + + class Meta(object): + model = Document + fields = ( + "id", + "sender", + "title", + "content", + "file_type", + "tags", + "created", + "modified", + "file_name", + "download_url" + ) diff --git a/src/documents/views.py b/src/documents/views.py index c92b6af09..45caf50e9 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -3,10 +3,13 @@ from django.template.defaultfilters import slugify from django.views.decorators.csrf import csrf_exempt from django.views.generic import FormView, DetailView +from rest_framework.viewsets import ModelViewSet + from paperless.db import GnuPG -from .models import Document from .forms import UploadForm +from .models import Sender, Tag, Document +from .serialisers import SenderSerializer, TagSerializer, DocumentSerializer class PdfView(DetailView): @@ -52,3 +55,18 @@ class PushView(FormView): def form_invalid(self, form): return HttpResponse("0") + + +class SenderViewSet(ModelViewSet): + model = Sender + serializer_class = SenderSerializer + + +class TagViewSet(ModelViewSet): + model = Tag + serializer_class = TagSerializer + + +class DocumentViewSet(ModelViewSet): + model = Document + serializer_class = DocumentSerializer diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 444989990..d31879110 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -44,6 +44,8 @@ INSTALLED_APPS = [ "documents", "logger", + "rest_framework", + ] MIDDLEWARE_CLASSES = [ diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 060953676..d8a48995d 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -15,14 +15,24 @@ Including another URLconf 3. Add a URL to urlpatterns: url(r'^blog/', include(blog_urls)) """ from django.conf import settings -from django.conf.urls import url, static +from django.conf.urls import url, static, include from django.contrib import admin -from documents.views import PdfView, PushView +from rest_framework.routers import DefaultRouter + +from documents.views import ( + PdfView, PushView, SenderViewSet, TagViewSet, DocumentViewSet) + +router = DefaultRouter() +router.register(r'senders', SenderViewSet) +router.register(r'tags', TagViewSet) +router.register(r'documents', DocumentViewSet) urlpatterns = [ + url(r"^api/auth/", include('rest_framework.urls', namespace='rest_framework')), + url(r"^api/", include(router.urls)), url(r"^fetch/(?P\d+)$", PdfView.as_view(), name="fetch"), - url(r'', admin.site.urls), + url(r"", admin.site.urls), ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) if settings.UPLOAD_SHARED_SECRET: From eb01bcf98b168d160fb667a7f53b2119c4c143bb Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 23:06:35 +0000 Subject: [PATCH 02/71] The Log class needed a __str__() method --- src/logger/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/logger/models.py b/src/logger/models.py index 48774c199..f7f2c421a 100644 --- a/src/logger/models.py +++ b/src/logger/models.py @@ -29,6 +29,9 @@ class Log(models.Model): class Meta(object): ordering = ("-time",) + def __str__(self): + return self.message + @classmethod def error(cls, message, component): cls.objects.create( From 1e7ece81ee7afe44342c3b649687550fde702e15 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Wed, 17 Feb 2016 23:07:54 +0000 Subject: [PATCH 03/71] Fixes #45 --- src/documents/consumer.py | 6 +----- .../management/commands/document_retagger.py | 9 +++++---- src/documents/models.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5ca42813b..98fedde09 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -273,11 +273,7 @@ class Consumer(object): def _store(self, text, doc): sender, title, tags, file_type = self._guess_attributes_from_name(doc) - tags = list(tags) - - lower_text = text.lower() - relevant_tags = set( - [t for t in Tag.objects.all() if t.matches(lower_text)] + tags) + relevant_tags = set(list(Tag.match_all(text)) + list(tags)) stats = os.stat(doc) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index d7519f53b..09a3fb917 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -23,9 +23,10 @@ class Command(Renderable, BaseCommand): self.verbosity = options["verbosity"] for document in Document.objects.all(): + tags = Tag.objects.exclude( pk__in=document.tags.values_list("pk", flat=True)) - for tag in tags: - if tag.matches(document.content): - print('Tagging {} with "{}"'.format(document, tag)) - document.tags.add(tag) + + for tag in Tag.match_all(document.content, tags): + print('Tagging {} with "{}"'.format(document, tag)) + document.tags.add(tag) diff --git a/src/documents/models.py b/src/documents/models.py index 447beaa66..03758eff5 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -86,7 +86,19 @@ class Tag(SluggedModel): return "{}: \"{}\" ({})".format( self.name, self.match, self.get_matching_algorithm_display()) + @classmethod + def match_all(cls, text, tags=None): + + if tags is None: + tags = cls.objects.all() + + text = text.lower() + for tag in tags: + if tag.matches(text): + yield tag + def matches(self, text): + # Check that match is not empty if self.match.strip() == "": return False From c34d57a872859e8f6799dceb41022b043490c6bd Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Thu, 18 Feb 2016 09:37:13 +0100 Subject: [PATCH 04/71] Detect image orientation if the OCR supports it Fixes issue #47. --- src/documents/consumer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 98fedde09..12761e992 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -28,6 +28,9 @@ from .languages import ISO639 def image_to_string(args): self, png, lang = args with Image.open(os.path.join(self.SCRATCH, png)) as f: + if self.OCR.can_detect_orientation(): + orientation = self.OCR.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) return self.OCR.image_to_string(f, lang=lang) From 724afa59c75853bf71e735650133e4d414558dfa Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Wed, 17 Feb 2016 18:45:04 +0100 Subject: [PATCH 05/71] Add Dockerfile for application and documentation This commit adds a `Dockerfile` to the root of the project, accompanied by a `docker-compose.yml.example` for simplified deployment. The `Dockerfile` is agnostic to whether it will be the webserver, the consumer, or if it is run for a one-off command (i.e. creation of a superuser, migration of the database, document export, ...). The containers entrypoint is the `scripts/docker-entrypoint.sh` script. This script verifies that the required permissions are set, remaps the default users and/or groups id if required and installs additional languages if the user wishes to. After initialization, it analyzes the command the user supplied: - If the command starts with a slash, it is expected that the user wants to execute a binary file and the command will be executed without further intervention. (Using `exec` to effectively replace the started shell-script and not have any reaping-issues.) - If the command does not start with a slash, the command will be passed directly to the `manage.py` script without further modification. (Again using `exec`.) The default command is set to `--help`. If the user wants to execute a command that is not meant for `manage.py` but doesn't start with a slash, the Docker `--entrypoint` parameter can be used to circumvent the mechanics of `docker-entrypoint.sh`. Further information can be found in `docs/setup.rst` and in `docs/migrating.rst`. For additional convenience, a `Dockerfile` has been added to the `docs/` directory which allows for easy building and serving of the documentation. This is documented in `docs/requirements.rst`. --- .gitignore | 1 + Dockerfile | 43 +++++ docker-compose.env | 15 ++ docker-compose.yml.example | 31 ++++ docs/Dockerfile | 18 ++ docs/migrating.rst | 95 ++++++++++ docs/requirements.rst | 13 ++ docs/setup.rst | 167 +++++++++++++++++- scripts/docker-entrypoint.sh | 74 ++++++++ .../management/commands/loaddata_stdin.py | 23 +++ 10 files changed, 474 insertions(+), 6 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.env create mode 100644 docker-compose.yml.example create mode 100644 docs/Dockerfile create mode 100644 scripts/docker-entrypoint.sh create mode 100644 src/documents/management/commands/loaddata_stdin.py diff --git a/.gitignore b/.gitignore index 908fa9748..2c65f8dcd 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ db.sqlite3 # Other stuff that doesn't belong virtualenv .vagrant +docker-compose.yml # Used for development scripts/import-for-development diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..dade863ca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install dependencies +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + sudo \ + tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \ + && rm -rf /var/lib/apt/lists/* + +# Install python dependencies +RUN mkdir -p /usr/src/paperless +WORKDIR /usr/src/paperless +COPY requirements.txt /usr/src/paperless/ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +RUN mkdir -p /usr/src/paperless/src +COPY src/ /usr/src/paperless/src/ + +# Set consumption directory +ENV PAPERLESS_CONSUME /consume +RUN mkdir -p $PAPERLESS_CONSUME + +# Migrate database +WORKDIR /usr/src/paperless/src +RUN mkdir /usr/src/paperless/data +RUN ./manage.py migrate + +# Create user +RUN groupadd -g 1000 paperless \ + && useradd -u 1000 -g 1000 -d /usr/src/paperless paperless \ + && chown -Rh paperless:paperless /usr/src/paperless + +# Setup entrypoint +COPY scripts/docker-entrypoint.sh /sbin/docker-entrypoint.sh +RUN chmod 755 /sbin/docker-entrypoint.sh + +# Mount volumes +VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", "/consume"] + +ENTRYPOINT ["/sbin/docker-entrypoint.sh"] +CMD ["--help"] diff --git a/docker-compose.env b/docker-compose.env new file mode 100644 index 000000000..13c74b6ab --- /dev/null +++ b/docker-compose.env @@ -0,0 +1,15 @@ +# Environment variables to set for Paperless +# Commented out variables will be replaced by a default within Paperless. + +# Passphrase Paperless uses to encrypt and decrypt your documents +PAPERLESS_PASSPHRASE=CHANGE_ME + +# The amount of threads to use for text recognition +# PAPERLESS_OCR_THREADS=4 + +# Additional languages to install for text recognition +# PAPERLESS_OCR_LANGUAGES=deu ita + +# You can change the default user and group id to a custom one +# USERMAP_UID=1000 +# USERMAP_GID=1000 diff --git a/docker-compose.yml.example b/docker-compose.yml.example new file mode 100644 index 000000000..f8e9b5b93 --- /dev/null +++ b/docker-compose.yml.example @@ -0,0 +1,31 @@ +version: '2' + +services: + webserver: + image: paperless + ports: + # You can adapt the port you want Paperless to listen on by + # modifying the part before the `:`. + - "8000:8000" + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + env_file: docker-compose.env + environment: + - PAPERLESS_OCR_LANGUAGES= + command: ["runserver", "0.0.0.0:8000"] + + consumer: + image: paperless + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + # You have to adapt the local path you want the consumption + # directory to mount to by modifying the part before the ':'. + - /path/to/arbitrary/place:/consume + env_file: docker-compose.env + command: ["document_consumer"] + +volumes: + paperless-data: + paperless-media: diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 000000000..ee63aebb4 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.5.1 +MAINTAINER Pit Kleyersburg + +# Install Sphinx and Pygments +RUN pip install Sphinx Pygments + +# Setup directories, copy data +RUN mkdir /build +COPY . /build +WORKDIR /build/docs + +# Build documentation +RUN make html + +# Start webserver +WORKDIR /build/docs/_build/html +EXPOSE 8000/tcp +CMD ["python3", "-m", "http.server"] diff --git a/docs/migrating.rst b/docs/migrating.rst index 46083533a..1e03bb3cb 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -30,6 +30,20 @@ as part of the update: Note that it's possible (even likely) that while ``git pull`` may update some files, the ``migrate`` step may not update anything. This is totally normal. +If you are :ref:`using Docker ` the update process +requires only one additional step: + +.. code-block:: shell-session + + $ cd /path/to/project + $ git pull + $ docker build -t paperless . + $ docker-compose up -d + $ docker-compose run --rm webserver migrate + +If ``git pull`` doesn't report any changes, there is no need to continue with +the remaining steps. + .. _migrating-backup: @@ -53,6 +67,65 @@ with Django's ``dumpdata`` command, which produces JSON output. $ ./manage.py document_export /path/to/arbitrary/place/ $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json +If you are :ref:`using Docker `, exporting your tags +as JSON is almost as easy: + +.. code-block:: shell-session + + $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json + +Exporting the documents though is a little more involved, since docker-compose +doesn't support mounting additional volumes with the ``run`` command. You have +three general options: + +1. Use the consumption directory if you happen to already have it mounted to a + host directory. + + .. code-block:: console + + $ # Stop the consumer so that it doesn't consume the exported documents + $ docker-compose stop consumer + $ # Export into the consumption directory + $ docker-compose run --rm consumer document_exporter /consume + +2. Add another volume to ``docker-compose.yml`` for exports and use + ``docker-compose run``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - /consume + + - /path/to/arbitrary/place:/export + + .. code-block:: shell-session + + $ docker-compose run --rm consumer document_exporter /export + +3. Use ``docker run`` directly, supplying the necessary commandline options: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export + .. _migrating-restoring: @@ -77,3 +150,25 @@ exported documents into the consumption directory and start up the consumer. $ cp /path/to/exported/docs/* /path/to/consumption/dir/ $ ./manage.py document_consumer +Importing your data if you are :ref:`using Docker ` +is almost as simple: + +.. code-block:: shell-session + + $ # Stop and remove your current containers + $ docker-compose stop + $ docker-compose rm -f + + $ # Recreate them, add the superuser + $ docker-compose up -d + $ docker-compose run --rm webserver createsuperuser + + $ # Load the tags + $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - + + $ # Load your exported documents into the consumption directory + $ # (How you do this highly depends on how you have set this up) + $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ + +After loading the documents into the consumption directory the consumer will +immediately start consuming the documents. diff --git a/docs/requirements.rst b/docs/requirements.rst index 1c4f989db..ee287d835 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -101,3 +101,16 @@ you'd like to generate your own docs locally, you'll need to: $ pip install sphinx and then cd into the ``docs`` directory and type ``make html``. + +If you are using Docker, you can use the following commands to build the +documentation and run a webserver serving it on `port 8001`_: + +.. code:: bash + + $ pwd + /path/to/paperless + + $ docker build -t paperless:docs -f docs/Dockerfile . + $ docker run --rm -it -p "8001:8000" paperless:docs + +.. _port 8001: http://127.0.0.1:8001 diff --git a/docs/setup.rst b/docs/setup.rst index 24a9b9fa2..796de88e6 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -37,11 +37,18 @@ or just download the tarball and go that route: Installation & Configuration ---------------------------- -You can go two routes with setting up and running Paperless. The *Vagrant* -route is quick & easy, but means you're running a VM which comes with memory -consumption etc. Alternatively the standard, "bare metal" approach is a little -more complicated. +You can go multiple routes with setting up and running Paperless. The `Vagrant +route`_ is quick & easy, but means you're running a VM which comes with memory +consumption etc. We also `support Docker`_, which you can use natively under +Linux and in a VM with `Docker Machine`_ (this guide was written for native +Docker usage under Linux, you might have to adapt it for Docker Machine.) +Alternatively the standard, `bare metal`_ approach is a little more complicated. +.. _Vagrant route: setup-installation-vagrant_ +.. _support Docker: setup-installation-docker_ +.. _bare metal: setup-installation-standard_ + +.. _Docker Machine: https://docs.docker.com/machine/ .. _setup-installation-standard: @@ -118,6 +125,150 @@ Vagrant Method .. _Paperless server: http://172.28.128.4:8000 +.. _setup-installation-docker: + +Docker Method +............. + +1. Install `Docker`_. + + .. caution:: + + As mentioned earlier, this guide assumes that you use Docker natively + under Linux. If you are using `Docker Machine`_ under Mac OS X or Windows, + you will have to adapt IP addresses, volume-mounting, command execution + and maybe more. + +2. Install `docker-compose`_. [#compose]_ + + .. caution:: + + If you want to use the included ``docker-compose.yml.example`` file, you + need to have at least Docker version **1.10.0** and docker-compose + version **1.6.0**. + + See the `Docker installation guide`_ on how to install the current + version of Docker for your operating system or Linux distribution of + choice. To get an up-to-date version of docker-compose, follow the + `docker-compose installation guide`_ if your package repository doesn't + include it. + + .. _Docker installation guide: https://docs.docker.com/engine/installation/ + .. _docker-compose installation guide: https://docs.docker.com/compose/install/ + +3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``. +4. Modify ``docker-compose.env`` and adapt the following environment variables: + + ``PAPERLESS_PASSPHRASE`` + This is the passphrase Paperless uses to encrypt/decrypt the original + document. + + ``PAPERLESS_OCR_THREADS`` + This is the number of threads the OCR process will spawn to process + document pages in parallel. If the variable is not set, Python determines + the core-count of your CPU and uses that value. + + ``PAPERLESS_OCR_LANGUAGES`` + If you want the OCR to recognize other languages in addition to the default + English, set this parameter to a space separated list of three-letter + language-codes after `ISO 639-2/T`_. For a list of available languages -- + including their three letter codes -- see the `Debian packagelist`_. + + ``USERMAP_UID`` and ``USERMAP_GID`` + If you want to mount the consumption volume (directory ``/consume`` within + the containers) to a host-directory -- which you probably want to do -- + access rights might be an issue. The default user and group ``paperless`` + in the containers have an id of 1000. The containers will enforce that the + owning group of the consumption directory will be ``paperless`` to be able + to delete consumed documents. If your host-system has a group with an id of + 1000 and you don't want this group to have access rights to the consumption + directory, you can use ``USERMAP_GID`` to change the id in the container + and thus the one of the consumption directory. Furthermore, you can change + the id of the default user as well using ``USERMAP_UID``. + +5. Run ``docker-compose up -d``. This will create and start the necessary + containers. +6. To be able to login, you will need a super user. To create it, execute the + following command: + + .. code-block:: shell-session + + $ docker-compose run --rm webserver createsuperuser + + This will prompt you to set a username (default ``paperless``), an optional + e-mail address and finally a password. +7. The default ``docker-compose.yml`` exports the webserver on your local port + 8000. If you haven't adapted this, you should now be able to visit your + `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the + user and password you just created. +8. Add files to consumption directory the way you prefer to. Following are two + possible options: + + 1. Mount the consumption directory to a local host path by modifying your + ``docker-compose.yml``: + + .. code-block:: diff + + diff --git a/docker-compose.yml b/docker-compose.yml + --- a/docker-compose.yml + +++ b/docker-compose.yml + @@ -17,9 +18,8 @@ services: + volumes: + - paperless-data:/usr/src/paperless/data + - paperless-media:/usr/src/paperless/media + - - /consume + + - /local/path/you/choose:/consume + + .. danger:: + + While the consumption container will ensure at startup that it can + **delete** a consumed file from a host-mounted directory, it might not + be able to **read** the document in the first place if the access + rights to the file are incorrect. + + Make sure that the documents you put into the consumption directory + will either be readable by everyone (``chmod o+r file.pdf``) or + readable by the default user or group id 1000 (or the one you have set + with ``USERMAP_UID`` or ``USERMAP_GID`` respectively). + + 2. Use ``docker cp`` to copy your files directly into the container: + + .. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ docker cp /path/to/your/file.pdf paperless_consumer_1:/consume + + ``docker cp`` is a one-shot-command, just like ``cp``. This means that + every time you want to consume a new document, you will have to execute + ``docker cp`` again. You can of course automate this process, but option 1 + is generally the preferred one. + + .. danger:: + + ``docker cp`` will change the owning user and group of a copied file + to the acting user at the destination, which will be ``root``. + + You therefore need to ensure that the documents you want to copy into + the container are readable by everyone (``chmod o+r file.pdf``) before + copying them. + + +.. _Docker: https://www.docker.com/ +.. _docker-compose: https://docs.docker.com/compose/install/ +.. _ISO 639-2/T: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +.. _Debian packagelist: https://packages.debian.org/search?suite=jessie&searchon=names&keywords=tesseract-ocr- + +.. [#compose] You of course don't have to use docker-compose, but it + simplifies deployment immensely. If you know your way around Docker, feel + free to tinker around without using compose! + + .. _making-things-a-little-more-permanent: Making Things a Little more Permanent @@ -126,5 +277,9 @@ Making Things a Little more Permanent Once you've tested things and are happy with the work flow, you can automate the process of starting the webserver and consumer automatically. If you're running on a bare metal system that's using Systemd, you can use the service unit files -in the ``scripts`` directory to set this up. If you're on a SysV or other -startup system (like the Vagrant box), then you're currently on your own. +in the ``scripts`` directory to set this up. If you're on another startup +system or are using a Vagrant box, then you're currently on your own. If you are +using Docker, you can set a restart-policy_ in the ``docker-compose.yml`` to +have the containers automatically start with the Docker daemon. + +.. _restart-policy: https://docs.docker.com/engine/reference/commandline/run/#restart-policies-restart diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh new file mode 100644 index 000000000..9001574a1 --- /dev/null +++ b/scripts/docker-entrypoint.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -e + +# Source: https://github.com/sameersbn/docker-gitlab/ +map_uidgid() { + USERMAP_ORIG_UID=$(id -u paperless) + USERMAP_ORIG_UID=$(id -g paperless) + USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}} + USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID} + if [[ ${USERMAP_UID} != ${USERMAP_ORIG_UID} || ${USERMAP_GID} != ${USERMAP_ORIG_GID} ]]; then + echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID" + groupmod -g ${USERMAP_GID} paperless + sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd + fi +} + +set_permissions() { + # Set permissions for consumption directory + chgrp paperless "$PAPERLESS_CONSUME" + chmod g+x "$PAPERLESS_CONSUME" + + # Set permissions for application directory + chown -Rh paperless:paperless /usr/src/paperless +} + +initialize() { + map_uidgid + set_permissions +} + +install_languages() { + local langs="$1" + read -ra langs <<<"$langs" + + # Check that it is not empty + if [ ${#langs[@]} -eq 0 ]; then + return + fi + + # Update apt-lists + apt-get update + + # Loop over languages to be installed + for lang in "${langs[@]}"; do + pkg="tesseract-ocr-$lang" + if dpkg -s "$pkg" 2>&1 > /dev/null; then + continue + fi + + if ! apt-cache show "$pkg" 2>&1 > /dev/null; then + continue + fi + + apt-get install "$pkg" + done + + # Remove apt lists + rm -rf /var/lib/apt/lists/* +} + + +if [[ "$1" != "/"* ]]; then + initialize + + # Install additional languages if specified + if [ ! -z "$PAPERLESS_OCR_LANGUAGES" ]; then + install_languages "$PAPERLESS_OCR_LANGUAGES" + fi + + exec sudo -HEu paperless "/usr/src/paperless/src/manage.py" "$@" +fi + +exec "$@" + diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py new file mode 100644 index 000000000..b6848f1eb --- /dev/null +++ b/src/documents/management/commands/loaddata_stdin.py @@ -0,0 +1,23 @@ +""" +Source: + https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 + +License: + MIT + Copyright (c) 2016 Baptiste Mispelon +""" +import sys + +from django.core.management.commands.loaddata import Command as LoadDataCommand + + +class Command(LoadDataCommand): + def parse_name(self, fixture_name): + self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None) + if fixture_name == '-': + return '-', 'json', 'stdin' + + def find_fixtures(self, fixture_label): + if fixture_label == '-': + return [('-', None, '-')] + return super(Command, self).find_fixtures(fixture_label) From ec88ea73f67e8f8b1d8f36da5d10296e75a26b4c Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 19 Feb 2016 00:45:02 +0000 Subject: [PATCH 06/71] #48: make the tag matching smarter --- src/documents/models.py | 8 +-- src/documents/tests/test_tags.py | 120 +++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 src/documents/tests/test_tags.py diff --git a/src/documents/models.py b/src/documents/models.py index 03758eff5..d4d95aa38 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -105,21 +105,21 @@ class Tag(SluggedModel): if self.matching_algorithm == self.MATCH_ALL: for word in self.match.split(" "): - if word not in text: + if not re.search(r"\b{}\b".format(word), text): return False return True if self.matching_algorithm == self.MATCH_ANY: for word in self.match.split(" "): - if word in text: + if re.search(r"\b{}\b".format(word), text): return True return False if self.matching_algorithm == self.MATCH_LITERAL: - return self.match in text + return bool(re.search(r"\b{}\b".format(self.match), text)) if self.matching_algorithm == self.MATCH_REGEX: - return re.search(re.compile(self.match), text) + return bool(re.search(re.compile(self.match), text)) raise NotImplementedError("Unsupported matching algorithm") diff --git a/src/documents/tests/test_tags.py b/src/documents/tests/test_tags.py new file mode 100644 index 000000000..f3518e012 --- /dev/null +++ b/src/documents/tests/test_tags.py @@ -0,0 +1,120 @@ +from django.test import TestCase + +from ..models import Tag + + +class TestTagMatching(TestCase): + + def test_match_all(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_ALL + ) + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have charlie in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertTrue(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_ALL + ) + self.assertFalse(t.matches("I have 12 in me")) + self.assertFalse(t.matches("I have 34 in me")) + self.assertFalse(t.matches("I have 56 in me")) + self.assertFalse(t.matches("I have 12 and 34 in me")) + self.assertTrue(t.matches("I have 12 34, and 56 in me")) + self.assertFalse(t.matches("I have 120, 34, and 56 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + + def test_match_any(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_ANY + ) + + self.assertTrue(t.matches("I have alpha in me")) + self.assertTrue(t.matches("I have charlie in me")) + self.assertTrue(t.matches("I have gamma in me")) + self.assertTrue(t.matches("I have alpha and charlie in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_ANY + ) + self.assertTrue(t.matches("I have 12 in me")) + self.assertTrue(t.matches("I have 34 in me")) + self.assertTrue(t.matches("I have 56 in me")) + self.assertTrue(t.matches("I have 12 and 34 in me")) + self.assertTrue(t.matches("I have 12 34, and 56 in me")) + self.assertTrue(t.matches("I have 120, 34, and 560 in me")) + self.assertFalse(t.matches("I have 120, 340, and 560 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + + def test_match_literal(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha charlie gamma", + matching_algorithm=Tag.MATCH_LITERAL + ) + + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have charlie in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertTrue(t.matches("I have 'alpha charlie gamma' in me")) + self.assertFalse(t.matches("I have alphas in me")) + self.assertFalse(t.matches("I have bravo in me")) + + t = Tag.objects.create( + name="Test 1", + match="12 34 56", + matching_algorithm=Tag.MATCH_LITERAL + ) + self.assertFalse(t.matches("I have 12 in me")) + self.assertFalse(t.matches("I have 34 in me")) + self.assertFalse(t.matches("I have 56 in me")) + self.assertFalse(t.matches("I have 12 and 34 in me")) + self.assertFalse(t.matches("I have 12 34, and 56 in me")) + self.assertFalse(t.matches("I have 120, 34, and 560 in me")) + self.assertFalse(t.matches("I have 120, 340, and 560 in me")) + self.assertFalse(t.matches("I have 123456 in me")) + self.assertFalse(t.matches("I have 01234567 in me")) + self.assertTrue(t.matches("I have 12 34 56 in me")) + + def test_match_regex(self): + + t = Tag.objects.create( + name="Test 0", + match="alpha\w+gamma", + matching_algorithm=Tag.MATCH_REGEX + ) + + self.assertFalse(t.matches("I have alpha in me")) + self.assertFalse(t.matches("I have gamma in me")) + self.assertFalse(t.matches("I have alpha and charlie in me")) + self.assertTrue(t.matches("I have alpha_and_gamma in me")) + self.assertTrue(t.matches("I have alphas_and_gamma in me")) + self.assertFalse(t.matches("I have alpha,and,gamma in me")) + self.assertFalse(t.matches("I have alpha and gamma in me")) + self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) + self.assertFalse(t.matches("I have alphas in me")) + From c45f951ca017f1fa94c87d15768e6ed06d99ca15 Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg Date: Fri, 19 Feb 2016 09:52:32 +0100 Subject: [PATCH 07/71] Ignore error if orientation detection fails Fixes an additional issue that came up in #48. --- src/documents/consumer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 12761e992..21484036b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -17,6 +17,7 @@ from PIL import Image from django.conf import settings from django.utils import timezone from django.template.defaultfilters import slugify +from pyocr.tesseract import TesseractError from logger.models import Log from paperless.db import GnuPG @@ -29,8 +30,11 @@ def image_to_string(args): self, png, lang = args with Image.open(os.path.join(self.SCRATCH, png)) as f: if self.OCR.can_detect_orientation(): - orientation = self.OCR.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) + try: + orientation = self.OCR.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) + except TesseractError: + pass return self.OCR.image_to_string(f, lang=lang) From 3a8755e4c8e8ea09a091985852da6bdba5355ed3 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 19 Feb 2016 17:26:40 +0000 Subject: [PATCH 08/71] Document the retagger Fixes #54 --- docs/utilities.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/utilities.rst b/docs/utilities.rst index 2b795d31a..f5b452a6f 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -105,3 +105,30 @@ import, so should you can now safely delete the entire project directly, database, encrypted PDFs and all, and later create it all again simply by running the consumer again and dumping all of these files into ``CONSUMPTION_DIR``. + + +.. _utilities-retagger: + +The Re-tagger +------------- + +Say you've imported a few hundred documents and now want to introduce a tag +and apply its matching to all of the currently-imported docs. This problem is +common enough that there's a tool for it. + + +.. _utilities-retagger-howto: + +How to Use It +............. + +This too is done via the ``manage.py`` script: + +.. code:: bash + + $ /path/to/paperless/src/manage.py document_retagger + +That's it. It'll loop over all of the documents in your database and attempt +to match all of your tags to them. If one matches, it'll be applied. And +don't worry, you can run this as often as you like, it' won't double-tag +a document. From 147f8f72a2b76f3118d7f28fe316a2c7e49412fe Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Fri, 19 Feb 2016 09:48:43 +0200 Subject: [PATCH 09/71] Simplify instructions for exporting with docker The export workflow reusing the `/consume` volume is complex and error- prone, and not at all necessary if the `docker-compose.yml` file has a volume for `/export` from the beginning. --- docker-compose.yml.example | 6 ++++ docs/migrating.rst | 68 ++++++++++++++------------------------ 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/docker-compose.yml.example b/docker-compose.yml.example index f8e9b5b93..7e3557aa8 100644 --- a/docker-compose.yml.example +++ b/docker-compose.yml.example @@ -23,6 +23,12 @@ services: # You have to adapt the local path you want the consumption # directory to mount to by modifying the part before the ':'. - /path/to/arbitrary/place:/consume + # Likewise, you can add a local path to mount a directory for + # exporting. This is not strictly needed for paperless to + # function, only if you're exporting your files: uncomment + # it and fill in a local path if you know you're going to + # want to export your documents. + # - /path/to/another/arbitrary/place:/export env_file: docker-compose.env command: ["document_consumer"] diff --git a/docs/migrating.rst b/docs/migrating.rst index 1e03bb3cb..491eeace4 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -74,57 +74,37 @@ as JSON is almost as easy: $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json -Exporting the documents though is a little more involved, since docker-compose -doesn't support mounting additional volumes with the ``run`` command. You have -three general options: +To export the documents you can either use ``docker run`` directly, specifying all +the commandline options by hand, or (more simply) mount a second volume for export. -1. Use the consumption directory if you happen to already have it mounted to a - host directory. +To mount a volume for exports, follow the instructions in the +``docker-compose.yml.example`` file for the ``/export`` volume (making the changes +in your own ``docker-compose.yml`` file, of course). Once you have the +volume mounted, the command to run an export is: - .. code-block:: console +.. code-block:: console - $ # Stop the consumer so that it doesn't consume the exported documents - $ docker-compose stop consumer - $ # Export into the consumption directory - $ docker-compose run --rm consumer document_exporter /consume + $ docker-compose run --rm consumer document_exporter /export -2. Add another volume to ``docker-compose.yml`` for exports and use - ``docker-compose run``: +If you prefer to use ``docker run`` directly, supplying the necessary commandline +options: - .. code-block:: diff +.. code-block:: shell-session - diff --git a/docker-compose.yml b/docker-compose.yml - --- a/docker-compose.yml - +++ b/docker-compose.yml - @@ -17,9 +18,8 @@ services: - volumes: - - paperless-data:/usr/src/paperless/data - - paperless-media:/usr/src/paperless/media - - /consume - + - /path/to/arbitrary/place:/export + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 - .. code-block:: shell-session - - $ docker-compose run --rm consumer document_exporter /export - -3. Use ``docker run`` directly, supplying the necessary commandline options: - - .. code-block:: shell-session - - $ # Identify your containers - $ docker-compose ps - Name Command State Ports - ------------------------------------------------------------------------- - paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 - paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 - - $ # Make sure to replace your passphrase and remove or adapt the id mapping - $ docker run --rm \ - --volumes-from paperless_data_1 \ - --volume /path/to/arbitrary/place:/export \ - -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ - -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ - paperless document_exporter /export + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export .. _migrating-restoring: From 438b161a25d6d26fd8c5bc0b3aa9d20ea2f6376a Mon Sep 17 00:00:00 2001 From: Tikitu de Jager Date: Fri, 19 Feb 2016 22:51:49 +0200 Subject: [PATCH 10/71] Move `docker-compose.env` to `docker-compose.env.example` & adjust docs This file, like `docker-compose.yml`, should be edited by the user. To avoid merge conflicts when pulling updates, the edited version should not be committed to the repository. --- .gitignore | 1 + ...-compose.env => docker-compose.env.example | 0 docs/setup.rst | 19 +++++++++++++------ 3 files changed, 14 insertions(+), 6 deletions(-) rename docker-compose.env => docker-compose.env.example (100%) diff --git a/.gitignore b/.gitignore index 2c65f8dcd..d4c3fe38e 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ db.sqlite3 virtualenv .vagrant docker-compose.yml +docker-compose.env # Used for development scripts/import-for-development diff --git a/docker-compose.env b/docker-compose.env.example similarity index 100% rename from docker-compose.env rename to docker-compose.env.example diff --git a/docs/setup.rst b/docs/setup.rst index 796de88e6..be8a349d8 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -156,8 +156,15 @@ Docker Method .. _Docker installation guide: https://docs.docker.com/engine/installation/ .. _docker-compose installation guide: https://docs.docker.com/compose/install/ -3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml``. -4. Modify ``docker-compose.env`` and adapt the following environment variables: +3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and + a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be + editing both these files: taking a copy ensures that you can ``git pull`` to + receive updates without risking merge conflicts with your modified versions + of the configuration files. +4. Modify ``docker-compose.yml`` to your preferences, following the instructions + in comments in the file. The only change that is a hard requirement is to + specify where the consumption directory should mount. +5. Modify ``docker-compose.env`` and adapt the following environment variables: ``PAPERLESS_PASSPHRASE`` This is the passphrase Paperless uses to encrypt/decrypt the original @@ -186,9 +193,9 @@ Docker Method and thus the one of the consumption directory. Furthermore, you can change the id of the default user as well using ``USERMAP_UID``. -5. Run ``docker-compose up -d``. This will create and start the necessary +6. Run ``docker-compose up -d``. This will create and start the necessary containers. -6. To be able to login, you will need a super user. To create it, execute the +7. To be able to login, you will need a super user. To create it, execute the following command: .. code-block:: shell-session @@ -197,11 +204,11 @@ Docker Method This will prompt you to set a username (default ``paperless``), an optional e-mail address and finally a password. -7. The default ``docker-compose.yml`` exports the webserver on your local port +8. The default ``docker-compose.yml`` exports the webserver on your local port 8000. If you haven't adapted this, you should now be able to visit your `Paperless webserver`_ at ``http://127.0.0.1:8000``. You can login with the user and password you just created. -8. Add files to consumption directory the way you prefer to. Following are two +9. Add files to consumption directory the way you prefer to. Following are two possible options: 1. Mount the consumption directory to a local host path by modifying your From 51b19f4c19fc38e45712c12f41fa86c8a7dac75f Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 20 Feb 2016 22:30:01 +0000 Subject: [PATCH 11/71] Issue #57 --- src/documents/consumer.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 21484036b..d6818cf5d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -128,9 +128,11 @@ class Consumer(object): except OCRError: self._ignore.append(doc) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) + self._cleanup_tempdir(tempdir) continue - finally: - self._cleanup(tempdir, doc) + else: + self._cleanup_tempdir(tempdir) + self._cleanup_doc(doc) def _get_greyscale(self, tempdir, doc): @@ -146,8 +148,12 @@ class Consumer(object): "-type", "grayscale", doc, png )).wait() - pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")] - return sorted(filter(lambda f: os.path.isfile(f), pngs)) + pngs = [] + for f in os.listdir(tempdir): + if f.startswith("convert"): + pngs.append(os.path.join(tempdir, f)) + + return sorted(filter(lambda __: os.path.isfile(__), pngs)) @staticmethod def _guess_language(text): @@ -308,12 +314,13 @@ class Consumer(object): Log.debug("Encrypting", Log.COMPONENT_CONSUMER) encrypted.write(GnuPG.encrypted(unencrypted)) - def _cleanup(self, tempdir, doc): - # Remove temporary directory recursively - Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER) - shutil.rmtree(tempdir) + @staticmethod + def _cleanup_tempdir(d): + Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER) + shutil.rmtree(d) - # Remove doc + @staticmethod + def _cleanup_doc(doc): Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) os.unlink(doc) From 422ae9303ac72dfad3fe53c598b1b53fc4d616c1 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sun, 21 Feb 2016 00:14:50 +0000 Subject: [PATCH 12/71] pep8 --- src/documents/admin.py | 52 +++++++++++++------ src/documents/consumer.py | 11 ++-- src/documents/forms.py | 8 +-- src/documents/languages.py | 6 +-- .../management/commands/document_retagger.py | 4 +- .../management/commands/loaddata_stdin.py | 2 +- src/documents/mixins.py | 4 +- src/documents/models.py | 8 +-- src/documents/serialisers.py | 3 +- src/documents/tests/test_consumer.py | 4 +- src/documents/tests/test_tags.py | 1 - src/paperless/urls.py | 12 ++++- tox.ini | 14 +++++ 13 files changed, 89 insertions(+), 40 deletions(-) create mode 100644 tox.ini diff --git a/src/documents/admin.py b/src/documents/admin.py index d3bdd3ba4..42c3fc968 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -56,26 +56,35 @@ class DocumentAdmin(admin.ModelAdmin): def tags_(self, obj): r = "" for tag in obj.tags.all(): - r += '{}'.format( - tag.get_colour_display(), - "{}?tags__id__exact={}".format( - reverse("admin:documents_document_changelist"), - tag.pk - ), - tag.slug + colour = tag.get_colour_display() + r += html_tag( + "a", + tag.slug, + **{ + "class": "tag", + "style": "background-color: {};".format(colour), + "href": "{}?tags__id__exact={}".format( + reverse("admin:documents_document_changelist"), + tag.pk + ) + } ) return r tags_.allow_tags = True def document(self, obj): - return '' \ - '{} icon' \ - ''.format( - obj.download_url, - static("documents/img/{}.png".format(obj.file_type)), - obj.file_type, - obj.file_name - ) + return html_tag( + "a", + html_tag( + "img", + src=static("documents/img/{}.png".format(obj.file_type)), + width=22, + height=22, + alt=obj.file_type, + title=obj.file_name + ), + href=obj.download_url + ) document.allow_tags = True admin.site.register(Sender) @@ -85,3 +94,16 @@ admin.site.register(Document, DocumentAdmin) # Unless we implement multi-user, these default registrations don't make sense. admin.site.unregister(Group) admin.site.unregister(User) + + +def html_tag(kind, inside=None, **kwargs): + + attributes = [] + for lft, rgt in kwargs.items(): + attributes.append('{}="{}"'.format(lft, rgt)) + + if inside is not None: + return "<{kind} {attributes}>{inside}".format( + kind=kind, attributes=" ".join(attributes), inside=inside) + + return "<{} {}/>".format(kind, " ".join(attributes)) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d6818cf5d..6cf3b3d9d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -127,7 +127,8 @@ class Consumer(object): self._store(text, doc) except OCRError: self._ignore.append(doc) - Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) + Log.error( + "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) self._cleanup_tempdir(tempdir) continue else: @@ -190,8 +191,8 @@ class Consumer(object): Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER) if settings.FORGIVING_OCR: Log.warning( - "As FORGIVING_OCR is enabled, we're going to make the best " - "with what we have.", + "As FORGIVING_OCR is enabled, we're going to make the " + "best with what we have.", Log.COMPONENT_CONSUMER ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) @@ -246,8 +247,8 @@ class Consumer(object): def _guess_attributes_from_name(self, parseable): """ - We use a crude naming convention to make handling the sender, title, and - tags easier: + We use a crude naming convention to make handling the sender, title, + and tags easier: " - - <tags>.<suffix>" "<sender> - <title>.<suffix>" "<title>.<suffix>" diff --git a/src/documents/forms.py b/src/documents/forms.py index d544917b4..404be1763 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -26,15 +26,17 @@ class UploadForm(forms.Form): sender = forms.CharField( max_length=Sender._meta.get_field("name").max_length, required=False) title = forms.CharField( - max_length=Document._meta.get_field("title").max_length, required=False) + max_length=Document._meta.get_field("title").max_length, + required=False + ) document = forms.FileField() signature = forms.CharField(max_length=256) def clean_sender(self): """ I suppose it might look cleaner to use .get_or_create() here, but that - would also allow someone to fill up the db with bogus senders before all - validation was met. + would also allow someone to fill up the db with bogus senders before + all validation was met. """ sender = self.cleaned_data.get("sender") if not sender: diff --git a/src/documents/languages.py b/src/documents/languages.py index 2bfafe08a..5ea560654 100644 --- a/src/documents/languages.py +++ b/src/documents/languages.py @@ -185,10 +185,10 @@ ISO639 = { "yo": "yor", "za": "zha", - # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I have - # no idea which one is better, so I just picked the bigger file. + # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I + # have no idea which one is better, so I just picked the bigger file. "zh": "chi_tra", "zu": "zul" -} \ No newline at end of file +} diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index 09a3fb917..8f56e1eea 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -10,8 +10,8 @@ class Command(Renderable, BaseCommand): help = """ Using the current set of tagging rules, apply said rules to all documents in the database, effectively allowing you to back-tag all - previously indexed documents with tags created (or modified) after their - initial import. + previously indexed documents with tags created (or modified) after + their initial import. """.replace(" ", "") def __init__(self, *args, **kwargs): diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py index b6848f1eb..ca0b9ef7b 100644 --- a/src/documents/management/commands/loaddata_stdin.py +++ b/src/documents/management/commands/loaddata_stdin.py @@ -13,7 +13,7 @@ from django.core.management.commands.loaddata import Command as LoadDataCommand class Command(LoadDataCommand): def parse_name(self, fixture_name): - self.compression_formats['stdin'] = (lambda x,y: sys.stdin, None) + self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None) if fixture_name == '-': return '-', 'json', 'stdin' diff --git a/src/documents/mixins.py b/src/documents/mixins.py index 881589fa3..4d4e9783f 100644 --- a/src/documents/mixins.py +++ b/src/documents/mixins.py @@ -1,7 +1,7 @@ class Renderable(object): """ - A handy mixin to make it easier/cleaner to print output based on a verbosity - value. + A handy mixin to make it easier/cleaner to print output based on a + verbosity value. """ def _render(self, text, verbosity): diff --git a/src/documents/models.py b/src/documents/models.py index d4d95aa38..267bebffe 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -36,7 +36,7 @@ class Sender(SluggedModel): class Tag(SluggedModel): - + COLOURS = ( (1, "#a6cee3"), (2, "#1f78b4"), @@ -71,9 +71,9 @@ class Tag(SluggedModel): default=MATCH_ANY, help_text=( "Which algorithm you want to use when matching text to the OCR'd " - "PDF. Here, \"any\" looks for any occurrence of any word provided " - "in the PDF, while \"all\" requires that every word provided " - "appear in the PDF, albeit not in the order provided. A " + "PDF. Here, \"any\" looks for any occurrence of any word " + "provided in the PDF, while \"all\" requires that every word " + "provided appear in the PDF, albeit not in the order provided. A " "\"literal\" match means that the text you enter must appear in " "the PDF exactly as you've entered it, and \"regular expression\" " "uses a regex to match the PDF. If you don't know what a regex " diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 209c778a1..f23a482c6 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -14,7 +14,8 @@ class TagSerializer(serializers.ModelSerializer): class Meta(object): model = Tag - fields = ("id", "slug", "name", "colour", "match", "matching_algorithm") + fields = ( + "id", "slug", "name", "colour", "match", "matching_algorithm") class DocumentSerializer(serializers.ModelSerializer): diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 7cee524c3..6db501e02 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -4,10 +4,10 @@ from ..consumer import Consumer class TestAttachment(TestCase): - + TAGS = ("tag1", "tag2", "tag3") CONSUMER = Consumer() - + def _test_guess_attributes_from_name(self, path, sender, title, tags): for suffix in ("pdf", "png", "jpg", "jpeg", "gif"): f = path.format(suffix) diff --git a/src/documents/tests/test_tags.py b/src/documents/tests/test_tags.py index f3518e012..e0ab43244 100644 --- a/src/documents/tests/test_tags.py +++ b/src/documents/tests/test_tags.py @@ -117,4 +117,3 @@ class TestTagMatching(TestCase): self.assertFalse(t.matches("I have alpha, charlie, and gamma in me")) self.assertFalse(t.matches("I have alphas, charlie, and gamma in me")) self.assertFalse(t.matches("I have alphas in me")) - diff --git a/src/paperless/urls.py b/src/paperless/urls.py index d8a48995d..5803a6685 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -29,10 +29,20 @@ router.register(r'tags', TagViewSet) router.register(r'documents', DocumentViewSet) urlpatterns = [ - url(r"^api/auth/", include('rest_framework.urls', namespace='rest_framework')), + + # API + url( + r"^api/auth/", + include('rest_framework.urls', namespace='rest_framework') + ), url(r"^api/", include(router.urls)), + + # File downloads url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"), + + # The Django admin url(r"", admin.site.urls), + ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) if settings.UPLOAD_SHARED_SECRET: diff --git a/tox.ini b/tox.ini new file mode 100644 index 000000000..360385de8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,14 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +#[tox] +#envlist = py34, py35 + +#[testenv] +#commands = {envpython} src/manage.py test +#deps = + +[pep8] +exclude=migrations,src/paperless/settings.py From 440614eddc3f35db448f40426b4302f9cc218387 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 00:29:21 +0000 Subject: [PATCH 13/71] Got tox working --- src/tox.ini | 23 +++++++++++++++++++++++ tox.ini | 14 -------------- 2 files changed, 23 insertions(+), 14 deletions(-) create mode 100644 src/tox.ini delete mode 100644 tox.ini diff --git a/src/tox.ini b/src/tox.ini new file mode 100644 index 000000000..962e39a19 --- /dev/null +++ b/src/tox.ini @@ -0,0 +1,23 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +skipsdist = True +envlist = py34, py35, pep8 + +[testenv] +commands = {envpython} manage.py test +deps = -r{toxinidir}/../requirements.txt +setenv = + PAPERLESS_CONSUME=/tmp/paperless/consume + PAPERLESS_PASSPHRASE=THISISNOTASECRET + PAPERLESS_SECRET=paperless + +[testenv:pep8] +commands=pep8 +deps=pep8 + +[pep8] +exclude=.tox,migrations,paperless/settings.py diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 360385de8..000000000 --- a/tox.ini +++ /dev/null @@ -1,14 +0,0 @@ -# Tox (http://tox.testrun.org/) is a tool for running tests -# in multiple virtualenvs. This configuration file will run the -# test suite on all supported python versions. To use it, "pip install tox" -# and then run "tox" from this directory. - -#[tox] -#envlist = py34, py35 - -#[testenv] -#commands = {envpython} src/manage.py test -#deps = - -[pep8] -exclude=migrations,src/paperless/settings.py From 809fb8fa1ff58c83e262d5470e8990a000c676a1 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 00:29:59 +0000 Subject: [PATCH 14/71] Moved the default GNUPG home to /tmp for tox-friendliness --- requirements.txt | 1 + src/paperless/settings.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6dd8b32b5..25f4a0a40 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ Django==1.9 django-extensions==1.6.1 +djangorestframework==3.3.2 filemagic==1.6 langdetect==1.0.5 Pillow==3.0.0 diff --git a/src/paperless/settings.py b/src/paperless/settings.py index d31879110..07918882f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -158,7 +158,7 @@ OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS") FORGIVING_OCR = True # GNUPG needs a home directory for some reason -GNUPG_HOME = os.environ.get("HOME", "/dev/null") +GNUPG_HOME = os.environ.get("HOME", "/tmp") # Convert is part of the Imagemagick package CONVERT_BINARY = "/usr/bin/convert" From 17d3a449525248e77327f22e1b141c76c2adfabc Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 00:55:38 +0000 Subject: [PATCH 15/71] A crude API is in place --- src/documents/serialisers.py | 10 ++++++---- src/documents/views.py | 13 +++++++++++++ src/paperless/urls.py | 4 ++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index f23a482c6..345fa166d 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -3,14 +3,14 @@ from rest_framework import serializers from .models import Sender, Tag, Document -class SenderSerializer(serializers.ModelSerializer): +class SenderSerializer(serializers.HyperlinkedModelSerializer): class Meta(object): model = Sender fields = ("id", "slug", "name") -class TagSerializer(serializers.ModelSerializer): +class TagSerializer(serializers.HyperlinkedModelSerializer): class Meta(object): model = Tag @@ -20,8 +20,10 @@ class TagSerializer(serializers.ModelSerializer): class DocumentSerializer(serializers.ModelSerializer): - sender = serializers.HyperlinkedModelSerializer(read_only=True) - tags = serializers.HyperlinkedModelSerializer(read_only=True) + sender = serializers.HyperlinkedRelatedField( + read_only=True, view_name="drf:sender-detail", allow_null=True) + tags = serializers.HyperlinkedRelatedField( + read_only=True, view_name="drf:tag-detail", many=True) class Meta(object): model = Document diff --git a/src/documents/views.py b/src/documents/views.py index 45caf50e9..bcce6f677 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -3,6 +3,7 @@ from django.template.defaultfilters import slugify from django.views.decorators.csrf import csrf_exempt from django.views.generic import FormView, DetailView +from rest_framework.pagination import PageNumberPagination from rest_framework.viewsets import ModelViewSet from paperless.db import GnuPG @@ -57,16 +58,28 @@ class PushView(FormView): return HttpResponse("0") +class StandardPagination(PageNumberPagination): + page_size = 25 + page_size_query_param = "page-size" + max_page_size = 100000 + + class SenderViewSet(ModelViewSet): model = Sender + queryset = Sender.objects.all() serializer_class = SenderSerializer + pagination_class = StandardPagination class TagViewSet(ModelViewSet): model = Tag + queryset = Tag.objects.all() serializer_class = TagSerializer + pagination_class = StandardPagination class DocumentViewSet(ModelViewSet): model = Document + queryset = Document.objects.all() serializer_class = DocumentSerializer + pagination_class = StandardPagination diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 5803a6685..b7ffe17dc 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -33,9 +33,9 @@ urlpatterns = [ # API url( r"^api/auth/", - include('rest_framework.urls', namespace='rest_framework') + include('rest_framework.urls', namespace="rest_framework") ), - url(r"^api/", include(router.urls)), + url(r"^api/", include(router.urls, namespace="drf")), # File downloads url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"), From 0d466430269d87555ab8ae7473106f11098b36a7 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:24:30 +0000 Subject: [PATCH 16/71] Version bump --- docs/changelog.rst | 36 ++++++++++++++++++++++++++++++++++-- src/paperless/version.py | 2 +- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index c56e7a367..5fdd2143b 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,10 +1,28 @@ Changelog ######### +* 0.1.0 + + * Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and + `Tikitu de Jager`_ for this one, and especially to `Pit`_ + who spearheadded this effort. + * A simple REST API is in place, but it should be considered unstable. + * Cleaned up the consumer to use temporary directories instead of a single + scratch space. (Thanks `Pit`_) + * Improved the efficiency of the consumer by parsing pages more intelligently + and introducing a threaded OCR process (thanks again `Pit`_). + * `#45`_: Cleaned up the logic for tag matching. Reported by `darkmatter`_. + * `#47`_: Auto-rotate landscape documents. Reported by `Paul`_ and fixed by + `Pit`_. + * `#48`_: Matching algorithms should do so on a word boundary (`darkmatter`_) + * `#54`_: Documented the re-tagger (`zedster`_) + * `#57`_: Make sure file is preserved on import failure (`darkmatter`_) + * Added tox with pep8 checking + * 0.0.6 - * Added support for parallel OCR (significant work from pitkley) - * Sped up the language detection (significant work from pitkley) + * Added support for parallel OCR (significant work from `Pit`_) + * Sped up the language detection (significant work from `Pit`_) * Added simple logging * 0.0.5 @@ -35,3 +53,17 @@ Changelog * 0.0.1 * Initial release + +.. _Wayne Werner: https://github.com/waynew +.. _Brian Conn: https://github.com/TheConnMan +.. _Tikitu de Jager: https://github.com/tikitu +.. _Pit: https://github.com/pitkley +.. _Paul: https://github.com/polo2ro +.. _darkmatter: https://github.com/darkmatter +.. _zedster: https://github.com/zedster + +.. _#45: https://github.com/danielquinn/paperless/issues/45 +.. _#47: https://github.com/danielquinn/paperless/issues/47 +.. _#48: https://github.com/danielquinn/paperless/issues/48 +.. _#54: https://github.com/danielquinn/paperless/issues/54 +.. _#57: https://github.com/danielquinn/paperless/issues/57 diff --git a/src/paperless/version.py b/src/paperless/version.py index 7afad8b77..8e2c2d9ea 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (0, 0, 6) +__version__ = (0, 1, 0) From c7787bc076b70e897f57bf137d224c80af08c840 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:37:57 +0000 Subject: [PATCH 17/71] Let's see if I can get Travis CI working on the first try --- src/.travis.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/.travis.yml diff --git a/src/.travis.yml b/src/.travis.yml new file mode 100644 index 000000000..cd0985dd3 --- /dev/null +++ b/src/.travis.yml @@ -0,0 +1,10 @@ +language: python +sudo: false +env: + - TOXENV=py34 + - TOXENV=py35 + - TOXENV=pep8 +install: + - pip install --requirement ../requirements.txt + - pip install tox +script: tox From 55a7dc2444db87bb74aafc9d452fc03f1be58b35 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:43:48 +0000 Subject: [PATCH 18/71] pep8 --- src/paperless/settings.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 07918882f..5d7cc3b2f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -170,7 +170,8 @@ SCRATCH_DIR = "/tmp/paperless" CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") # If you want to use IMAP mail consumption, populate this with useful values. -# If you leave HOST set to None, we assume you're not going to use this feature. +# If you leave HOST set to None, we assume you're not going to use this +# feature. MAIL_CONSUMPTION = { "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"), "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"), @@ -180,8 +181,8 @@ MAIL_CONSUMPTION = { "INBOX": "INBOX" # The name of the inbox on the server } -# This is used to encrypt the original documents and decrypt them later when you -# want to download them. Set it and change the permissions on this file to +# This is used to encrypt the original documents and decrypt them later when +# you want to download them. Set it and change the permissions on this file to # 0600, or set it to `None` and you'll be prompted for the passphrase at # runtime. The default looks for an environment variable. # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things @@ -189,7 +190,7 @@ MAIL_CONSUMPTION = { # files. PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") -# If you intend to use the "API" to push files into the consumer, you'll need to -# provide a shared secret here. Leaving this as the default will disable the -# API. +# If you intend to use the "API" to push files into the consumer, you'll need +# to provide a shared secret here. Leaving this as the default will disable +# the API. UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "") From 6f7169d2d67f445637d12982a9f44c872bc7de89 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:46:49 +0000 Subject: [PATCH 19/71] Travis integration: take 3 --- src/.travis.yml => .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/.travis.yml => .travis.yml (85%) diff --git a/src/.travis.yml b/.travis.yml similarity index 85% rename from src/.travis.yml rename to .travis.yml index cd0985dd3..1d461d255 100644 --- a/src/.travis.yml +++ b/.travis.yml @@ -7,4 +7,4 @@ env: install: - pip install --requirement ../requirements.txt - pip install tox -script: tox +script: tox -c src/tox.ini From e0b2d27e01090db5935887d9641b38411416aaeb Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:50:04 +0000 Subject: [PATCH 20/71] Travis integration: take 4 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1d461d255..feba6c290 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,6 @@ env: - TOXENV=py35 - TOXENV=pep8 install: - - pip install --requirement ../requirements.txt + - pip install --requirement requirements.txt - pip install tox script: tox -c src/tox.ini From 300dc97e83adba66d330e9ed531bfa9f81a79856 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:53:10 +0000 Subject: [PATCH 21/71] Travis integration: take 5 --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index feba6c290..a83352daa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,8 @@ language: python sudo: false +python: + - "3.4" + - "3.5" env: - TOXENV=py34 - TOXENV=py35 From 5f0962bc3edf0987c79bb79143ca4e6e0cdfcb3c Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 01:58:09 +0000 Subject: [PATCH 22/71] Travis integration: take 6 --- .travis.yml | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index a83352daa..6d3d5d217 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,17 @@ language: python + sudo: false -python: - - "3.4" - - "3.5" -env: - - TOXENV=py34 - - TOXENV=py35 - - TOXENV=pep8 + +matrix: + include: + - python: 3.4 + env: TOXENV=py34 + - python: 3.5 + env: TOXENV=py35 + - env: TOXENV=pep8 + install: - - pip install --requirement requirements.txt - - pip install tox + - pip install --requirement requirements.txt + - pip install tox + script: tox -c src/tox.ini From 3a7923e32dba6d76949788cecd361e6f19df04d4 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 02:24:05 +0000 Subject: [PATCH 23/71] Moved pyocr.get_available_tools() into a method --- .travis.yml | 3 ++- src/documents/consumer.py | 27 +++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6d3d5d217..dcaaeab8d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,8 @@ matrix: env: TOXENV=py34 - python: 3.5 env: TOXENV=py35 - - env: TOXENV=pep8 + - python: 3.5 + env: TOXENV=pep8 install: - pip install --requirement requirements.txt diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 6cf3b3d9d..2bd47c6da 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -26,18 +26,6 @@ from .models import Sender, Tag, Document from .languages import ISO639 -def image_to_string(args): - self, png, lang = args - with Image.open(os.path.join(self.SCRATCH, png)) as f: - if self.OCR.can_detect_orientation(): - try: - orientation = self.OCR.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except TesseractError: - pass - return self.OCR.image_to_string(f, lang=lang) - - class OCRError(Exception): pass @@ -61,7 +49,6 @@ class Consumer(object): CONSUME = settings.CONSUMPTION_DIR THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None - OCR = pyocr.get_available_tools()[0] DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE REGEX_TITLE = re.compile( @@ -239,12 +226,24 @@ class Consumer(object): with Pool(processes=self.THREADS) as pool: r = pool.map( - image_to_string, itertools.product([self], pngs, [lang])) + self.image_to_string, itertools.product(pngs, [lang])) r = " ".join(r) # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) + def image_to_string(self, args): + png, lang = args + ocr = pyocr.get_available_tools()[0] + with Image.open(os.path.join(self.SCRATCH, png)) as f: + if ocr.can_detect_orientation(): + try: + orientation = ocr.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) + except TesseractError: + pass + return ocr.image_to_string(f, lang=lang) + def _guess_attributes_from_name(self, parseable): """ We use a crude naming convention to make handling the sender, title, From 312cb523d817da7122444dbb7dd6abbdc72a183f Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 02:30:39 +0000 Subject: [PATCH 24/71] /tmp is probably better than /tmp/paperless/consume --- src/tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tox.ini b/src/tox.ini index 962e39a19..1840b507e 100644 --- a/src/tox.ini +++ b/src/tox.ini @@ -11,7 +11,7 @@ envlist = py34, py35, pep8 commands = {envpython} manage.py test deps = -r{toxinidir}/../requirements.txt setenv = - PAPERLESS_CONSUME=/tmp/paperless/consume + PAPERLESS_CONSUME=/tmp PAPERLESS_PASSPHRASE=THISISNOTASECRET PAPERLESS_SECRET=paperless From b400c24dc8cb233c496b14d56f8df8c739df4975 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 21 Feb 2016 02:32:47 +0000 Subject: [PATCH 25/71] Adding travis badge --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index cf9d387cc..0aba0545e 100644 --- a/README.rst +++ b/README.rst @@ -3,6 +3,7 @@ Paperless |Documentation| |Chat| +|Travis| Scan, index, and archive all of your paper documents @@ -105,4 +106,5 @@ home. .. |Chat| image:: https://badges.gitter.im/danielquinn/paperless.svg :alt: Join the chat at https://gitter.im/danielquinn/paperless :target: https://gitter.im/danielquinn/paperless?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge - +.. |Travis| image:: https://travis-ci.org/danielquinn/paperless.svg?branch=master + :target: https://travis-ci.org/danielquinn/paperless From 1aecb1e63a9fede4998612df8024990ba940cbfe Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Tue, 23 Feb 2016 20:15:13 +0000 Subject: [PATCH 26/71] Compensate for case and format of jpg vs. jpeg --- src/documents/consumer.py | 17 ++++++++++++++--- src/documents/tests/test_consumer.py | 12 ++++++++++-- src/documents/views.py | 2 +- src/paperless/urls.py | 4 ++-- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 2bd47c6da..ddbe474a7 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -264,6 +264,12 @@ class Consumer(object): Tag.objects.get_or_create(slug=t, defaults={"name": t})[0]) return tuple(r) + def get_suffix(suffix): + suffix = suffix.lower() + if suffix == "jpeg": + return "jpg" + return suffix + # First attempt: "<sender> - <title> - <tags>.<suffix>" m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable) if m: @@ -271,17 +277,22 @@ class Consumer(object): get_sender(m.group(1)), m.group(2), get_tags(m.group(3)), - m.group(4) + get_suffix(m.group(4)) ) # Second attempt: "<sender> - <title>.<suffix>" m = re.match(self.REGEX_SENDER_TITLE, parseable) if m: - return get_sender(m.group(1)), m.group(2), (), m.group(3) + return ( + get_sender(m.group(1)), + m.group(2), + (), + get_suffix(m.group(3)) + ) # That didn't work, so we assume sender and tags are None m = re.match(self.REGEX_TITLE, parseable) - return None, m.group(1), (), m.group(2) + return None, m.group(1), (), get_suffix(m.group(2)) def _store(self, text, doc): diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 6db501e02..04f92f98c 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -7,15 +7,23 @@ class TestAttachment(TestCase): TAGS = ("tag1", "tag2", "tag3") CONSUMER = Consumer() + SUFFIXES = ( + "pdf", "png", "jpg", "jpeg", "gif", + "PDF", "PNG", "JPG", "JPEG", "GIF", + "PdF", "PnG", "JpG", "JPeG", "GiF", + ) def _test_guess_attributes_from_name(self, path, sender, title, tags): - for suffix in ("pdf", "png", "jpg", "jpeg", "gif"): + for suffix in self.SUFFIXES: f = path.format(suffix) results = self.CONSUMER._guess_attributes_from_name(f) self.assertEqual(results[0].name, sender, f) self.assertEqual(results[1], title, f) self.assertEqual(tuple([t.slug for t in results[2]]), tags, f) - self.assertEqual(results[3], suffix, f) + if suffix.lower() == "jpeg": + self.assertEqual(results[3], "jpg", f) + else: + self.assertEqual(results[3], suffix.lower(), f) def test_guess_attributes_from_name0(self): self._test_guess_attributes_from_name( diff --git a/src/documents/views.py b/src/documents/views.py index bcce6f677..26642c9fc 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -13,7 +13,7 @@ from .models import Sender, Tag, Document from .serialisers import SenderSerializer, TagSerializer, DocumentSerializer -class PdfView(DetailView): +class FetchView(DetailView): model = Document diff --git a/src/paperless/urls.py b/src/paperless/urls.py index b7ffe17dc..fd1af065d 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -21,7 +21,7 @@ from django.contrib import admin from rest_framework.routers import DefaultRouter from documents.views import ( - PdfView, PushView, SenderViewSet, TagViewSet, DocumentViewSet) + FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet) router = DefaultRouter() router.register(r'senders', SenderViewSet) @@ -38,7 +38,7 @@ urlpatterns = [ url(r"^api/", include(router.urls, namespace="drf")), # File downloads - url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"), + url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"), # The Django admin url(r"", admin.site.urls), From e149baec4ecf70f791981dd516f7eb0e212811fd Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Tue, 23 Feb 2016 20:30:33 +0000 Subject: [PATCH 27/71] Update for #53 --- docs/changelog.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index 5fdd2143b..86f365653 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,11 @@ Changelog ######### +* 0.1.1 (master) + + * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images + to be imported but made unavailable. + * 0.1.0 * Docker support! Big thanks to `Wayne Werner`_, `Brian Conn`_, and @@ -65,5 +70,6 @@ Changelog .. _#45: https://github.com/danielquinn/paperless/issues/45 .. _#47: https://github.com/danielquinn/paperless/issues/47 .. _#48: https://github.com/danielquinn/paperless/issues/48 +.. _#53: https://github.com/danielquinn/paperless/issues/53 .. _#54: https://github.com/danielquinn/paperless/issues/54 .. _#57: https://github.com/danielquinn/paperless/issues/57 From df1741e1fa1d43931ee21e0eb71ed497a97da512 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Fri, 26 Feb 2016 11:21:14 +0000 Subject: [PATCH 28/71] Added a time to the logger admin --- src/logger/admin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/logger/admin.py b/src/logger/admin.py index dc9446821..b949f1908 100644 --- a/src/logger/admin.py +++ b/src/logger/admin.py @@ -5,7 +5,7 @@ from .models import Log class LogAdmin(admin.ModelAdmin): - list_display = ("message", "level", "component") + list_display = ("message", "time", "level", "component") list_filter = ("level", "component",) From 7b9e55d2085fd2830743b72c1a38b28176679360 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Fri, 26 Feb 2016 11:21:24 +0000 Subject: [PATCH 29/71] Software updates --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 25f4a0a40..810af8ec2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -Django==1.9 +Django==1.9.2 django-extensions==1.6.1 djangorestframework==3.3.2 filemagic==1.6 langdetect==1.0.5 -Pillow==3.0.0 +Pillow==3.1.1 pyocr==0.3.1 python-dateutil==2.4.2 python-gnupg==0.3.8 From 2fe9b0cbc129929127482771f5c4053a6ae09cc6 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 27 Feb 2016 20:18:50 +0000 Subject: [PATCH 30/71] New logging appears to work --- docs/changelog.rst | 2 + src/documents/admin.py | 43 +++--- src/documents/consumer.py | 109 ++++++++------- src/documents/loggers.py | 30 +++++ src/documents/mail.py | 48 ++++--- .../migrations/0010_log.py} | 18 +-- src/documents/models.py | 32 +++++ src/documents/tests/test_logger.py | 124 ++++++++++++++++++ src/logger/__init__.py | 0 src/logger/admin.py | 12 -- src/logger/apps.py | 5 - src/logger/migrations/__init__.py | 0 src/logger/models.py | 53 -------- src/logger/tests.py | 3 - src/logger/views.py | 3 - src/paperless/settings.py | 52 +++++--- 16 files changed, 346 insertions(+), 188 deletions(-) create mode 100644 src/documents/loggers.py rename src/{logger/migrations/0001_initial.py => documents/migrations/0010_log.py} (57%) create mode 100644 src/documents/tests/test_logger.py delete mode 100644 src/logger/__init__.py delete mode 100644 src/logger/admin.py delete mode 100644 src/logger/apps.py delete mode 100644 src/logger/migrations/__init__.py delete mode 100644 src/logger/models.py delete mode 100644 src/logger/tests.py delete mode 100644 src/logger/views.py diff --git a/docs/changelog.rst b/docs/changelog.rst index 86f365653..cdb720926 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,7 @@ Changelog * 0.1.1 (master) + * `#60`_: Setup logging to actually use the Python native logging framework. * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images to be imported but made unavailable. @@ -73,3 +74,4 @@ Changelog .. _#53: https://github.com/danielquinn/paperless/issues/53 .. _#54: https://github.com/danielquinn/paperless/issues/54 .. _#57: https://github.com/danielquinn/paperless/issues/57 +.. _#60: https://github.com/danielquinn/paperless/issues/60 diff --git a/src/documents/admin.py b/src/documents/admin.py index 42c3fc968..118a295eb 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group from django.core.urlresolvers import reverse from django.templatetags.static import static -from .models import Sender, Tag, Document +from .models import Sender, Tag, Document, Log class MonthListFilter(admin.SimpleListFilter): @@ -57,7 +57,7 @@ class DocumentAdmin(admin.ModelAdmin): r = "" for tag in obj.tags.all(): colour = tag.get_colour_display() - r += html_tag( + r += self._html_tag( "a", tag.slug, **{ @@ -73,9 +73,9 @@ class DocumentAdmin(admin.ModelAdmin): tags_.allow_tags = True def document(self, obj): - return html_tag( + return self._html_tag( "a", - html_tag( + self._html_tag( "img", src=static("documents/img/{}.png".format(obj.file_type)), width=22, @@ -87,23 +87,32 @@ class DocumentAdmin(admin.ModelAdmin): ) document.allow_tags = True + @staticmethod + def _html_tag(kind, inside=None, **kwargs): + + attributes = [] + for lft, rgt in kwargs.items(): + attributes.append('{}="{}"'.format(lft, rgt)) + + if inside is not None: + return "<{kind} {attributes}>{inside}</{kind}>".format( + kind=kind, attributes=" ".join(attributes), inside=inside) + + return "<{} {}/>".format(kind, " ".join(attributes)) + + +class LogAdmin(admin.ModelAdmin): + + list_display = ("message", "level", "component") + list_filter = ("level", "component",) + + admin.site.register(Sender) admin.site.register(Tag, TagAdmin) admin.site.register(Document, DocumentAdmin) +admin.site.register(Log, LogAdmin) + # Unless we implement multi-user, these default registrations don't make sense. admin.site.unregister(Group) admin.site.unregister(User) - - -def html_tag(kind, inside=None, **kwargs): - - attributes = [] - for lft, rgt in kwargs.items(): - attributes.append('{}="{}"'.format(lft, rgt)) - - if inside is not None: - return "<{kind} {attributes}>{inside}</{kind}>".format( - kind=kind, attributes=" ".join(attributes), inside=inside) - - return "<{} {}/>".format(kind, " ".join(attributes)) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index ddbe474a7..37b348495 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,5 +1,8 @@ import datetime +import logging import tempfile +import uuid + from multiprocessing.pool import Pool import itertools @@ -19,10 +22,9 @@ from django.utils import timezone from django.template.defaultfilters import slugify from pyocr.tesseract import TesseractError -from logger.models import Log from paperless.db import GnuPG -from .models import Sender, Tag, Document +from .models import Sender, Tag, Document, Log from .languages import ISO639 @@ -67,6 +69,8 @@ class Consumer(object): def __init__(self, verbosity=1): self.verbosity = verbosity + self.logger = logging.getLogger(__name__) + self.logging_group = None try: os.makedirs(self.SCRATCH) @@ -86,6 +90,12 @@ class Consumer(object): raise ConsumerError( "Consumption directory {} does not exist".format(self.CONSUME)) + def log(self, level, message): + getattr(self.logger, level)(message, extra={ + "group": self.logging_group, + "component": Log.COMPONENT_CONSUMER + }) + def consume(self): for doc in os.listdir(self.CONSUME): @@ -104,7 +114,9 @@ class Consumer(object): if self._is_ready(doc): continue - Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) + self.logging_group = uuid.uuid4() + + self.log("info", "Consuming {}".format(doc)) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) @@ -114,8 +126,7 @@ class Consumer(object): self._store(text, doc) except OCRError: self._ignore.append(doc) - Log.error( - "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) + self.log("error", "OCR FAILURE: {}".format(doc)) self._cleanup_tempdir(tempdir) continue else: @@ -124,10 +135,7 @@ class Consumer(object): def _get_greyscale(self, tempdir, doc): - Log.debug( - "Generating greyscale image from {}".format(doc), - Log.COMPONENT_CONSUMER - ) + self.log("info", "Generating greyscale image from {}".format(doc)) png = os.path.join(tempdir, "convert-%04d.jpg") @@ -143,18 +151,13 @@ class Consumer(object): return sorted(filter(lambda __: os.path.isfile(__), pngs)) - @staticmethod - def _guess_language(text): + def _guess_language(self, text): try: guess = langdetect.detect(text) - Log.debug( - "Language detected: {}".format(guess), - Log.COMPONENT_CONSUMER - ) + self.log("debug", "Language detected: {}".format(guess)) return guess except Exception as e: - Log.warning( - "Language detection error: {}".format(e), Log.COMPONENT_MAIL) + self.log("warning", "Language detection error: {}".format(e)) def _get_ocr(self, pngs): """ @@ -165,7 +168,7 @@ class Consumer(object): if not pngs: raise OCRError - Log.debug("OCRing the document", Log.COMPONENT_CONSUMER) + self.log("info", "OCRing the document") # Since the division gets rounded down by int, this calculation works # for every edge-case, i.e. 1 @@ -175,12 +178,12 @@ class Consumer(object): guessed_language = self._guess_language(raw_text) if not guessed_language or guessed_language not in ISO639: - Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER) + self.log("warning", "Language detection failed!") if settings.FORGIVING_OCR: - Log.warning( + self.log( + "warning", "As FORGIVING_OCR is enabled, we're going to make the " - "best with what we have.", - Log.COMPONENT_CONSUMER + "best with what we have." ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text @@ -194,12 +197,12 @@ class Consumer(object): return self._ocr(pngs, ISO639[guessed_language]) except pyocr.pyocr.tesseract.TesseractError: if settings.FORGIVING_OCR: - Log.warning( + self.log( + "warning", "OCR for {} failed, but we're going to stick with what " "we've got since FORGIVING_OCR is enabled.".format( guessed_language - ), - Log.COMPONENT_CONSUMER + ) ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text @@ -222,28 +225,15 @@ class Consumer(object): if not pngs: return "" - Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER) + self.log("info", "Parsing for {}".format(lang)) with Pool(processes=self.THREADS) as pool: - r = pool.map( - self.image_to_string, itertools.product(pngs, [lang])) + r = pool.map(image_to_string, itertools.product(pngs, [lang])) r = " ".join(r) # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) - def image_to_string(self, args): - png, lang = args - ocr = pyocr.get_available_tools()[0] - with Image.open(os.path.join(self.SCRATCH, png)) as f: - if ocr.can_detect_orientation(): - try: - orientation = ocr.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except TesseractError: - pass - return ocr.image_to_string(f, lang=lang) - def _guess_attributes_from_name(self, parseable): """ We use a crude naming convention to make handling the sender, title, @@ -301,7 +291,7 @@ class Consumer(object): stats = os.stat(doc) - Log.debug("Saving record to database", Log.COMPONENT_CONSUMER) + self.log("debug", "Saving record to database") document = Document.objects.create( sender=sender, @@ -316,23 +306,22 @@ class Consumer(object): if relevant_tags: tag_names = ", ".join([t.slug for t in relevant_tags]) - Log.debug( - "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER) + self.log("debug", "Tagging with {}".format(tag_names)) document.tags.add(*relevant_tags) with open(doc, "rb") as unencrypted: with open(document.source_path, "wb") as encrypted: - Log.debug("Encrypting", Log.COMPONENT_CONSUMER) + self.log("debug", "Encrypting") encrypted.write(GnuPG.encrypted(unencrypted)) - @staticmethod - def _cleanup_tempdir(d): - Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER) + self.log("info", "Completed") + + def _cleanup_tempdir(self, d): + self.log("debug", "Deleting directory {}".format(d)) shutil.rmtree(d) - @staticmethod - def _cleanup_doc(doc): - Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER) + def _cleanup_doc(self, doc): + self.log("debug", "Deleting document {}".format(doc)) os.unlink(doc) def _is_ready(self, doc): @@ -350,3 +339,23 @@ class Consumer(object): self.stats[doc] = t return False + + +def image_to_string(args): + """ + I have no idea why, but if this function were a method of Consumer, it + would explode with: + + `TypeError: cannot serialize '_io.TextIOWrapper' object`. + """ + + png, lang = args + ocr = pyocr.get_available_tools()[0] + with Image.open(os.path.join(Consumer.SCRATCH, png)) as f: + if ocr.can_detect_orientation(): + try: + orientation = ocr.detect_orientation(f, lang=lang) + f = f.rotate(orientation["angle"], expand=1) + except TesseractError: + pass + return ocr.image_to_string(f, lang=lang) diff --git a/src/documents/loggers.py b/src/documents/loggers.py new file mode 100644 index 000000000..3464478cc --- /dev/null +++ b/src/documents/loggers.py @@ -0,0 +1,30 @@ +import logging + + +class PaperlessLogger(logging.StreamHandler): + """ + A logger smart enough to know to log some kinds of messages to the database + for later retrieval in a pretty interface. + """ + + def emit(self, record): + + logging.StreamHandler.emit(self, record) + + if not hasattr(record, "component"): + return + + # We have to do the import here or Django will barf when it tries to + # load this because the apps aren't loaded at that point + from .models import Log + + kwargs = { + "message": record.msg, + "component": record.component, + "level": record.levelno, + } + + if hasattr(record, "group"): + kwargs["group"] = record.group + + Log.objects.create(**kwargs) diff --git a/src/documents/mail.py b/src/documents/mail.py index 384567e60..cc987bf64 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -1,8 +1,10 @@ import datetime import imaplib +import logging import os import re import time +import uuid from base64 import b64decode from email import policy @@ -11,10 +13,8 @@ from dateutil import parser from django.conf import settings -from logger.models import Log - from .consumer import Consumer -from .models import Sender +from .models import Sender, Log class MailFetcherError(Exception): @@ -25,7 +25,20 @@ class InvalidMessageError(Exception): pass -class Message(object): +class Loggable(object): + + def __init__(self, group=None): + self.logger = logging.getLogger(__name__) + self.logging_group = group or uuid.uuid4() + + def log(self, level, message): + getattr(self.logger, level)(message, extra={ + "group": self.logging_group, + "component": Log.COMPONENT_MAIL + }) + + +class Message(Loggable): """ A crude, but simple email message class. We assume that there's a subject and n attachments, and that we don't care about the message body. @@ -33,13 +46,13 @@ class Message(object): SECRET = settings.UPLOAD_SHARED_SECRET - def __init__(self, data, verbosity=1): + def __init__(self, data, group=None): """ Cribbed heavily from https://www.ianlewis.org/en/parsing-email-attachments-python """ - self.verbosity = verbosity + Loggable.__init__(self, group=group) self.subject = None self.time = None @@ -54,8 +67,7 @@ class Message(object): self._set_time(message) - Log.info( - 'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL) + self.log("info", 'Importing email: "{}"'.format(self.subject)) attachments = [] for part in message.walk(): @@ -134,9 +146,11 @@ class Attachment(object): return self.data -class MailFetcher(object): +class MailFetcher(Loggable): - def __init__(self, verbosity=1): + def __init__(self): + + Loggable.__init__(self) self._connection = None self._host = settings.MAIL_CONSUMPTION["HOST"] @@ -148,7 +162,6 @@ class MailFetcher(object): self._enabled = bool(self._host) self.last_checked = datetime.datetime.now() - self.verbosity = verbosity def pull(self): """ @@ -159,14 +172,11 @@ class MailFetcher(object): if self._enabled: - Log.info("Checking mail", Log.COMPONENT_MAIL) + self.log("info", "Checking mail") for message in self._get_messages(): - Log.debug( - 'Storing email: "{}"'.format(message.subject), - Log.COMPONENT_MAIL - ) + self.log("info", 'Storing email: "{}"'.format(message.subject)) t = int(time.mktime(message.time.timetuple())) file_name = os.path.join(Consumer.CONSUME, message.file_name) @@ -193,7 +203,7 @@ class MailFetcher(object): self._connection.logout() except Exception as e: - Log.error(e, Log.COMPONENT_MAIL) + self.log("error", str(e)) return r @@ -218,9 +228,9 @@ class MailFetcher(object): message = None try: - message = Message(data[0][1], self.verbosity) + message = Message(data[0][1], self.logging_group) except InvalidMessageError as e: - Log.error(e, Log.COMPONENT_MAIL) + self.log("error", str(e)) else: self._connection.store(num, "+FLAGS", "\\Deleted") diff --git a/src/logger/migrations/0001_initial.py b/src/documents/migrations/0010_log.py similarity index 57% rename from src/logger/migrations/0001_initial.py rename to src/documents/migrations/0010_log.py index 029fe43c2..57cf804b7 100644 --- a/src/logger/migrations/0001_initial.py +++ b/src/documents/migrations/0010_log.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Generated by Django 1.9 on 2016-02-14 16:08 +# Generated by Django 1.9 on 2016-02-27 17:54 from __future__ import unicode_literals from django.db import migrations, models @@ -7,9 +7,8 @@ from django.db import migrations, models class Migration(migrations.Migration): - initial = True - dependencies = [ + ('documents', '0009_auto_20160214_0040'), ] operations = [ @@ -17,14 +16,15 @@ class Migration(migrations.Migration): name='Log', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('time', models.DateTimeField(auto_now_add=True)), + ('group', models.UUIDField(blank=True)), ('message', models.TextField()), - ('level', models.PositiveIntegerField(choices=[(1, 'Error'), (2, 'Warning'), (3, 'Informational'), (4, 'Debugging')], default=3)), + ('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)), ('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])), + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), ], - ), - migrations.AlterModelOptions( - name='log', - options={'ordering': ('-time',)}, + options={ + 'ordering': ('-modified',), + }, ), ] diff --git a/src/documents/models.py b/src/documents/models.py index 267bebffe..91dd458ea 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,3 +1,4 @@ +import logging import os import re @@ -187,3 +188,34 @@ class Document(models.Model): @property def download_url(self): return reverse("fetch", kwargs={"pk": self.pk}) + + +class Log(models.Model): + + LEVELS = ( + (logging.DEBUG, "Debugging"), + (logging.INFO, "Informational"), + (logging.WARNING, "Warning"), + (logging.ERROR, "Error"), + (logging.CRITICAL, "Critical"), + ) + + COMPONENT_CONSUMER = 1 + COMPONENT_MAIL = 2 + COMPONENTS = ( + (COMPONENT_CONSUMER, "Consumer"), + (COMPONENT_MAIL, "Mail Fetcher") + ) + + group = models.UUIDField(blank=True) + message = models.TextField() + level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO) + component = models.PositiveIntegerField(choices=COMPONENTS) + created = models.DateTimeField(auto_now_add=True) + modified = models.DateTimeField(auto_now=True) + + class Meta(object): + ordering = ("-modified",) + + def __str__(self): + return self.message diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py new file mode 100644 index 000000000..d5527d7c6 --- /dev/null +++ b/src/documents/tests/test_logger.py @@ -0,0 +1,124 @@ +import logging +import uuid + +from unittest import mock + +from django.test import TestCase + +from ..models import Log + + +class TestPaperlessLog(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + self.logger = logging.getLogger( + "documents.management.commands.document_consumer") + + def test_ignored(self): + with mock.patch("logging.StreamHandler.emit") as __: + self.assertEqual(Log.objects.all().count(), 0) + self.logger.info("This is an informational message") + self.logger.warning("This is an informational message") + self.logger.error("This is an informational message") + self.logger.critical("This is an informational message") + self.assertEqual(Log.objects.all().count(), 0) + + def test_that_it_saves_at_all(self): + + kw = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + + self.assertEqual(Log.objects.all().count(), 0) + + with mock.patch("logging.StreamHandler.emit") as __: + + # Debug messages are ignored by default + self.logger.debug("This is a debugging message", extra=kw) + self.assertEqual(Log.objects.all().count(), 0) + + self.logger.info("This is an informational message", extra=kw) + self.assertEqual(Log.objects.all().count(), 1) + + self.logger.warning("This is an warning message", extra=kw) + self.assertEqual(Log.objects.all().count(), 2) + + self.logger.error("This is an error message", extra=kw) + self.assertEqual(Log.objects.all().count(), 3) + + self.logger.critical("This is a critical message", extra=kw) + self.assertEqual(Log.objects.all().count(), 4) + + def test_groups(self): + + kw1 = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + kw2 = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + + self.assertEqual(Log.objects.all().count(), 0) + + with mock.patch("logging.StreamHandler.emit") as __: + + # Debug messages are ignored by default + self.logger.debug("This is a debugging message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 0) + + self.logger.info("This is an informational message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 1) + self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1) + + self.logger.warning("This is an warning message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 2) + self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1) + + self.logger.error("This is an error message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 3) + self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2) + + self.logger.critical("This is a critical message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 4) + self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2) + + def test_components(self): + + c1 = Log.COMPONENT_CONSUMER + c2 = Log.COMPONENT_MAIL + kw1 = { + "group": uuid.uuid4(), + "component": c1 + } + kw2 = { + "group": kw1["group"], + "component": c2 + } + + self.assertEqual(Log.objects.all().count(), 0) + + with mock.patch("logging.StreamHandler.emit") as __: + + # Debug messages are ignored by default + self.logger.debug("This is a debugging message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 0) + + self.logger.info("This is an informational message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 1) + self.assertEqual(Log.objects.filter(component=c2).count(), 1) + + self.logger.warning("This is an warning message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 2) + self.assertEqual(Log.objects.filter(component=c1).count(), 1) + + self.logger.error("This is an error message", extra=kw2) + self.assertEqual(Log.objects.all().count(), 3) + self.assertEqual(Log.objects.filter(component=c2).count(), 2) + + self.logger.critical("This is a critical message", extra=kw1) + self.assertEqual(Log.objects.all().count(), 4) + self.assertEqual(Log.objects.filter(component=c1).count(), 2) diff --git a/src/logger/__init__.py b/src/logger/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/logger/admin.py b/src/logger/admin.py deleted file mode 100644 index dc9446821..000000000 --- a/src/logger/admin.py +++ /dev/null @@ -1,12 +0,0 @@ -from django.contrib import admin - -from .models import Log - - -class LogAdmin(admin.ModelAdmin): - - list_display = ("message", "level", "component") - list_filter = ("level", "component",) - - -admin.site.register(Log, LogAdmin) diff --git a/src/logger/apps.py b/src/logger/apps.py deleted file mode 100644 index 2c1a7d735..000000000 --- a/src/logger/apps.py +++ /dev/null @@ -1,5 +0,0 @@ -from django.apps import AppConfig - - -class LoggerConfig(AppConfig): - name = 'logger' diff --git a/src/logger/migrations/__init__.py b/src/logger/migrations/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/logger/models.py b/src/logger/models.py deleted file mode 100644 index f7f2c421a..000000000 --- a/src/logger/models.py +++ /dev/null @@ -1,53 +0,0 @@ -from django.db import models - - -class Log(models.Model): - - LEVEL_ERROR = 1 - LEVEL_WARNING = 2 - LEVEL_INFO = 3 - LEVEL_DEBUG = 4 - LEVELS = ( - (LEVEL_ERROR, "Error"), - (LEVEL_WARNING, "Warning"), - (LEVEL_INFO, "Informational"), - (LEVEL_DEBUG, "Debugging"), - ) - - COMPONENT_CONSUMER = 1 - COMPONENT_MAIL = 2 - COMPONENTS = ( - (COMPONENT_CONSUMER, "Consumer"), - (COMPONENT_MAIL, "Mail Fetcher") - ) - - time = models.DateTimeField(auto_now_add=True) - message = models.TextField() - level = models.PositiveIntegerField(choices=LEVELS, default=LEVEL_INFO) - component = models.PositiveIntegerField(choices=COMPONENTS) - - class Meta(object): - ordering = ("-time",) - - def __str__(self): - return self.message - - @classmethod - def error(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_ERROR, component=component) - - @classmethod - def warning(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_WARNING, component=component) - - @classmethod - def info(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_INFO, component=component) - - @classmethod - def debug(cls, message, component): - cls.objects.create( - message=message, level=cls.LEVEL_DEBUG, component=component) diff --git a/src/logger/tests.py b/src/logger/tests.py deleted file mode 100644 index 7ce503c2d..000000000 --- a/src/logger/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/src/logger/views.py b/src/logger/views.py deleted file mode 100644 index 91ea44a21..000000000 --- a/src/logger/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 5d7cc3b2f..1f7bb6d0a 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -42,7 +42,6 @@ INSTALLED_APPS = [ "django_extensions", "documents", - "logger", "rest_framework", @@ -89,12 +88,12 @@ DATABASES = { "NAME": os.path.join(BASE_DIR, "..", "data", "db.sqlite3"), } } -if os.environ.get("PAPERLESS_DBUSER") and os.environ.get("PAPERLESS_DBPASS"): +if os.getenv("PAPERLESS_DBUSER") and os.getenv("PAPERLESS_DBPASS"): DATABASES["default"] = { "ENGINE": "django.db.backends.postgresql_psycopg2", - "NAME": os.environ.get("PAPERLESS_DBNAME", "paperless"), - "USER": os.environ.get("PAPERLESS_DBUSER"), - "PASSWORD": os.environ.get("PAPERLESS_DBPASS") + "NAME": os.getenv("PAPERLESS_DBNAME", "paperless"), + "USER": os.getenv("PAPERLESS_DBUSER"), + "PASSWORD": os.getenv("PAPERLESS_DBPASS") } @@ -141,6 +140,25 @@ STATIC_URL = '/static/' MEDIA_URL = "/media/" +# Logging + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "consumer": { + "class": "documents.loggers.PaperlessLogger", + } + }, + "loggers": { + "documents": { + "handlers": ["consumer"], + "level": os.getenv("PAPERLESS_CONSUMER_LOG_LEVEL", "INFO"), + }, + }, +} + + # Paperless-specific stuffs # Change these paths if yours are different # ---------------------------------------------------------------------------- @@ -150,15 +168,15 @@ MEDIA_URL = "/media/" OCR_LANGUAGE = "eng" # The amount of threads to use for OCR -OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS") +OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") -# If this is true, any failed attempts to OCR a PDF will result in the PDF being -# indexed anyway, with whatever we could get. If it's False, the file will -# simply be left in the CONSUMPTION_DIR. +# If this is true, any failed attempts to OCR a PDF will result in the PDF +# being indexed anyway, with whatever we could get. If it's False, the file +# will simply be left in the CONSUMPTION_DIR. FORGIVING_OCR = True # GNUPG needs a home directory for some reason -GNUPG_HOME = os.environ.get("HOME", "/tmp") +GNUPG_HOME = os.getenv("HOME", "/tmp") # Convert is part of the Imagemagick package CONVERT_BINARY = "/usr/bin/convert" @@ -167,16 +185,16 @@ CONVERT_BINARY = "/usr/bin/convert" SCRATCH_DIR = "/tmp/paperless" # This is where Paperless will look for PDFs to index -CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") +CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME") # If you want to use IMAP mail consumption, populate this with useful values. # If you leave HOST set to None, we assume you're not going to use this # feature. MAIL_CONSUMPTION = { - "HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"), - "PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"), - "USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"), - "PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"), + "HOST": os.getenv("PAPERLESS_CONSUME_MAIL_HOST"), + "PORT": os.getenv("PAPERLESS_CONSUME_MAIL_PORT"), + "USERNAME": os.getenv("PAPERLESS_CONSUME_MAIL_USER"), + "PASSWORD": os.getenv("PAPERLESS_CONSUME_MAIL_PASS"), "USE_SSL": True, # If True, use SSL/TLS to connect "INBOX": "INBOX" # The name of the inbox on the server } @@ -188,9 +206,9 @@ MAIL_CONSUMPTION = { # DON'T FORGET TO SET THIS as leaving it blank may cause some strange things # with GPG, including an interesting case where it may "encrypt" zero-byte # files. -PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") +PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") # If you intend to use the "API" to push files into the consumer, you'll need # to provide a shared secret here. Leaving this as the default will disable # the API. -UPLOAD_SHARED_SECRET = os.environ.get("PAPERLESS_SECRET", "") +UPLOAD_SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "") From 51173d80cf1a153cab5f5ec91461960b5aacfbe9 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 27 Feb 2016 20:19:09 +0000 Subject: [PATCH 31/71] License clarification --- src/documents/management/commands/loaddata_stdin.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/documents/management/commands/loaddata_stdin.py b/src/documents/management/commands/loaddata_stdin.py index ca0b9ef7b..9cce7a047 100644 --- a/src/documents/management/commands/loaddata_stdin.py +++ b/src/documents/management/commands/loaddata_stdin.py @@ -1,17 +1,14 @@ -""" -Source: - https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 - -License: - MIT - Copyright (c) 2016 Baptiste Mispelon -""" import sys from django.core.management.commands.loaddata import Command as LoadDataCommand class Command(LoadDataCommand): + """ + Allow the loading of data from standard in. Sourced originally from: + https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed) + """ + def parse_name(self, fixture_name): self.compression_formats['stdin'] = (lambda x, y: sys.stdin, None) if fixture_name == '-': From a4d89ed1244f27b89905b7adc6ae3410aeb9c858 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 27 Feb 2016 20:50:48 +0000 Subject: [PATCH 32/71] Fixt the test to ignore verbosity --- src/documents/tests/test_mail.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/tests/test_mail.py b/src/documents/tests/test_mail.py index 9a9480db4..366dc97b9 100644 --- a/src/documents/tests/test_mail.py +++ b/src/documents/tests/test_mail.py @@ -27,7 +27,7 @@ class TestMessage(TestCase): with open(self.sample, "rb") as f: - message = Message(f.read(), verbosity=0) + message = Message(f.read()) self.assertTrue(message) self.assertEqual(message.subject, "Test 0") From 631aa99d9299ce17c382cfd9b206a15c7ef2f186 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 28 Feb 2016 00:39:40 +0000 Subject: [PATCH 33/71] No need to pass verbosity around anymore --- src/documents/consumer.py | 3 +-- src/documents/management/commands/document_consumer.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 37b348495..f3d5b71cb 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -66,9 +66,8 @@ class Consumer(object): flags=re.IGNORECASE ) - def __init__(self, verbosity=1): + def __init__(self): - self.verbosity = verbosity self.logger = logging.getLogger(__name__) self.logging_group = None diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index ae72381e2..0eae5c80c 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -34,7 +34,7 @@ class Command(BaseCommand): self.verbosity = options["verbosity"] try: - self.file_consumer = Consumer(verbosity=self.verbosity) + self.file_consumer = Consumer() self.mail_fetcher = MailFetcher() except (ConsumerError, MailFetcherError) as e: raise CommandError(e) From d686aba9ae497fd83216870a278657351c3b17d2 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 28 Feb 2016 00:40:08 +0000 Subject: [PATCH 34/71] Reset the group id for every pull --- src/documents/mail.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/documents/mail.py b/src/documents/mail.py index cc987bf64..a5f5416cb 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -172,6 +172,9 @@ class MailFetcher(Loggable): if self._enabled: + # Reset the grouping id for each fetch + self.logging_group = uuid.uuid4() + self.log("info", "Checking mail") for message in self._get_messages(): From 5a8e75112f7cc59358e85699caa6836b2f3a4451 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 28 Feb 2016 00:41:03 +0000 Subject: [PATCH 35/71] Added a custom manager for groupped logs --- src/documents/managers.py | 70 +++++++++++++++++++++++++++++++++++++++ src/documents/models.py | 4 +++ 2 files changed, 74 insertions(+) create mode 100644 src/documents/managers.py diff --git a/src/documents/managers.py b/src/documents/managers.py new file mode 100644 index 000000000..d7e7225eb --- /dev/null +++ b/src/documents/managers.py @@ -0,0 +1,70 @@ +from django.conf import settings + +from django.db import models +from django.db.models.aggregates import Max + + +class Concat(models.Aggregate): + """ + Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've + only ever tested it in Sqlite. + """ + + ENGINE_SQLITE = 1 + ENGINE_POSTGRESQL = 2 + ENGINE_MYSQL = 3 + ENGINES = { + "django.db.backends.sqlite3": ENGINE_SQLITE, + "django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL, + "django.db.backends.postgresql": ENGINE_POSTGRESQL, + "django.db.backends.mysql": ENGINE_MYSQL + } + + def __init__(self, expression, separator="\n", **extra): + + self.engine = self._get_engine() + self.function = self._get_function() + self.template = self._get_template(separator) + + models.Aggregate.__init__( + self, + expression, + output_field=models.CharField(), + **extra + ) + + def _get_engine(self): + engine = settings.DATABASES["default"]["ENGINE"] + try: + return self.ENGINES[engine] + except KeyError: + raise NotImplementedError( + "There's currently no support for {} when it comes to group " + "concatenation in Paperless".format(engine) + ) + + def _get_function(self): + if self.engine == self.ENGINE_POSTGRESQL: + return "STRING_AGG" + return "GROUP_CONCAT" + + def _get_template(self, separator): + if self.engine == self.ENGINE_MYSQL: + return "%(function)s(%(expressions)s, SEPARATOR '{}')".format( + separator) + return "%(function)s(%(expressions)s, '{}')".format(separator) + + +class LogQuerySet(models.query.QuerySet): + + def by_group(self): + return self.values("group").annotate( + time=Max("modified"), + messages=Concat("message"), + ).order_by("-time") + + +class LogManager(models.Manager): + + def get_queryset(self): + return LogQuerySet(self.model, using=self._db) diff --git a/src/documents/models.py b/src/documents/models.py index 91dd458ea..e5556534a 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -8,6 +8,8 @@ from django.db import models from django.template.defaultfilters import slugify from django.utils import timezone +from .managers import LogManager + class SluggedModel(models.Model): @@ -214,6 +216,8 @@ class Log(models.Model): created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) + objects = LogManager() + class Meta(object): ordering = ("-modified",) From 86878923322484cdbdf9f8898924d68df7575fb5 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 28 Feb 2016 00:52:44 +0000 Subject: [PATCH 36/71] Don't print to standard out during a test --- src/documents/tests/test_mail.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/tests/test_mail.py b/src/documents/tests/test_mail.py index 366dc97b9..256c77231 100644 --- a/src/documents/tests/test_mail.py +++ b/src/documents/tests/test_mail.py @@ -3,6 +3,7 @@ import os import magic from hashlib import md5 +from unittest import mock from django.conf import settings from django.test import TestCase @@ -27,7 +28,8 @@ class TestMessage(TestCase): with open(self.sample, "rb") as f: - message = Message(f.read()) + with mock.patch("logging.StreamHandler.emit") as __: + message = Message(f.read()) self.assertTrue(message) self.assertEqual(message.subject, "Test 0") From 9379e95446869fb01718ef21a1e2f26a9b730b6b Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 28 Feb 2016 00:53:18 +0000 Subject: [PATCH 37/71] Added a test for the new by_group() feature --- src/documents/tests/test_logger.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py index d5527d7c6..8d31fe4ec 100644 --- a/src/documents/tests/test_logger.py +++ b/src/documents/tests/test_logger.py @@ -122,3 +122,21 @@ class TestPaperlessLog(TestCase): self.logger.critical("This is a critical message", extra=kw1) self.assertEqual(Log.objects.all().count(), 4) self.assertEqual(Log.objects.filter(component=c1).count(), 2) + + def test_groupped_query(self): + + kw = { + "group": uuid.uuid4(), + "component": Log.COMPONENT_MAIL + } + with mock.patch("logging.StreamHandler.emit") as __: + self.logger.info("Message 0", extra=kw) + self.logger.info("Message 1", extra=kw) + self.logger.info("Message 2", extra=kw) + self.logger.info("Message 3", extra=kw) + + self.assertEqual(Log.objects.all().by_group().count(), 1) + self.assertEqual( + Log.objects.all().by_group()[0]["Messages"], + "Message 0\nMessage 1\nMessage 2\nMessage 3" + ) From 85f59638519168b7b85a8889a24b99478acad880 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 28 Feb 2016 15:02:18 +0000 Subject: [PATCH 38/71] Fixt capitalisation --- src/documents/tests/test_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/tests/test_logger.py b/src/documents/tests/test_logger.py index 8d31fe4ec..23cea13e7 100644 --- a/src/documents/tests/test_logger.py +++ b/src/documents/tests/test_logger.py @@ -137,6 +137,6 @@ class TestPaperlessLog(TestCase): self.assertEqual(Log.objects.all().by_group().count(), 1) self.assertEqual( - Log.objects.all().by_group()[0]["Messages"], + Log.objects.all().by_group()[0]["messages"], "Message 0\nMessage 1\nMessage 2\nMessage 3" ) From 26fc27da9bf21e3e3de1edb605a63ab59aaf4492 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Tue, 1 Mar 2016 18:57:12 +0000 Subject: [PATCH 39/71] Setting appropriate permissions --- src/documents/serialisers.py | 15 ++++++++++++++- src/documents/views.py | 35 ++++++++++++++++++++++++++++------- src/paperless/urls.py | 3 ++- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 345fa166d..f9b29f790 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1,6 +1,6 @@ from rest_framework import serializers -from .models import Sender, Tag, Document +from .models import Sender, Tag, Document, Log class SenderSerializer(serializers.HyperlinkedModelSerializer): @@ -39,3 +39,16 @@ class DocumentSerializer(serializers.ModelSerializer): "file_name", "download_url" ) + + +class LogSerializer(serializers.ModelSerializer): + + time = serializers.DateTimeField() + messages = serializers.CharField() + + class Meta(object): + model = Log + fields = ( + "time", + "messages" + ) diff --git a/src/documents/views.py b/src/documents/views.py index 26642c9fc..1bfba3ee7 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,19 +1,25 @@ +from django.contrib.auth.mixins import LoginRequiredMixin from django.http import HttpResponse from django.template.defaultfilters import slugify from django.views.decorators.csrf import csrf_exempt from django.views.generic import FormView, DetailView +from rest_framework.mixins import ( + RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin) from rest_framework.pagination import PageNumberPagination -from rest_framework.viewsets import ModelViewSet +from rest_framework.permissions import IsAuthenticated +from rest_framework.viewsets import ( + ModelViewSet, ReadOnlyModelViewSet, GenericViewSet) from paperless.db import GnuPG from .forms import UploadForm -from .models import Sender, Tag, Document -from .serialisers import SenderSerializer, TagSerializer, DocumentSerializer +from .models import Sender, Tag, Document, Log +from .serialisers import ( + SenderSerializer, TagSerializer, DocumentSerializer, LogSerializer) -class FetchView(DetailView): +class FetchView(LoginRequiredMixin, DetailView): model = Document @@ -40,9 +46,9 @@ class FetchView(DetailView): return response -class PushView(FormView): +class PushView(LoginRequiredMixin, FormView): """ - A crude REST API for creating documents. + A crude REST-ish API for creating documents. """ form_class = UploadForm @@ -69,6 +75,7 @@ class SenderViewSet(ModelViewSet): queryset = Sender.objects.all() serializer_class = SenderSerializer pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) class TagViewSet(ModelViewSet): @@ -76,10 +83,24 @@ class TagViewSet(ModelViewSet): queryset = Tag.objects.all() serializer_class = TagSerializer pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) -class DocumentViewSet(ModelViewSet): +class DocumentViewSet(RetrieveModelMixin, + UpdateModelMixin, + DestroyModelMixin, + ListModelMixin, + GenericViewSet): model = Document queryset = Document.objects.all() serializer_class = DocumentSerializer pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) + + +class LogViewSet(ReadOnlyModelViewSet): + model = Log + queryset = Log.objects.all().by_group() + serializer_class = LogSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated,) diff --git a/src/paperless/urls.py b/src/paperless/urls.py index fd1af065d..2f4c63f17 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -21,12 +21,13 @@ from django.contrib import admin from rest_framework.routers import DefaultRouter from documents.views import ( - FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet) + FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet) router = DefaultRouter() router.register(r'senders', SenderViewSet) router.register(r'tags', TagViewSet) router.register(r'documents', DocumentViewSet) +router.register(r'logs', LogViewSet) urlpatterns = [ From 7d1aa1175f93f46b6e2b376a238f19b6f35a0a29 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Tue, 1 Mar 2016 19:03:28 +0000 Subject: [PATCH 40/71] pep8 --- src/paperless/urls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 2f4c63f17..55563c6c5 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -21,7 +21,8 @@ from django.contrib import admin from rest_framework.routers import DefaultRouter from documents.views import ( - FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet) + FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet +) router = DefaultRouter() router.register(r'senders', SenderViewSet) From 26c378135079c80d716aa9d656c9e6ae6720286a Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Tue, 1 Mar 2016 22:37:42 +0000 Subject: [PATCH 41/71] #44: Harmonise environment variables with constant names --- docs/changelog.rst | 2 ++ src/documents/forms.py | 2 +- src/documents/mail.py | 4 ++-- src/paperless/settings.py | 47 +++++++++++++++++++++++++++++++++------ src/paperless/urls.py | 2 +- src/paperless/version.py | 2 +- 6 files changed, 47 insertions(+), 12 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index cdb720926..5b8029780 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,7 @@ Changelog * 0.1.1 (master) + * `#44`_: Harmonise environment variable names with constant names. * `#60`_: Setup logging to actually use the Python native logging framework. * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images to be imported but made unavailable. @@ -68,6 +69,7 @@ Changelog .. _darkmatter: https://github.com/darkmatter .. _zedster: https://github.com/zedster +.. _#44: https://github.com/danielquinn/paperless/issues/44 .. _#45: https://github.com/danielquinn/paperless/issues/45 .. _#47: https://github.com/danielquinn/paperless/issues/47 .. _#48: https://github.com/danielquinn/paperless/issues/48 diff --git a/src/documents/forms.py b/src/documents/forms.py index 404be1763..8eb7b8381 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -14,7 +14,7 @@ from .consumer import Consumer class UploadForm(forms.Form): - SECRET = settings.UPLOAD_SHARED_SECRET + SECRET = settings.SHARED_SECRET TYPE_LOOKUP = { "application/pdf": Document.TYPE_PDF, "image/png": Document.TYPE_PNG, diff --git a/src/documents/mail.py b/src/documents/mail.py index a5f5416cb..0bc3ce94f 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -44,7 +44,7 @@ class Message(Loggable): and n attachments, and that we don't care about the message body. """ - SECRET = settings.UPLOAD_SHARED_SECRET + SECRET = settings.SHARED_SECRET def __init__(self, data, group=None): """ @@ -175,7 +175,7 @@ class MailFetcher(Loggable): # Reset the grouping id for each fetch self.logging_group = uuid.uuid4() - self.log("info", "Checking mail") + self.log("debug", "Checking mail") for message in self._get_messages(): diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1f7bb6d0a..67f6c4a0c 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -159,7 +159,7 @@ LOGGING = { } -# Paperless-specific stuffs +# Paperless-specific stuff # Change these paths if yours are different # ---------------------------------------------------------------------------- @@ -173,19 +173,19 @@ OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") # If this is true, any failed attempts to OCR a PDF will result in the PDF # being indexed anyway, with whatever we could get. If it's False, the file # will simply be left in the CONSUMPTION_DIR. -FORGIVING_OCR = True +FORGIVING_OCR = bool(os.getenv("PAPERLESS_FORGIVING_OCR", "YES").lower() in ("yes", "y", "1", "t", "true")) # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") -# Convert is part of the Imagemagick package -CONVERT_BINARY = "/usr/bin/convert" +# Convert is part of the ImageMagick package +CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") # This will be created if it doesn't exist -SCRATCH_DIR = "/tmp/paperless" +SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") # This is where Paperless will look for PDFs to index -CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME") +CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUMPTION_DIR") # If you want to use IMAP mail consumption, populate this with useful values. # If you leave HOST set to None, we assume you're not going to use this @@ -211,4 +211,37 @@ PASSPHRASE = os.getenv("PAPERLESS_PASSPHRASE") # If you intend to use the "API" to push files into the consumer, you'll need # to provide a shared secret here. Leaving this as the default will disable # the API. -UPLOAD_SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "") +SHARED_SECRET = os.getenv("PAPERLESS_SHARED_SECRET", "") + +# +# TODO: Remove after 1.2 +# +# This logic is here to address issue #44, wherein we were using inconsistent +# constant names vs. environment variables. If you're using Paperless for the +# first time, you can safely ignore everything from here on, so long as you're +# correctly defining the variables as per the documentation. +# + + +def deprecated(before, after): + print( + "\n\n" + "WARNING: {before} has been renamed to {after}.\n" + "WARNING: Use of {before} will not work as of version 1.2." + "\n\n".format( + before=before, + after=after + ) + ) + +if not CONVERT_BINARY: + deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY") + CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", "convert") + +if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"): + deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR") + CONSUMPTION_DIR = os.getenv("PAPERLESS_CONSUME") + +if not SHARED_SECRET and os.getenv("PAPERLESS_SECRET"): + deprecated("PAPERLESS_SECRET", "PAPERLESS_SHARED_SECRET") + SHARED_SECRET = os.getenv("PAPERLESS_SECRET", "") diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 55563c6c5..eb302638f 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -47,5 +47,5 @@ urlpatterns = [ ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) -if settings.UPLOAD_SHARED_SECRET: +if settings.SHARED_SECRET: urlpatterns.insert(0, url(r"^push$", PushView.as_view(), name="push")) diff --git a/src/paperless/version.py b/src/paperless/version.py index 8e2c2d9ea..d61abb655 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (0, 1, 0) +__version__ = (0, 1, 1) From 857c7ac65426907925c89e3c62d3d73455367230 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Tue, 1 Mar 2016 22:39:40 +0000 Subject: [PATCH 42/71] #44: Harmonise environment variables with constant names --- src/paperless/settings.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 67f6c4a0c..1599a08e8 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -235,8 +235,10 @@ def deprecated(before, after): ) if not CONVERT_BINARY: - deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY") - CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", "convert") + CONVERT_BINARY = "convert" + if os.getenv("PAPERLESS_CONVERT"): + deprecated("PAPERLESS_CONVERT", "PAPERLESS_CONVERT_BINARY") + CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT", CONVERT_BINARY) if not CONSUMPTION_DIR and os.getenv("PAPERLESS_CONSUME"): deprecated("PAPERLESS_CONSUME", "PAPERLESS_CONSUMPTION_DIR") From 21cd4e9f14a7b74bec3b5d35dcc56a31ebed0372 Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg <pitkley@googlemail.com> Date: Wed, 2 Mar 2016 09:05:51 +0100 Subject: [PATCH 43/71] Update env-var in Dockerfile, fix volume names --- Dockerfile | 4 ++-- docker-compose.yml.example | 14 +++++++------- scripts/docker-entrypoint.sh | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index dade863ca..fec76ee37 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,8 +19,8 @@ RUN mkdir -p /usr/src/paperless/src COPY src/ /usr/src/paperless/src/ # Set consumption directory -ENV PAPERLESS_CONSUME /consume -RUN mkdir -p $PAPERLESS_CONSUME +ENV PAPERLESS_CONSUMPTION_DIR /consume +RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR # Migrate database WORKDIR /usr/src/paperless/src diff --git a/docker-compose.yml.example b/docker-compose.yml.example index 7e3557aa8..488fc83d2 100644 --- a/docker-compose.yml.example +++ b/docker-compose.yml.example @@ -8,8 +8,8 @@ services: # modifying the part before the `:`. - "8000:8000" volumes: - - paperless-data:/usr/src/paperless/data - - paperless-media:/usr/src/paperless/media + - data:/usr/src/paperless/data + - media:/usr/src/paperless/media env_file: docker-compose.env environment: - PAPERLESS_OCR_LANGUAGES= @@ -18,20 +18,20 @@ services: consumer: image: paperless volumes: - - paperless-data:/usr/src/paperless/data - - paperless-media:/usr/src/paperless/media + - data:/usr/src/paperless/data + - media:/usr/src/paperless/media # You have to adapt the local path you want the consumption # directory to mount to by modifying the part before the ':'. - /path/to/arbitrary/place:/consume # Likewise, you can add a local path to mount a directory for # exporting. This is not strictly needed for paperless to # function, only if you're exporting your files: uncomment - # it and fill in a local path if you know you're going to + # it and fill in a local path if you know you're going to # want to export your documents. # - /path/to/another/arbitrary/place:/export env_file: docker-compose.env command: ["document_consumer"] volumes: - paperless-data: - paperless-media: + data: + media: diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh index 9001574a1..14d385469 100644 --- a/scripts/docker-entrypoint.sh +++ b/scripts/docker-entrypoint.sh @@ -16,8 +16,8 @@ map_uidgid() { set_permissions() { # Set permissions for consumption directory - chgrp paperless "$PAPERLESS_CONSUME" - chmod g+x "$PAPERLESS_CONSUME" + chgrp paperless "$PAPERLESS_CONSUMPTION_DIR" + chmod g+x "$PAPERLESS_CONSUMPTION_DIR" # Set permissions for application directory chown -Rh paperless:paperless /usr/src/paperless From dd3bdcb9568838408296bddbac43c97659de41e2 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Thu, 3 Mar 2016 11:00:46 +0000 Subject: [PATCH 44/71] Updated the Vagrant tools to use environment variables --- docs/changelog.rst | 2 ++ docs/setup.rst | 46 ++++++++++++++++------------- scripts/vagrant-provision | 62 ++++++++++++++++++++++++++++++++------- 3 files changed, 79 insertions(+), 31 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 5b8029780..ce2a4edab 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,8 @@ Changelog * 0.1.1 (master) + * Refactored the Vagrant installation process to use environment variables + rather than asking the user to modify ``settings.py``. * `#44`_: Harmonise environment variable names with constant names. * `#60`_: Setup logging to actually use the Python native logging framework. * `#53`_: Fixed an annoying bug that caused ``.jpeg`` and ``.JPG`` images diff --git a/docs/setup.rst b/docs/setup.rst index be8a349d8..077ce135c 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -98,27 +98,31 @@ Vagrant Method 2. Run ``vagrant up``. An instance will start up for you. When it's ready and provisioned... 3. Run ``vagrant ssh`` and once inside your new vagrant box, edit - ``/opt/paperless/src/paperless/settings.py`` and set the values for: - * ``CONSUMPTION_DIR``: this is where your documents will be dumped to be - consumed by Paperless. - * ``PASSPHRASE``: this is the passphrase Paperless uses to encrypt/decrypt - the original document. The default value attempts to source the - passphrase from the environment, so if you don't set it to a static value - here, you must set ``PAPERLESS_PASSPHRASE=some-secret-string`` on the - command line whenever invoking the consumer or webserver. -4. Initialise the database with ``/opt/paperless/src/manage.py migrate``. -5. Still inside your vagrant box, create a user for your Paperless instance with - ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to + ``/etc/paperless.conf`` and set the values for: + * ``PAPERLESS_CONSUMPTION_DIR``: this is where your documents will be + dumped to be consumed by Paperless. + * ``PAPERLESS_PASSPHRASE``: this is the passphrase Paperless uses to + encrypt/decrypt the original document. + * ``PAPERLESS_SHARED_SECRET``: this is the "magic word" used when consuming + documents from mail or via the API. If you don't use either, leaving it + blank is just fine. +4. Exit the vagrant box and re-enter it with ``vagrant ssh`` again. This + updates the environment to make use of the changes you made to the config + file. +5. Initialise the database with ``/opt/paperless/src/manage.py migrate``. +6. Still inside your vagrant box, create a user for your Paperless instance + with ``/opt/paperless/src/manage.py createsuperuser``. Follow the prompts to create your user. -6. Start the webserver with ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. - You should now be able to visit your (empty) `Paperless webserver`_ at - ``172.28.128.4:8000``. You can login with the user/pass you created in #5. -7. In a separate window, run ``vagrant ssh`` again, but this time once inside +7. Start the webserver with + ``/opt/paperless/src/manage.py runserver 0.0.0.0:8000``. You should now be + able to visit your (empty) `Paperless webserver`_ at ``172.28.128.4:8000``. + You can login with the user/pass you created in #6. +8. In a separate window, run ``vagrant ssh`` again, but this time once inside your vagrant instance, you should start the consumer script with ``/opt/paperless/src/manage.py document_consumer``. -8. Scan something. Put it in the ``CONSUMPTION_DIR``. -9. Wait a few minutes -10. Visit the document list on your webserver, and it should be there, indexed +9. Scan something. Put it in the ``CONSUMPTION_DIR``. +10. Wait a few minutes +11. Visit the document list on your webserver, and it should be there, indexed and downloadable. .. _Vagrant: https://vagrantup.com/ @@ -158,11 +162,11 @@ Docker Method 3. Create a copy of ``docker-compose.yml.example`` as ``docker-compose.yml`` and a copy of ``docker-compose.env.example`` as ``docker-compose.env``. You'll be - editing both these files: taking a copy ensures that you can ``git pull`` to - receive updates without risking merge conflicts with your modified versions + editing both these files: taking a copy ensures that you can ``git pull`` to + receive updates without risking merge conflicts with your modified versions of the configuration files. 4. Modify ``docker-compose.yml`` to your preferences, following the instructions - in comments in the file. The only change that is a hard requirement is to + in comments in the file. The only change that is a hard requirement is to specify where the consumption directory should mount. 5. Modify ``docker-compose.env`` and adapt the following environment variables: diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision index aa6ca5e14..c746e7fc1 100644 --- a/scripts/vagrant-provision +++ b/scripts/vagrant-provision @@ -1,13 +1,55 @@ #!/bin/bash -# install packages -sudo apt-get update -sudo apt-get build-dep -y python-imaging -sudo apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev -sudo apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git -sudo apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick +# Install packages +apt-get update +apt-get build-dep -y python-imaging +apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev +apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git +apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick -# setup python project -pushd /opt/paperless -sudo pip3 install -r requirements.txt -popd +# Python dependencies +pip3 install -r /opt/paperless/requirements.txt + +# Create the environment file +echo " +# This where your documents should go to be consumed. Make sure that it exists +# before you start Paperless. +export PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption' + +# This is the secret passphrase used to encrypt the documents once they have +# been consumed. Change it to whatever you like, but you shouldn't change it +# after it has been used to consume a document or you won't be able to read +# that document again. +export PAPERLESS_PASSPHRASE='secret' + +# This is the secret string used to verify PDFs sent by mail or consumed via +# the API. If you don't plan to use either of these, you can safely leave it +# blank +export PAPERLESS_SHARED_SECRET='' +" > /tmp/paperless.conf +chmod 0640 /tmp/paperless.conf +chown root:vagrant /tmp/paperless.conf +mv /tmp/paperless.conf /etc/ + +# Create the consumption directory +mkdir /home/vagrant/consumption +chown vagrant:vagrant /home/vagrant/consumption + +# Create environment wrapper +echo " + + +# Setup the paperless environment variables +. /etc/paperless.conf +" >> /home/vagrant/.bashrc + +echo " + + +Now follow the remaining steps in the Vagrant section of the setup +documentation to complete the process: + +http://paperless.readthedocs.org/en/latest/setup.html#setup-installation-vagrant + + +" From 0aead1fbe6578240476bb36a135859b5126f4966 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Thu, 3 Mar 2016 17:59:27 +0000 Subject: [PATCH 45/71] #68: Using dotenv for a proper unix config file --- docs/changelog.rst | 3 +++ requirements.txt | 1 + scripts/paperless-consumer.service | 3 +-- scripts/paperless-webserver.service | 1 - src/paperless/settings.py | 16 ++++++++++++---- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index ce2a4edab..772e30dc0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,8 @@ Changelog * 0.1.1 (master) + * `#68`_: Added support for using a proper config file at + ``/etc/paperless.conf``. * Refactored the Vagrant installation process to use environment variables rather than asking the user to modify ``settings.py``. * `#44`_: Harmonise environment variable names with constant names. @@ -79,3 +81,4 @@ Changelog .. _#54: https://github.com/danielquinn/paperless/issues/54 .. _#57: https://github.com/danielquinn/paperless/issues/57 .. _#60: https://github.com/danielquinn/paperless/issues/60 +.. _#68: https://github.com/danielquinn/paperless/issues/68 diff --git a/requirements.txt b/requirements.txt index 810af8ec2..6a133327a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ Django==1.9.2 django-extensions==1.6.1 djangorestframework==3.3.2 +python-dotenv==0.3.0 filemagic==1.6 langdetect==1.0.5 Pillow==3.1.1 diff --git a/scripts/paperless-consumer.service b/scripts/paperless-consumer.service index 34d65dedb..79a27d3ce 100644 --- a/scripts/paperless-consumer.service +++ b/scripts/paperless-consumer.service @@ -2,10 +2,9 @@ Description=Paperless consumer [Service] -EnvironmentFile=/etc/conf.d/paperless User=paperless Group=paperless -ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY +ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer [Install] WantedBy=multi-user.target diff --git a/scripts/paperless-webserver.service b/scripts/paperless-webserver.service index 1a2386471..9d20f5a1c 100644 --- a/scripts/paperless-webserver.service +++ b/scripts/paperless-webserver.service @@ -2,7 +2,6 @@ Description=Paperless webserver [Service] -EnvironmentFile=/etc/conf.d/paperless User=paperless Group=paperless ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py runserver 0.0.0.0:8000 diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1599a08e8..f2fb41941 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -12,6 +12,8 @@ https://docs.djangoproject.com/en/1.9/ref/settings/ import os +from dotenv import load_dotenv + # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -140,6 +142,16 @@ STATIC_URL = '/static/' MEDIA_URL = "/media/" +# Paperless-specific stuff +# You shouldn't have to edit any of these values. Rather, you can set these +# values in /etc/paperless.conf instead. +# ---------------------------------------------------------------------------- + +# Tap paperless.conf if it's available +if os.path.exists("/etc/paperless.conf"): + load_dotenv("/etc/paperless.conf") + + # Logging LOGGING = { @@ -159,10 +171,6 @@ LOGGING = { } -# Paperless-specific stuff -# Change these paths if yours are different -# ---------------------------------------------------------------------------- - # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = "eng" From 66d4407565ce4ff431b50c9dbd01b7598625e62e Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Thu, 3 Mar 2016 18:01:02 +0000 Subject: [PATCH 46/71] #68: Using dotenv for a proper unix config file --- scripts/vagrant-provision | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision index c746e7fc1..2a744d5d3 100644 --- a/scripts/vagrant-provision +++ b/scripts/vagrant-provision @@ -14,18 +14,18 @@ pip3 install -r /opt/paperless/requirements.txt echo " # This where your documents should go to be consumed. Make sure that it exists # before you start Paperless. -export PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption' +PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption' # This is the secret passphrase used to encrypt the documents once they have # been consumed. Change it to whatever you like, but you shouldn't change it # after it has been used to consume a document or you won't be able to read # that document again. -export PAPERLESS_PASSPHRASE='secret' +PAPERLESS_PASSPHRASE='secret' # This is the secret string used to verify PDFs sent by mail or consumed via # the API. If you don't plan to use either of these, you can safely leave it # blank -export PAPERLESS_SHARED_SECRET='' +PAPERLESS_SHARED_SECRET='' " > /tmp/paperless.conf chmod 0640 /tmp/paperless.conf chown root:vagrant /tmp/paperless.conf @@ -35,14 +35,6 @@ mv /tmp/paperless.conf /etc/ mkdir /home/vagrant/consumption chown vagrant:vagrant /home/vagrant/consumption -# Create environment wrapper -echo " - - -# Setup the paperless environment variables -. /etc/paperless.conf -" >> /home/vagrant/.bashrc - echo " From b8be20b5658d428aa66e2156de588778f96e9f43 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Thu, 3 Mar 2016 18:09:10 +0000 Subject: [PATCH 47/71] Preparing for a proper UI --- src/documents/templates/documents/index.html | 10 ++++++++++ src/documents/views.py | 13 ++++++++++++- src/paperless/urls.py | 9 +++++++-- 3 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 src/documents/templates/documents/index.html diff --git a/src/documents/templates/documents/index.html b/src/documents/templates/documents/index.html new file mode 100644 index 000000000..ccde2d389 --- /dev/null +++ b/src/documents/templates/documents/index.html @@ -0,0 +1,10 @@ +<!DOCTYPE html> + +<html lang="en-gb"> + <head> + <title>Paperless + + + + + diff --git a/src/documents/views.py b/src/documents/views.py index 1bfba3ee7..0b2b50926 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -2,7 +2,7 @@ from django.contrib.auth.mixins import LoginRequiredMixin from django.http import HttpResponse from django.template.defaultfilters import slugify from django.views.decorators.csrf import csrf_exempt -from django.views.generic import FormView, DetailView +from django.views.generic import FormView, DetailView, TemplateView from rest_framework.mixins import ( RetrieveModelMixin, UpdateModelMixin, DestroyModelMixin, ListModelMixin) @@ -19,6 +19,17 @@ from .serialisers import ( SenderSerializer, TagSerializer, DocumentSerializer, LogSerializer) +class IndexView(TemplateView): + + template_name = "documents/index.html" + + def get_context_data(self, **kwargs): + print(kwargs) + print(self.request.GET) + print(self.request.POST) + return TemplateView.get_context_data(self, **kwargs) + + class FetchView(LoginRequiredMixin, DetailView): model = Document diff --git a/src/paperless/urls.py b/src/paperless/urls.py index eb302638f..6fa7e65ef 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -21,7 +21,8 @@ from django.contrib import admin from rest_framework.routers import DefaultRouter from documents.views import ( - FetchView, PushView, SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet + IndexView, FetchView, PushView, + SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet ) router = DefaultRouter() @@ -39,11 +40,15 @@ urlpatterns = [ ), url(r"^api/", include(router.urls, namespace="drf")), + # Normal pages (coming soon) + # url(r"^$", IndexView.as_view(), name="index"), + # File downloads url(r"^fetch/(?P\d+)$", FetchView.as_view(), name="fetch"), # The Django admin - url(r"", admin.site.urls), + url(r"admin", admin.site.urls), + url(r"", admin.site.urls), # This is going away ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) From 55dcbcc47f944d67f5ab2c0b5c83bdf097683ce2 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 3 Mar 2016 18:18:38 +0000 Subject: [PATCH 48/71] Forgot a slash --- src/paperless/urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 6fa7e65ef..24a495810 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -47,7 +47,7 @@ urlpatterns = [ url(r"^fetch/(?P\d+)$", FetchView.as_view(), name="fetch"), # The Django admin - url(r"admin", admin.site.urls), + url(r"admin/", admin.site.urls), url(r"", admin.site.urls), # This is going away ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) From fad466477b3ccf5e6f433871e2fd89a840b738eb Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 3 Mar 2016 18:18:48 +0000 Subject: [PATCH 49/71] More verbose error logging --- src/documents/consumer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f3d5b71cb..5617ed550 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -123,9 +123,9 @@ class Consumer(object): try: text = self._get_ocr(pngs) self._store(text, doc) - except OCRError: + except OCRError as e: self._ignore.append(doc) - self.log("error", "OCR FAILURE: {}".format(doc)) + self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) self._cleanup_tempdir(tempdir) continue else: @@ -165,7 +165,7 @@ class Consumer(object): """ if not pngs: - raise OCRError + raise OCRError("No images found") self.log("info", "OCRing the document") @@ -186,7 +186,7 @@ class Consumer(object): ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text - raise OCRError + raise OCRError("Language detection failed") if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) @@ -205,7 +205,10 @@ class Consumer(object): ) raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) return raw_text - raise OCRError + raise OCRError( + "The guessed language is not available in this instance of " + "Tesseract." + ) def _assemble_ocr_sections(self, pngs, middle, text): """ From 070463b85a396c4895e6473e63af70af6406b539 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 3 Mar 2016 20:52:42 +0000 Subject: [PATCH 50/71] s/Sender/Correspondent & reworked the (im|ex)porter --- docs/consumption.rst | 38 ++-- docs/migrating.rst | 177 +++++++----------- docs/utilities.rst | 90 ++++++++- src/documents/admin.py | 4 +- src/documents/consumer.py | 4 +- src/documents/forms.py | 10 +- src/documents/mail.py | 4 +- .../management/commands/document_exporter.py | 29 ++- .../management/commands/document_importer.py | 110 +++++++++++ .../migrations/0011_auto_20160303_1929.py | 19 ++ src/documents/models.py | 16 +- src/documents/serialisers.py | 6 +- src/documents/views.py | 15 +- src/paperless/urls.py | 4 +- 14 files changed, 342 insertions(+), 184 deletions(-) create mode 100644 src/documents/management/commands/document_importer.py create mode 100644 src/documents/migrations/0011_auto_20160303_1929.py diff --git a/docs/consumption.rst b/docs/consumption.rst index 8b9b35433..0f8ff7ca5 100644 --- a/docs/consumption.rst +++ b/docs/consumption.rst @@ -44,10 +44,10 @@ Any document you put into the consumption directory will be consumed, but if you name the file right, it'll automatically set some values in the database for you. This is is the logic the consumer follows: -1. Try to find the sender, title, and tags in the file name following the - pattern: ``Sender - Title - tag,tag,tag.pdf``. -2. If that doesn't work, try to find the sender and title in the file name - following the pattern: ``Sender - Title.pdf``. +1. Try to find the correspondent, title, and tags in the file name following + the pattern: ``Correspondent - Title - tag,tag,tag.pdf``. +2. If that doesn't work, try to find the correspondent and title in the file + name following the pattern: ``Correspondent - Title.pdf``. 3. If that doesn't work, just assume that the name of the file is the title. So given the above, the following examples would work as you'd expect: @@ -97,9 +97,9 @@ So, with all that in mind, here's what you do to get it running: the configured email account every 10 minutes for something new and pull down whatever it finds. 4. Send yourself an email! Note that the subject is treated as the file name, - so if you set the subject to ``Sender - Title - tag,tag,tag``, you'll get - what you expect. Also, you must include the aforementioned secret string in - every email so the fetcher knows that it's safe to import. + so if you set the subject to ``Correspondent - Title - tag,tag,tag``, you'll + get what you expect. Also, you must include the aforementioned secret + string in every email so the fetcher knows that it's safe to import. 5. After a few minutes, the consumer will poll your mailbox, pull down the message, and place the attachment in the consumption directory with the appropriate name. A few minutes later, the consumer will import it like any @@ -118,16 +118,16 @@ a real API, it's just a URL that accepts an HTTP POST. To push your document to *Paperless*, send an HTTP POST to the server with the following name/value pairs: -* ``sender``: The name of the document's sender. Note that there are - restrictions on what characters you can use here. Specifically, alphanumeric - characters, `-`, `,`, `.`, and `'` are ok, everything else it out. You also - can't use the sequence ` - ` (space, dash, space). +* ``correspondent``: The name of the document's correspondent. Note that there + are restrictions on what characters you can use here. Specifically, + alphanumeric characters, `-`, `,`, `.`, and `'` are ok, everything else it + out. You also can't use the sequence ` - ` (space, dash, space). * ``title``: The title of the document. The rules for characters is the same - here as the sender. -* ``signature``: For security reasons, we have the sender send a signature using - a "shared secret" method to make sure that random strangers don't start - uploading stuff to your server. The means of generating this signature is - defined below. + here as the correspondent. +* ``signature``: For security reasons, we have the correspondent send a + signature using a "shared secret" method to make sure that random strangers + don't start uploading stuff to your server. The means of generating this + signature is defined below. Specify ``enctype="multipart/form-data"``, and then POST your file with::: @@ -146,12 +146,12 @@ verification. In the case of *Paperless*, you configure the server with the secret by setting ``UPLOAD_SHARED_SECRET``. Then on your client, you generate your signature by -concatenating the sender, title, and the secret, and then using sha256 to -generate a hexdigest. +concatenating the correspondent, title, and the secret, and then using sha256 +to generate a hexdigest. If you're using Python, this is what that looks like: .. code:: python from hashlib import sha256 - signature = sha256(sender + title + secret).hexdigest() + signature = sha256(correspondent + title + secret).hexdigest() diff --git a/docs/migrating.rst b/docs/migrating.rst index 491eeace4..d659620ac 100644 --- a/docs/migrating.rst +++ b/docs/migrating.rst @@ -4,10 +4,68 @@ Migrating, Updates, and Backups =============================== As *Paperless* is still under active development, there's a lot that can change -as software updates roll out. The thing you just need to remember for all of -this is that for the most part, **the database is expendable** so long as you -have your files. This is because the file name of the exported files includes -the name of the sender, the title, and the tags (if any) on each file. +as software updates roll out. You should backup often, so if anything goes +wrong during an update, you at least have a means of restoring to something +usable. Thankfully, there are automated ways of backing up, restoring, and +updating the software. + + +.. _migrating-backup: + +Backing Up +---------- + +So you're bored of this whole project, or you want to make a remote backup of +the unencrypted files for whatever reason. This is easy to do, simply use the +:ref:`exporter ` to dump your documents and database out +into an arbitrary directory. + + +.. _migrating-restoring: + +Restoring +--------- + +Restoring your data is just as easy, since nearly all of your data exists either +in the file names, or in the contents of the files themselves. You just need to +create an empty database (just follow the +:ref:`installation instructions ` again) and then import the +``tags.json`` file you created as part of your backup. Lastly, copy your +exported documents into the consumption directory and start up the consumer. + +.. code-block:: shell-session + + $ cd /path/to/project + $ rm data/db.sqlite3 # Delete the database + $ cd src + $ ./manage.py migrate # Create the database + $ ./manage.py createsuperuser + $ ./manage.py loaddata /path/to/arbitrary/place/tags.json + $ cp /path/to/exported/docs/* /path/to/consumption/dir/ + $ ./manage.py document_consumer + +Importing your data if you are :ref:`using Docker ` +is almost as simple: + +.. code-block:: shell-session + + # Stop and remove your current containers + $ docker-compose stop + $ docker-compose rm -f + + # Recreate them, add the superuser + $ docker-compose up -d + $ docker-compose run --rm webserver createsuperuser + + # Load the tags + $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - + + # Load your exported documents into the consumption directory + # (How you do this highly depends on how you have set this up) + $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ + +After loading the documents into the consumption directory the consumer will +immediately start consuming the documents. .. _migrating-updates: @@ -20,7 +78,7 @@ on the directory containing the project files, and then use Django's ``migrate`` command to execute any database schema updates that might have been rolled in as part of the update: -.. code:: bash +.. code-block:: shell-session $ cd /path/to/project $ git pull @@ -43,112 +101,3 @@ requires only one additional step: If ``git pull`` doesn't report any changes, there is no need to continue with the remaining steps. - - -.. _migrating-backup: - -Backing Up ----------- - -So you're bored of this whole project, or you want to make a remote backup of -the unencrypted files for whatever reason. This is easy to do, simply use the -:ref:`exporter ` to dump your documents out into an -arbitrary directory. - -Additionally however, you'll need to back up the tags themselves. The file -names contain the tag names, but you still need to define the tags and their -matching algorithms in the database for things to work properly. We do this -with Django's ``dumpdata`` command, which produces JSON output. - -.. code:: bash - - $ cd /path/to/project - $ cd src - $ ./manage.py document_export /path/to/arbitrary/place/ - $ ./manage.py dumpdata documents.Tag > /path/to/arbitrary/place/tags.json - -If you are :ref:`using Docker `, exporting your tags -as JSON is almost as easy: - -.. code-block:: shell-session - - $ docker-compose run --rm webserver dumpdata documents.Tag > /path/to/arbitrary/place/tags.json - -To export the documents you can either use ``docker run`` directly, specifying all -the commandline options by hand, or (more simply) mount a second volume for export. - -To mount a volume for exports, follow the instructions in the -``docker-compose.yml.example`` file for the ``/export`` volume (making the changes -in your own ``docker-compose.yml`` file, of course). Once you have the -volume mounted, the command to run an export is: - -.. code-block:: console - - $ docker-compose run --rm consumer document_exporter /export - -If you prefer to use ``docker run`` directly, supplying the necessary commandline -options: - -.. code-block:: shell-session - - $ # Identify your containers - $ docker-compose ps - Name Command State Ports - ------------------------------------------------------------------------- - paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 - paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 - - $ # Make sure to replace your passphrase and remove or adapt the id mapping - $ docker run --rm \ - --volumes-from paperless_data_1 \ - --volume /path/to/arbitrary/place:/export \ - -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ - -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ - paperless document_exporter /export - - -.. _migrating-restoring: - -Restoring ---------- - -Restoring your data is just as easy, since nearly all of your data exists either -in the file names, or in the contents of the files themselves. You just need to -create an empty database (just follow the -:ref:`installation instructions ` again) and then import the -``tags.json`` file you created as part of your backup. Lastly, copy your -exported documents into the consumption directory and start up the consumer. - -.. code:: bash - - $ cd /path/to/project - $ rm data/db.sqlite3 # Delete the database - $ cd src - $ ./manage.py migrate # Create the database - $ ./manage.py createsuperuser - $ ./manage.py loaddata /path/to/arbitrary/place/tags.json - $ cp /path/to/exported/docs/* /path/to/consumption/dir/ - $ ./manage.py document_consumer - -Importing your data if you are :ref:`using Docker ` -is almost as simple: - -.. code-block:: shell-session - - $ # Stop and remove your current containers - $ docker-compose stop - $ docker-compose rm -f - - $ # Recreate them, add the superuser - $ docker-compose up -d - $ docker-compose run --rm webserver createsuperuser - - $ # Load the tags - $ cat /path/to/arbitrary/place/tags.json | docker-compose run --rm webserver loaddata_stdin - - - $ # Load your exported documents into the consumption directory - $ # (How you do this highly depends on how you have set this up) - $ cp /path/to/exported/docs/* /path/to/mounted/consumption/dir/ - -After loading the documents into the consumption directory the consumer will -immediately start consuming the documents. diff --git a/docs/utilities.rst b/docs/utilities.rst index f5b452a6f..ce3555b73 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -26,7 +26,7 @@ How to Use It The webserver is started via the ``manage.py`` script: -.. code:: bash +.. code-block:: shell-session $ /path/to/paperless/src/manage.py runserver @@ -64,7 +64,7 @@ How to Use It The consumer is started via the ``manage.py`` script: -.. code:: bash +.. code-block:: shell-session $ /path/to/paperless/src/manage.py document_consumer @@ -95,16 +95,86 @@ How to Use It This too is done via the ``manage.py`` script: -.. code:: bash +.. code-block:: shell-session - $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere + $ /path/to/paperless/src/manage.py document_exporter /path/to/somewhere/ -This will dump all of your PDFs into ``/path/to/somewhere`` for you to do with -as you please. The naming scheme on export is identical to that used for -import, so should you can now safely delete the entire project directly, -database, encrypted PDFs and all, and later create it all again simply by -running the consumer again and dumping all of these files into -``CONSUMPTION_DIR``. +This will dump all of your unencrypted PDFs into ``/path/to/somewhere`` for you +to do with as you please. The files are accompanied with a special file, +``manifest.json`` which can be used to +:ref:`import the files ` at a later date if you wish. + + +.. _utilities-exporter-howto-docker: + +Docker +______ + +If you are :ref:`using Docker `, running the +expoorter is almost as easy. To mount a volume for exports, follow the +instructions in the ``docker-compose.yml.example`` file for the ``/export`` +volume (making the changes in your own ``docker-compose.yml`` file, of course). +Once you have the volume mounted, the command to run an export is: + +.. code-block:: shell-session + + $ docker-compose run --rm consumer document_exporter /export + +If you prefer to use ``docker run`` directly, supplying the necessary commandline +options: + +.. code-block:: shell-session + + $ # Identify your containers + $ docker-compose ps + Name Command State Ports + ------------------------------------------------------------------------- + paperless_consumer_1 /sbin/docker-entrypoint.sh ... Exit 0 + paperless_webserver_1 /sbin/docker-entrypoint.sh ... Exit 0 + + $ # Make sure to replace your passphrase and remove or adapt the id mapping + $ docker run --rm \ + --volumes-from paperless_data_1 \ + --volume /path/to/arbitrary/place:/export \ + -e PAPERLESS_PASSPHRASE=YOUR_PASSPHRASE \ + -e USERMAP_UID=1000 -e USERMAP_GID=1000 \ + paperless document_exporter /export + + +.. _utilities-importer: + +The Importer +------------ + +Looking to transfer Paperless data from one instance to another, or just want +to restore from a backup? This is your go-to toy. + + +.. _utilities-importer-howto: + +How to Use It +............. + +The importer works just like the exporter. You point it at a directory, and +the script does the rest of the work: + +.. code-block:: shell-session + + $ /path/to/paperless/src/manage.py document_importer /path/to/somewhere/ + +Docker +______ + +Assuming that you've already gone through the steps above in the +:ref:`export ` section, then the easiest thing +to do is just re-use the ``/export`` path you already setup: + +.. code-block:: shell-session + + $ docker-compose run --rm consumer document_importer /export + +Similarly, if you're not using docker-compose, you can adjust the export +instructions above to do the import. .. _utilities-retagger: diff --git a/src/documents/admin.py b/src/documents/admin.py index 118a295eb..3baad817b 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group from django.core.urlresolvers import reverse from django.templatetags.static import static -from .models import Sender, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log class MonthListFilter(admin.SimpleListFilter): @@ -107,7 +107,7 @@ class LogAdmin(admin.ModelAdmin): list_filter = ("level", "component",) -admin.site.register(Sender) +admin.site.register(Correspondent) admin.site.register(Tag, TagAdmin) admin.site.register(Document, DocumentAdmin) admin.site.register(Log, LogAdmin) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5617ed550..4233cded8 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -24,7 +24,7 @@ from pyocr.tesseract import TesseractError from paperless.db import GnuPG -from .models import Sender, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log from .languages import ISO639 @@ -246,7 +246,7 @@ class Consumer(object): """ def get_sender(sender_name): - return Sender.objects.get_or_create( + return Correspondent.objects.get_or_create( name=sender_name, defaults={"slug": slugify(sender_name)})[0] def get_tags(tags): diff --git a/src/documents/forms.py b/src/documents/forms.py index 8eb7b8381..d8960f88b 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -8,7 +8,7 @@ from time import mktime from django import forms from django.conf import settings -from .models import Document, Sender +from .models import Document, Correspondent from .consumer import Consumer @@ -24,7 +24,9 @@ class UploadForm(forms.Form): } sender = forms.CharField( - max_length=Sender._meta.get_field("name").max_length, required=False) + max_length=Correspondent._meta.get_field("name").max_length, + required=False + ) title = forms.CharField( max_length=Document._meta.get_field("title").max_length, required=False @@ -41,7 +43,7 @@ class UploadForm(forms.Form): sender = self.cleaned_data.get("sender") if not sender: return None - if not Sender.SAFE_REGEX.match(sender) or " - " in sender: + if not Correspondent.SAFE_REGEX.match(sender) or " - " in sender: raise forms.ValidationError("That sender name is suspicious.") return sender @@ -49,7 +51,7 @@ class UploadForm(forms.Form): title = self.cleaned_data.get("title") if not title: return None - if not Sender.SAFE_REGEX.match(title) or " - " in title: + if not Correspondent.SAFE_REGEX.match(title) or " - " in title: raise forms.ValidationError("That title is suspicious.") def clean_document(self): diff --git a/src/documents/mail.py b/src/documents/mail.py index 0bc3ce94f..5bacb5b5f 100644 --- a/src/documents/mail.py +++ b/src/documents/mail.py @@ -14,7 +14,7 @@ from dateutil import parser from django.conf import settings from .consumer import Consumer -from .models import Sender, Log +from .models import Correspondent, Log class MailFetcherError(Exception): @@ -103,7 +103,7 @@ class Message(Loggable): def check_subject(self): if self.subject is None: raise InvalidMessageError("Message does not have a subject") - if not Sender.SAFE_REGEX.match(self.subject): + if not Correspondent.SAFE_REGEX.match(self.subject): raise InvalidMessageError("Message subject is unsafe: {}".format( self.subject)) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index ac448d8e8..87ed804a2 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -1,10 +1,12 @@ +import json import os import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError +from django.core import serializers -from documents.models import Document +from documents.models import Document, Correspondent, Tag from paperless.db import GnuPG from ...mixins import Renderable @@ -14,21 +16,19 @@ class Command(Renderable, BaseCommand): help = """ Decrypt and rename all files in our collection into a given target - directory. Note that we don't export any of the parsed data since - that can always be re-collected via the consumer. + directory. And include a manifest file containing document data for + easy import. """.replace(" ", "") def add_arguments(self, parser): parser.add_argument("target") def __init__(self, *args, **kwargs): - self.verbosity = 0 - self.target = None BaseCommand.__init__(self, *args, **kwargs) + self.target = None def handle(self, *args, **options): - self.verbosity = options["verbosity"] self.target = options["target"] if not os.path.exists(self.target): @@ -40,9 +40,15 @@ class Command(Renderable, BaseCommand): if not settings.PASSPHRASE: settings.PASSPHRASE = input("Please enter the passphrase: ") - for document in Document.objects.all(): + documents = Document.objects.all() + document_map = {d.pk: d for d in documents} + manifest = json.loads(serializers.serialize("json", documents)) + for document_dict in manifest: + + document = document_map[document_dict["pk"]] target = os.path.join(self.target, document.file_name) + document_dict["__exported_file_name__"] = target print("Exporting: {}".format(target)) @@ -50,3 +56,12 @@ class Command(Renderable, BaseCommand): f.write(GnuPG.decrypted(document.source_file)) t = int(time.mktime(document.created.timetuple())) os.utime(target, times=(t, t)) + + manifest += json.loads( + serializers.serialize("json", Correspondent.objects.all())) + + manifest += json.loads(serializers.serialize( + "json", Tag.objects.all())) + + with open(os.path.join(self.target, "manifest.json"), "w") as f: + json.dump(manifest, f, indent=2) diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py new file mode 100644 index 000000000..213c049e4 --- /dev/null +++ b/src/documents/management/commands/document_importer.py @@ -0,0 +1,110 @@ +import json +import os + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.core.management import call_command + +from documents.models import Document +from paperless.db import GnuPG + +from ...mixins import Renderable + + +class Command(Renderable, BaseCommand): + + help = """ + Using a manifest.json file, load the data from there, and import the + documents it refers to. + """.replace(" ", "") + + def add_arguments(self, parser): + parser.add_argument("source") + parser.add_argument( + '--ignore-absent', + action='store_true', + default=False, + help="If the manifest refers to a document that doesn't exist, " + "ignore it and attempt to import what it can" + ) + + def __init__(self, *args, **kwargs): + BaseCommand.__init__(self, *args, **kwargs) + self.source = None + self.manifest = None + + def handle(self, *args, **options): + + self.source = options["source"] + + if not os.path.exists(self.source): + raise CommandError("That path doesn't exist") + + if not os.access(self.source, os.R_OK): + raise CommandError("That path doesn't appear to be readable") + + manifest_path = os.path.join(self.source, "manifest.json") + self._check_manifest_exists(manifest_path) + + with open(manifest_path) as f: + self.manifest = json.load(f) + + self._check_manifest() + + if not settings.PASSPHRASE: + raise CommandError( + "You need to define a passphrase before continuing. Please " + "consult the documentation for setting up Paperless." + ) + + # Fill up the database with whatever is in the manifest + call_command("loaddata", manifest_path) + + self._import_files_from_manifest() + + @staticmethod + def _check_manifest_exists(path): + if not os.path.exists(path): + raise CommandError( + "That directory doesn't appear to contain a manifest.json " + "file." + ) + + def _check_manifest(self): + + for record in self.manifest: + + if not record["model"] == "documents.document": + continue + + if "__exported_file_name__" not in record: + raise CommandError( + 'The manifest file contains a record which does not ' + 'refer to an actual document file. If you want to import ' + 'the rest anyway (skipping such references) call the ' + 'importer with --ignore-absent' + ) + + doc_file = record["__exported_file_name__"] + if not os.path.exists(os.path.join(self.source, doc_file)): + raise CommandError( + 'The manifest file refers to "{}" which does not ' + 'appear to be in the source directory. If you want to ' + 'import the rest anyway (skipping such references) call ' + 'the importer with --ignore-absent'.format(doc_file) + ) + + def _import_files_from_manifest(self): + + for record in self.manifest: + + if not record["model"] == "documents.document": + continue + + doc_file = record["__exported_file_name__"] + document = Document.objects.get(pk=record["pk"]) + with open(doc_file, "rb") as unencrypted: + with open(document.source_path, "wb") as encrypted: + print("Encrypting {} and saving it to {}".format( + doc_file, document.source_path)) + encrypted.write(GnuPG.encrypted(unencrypted)) diff --git a/src/documents/migrations/0011_auto_20160303_1929.py b/src/documents/migrations/0011_auto_20160303_1929.py new file mode 100644 index 000000000..a9aefddaf --- /dev/null +++ b/src/documents/migrations/0011_auto_20160303_1929.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.2 on 2016-03-03 19:29 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0010_log'), + ] + + operations = [ + migrations.RenameModel( + old_name='Sender', + new_name='Correspondent', + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index e5556534a..0fb6489c4 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -28,7 +28,7 @@ class SluggedModel(models.Model): return self.name -class Sender(SluggedModel): +class Correspondent(SluggedModel): # This regex is probably more restrictive than it needs to be, but it's # better safe than sorry. @@ -141,7 +141,7 @@ class Document(models.Model): TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) sender = models.ForeignKey( - Sender, blank=True, null=True, related_name="documents") + Correspondent, blank=True, null=True, related_name="documents") title = models.CharField(max_length=128, blank=True, db_index=True) content = models.TextField(db_index=True) file_type = models.CharField( @@ -158,9 +158,9 @@ class Document(models.Model): ordering = ("sender", "title") def __str__(self): - created = self.created.strftime("%Y-%m-%d") + created = self.created.strftime("%Y%m%d%H%M%S") if self.sender and self.title: - return "{}: {}, {}".format(created, self.sender, self.title) + return "{}: {} - {}".format(created, self.sender, self.title) if self.sender or self.title: return "{}: {}".format(created, self.sender or self.title) return str(created) @@ -179,13 +179,7 @@ class Document(models.Model): @property def file_name(self): - if self.sender and self.title: - tags = ",".join([t.slug for t in self.tags.all()]) - if tags: - return "{} - {} - {}.{}".format( - self.sender, self.title, tags, self.file_type) - return "{} - {}.{}".format(self.sender, self.title, self.file_type) - return os.path.basename(self.source_path) + return slugify(str(self)) + "." + self.file_type @property def download_url(self): diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index f9b29f790..340fdaa25 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1,12 +1,12 @@ from rest_framework import serializers -from .models import Sender, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log -class SenderSerializer(serializers.HyperlinkedModelSerializer): +class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): class Meta(object): - model = Sender + model = Correspondent fields = ("id", "slug", "name") diff --git a/src/documents/views.py b/src/documents/views.py index 0b2b50926..ff7c4ce05 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,6 +1,5 @@ from django.contrib.auth.mixins import LoginRequiredMixin from django.http import HttpResponse -from django.template.defaultfilters import slugify from django.views.decorators.csrf import csrf_exempt from django.views.generic import FormView, DetailView, TemplateView @@ -14,9 +13,9 @@ from rest_framework.viewsets import ( from paperless.db import GnuPG from .forms import UploadForm -from .models import Sender, Tag, Document, Log +from .models import Correspondent, Tag, Document, Log from .serialisers import ( - SenderSerializer, TagSerializer, DocumentSerializer, LogSerializer) + CorrespondentSerializer, TagSerializer, DocumentSerializer, LogSerializer) class IndexView(TemplateView): @@ -52,7 +51,7 @@ class FetchView(LoginRequiredMixin, DetailView): content_type=content_types[self.object.file_type] ) response["Content-Disposition"] = 'attachment; filename="{}"'.format( - slugify(str(self.object)) + "." + self.object.file_type) + self.object.file_name) return response @@ -81,10 +80,10 @@ class StandardPagination(PageNumberPagination): max_page_size = 100000 -class SenderViewSet(ModelViewSet): - model = Sender - queryset = Sender.objects.all() - serializer_class = SenderSerializer +class CorrespondentViewSet(ModelViewSet): + model = Correspondent + queryset = Correspondent.objects.all() + serializer_class = CorrespondentSerializer pagination_class = StandardPagination permission_classes = (IsAuthenticated,) diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 24a495810..e81d4dcf9 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -22,11 +22,11 @@ from rest_framework.routers import DefaultRouter from documents.views import ( IndexView, FetchView, PushView, - SenderViewSet, TagViewSet, DocumentViewSet, LogViewSet + CorrespondentViewSet, TagViewSet, DocumentViewSet, LogViewSet ) router = DefaultRouter() -router.register(r'senders', SenderViewSet) +router.register(r'senders', CorrespondentViewSet) router.register(r'tags', TagViewSet) router.register(r'documents', DocumentViewSet) router.register(r'logs', LogViewSet) From ba7878b9aa5b115ad91daddf387433a3948c7619 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Thu, 3 Mar 2016 21:25:08 +0000 Subject: [PATCH 51/71] Added some tests for the importer --- .../management/commands/document_importer.py | 15 ++------ src/documents/tests/test_importer.py | 36 +++++++++++++++++++ 2 files changed, 38 insertions(+), 13 deletions(-) create mode 100644 src/documents/tests/test_importer.py diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index 213c049e4..63c961815 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -20,13 +20,6 @@ class Command(Renderable, BaseCommand): def add_arguments(self, parser): parser.add_argument("source") - parser.add_argument( - '--ignore-absent', - action='store_true', - default=False, - help="If the manifest refers to a document that doesn't exist, " - "ignore it and attempt to import what it can" - ) def __init__(self, *args, **kwargs): BaseCommand.__init__(self, *args, **kwargs) @@ -80,18 +73,14 @@ class Command(Renderable, BaseCommand): if "__exported_file_name__" not in record: raise CommandError( 'The manifest file contains a record which does not ' - 'refer to an actual document file. If you want to import ' - 'the rest anyway (skipping such references) call the ' - 'importer with --ignore-absent' + 'refer to an actual document file.' ) doc_file = record["__exported_file_name__"] if not os.path.exists(os.path.join(self.source, doc_file)): raise CommandError( 'The manifest file refers to "{}" which does not ' - 'appear to be in the source directory. If you want to ' - 'import the rest anyway (skipping such references) call ' - 'the importer with --ignore-absent'.format(doc_file) + 'appear to be in the source directory.'.format(doc_file) ) def _import_files_from_manifest(self): diff --git a/src/documents/tests/test_importer.py b/src/documents/tests/test_importer.py new file mode 100644 index 000000000..8880aba66 --- /dev/null +++ b/src/documents/tests/test_importer.py @@ -0,0 +1,36 @@ +from django.core.management.base import CommandError +from django.test import TestCase + +from ..management.commands.document_importer import Command + + +class TestImporter(TestCase): + + def __init__(self, *args, **kwargs): + TestCase.__init__(self, *args, **kwargs) + + def test_check_manifest_exists(self): + cmd = Command() + self.assertRaises( + CommandError, cmd._check_manifest_exists, "/tmp/manifest.json") + + def test_check_manifest(self): + + cmd = Command() + cmd.source = "/tmp" + + cmd.manifest = [{"model": "documents.document"}] + with self.assertRaises(CommandError) as cm: + cmd._check_manifest() + self.assertTrue( + 'The manifest file contains a record' in str(cm.exception)) + + cmd.manifest = [{ + "model": "documents.document", + "__exported_file_name__": "noexist.pdf" + }] + # self.assertRaises(CommandError, cmd._check_manifest) + with self.assertRaises(CommandError) as cm: + cmd._check_manifest() + self.assertTrue( + 'The manifest file refers to "noexist.pdf"' in str(cm.exception)) From 5d4587ef8b599fbe91c74740ded81e35d1b711f8 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 4 Mar 2016 09:14:50 +0000 Subject: [PATCH 52/71] Accounted for .sender in a few places --- src/documents/admin.py | 6 +-- src/documents/consumer.py | 34 ++++++++-------- src/documents/forms.py | 29 +++++++------- .../management/commands/document_exporter.py | 39 +++++++++++++++++++ .../migrations/0011_auto_20160303_1929.py | 9 +++++ src/documents/models.py | 13 ++++--- src/documents/serialisers.py | 6 +-- 7 files changed, 95 insertions(+), 41 deletions(-) diff --git a/src/documents/admin.py b/src/documents/admin.py index 3baad817b..a5b523492 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -45,9 +45,9 @@ class DocumentAdmin(admin.ModelAdmin): "all": ("paperless.css",) } - search_fields = ("sender__name", "title", "content") - list_display = ("created_", "sender", "title", "tags_", "document") - list_filter = ("tags", "sender", MonthListFilter) + search_fields = ("correspondent__name", "title", "content") + list_display = ("created_", "correspondent", "title", "tags_", "document") + list_filter = ("tags", "correspondent", MonthListFilter) list_per_page = 25 def created_(self, obj): diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 4233cded8..eeb42cdf1 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -57,11 +57,11 @@ class Consumer(object): r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) - REGEX_SENDER_TITLE = re.compile( + REGEX_CORRESPONDENT_TITLE = re.compile( r"^.*/(.+) - (.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) - REGEX_SENDER_TITLE_TAGS = re.compile( + REGEX_CORRESPONDENT_TITLE_TAGS = re.compile( r"^.*/(.*) - (.*) - ([a-z0-9\-,]*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) @@ -238,16 +238,18 @@ class Consumer(object): def _guess_attributes_from_name(self, parseable): """ - We use a crude naming convention to make handling the sender, title, - and tags easier: - " - - <tags>.<suffix>" - "<sender> - <title>.<suffix>" + We use a crude naming convention to make handling the correspondent, + title, and tags easier: + "<correspondent> - <title> - <tags>.<suffix>" + "<correspondent> - <title>.<suffix>" "<title>.<suffix>" """ - def get_sender(sender_name): + def get_correspondent(correspondent_name): return Correspondent.objects.get_or_create( - name=sender_name, defaults={"slug": slugify(sender_name)})[0] + name=correspondent_name, + defaults={"slug": slugify(correspondent_name)} + )[0] def get_tags(tags): r = [] @@ -262,27 +264,27 @@ class Consumer(object): return "jpg" return suffix - # First attempt: "<sender> - <title> - <tags>.<suffix>" - m = re.match(self.REGEX_SENDER_TITLE_TAGS, parseable) + # First attempt: "<correspondent> - <title> - <tags>.<suffix>" + m = re.match(self.REGEX_CORRESPONDENT_TITLE_TAGS, parseable) if m: return ( - get_sender(m.group(1)), + get_correspondent(m.group(1)), m.group(2), get_tags(m.group(3)), get_suffix(m.group(4)) ) - # Second attempt: "<sender> - <title>.<suffix>" - m = re.match(self.REGEX_SENDER_TITLE, parseable) + # Second attempt: "<correspondent> - <title>.<suffix>" + m = re.match(self.REGEX_CORRESPONDENT_TITLE, parseable) if m: return ( - get_sender(m.group(1)), + get_correspondent(m.group(1)), m.group(2), (), get_suffix(m.group(3)) ) - # That didn't work, so we assume sender and tags are None + # That didn't work, so we assume correspondent and tags are None m = re.match(self.REGEX_TITLE, parseable) return None, m.group(1), (), get_suffix(m.group(2)) @@ -296,7 +298,7 @@ class Consumer(object): self.log("debug", "Saving record to database") document = Document.objects.create( - sender=sender, + correspondent=sender, title=title, content=text, file_type=file_type, diff --git a/src/documents/forms.py b/src/documents/forms.py index d8960f88b..d4c01745a 100644 --- a/src/documents/forms.py +++ b/src/documents/forms.py @@ -23,7 +23,7 @@ class UploadForm(forms.Form): "image/tiff": Document.TYPE_TIF, } - sender = forms.CharField( + correspondent = forms.CharField( max_length=Correspondent._meta.get_field("name").max_length, required=False ) @@ -34,18 +34,19 @@ class UploadForm(forms.Form): document = forms.FileField() signature = forms.CharField(max_length=256) - def clean_sender(self): + def clean_correspondent(self): """ I suppose it might look cleaner to use .get_or_create() here, but that - would also allow someone to fill up the db with bogus senders before - all validation was met. + would also allow someone to fill up the db with bogus correspondents + before all validation was met. """ - sender = self.cleaned_data.get("sender") - if not sender: + corresp = self.cleaned_data.get("correspondent") + if not corresp: return None - if not Correspondent.SAFE_REGEX.match(sender) or " - " in sender: - raise forms.ValidationError("That sender name is suspicious.") - return sender + if not Correspondent.SAFE_REGEX.match(corresp) or " - " in corresp: + raise forms.ValidationError( + "That correspondent name is suspicious.") + return corresp def clean_title(self): title = self.cleaned_data.get("title") @@ -63,10 +64,10 @@ class UploadForm(forms.Form): return document, self.TYPE_LOOKUP[file_type] def clean(self): - sender = self.clened_data("sender") + corresp = self.clened_data("correspondent") title = self.cleaned_data("title") signature = self.cleaned_data("signature") - if sha256(sender + title + self.SECRET).hexdigest() == signature: + if sha256(corresp + title + self.SECRET).hexdigest() == signature: return True return False @@ -77,13 +78,15 @@ class UploadForm(forms.Form): form do that as well. Think of it as a poor-man's queue server. """ - sender = self.clened_data("sender") + correspondent = self.clened_data("correspondent") title = self.cleaned_data("title") document, file_type = self.cleaned_data.get("document") t = int(mktime(datetime.now())) file_name = os.path.join( - Consumer.CONSUME, "{} - {}.{}".format(sender, title, file_type)) + Consumer.CONSUME, + "{} - {}.{}".format(correspondent, title, file_type) + ) with open(file_name, "wb") as f: f.write(document) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 87ed804a2..913f7ae79 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -22,6 +22,13 @@ class Command(Renderable, BaseCommand): def add_arguments(self, parser): parser.add_argument("target") + parser.add_argument( + "--legacy", + action="store_true", + help="Don't try to export all of the document data, just dump the " + "original document files out in a format that makes " + "re-consuming them easy." + ) def __init__(self, *args, **kwargs): BaseCommand.__init__(self, *args, **kwargs) @@ -40,6 +47,13 @@ class Command(Renderable, BaseCommand): if not settings.PASSPHRASE: settings.PASSPHRASE = input("Please enter the passphrase: ") + if options["legacy"]: + self.dump_legacy() + else: + self.dump() + + def dump(self): + documents = Document.objects.all() document_map = {d.pk: d for d in documents} manifest = json.loads(serializers.serialize("json", documents)) @@ -65,3 +79,28 @@ class Command(Renderable, BaseCommand): with open(os.path.join(self.target, "manifest.json"), "w") as f: json.dump(manifest, f, indent=2) + + def dump_legacy(self): + + for document in Document.objects.all(): + + target = os.path.join( + self.target, self._get_legacy_file_name(document)) + + print("Exporting: {}".format(target)) + + with open(target, "wb") as f: + f.write(GnuPG.decrypted(document.source_file)) + t = int(time.mktime(document.created.timetuple())) + os.utime(target, times=(t, t)) + + @staticmethod + def _get_legacy_file_name(doc): + if doc.correspondent and doc.title: + tags = ",".join([t.slug for t in doc.tags.all()]) + if tags: + return "{} - {} - {}.{}".format( + doc.correspondent, doc.title, tags, doc.file_type) + return "{} - {}.{}".format( + doc.correspondent, doc.title, doc.file_type) + return os.path.basename(doc.source_path) diff --git a/src/documents/migrations/0011_auto_20160303_1929.py b/src/documents/migrations/0011_auto_20160303_1929.py index a9aefddaf..af4ee4c66 100644 --- a/src/documents/migrations/0011_auto_20160303_1929.py +++ b/src/documents/migrations/0011_auto_20160303_1929.py @@ -16,4 +16,13 @@ class Migration(migrations.Migration): old_name='Sender', new_name='Correspondent', ), + migrations.AlterModelOptions( + name='document', + options={'ordering': ('correspondent', 'title')}, + ), + migrations.RenameField( + model_name='document', + old_name='sender', + new_name='correspondent', + ), ] diff --git a/src/documents/models.py b/src/documents/models.py index 0fb6489c4..a82f7643f 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -140,7 +140,7 @@ class Document(models.Model): TYPE_TIF = "tiff" TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,) - sender = models.ForeignKey( + correspondent = models.ForeignKey( Correspondent, blank=True, null=True, related_name="documents") title = models.CharField(max_length=128, blank=True, db_index=True) content = models.TextField(db_index=True) @@ -155,14 +155,15 @@ class Document(models.Model): modified = models.DateTimeField(auto_now=True, editable=False) class Meta(object): - ordering = ("sender", "title") + ordering = ("correspondent", "title") def __str__(self): created = self.created.strftime("%Y%m%d%H%M%S") - if self.sender and self.title: - return "{}: {} - {}".format(created, self.sender, self.title) - if self.sender or self.title: - return "{}: {}".format(created, self.sender or self.title) + if self.correspondent and self.title: + return "{}: {} - {}".format( + created, self.correspondent, self.title) + if self.correspondent or self.title: + return "{}: {}".format(created, self.correspondent or self.title) return str(created) @property diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 340fdaa25..c2b2ae7fd 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -20,8 +20,8 @@ class TagSerializer(serializers.HyperlinkedModelSerializer): class DocumentSerializer(serializers.ModelSerializer): - sender = serializers.HyperlinkedRelatedField( - read_only=True, view_name="drf:sender-detail", allow_null=True) + correspondent = serializers.HyperlinkedRelatedField( + read_only=True, view_name="drf:correspondent-detail", allow_null=True) tags = serializers.HyperlinkedRelatedField( read_only=True, view_name="drf:tag-detail", many=True) @@ -29,7 +29,7 @@ class DocumentSerializer(serializers.ModelSerializer): model = Document fields = ( "id", - "sender", + "correspondent", "title", "content", "file_type", From 13c2ed66e13c493c25ca460f29f43aa1f0f5815d Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Fri, 4 Mar 2016 17:53:54 +0000 Subject: [PATCH 53/71] Better bare metal explanation --- docs/setup.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index 077ce135c..9992418c1 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -42,12 +42,13 @@ route`_ is quick & easy, but means you're running a VM which comes with memory consumption etc. We also `support Docker`_, which you can use natively under Linux and in a VM with `Docker Machine`_ (this guide was written for native Docker usage under Linux, you might have to adapt it for Docker Machine.) -Alternatively the standard, `bare metal`_ approach is a little more complicated. +Alternatively the standard, `bare metal`_ approach is a little more complicated, +but worth it because it makes it easier to should you want to contribute some +code back. .. _Vagrant route: setup-installation-vagrant_ .. _support Docker: setup-installation-docker_ .. _bare metal: setup-installation-standard_ - .. _Docker Machine: https://docs.docker.com/machine/ .. _setup-installation-standard: From 94a7914073f1ba449f3c23b314be87e7418e90d4 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Fri, 4 Mar 2016 23:20:22 +0000 Subject: [PATCH 54/71] More descriptive --- docs/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 772e30dc0..d135d3564 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,7 +4,7 @@ Changelog * 0.1.1 (master) * `#68`_: Added support for using a proper config file at - ``/etc/paperless.conf``. + ``/etc/paperless.conf`` and modified the systemd unit files to use it. * Refactored the Vagrant installation process to use environment variables rather than asking the user to modify ``settings.py``. * `#44`_: Harmonise environment variable names with constant names. From d24cfbb24652972b6c72f70a3eca4b78f22817f7 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Fri, 4 Mar 2016 23:22:57 +0000 Subject: [PATCH 55/71] Added the bit about s/sender/correspondent/g --- docs/changelog.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index d135d3564..2228c9be1 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,10 @@ Changelog * 0.1.1 (master) + * Potentially **Breaking Change**: All references to "sender" in the code + have been renamed to "correspondent" to better reflect the nature of the + property (one could quite reasonably scan a document before sending it to + someone.) * `#68`_: Added support for using a proper config file at ``/etc/paperless.conf`` and modified the systemd unit files to use it. * Refactored the Vagrant installation process to use environment variables From 1ffce8f52d90e27dbfcb4863c57448a6cb5c5666 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Fri, 4 Mar 2016 23:59:13 +0000 Subject: [PATCH 56/71] Documented the API and added some help for the config file --- docs/api.rst | 23 +++++++++++++++++++++++ docs/consumption.rst | 13 ++++++------- paperless.conf.example | 32 ++++++++++++++++++++++++++++++++ scripts/vagrant-provision | 22 +++------------------- 4 files changed, 64 insertions(+), 26 deletions(-) create mode 100644 docs/api.rst create mode 100644 paperless.conf.example diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 000000000..15ca9bc44 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,23 @@ +.. _api: + +The REST API +############ + +Paperless makes use of the `Django REST Framework`_ standard API interface +because of its inherent awesomeness. Conveniently, the system is also +self-documenting, so learn more about the access points, schema, what's +accepted and what isn't, you need only visit ``/api`` on your local Paperless +installation. + +.. _Django REST Framework: http://django-rest-framework.org/ + + +.. _api-uploading: + +Uploading +--------- + +File uploads in an API are hard and so far as I've been able to tell, there's +no standard way of accepting them, so rather than crowbar file uploads into the +REST API and endure that headache, I've left that process to a simple HTTP +POST, documented on the :ref:`consumption page <consumption-http>`. diff --git a/docs/consumption.rst b/docs/consumption.rst index 0f8ff7ca5..eadf12823 100644 --- a/docs/consumption.rst +++ b/docs/consumption.rst @@ -40,9 +40,9 @@ follow the :ref:`consumer <utilities-consumer>` instructions to get it running. A Note on File Naming --------------------- -Any document you put into the consumption directory will be consumed, but if you -name the file right, it'll automatically set some values in the database for -you. This is is the logic the consumer follows: +Any document you put into the consumption directory will be consumed, but if +you name the file right, it'll automatically set some values in the database +for you. This is is the logic the consumer follows: 1. Try to find the correspondent, title, and tags in the file name following the pattern: ``Correspondent - Title - tag,tag,tag.pdf``. @@ -111,11 +111,10 @@ So, with all that in mind, here's what you do to get it running: HTTP POST ========= -Currently, the API is limited to only handling file uploads, it doesn't do tags -yet, and the URL schema isn't concrete, but it's a start. It's also not much of -a real API, it's just a URL that accepts an HTTP POST. +You can also submit a document via HTTP POST. It doesn't do tags yet, and the +URL schema isn't concrete, but it's a start. -To push your document to *Paperless*, send an HTTP POST to the server with the +To push your document to Paperless, send an HTTP POST to the server with the following name/value pairs: * ``correspondent``: The name of the document's correspondent. Note that there diff --git a/paperless.conf.example b/paperless.conf.example new file mode 100644 index 000000000..fa65c35b9 --- /dev/null +++ b/paperless.conf.example @@ -0,0 +1,32 @@ +# Sample paperless.conf +# Copy this file to /etc/paperless.conf and modify it to suit your needs. + +# This where your documents should go to be consumed. Make sure that it exists +# and that the user running the paperless service can read/write its contents +# before you start Paperless. +PAPERLESS_CONSUMPTION_DIR="" + +# These values are required if you want paperless to check a particular email +# box every 10 minutes and attempt to consume documents from there. If you +# dont define a HOST, mail checking will just be disabled. +PAPERLESS_CONSUME_MAIL_HOST="" +PAPERLESS_CONSUME_MAIL_PORT="" +PAPERLESS_CONSUME_MAIL_USER="" +PAPERLESS_CONSUME_MAIL_PASS="" + +# You must have a passphrase in order for Paperless to work at all. If you set +# this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte +# file. +# The passphrase you use here will be used when storing your documents in +# Paperless, but you can always export them in an unencrypted format by using +# document exporter. See the documentaiton for more information. +# +# One final note about the passphrase. Once you've consumed a document with +# one passphrase, DON'T CHANGE IT. Paperless assumes this to be a constant and +# can't properly export documents that were encrypted with an old passphrase if +# you've since changed it to a new one. +PAPERLESS_PASSPHRASE="secret" + +# If you intend to consume documents either via HTTP POST or by email, you must +# have a shared secret here. +PAPERLESS_SHARED_SECRET="" diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision index 2a744d5d3..0a09058e4 100644 --- a/scripts/vagrant-provision +++ b/scripts/vagrant-provision @@ -11,25 +11,9 @@ apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick pip3 install -r /opt/paperless/requirements.txt # Create the environment file -echo " -# This where your documents should go to be consumed. Make sure that it exists -# before you start Paperless. -PAPERLESS_CONSUMPTION_DIR='/home/vagrant/consumption' - -# This is the secret passphrase used to encrypt the documents once they have -# been consumed. Change it to whatever you like, but you shouldn't change it -# after it has been used to consume a document or you won't be able to read -# that document again. -PAPERLESS_PASSPHRASE='secret' - -# This is the secret string used to verify PDFs sent by mail or consumed via -# the API. If you don't plan to use either of these, you can safely leave it -# blank -PAPERLESS_SHARED_SECRET='' -" > /tmp/paperless.conf -chmod 0640 /tmp/paperless.conf -chown root:vagrant /tmp/paperless.conf -mv /tmp/paperless.conf /etc/ +cat /opt/paperless/paperless.conf.example | sed -e 's#CONSUMPTION_DIR=""#CONSUMPTION_DIR="/home/vagrant/consumption"#' > /etc/paperless.conf +chmod 0640 /etc/paperless.conf +chown root:vagrant /etc/paperless.conf # Create the consumption directory mkdir /home/vagrant/consumption From eb05707f2788db885d33be800fd22392a1b6692c Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 00:01:09 +0000 Subject: [PATCH 57/71] Added link to the api doc page --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index fc78f6f23..47710d376 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ Contents requirements setup consumption + api utilities migrating changelog From 8b5416896d80695a4cfbea125bba3950baf8ff57 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 00:03:45 +0000 Subject: [PATCH 58/71] Grammar & formatting --- paperless.conf.example | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paperless.conf.example b/paperless.conf.example index fa65c35b9..3ee429ea8 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -8,7 +8,7 @@ PAPERLESS_CONSUMPTION_DIR="" # These values are required if you want paperless to check a particular email # box every 10 minutes and attempt to consume documents from there. If you -# dont define a HOST, mail checking will just be disabled. +# don't define a HOST, mail checking will just be disabled. PAPERLESS_CONSUME_MAIL_HOST="" PAPERLESS_CONSUME_MAIL_PORT="" PAPERLESS_CONSUME_MAIL_USER="" @@ -17,6 +17,7 @@ PAPERLESS_CONSUME_MAIL_PASS="" # You must have a passphrase in order for Paperless to work at all. If you set # this to "", GNUGPG will "encrypt" your PDF by writing it out as a zero-byte # file. +# # The passphrase you use here will be used when storing your documents in # Paperless, but you can always export them in an unencrypted format by using # document exporter. See the documentaiton for more information. From 5c41e717f0cc718f86c91c6179cafbe4b3e9bd56 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 00:23:14 +0000 Subject: [PATCH 59/71] Missed on case of 'sender' --- src/paperless/urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless/urls.py b/src/paperless/urls.py index e81d4dcf9..4b73dc88e 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -26,7 +26,7 @@ from documents.views import ( ) router = DefaultRouter() -router.register(r'senders', CorrespondentViewSet) +router.register(r'correspondents', CorrespondentViewSet) router.register(r'tags', TagViewSet) router.register(r'documents', DocumentViewSet) router.register(r'logs', LogViewSet) From 52f15b4de14c3dbaa4969fb6f4e5382d47c230a5 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 01:57:49 +0000 Subject: [PATCH 60/71] The first stages of getting thumbnails back --- .gitignore | 5 +- media/documents/originals/.keep | 0 media/documents/thumbnails/.keep | 0 .../migrations/0012_auto_20160305_0040.py | 101 ++++++++++++++++++ src/documents/models.py | 20 +++- src/documents/views.py | 8 +- src/paperless/urls.py | 6 +- 7 files changed, 135 insertions(+), 5 deletions(-) create mode 100644 media/documents/originals/.keep create mode 100644 media/documents/thumbnails/.keep create mode 100644 src/documents/migrations/0012_auto_20160305_0040.py diff --git a/.gitignore b/.gitignore index d4c3fe38e..3c8b8ffea 100644 --- a/.gitignore +++ b/.gitignore @@ -57,7 +57,9 @@ docs/_build/ target/ # Stored PDFs -media/* +media/documents/*.gpg +media/documents/thumbnails/*.gpg +media/documents/originals/*.gpg # Sqlite database db.sqlite3 @@ -74,4 +76,3 @@ docker-compose.env # Used for development scripts/import-for-development environment - diff --git a/media/documents/originals/.keep b/media/documents/originals/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/media/documents/thumbnails/.keep b/media/documents/thumbnails/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py new file mode 100644 index 000000000..e42c6cde5 --- /dev/null +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.2 on 2016-03-05 00:40 +from __future__ import unicode_literals + +import gnupg +import os +import re +import shutil +import subprocess +import tempfile + +from django.conf import settings +from django.db import migrations + + +class GnuPG(object): + """ + A handy singleton to use when handling encrypted files. + """ + + gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + + @classmethod + def decrypted(cls, file_handle): + return cls.gpg.decrypt_file( + file_handle, passphrase=settings.PASSPHRASE).data + + @classmethod + def encrypted(cls, file_handle): + return cls.gpg.encrypt_file( + file_handle, + recipients=None, + passphrase=settings.PASSPHRASE, + symmetric=True + ).data + + +def move_documents_and_create_thumbnails(apps, schema_editor): + + documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents")) + + if not documents: + return + + print("\n") + + for f in sorted(documents): + + if not f.endswith("gpg"): + continue + + print(" * Generating a thumbnail for {}".format(f)) + + thumb_temp = tempfile.mkdtemp( + prefix="paperless", dir=settings.SCRATCH_DIR) + orig_temp = tempfile.mkdtemp( + prefix="paperless", dir=settings.SCRATCH_DIR) + + orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f) + orig_target = os.path.join(orig_temp, f.replace(".gpg", "")) + + with open(orig_source, "rb") as encrypted: + with open(orig_target, "wb") as unencrypted: + unencrypted.write(GnuPG.decrypted(encrypted)) + + subprocess.Popen(( + settings.CONVERT_BINARY, + "-scale", "500x500", + orig_target, + os.path.join(thumb_temp, "convert-%04d.jpg") + )).wait() + + thumb_source = os.path.join(thumb_temp, "convert-0000.jpg") + thumb_target = os.path.join( + settings.MEDIA_ROOT, + "documents", + "thumbnails", + re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.jpg\\2", f) + ) + with open(thumb_source, "rb") as unencrypted: + with open(thumb_target, "wb") as encrypted: + encrypted.write(GnuPG.encrypted(unencrypted)) + + shutil.rmtree(thumb_temp) + shutil.rmtree(orig_temp) + + shutil.move( + os.path.join(settings.MEDIA_ROOT, "documents", f), + os.path.join(settings.MEDIA_ROOT, "documents", "originals", f), + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0011_auto_20160303_1929'), + ] + + operations = [ + migrations.RunPython(move_documents_and_create_thumbnails), + ] diff --git a/src/documents/models.py b/src/documents/models.py index a82f7643f..a3ffb8a74 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -171,6 +171,7 @@ class Document(models.Model): return os.path.join( settings.MEDIA_ROOT, "documents", + "originals", "{:07}.{}.gpg".format(self.pk, self.file_type) ) @@ -184,7 +185,24 @@ class Document(models.Model): @property def download_url(self): - return reverse("fetch", kwargs={"pk": self.pk}) + return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk}) + + @property + def thumbnail_path(self): + return os.path.join( + settings.MEDIA_ROOT, + "documents", + "thumbnails", + "{:07}.jpg.gpg".format(self.pk) + ) + + @property + def thumbnail_file(self): + return open(self.thumbnail_path, "rb") + + @property + def thumbnail_url(self): + return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk}) class Log(models.Model): diff --git a/src/documents/views.py b/src/documents/views.py index ff7c4ce05..4a4a060bf 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -35,7 +35,7 @@ class FetchView(LoginRequiredMixin, DetailView): def render_to_response(self, context, **response_kwargs): """ - Override the default to return the unencrypted PDF as raw data. + Override the default to return the unencrypted image/PDF as raw data. """ content_types = { @@ -46,6 +46,12 @@ class FetchView(LoginRequiredMixin, DetailView): Document.TYPE_TIF: "image/tiff", } + if self.kwargs["kind"] == "thumb": + return HttpResponse( + GnuPG.decrypted(self.object.thumb_file), + content_type=content_types[Document.TYPE_JPG] + ) + response = HttpResponse( GnuPG.decrypted(self.object.source_file), content_type=content_types[self.object.file_type] diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 4b73dc88e..a7775a588 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -44,7 +44,11 @@ urlpatterns = [ # url(r"^$", IndexView.as_view(), name="index"), # File downloads - url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"), + url( + r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$", + FetchView.as_view(), + name="fetch" + ), # The Django admin url(r"admin/", admin.site.urls), From 8a9ea4664c01f104436cfc89119f7429050841dd Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 02:15:26 +0000 Subject: [PATCH 61/71] Cleaned up the thumbnails by switching to .png --- src/documents/migrations/0012_auto_20160305_0040.py | 9 +++++---- src/documents/models.py | 2 +- src/documents/views.py | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index e42c6cde5..876c2c68e 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -65,17 +65,18 @@ def move_documents_and_create_thumbnails(apps, schema_editor): subprocess.Popen(( settings.CONVERT_BINARY, - "-scale", "500x500", + "-scale", "500x5000", + "-alpha", "remove", orig_target, - os.path.join(thumb_temp, "convert-%04d.jpg") + os.path.join(thumb_temp, "convert-%04d.png") )).wait() - thumb_source = os.path.join(thumb_temp, "convert-0000.jpg") + thumb_source = os.path.join(thumb_temp, "convert-0000.png") thumb_target = os.path.join( settings.MEDIA_ROOT, "documents", "thumbnails", - re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.jpg\\2", f) + re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f) ) with open(thumb_source, "rb") as unencrypted: with open(thumb_target, "wb") as encrypted: diff --git a/src/documents/models.py b/src/documents/models.py index a3ffb8a74..b8baea7f8 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -193,7 +193,7 @@ class Document(models.Model): settings.MEDIA_ROOT, "documents", "thumbnails", - "{:07}.jpg.gpg".format(self.pk) + "{:07}.png.gpg".format(self.pk) ) @property diff --git a/src/documents/views.py b/src/documents/views.py index 4a4a060bf..1dc23aa4f 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -29,7 +29,7 @@ class IndexView(TemplateView): return TemplateView.get_context_data(self, **kwargs) -class FetchView(LoginRequiredMixin, DetailView): +class FetchView(DetailView): model = Document @@ -48,8 +48,8 @@ class FetchView(LoginRequiredMixin, DetailView): if self.kwargs["kind"] == "thumb": return HttpResponse( - GnuPG.decrypted(self.object.thumb_file), - content_type=content_types[Document.TYPE_JPG] + GnuPG.decrypted(self.object.thumbnail_file), + content_type=content_types[Document.TYPE_PNG] ) response = HttpResponse( From 495ed1c36c9c8ebb120449ad0bdac9be27255f3c Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 12:09:06 +0000 Subject: [PATCH 62/71] Added thumbnail generation to the conumer --- src/documents/consumer.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index eeb42cdf1..5cfc20852 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -119,10 +119,11 @@ class Consumer(object): tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) + thumbnail = self._get_thumbnail(tempdir, doc) try: text = self._get_ocr(pngs) - self._store(text, doc) + self._store(text, doc, thumbnail) except OCRError as e: self._ignore.append(doc) self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) @@ -133,6 +134,9 @@ class Consumer(object): self._cleanup_doc(doc) def _get_greyscale(self, tempdir, doc): + """ + Greyscale images are easier for Tesseract to OCR + """ self.log("info", "Generating greyscale image from {}".format(doc)) @@ -150,6 +154,23 @@ class Consumer(object): return sorted(filter(lambda __: os.path.isfile(__), pngs)) + def _get_thumbnail(self, tempdir, doc): + """ + The thumbnail of a PDF is just a 500px wide image of the first page. + """ + + self.log("info", "Generating the thumbnail") + + subprocess.Popen(( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + doc, + os.path.join(tempdir, "convert-%04d.png") + )).wait() + + return os.path.join(tempdir, "convert-0000.png") + def _guess_language(self, text): try: guess = langdetect.detect(text) @@ -288,7 +309,7 @@ class Consumer(object): m = re.match(self.REGEX_TITLE, parseable) return None, m.group(1), (), get_suffix(m.group(2)) - def _store(self, text, doc): + def _store(self, text, doc, thumbnail): sender, title, tags, file_type = self._guess_attributes_from_name(doc) relevant_tags = set(list(Tag.match_all(text)) + list(tags)) @@ -313,9 +334,16 @@ class Consumer(object): self.log("debug", "Tagging with {}".format(tag_names)) document.tags.add(*relevant_tags) + # Encrypt and store the actual document with open(doc, "rb") as unencrypted: with open(document.source_path, "wb") as encrypted: - self.log("debug", "Encrypting") + self.log("debug", "Encrypting the document") + encrypted.write(GnuPG.encrypted(unencrypted)) + + # Encrypt and store the thumbnail + with open(thumbnail, "rb") as unencrypted: + with open(document.thumbnail_path, "wb") as encrypted: + self.log("debug", "Encrypting the thumbnail") encrypted.write(GnuPG.encrypted(unencrypted)) self.log("info", "Completed") From ac40aee805a7289a721469b50f146d5c3801cdfe Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 12:31:43 +0000 Subject: [PATCH 63/71] Added some nice output so the migration is less scary --- .../migrations/0012_auto_20160305_0040.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index 876c2c68e..618ace5d8 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -11,6 +11,7 @@ import tempfile from django.conf import settings from django.db import migrations +from django.utils.termcolors import colorize as colourise # Spelling hurts me class GnuPG(object): @@ -42,14 +43,25 @@ def move_documents_and_create_thumbnails(apps, schema_editor): if not documents: return - print("\n") + print(colourise( + "\n\n" + " This is a one-time only migration to generate thumbnails for all of your\n" + " documents so that future UIs will have something to work with. If you have\n" + " a lot of documents though, this may take a while, so a coffee break may be\n" + " in order." + "\n", opts=("bold",) + )) for f in sorted(documents): if not f.endswith("gpg"): continue - print(" * Generating a thumbnail for {}".format(f)) + print(" {} {} {}".format( + colourise("*", fg="green"), + colourise("Generating a thumbnail for", fg="white"), + colourise(f, fg="cyan") + )) thumb_temp = tempfile.mkdtemp( prefix="paperless", dir=settings.SCRATCH_DIR) From 034b96277cbb15ca178b3c60ea10a1db0da0c782 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 12:34:26 +0000 Subject: [PATCH 64/71] Added thumbnail_url to the API --- src/documents/serialisers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index c2b2ae7fd..db50d34ba 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -37,7 +37,8 @@ class DocumentSerializer(serializers.ModelSerializer): "created", "modified", "file_name", - "download_url" + "download_url", + "thumbnail_url", ) From bfad4560e139257ec81ccd284984001ee53bfce9 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sat, 5 Mar 2016 12:43:05 +0000 Subject: [PATCH 65/71] Fixed the check for empty installations --- src/documents/migrations/0012_auto_20160305_0040.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index 618ace5d8..62a5c65bc 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -40,7 +40,7 @@ def move_documents_and_create_thumbnails(apps, schema_editor): documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents")) - if not documents: + if set(documents) == {"originals", "thumbnails"}: return print(colourise( From 9180ad78c4a475f50d8e90d2051545c6f5ff8942 Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg <pitkley@googlemail.com> Date: Sun, 6 Mar 2016 14:39:28 +0100 Subject: [PATCH 66/71] Update Dockerfile to match latest version --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index fec76ee37..eb9fa90dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,11 @@ RUN pip install --no-cache-dir -r requirements.txt # Copy application RUN mkdir -p /usr/src/paperless/src +RUN mkdir -p /usr/src/paperless/data +RUN mkdir -p /usr/src/paperless/media COPY src/ /usr/src/paperless/src/ +COPY data/ /usr/src/paperless/data/ +COPY media/ /usr/src/paperless/media/ # Set consumption directory ENV PAPERLESS_CONSUMPTION_DIR /consume @@ -24,7 +28,6 @@ RUN mkdir -p $PAPERLESS_CONSUMPTION_DIR # Migrate database WORKDIR /usr/src/paperless/src -RUN mkdir /usr/src/paperless/data RUN ./manage.py migrate # Create user From fb36a49c2681aa5362e30f266e85c89565a310c3 Mon Sep 17 00:00:00 2001 From: Pit Kleyersburg <pitkley@googlemail.com> Date: Tue, 16 Feb 2016 10:49:55 +0100 Subject: [PATCH 67/71] Add unpaper as another pre-processing step --- Dockerfile | 2 +- docs/requirements.rst | 2 + scripts/vagrant-provision | 2 +- src/documents/consumer.py | 80 ++++++++++++++++++++++----------------- src/paperless/settings.py | 3 ++ 5 files changed, 53 insertions(+), 36 deletions(-) diff --git a/Dockerfile b/Dockerfile index eb9fa90dd..a13fa7b3f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ MAINTAINER Pit Kleyersburg <pitkley@googlemail.com> RUN apt-get update \ && apt-get install -y --no-install-recommends \ sudo \ - tesseract-ocr tesseract-ocr-eng imagemagick ghostscript \ + tesseract-ocr tesseract-ocr-eng imagemagick ghostscript unpaper \ && rm -rf /var/lib/apt/lists/* # Install python dependencies diff --git a/docs/requirements.rst b/docs/requirements.rst index ee287d835..36bc234c0 100644 --- a/docs/requirements.rst +++ b/docs/requirements.rst @@ -10,11 +10,13 @@ should work) that has the following software installed on it: * `GNU Privacy Guard`_ * `Tesseract`_ * `Imagemagick`_ +* `unpaper`_ .. _Python3: https://python.org/ .. _GNU Privacy Guard: https://gnupg.org .. _Tesseract: https://github.com/tesseract-ocr .. _Imagemagick: http://imagemagick.org/ +.. _unpaper: https://www.flameeyes.eu/projects/unpaper Notably, you should confirm how you access your Python3 installation. Many Linux distributions will install Python3 in parallel to Python2, using the names diff --git a/scripts/vagrant-provision b/scripts/vagrant-provision index 0a09058e4..940bf476c 100644 --- a/scripts/vagrant-provision +++ b/scripts/vagrant-provision @@ -5,7 +5,7 @@ apt-get update apt-get build-dep -y python-imaging apt-get install -y libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev apt-get install -y build-essential python3-dev python3-pip sqlite3 libsqlite3-dev git -apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick +apt-get install -y tesseract-ocr tesseract-ocr-eng imagemagick unpaper # Python dependencies pip3 install -r /opt/paperless/requirements.txt diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5cfc20852..fbdbbc276 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -39,8 +39,8 @@ class ConsumerError(Exception): class Consumer(object): """ Loop over every file found in CONSUMPTION_DIR and: - 1. Convert it to a greyscale png - 2. Use tesseract on the png + 1. Convert it to a greyscale pnm + 2. Use tesseract on the pnm 3. Encrypt and store the document in the MEDIA_ROOT 4. Store the OCR'd text in the database 5. Delete the document and image(s) @@ -48,6 +48,7 @@ class Consumer(object): SCRATCH = settings.SCRATCH_DIR CONVERT = settings.CONVERT_BINARY + UNPAPER = settings.UNPAPER_BINARY CONSUME = settings.CONSUMPTION_DIR THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None @@ -118,11 +119,11 @@ class Consumer(object): self.log("info", "Consuming {}".format(doc)) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) - pngs = self._get_greyscale(tempdir, doc) + imgs = self._get_greyscale(tempdir, doc) thumbnail = self._get_thumbnail(tempdir, doc) try: - text = self._get_ocr(pngs) + text = self._get_ocr(imgs) self._store(text, doc, thumbnail) except OCRError as e: self._ignore.append(doc) @@ -140,19 +141,30 @@ class Consumer(object): self.log("info", "Generating greyscale image from {}".format(doc)) - png = os.path.join(tempdir, "convert-%04d.jpg") - + # Convert PDF to multiple PNMs + pnm = os.path.join(tempdir, "convert-%04d.pnm") subprocess.Popen(( self.CONVERT, "-density", "300", "-depth", "8", - "-type", "grayscale", doc, png + "-type", "grayscale", doc, pnm )).wait() - pngs = [] + # Get a list of converted images + pnms = [] for f in os.listdir(tempdir): - if f.startswith("convert"): - pngs.append(os.path.join(tempdir, f)) + if f.endswith(".pnm"): + pnms.append(os.path.join(tempdir, f)) - return sorted(filter(lambda __: os.path.isfile(__), pngs)) + # Run unpaper in parallel on converted images + with Pool(processes=self.THREADS) as pool: + pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms)) + + # Return list of converted images, processed with unpaper + pnms = [] + for f in os.listdir(tempdir): + if f.endswith(".unpaper.pnm"): + pnms.append(os.path.join(tempdir, f)) + + return sorted(filter(lambda __: os.path.isfile(__), pnms)) def _get_thumbnail(self, tempdir, doc): """ @@ -179,21 +191,21 @@ class Consumer(object): except Exception as e: self.log("warning", "Language detection error: {}".format(e)) - def _get_ocr(self, pngs): + def _get_ocr(self, imgs): """ Attempts to do the best job possible OCR'ing the document based on simple language detection trial & error. """ - if not pngs: + if not imgs: raise OCRError("No images found") self.log("info", "OCRing the document") # Since the division gets rounded down by int, this calculation works # for every edge-case, i.e. 1 - middle = int(len(pngs) / 2) - raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE) + middle = int(len(imgs) / 2) + raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE) guessed_language = self._guess_language(raw_text) @@ -205,16 +217,16 @@ class Consumer(object): "As FORGIVING_OCR is enabled, we're going to make the " "best with what we have." ) - raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) return raw_text raise OCRError("Language detection failed") if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: - raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) return raw_text try: - return self._ocr(pngs, ISO639[guessed_language]) + return self._ocr(imgs, ISO639[guessed_language]) except pyocr.pyocr.tesseract.TesseractError: if settings.FORGIVING_OCR: self.log( @@ -224,34 +236,34 @@ class Consumer(object): guessed_language ) ) - raw_text = self._assemble_ocr_sections(pngs, middle, raw_text) + raw_text = self._assemble_ocr_sections(imgs, middle, raw_text) return raw_text raise OCRError( "The guessed language is not available in this instance of " "Tesseract." ) - def _assemble_ocr_sections(self, pngs, middle, text): + def _assemble_ocr_sections(self, imgs, middle, text): """ Given a `middle` value and the text that middle page represents, we OCR the remainder of the document and return the whole thing. """ - text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + text - text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE) + text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text + text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE) return text - def _ocr(self, pngs, lang): + def _ocr(self, imgs, lang): """ Performs a single OCR attempt. """ - if not pngs: + if not imgs: return "" self.log("info", "Parsing for {}".format(lang)) with Pool(processes=self.THREADS) as pool: - r = pool.map(image_to_string, itertools.product(pngs, [lang])) + r = pool.map(image_to_string, itertools.product(imgs, [lang])) r = " ".join(r) # Strip out excess white space to allow matching to go smoother @@ -374,16 +386,9 @@ class Consumer(object): def image_to_string(args): - """ - I have no idea why, but if this function were a method of Consumer, it - would explode with: - - `TypeError: cannot serialize '_io.TextIOWrapper' object`. - """ - - png, lang = args + img, lang = args ocr = pyocr.get_available_tools()[0] - with Image.open(os.path.join(Consumer.SCRATCH, png)) as f: + with Image.open(os.path.join(Consumer.SCRATCH, img)) as f: if ocr.can_detect_orientation(): try: orientation = ocr.detect_orientation(f, lang=lang) @@ -391,3 +396,10 @@ def image_to_string(args): except TesseractError: pass return ocr.image_to_string(f, lang=lang) + + +def run_unpaper(args): + unpaper, pnm = args + subprocess.Popen(( + unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") + )).wait() diff --git a/src/paperless/settings.py b/src/paperless/settings.py index f2fb41941..b7daecaf8 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -189,6 +189,9 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") # Convert is part of the ImageMagick package CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY") +# Unpaper +UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") + # This will be created if it doesn't exist SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless") From 2fba41ad7530699e30fa2a0f1306c5811070665f Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 6 Mar 2016 16:03:02 +0000 Subject: [PATCH 68/71] Added the use of unpaper to the README --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 0aba0545e..80043ff7a 100644 --- a/README.rst +++ b/README.rst @@ -56,6 +56,7 @@ powerful tools. * `ImageMagick`_ converts the images between colour and greyscale. * `Tesseract`_ does the character recognition. +* `Unpaper`_ despeckles and and deskews the scanned image. * `GNU Privacy Guard`_ is used as the encryption backend. * `Python 3`_ is the language of the project. @@ -93,6 +94,7 @@ home. .. _this one: http://www.brother.ca/en-CA/Scanners/11/ProductDetail/ADS1500W?ProductDetail=productdetail .. _ImageMagick: http://imagemagick.org/ .. _Tesseract: https://github.com/tesseract-ocr +.. _Unpaper: https://www.flameeyes.eu/projects/unpaper .. _GNU Privacy Guard: https://gnupg.org/ .. _Python 3: https://python.org/ .. _Pillow: https://pypi.python.org/pypi/pillowfight/ From 3b278c3a242752dcff3e878bdf8cf6c3b28332dc Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 6 Mar 2016 17:26:07 +0000 Subject: [PATCH 69/71] Added an informational log message for consumer start --- .../management/commands/document_consumer.py | 9 +++++++++ src/documents/managers.py | 4 ++-- src/documents/models.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 0eae5c80c..8116303b5 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,10 +1,12 @@ import datetime +import logging import os import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError +from ...models import Log from ...consumer import Consumer, ConsumerError from ...mail import MailFetcher, MailFetcherError @@ -44,6 +46,13 @@ class Command(BaseCommand): except FileExistsError: pass + logging.getLogger(__name__).info( + "Starting document consumer at {}".format( + settings.CONSUMPTION_DIR + ), + extra={"component": Log.COMPONENT_CONSUMER} + ) + try: while True: self.loop() diff --git a/src/documents/managers.py b/src/documents/managers.py index d7e7225eb..e7b0751ca 100644 --- a/src/documents/managers.py +++ b/src/documents/managers.py @@ -4,7 +4,7 @@ from django.db import models from django.db.models.aggregates import Max -class Concat(models.Aggregate): +class GroupConcat(models.Aggregate): """ Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've only ever tested it in Sqlite. @@ -60,7 +60,7 @@ class LogQuerySet(models.query.QuerySet): def by_group(self): return self.values("group").annotate( time=Max("modified"), - messages=Concat("message"), + messages=GroupConcat("message"), ).order_by("-time") diff --git a/src/documents/models.py b/src/documents/models.py index b8baea7f8..0d79dba0a 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,6 +1,7 @@ import logging import os import re +import uuid from django.conf import settings from django.core.urlresolvers import reverse @@ -236,3 +237,16 @@ class Log(models.Model): def __str__(self): return self.message + + def save(self, *args, **kwargs): + """ + To allow for the case where we don't want to group the message, we + shouldn't force the caller to specify a one-time group value. However, + allowing group=None means that the manager can't differentiate the + different un-grouped messages, so instead we set a random one here. + """ + + if not self.group: + self.group = uuid.uuid4() + + models.Model.save(self, *args, **kwargs) From f7e96eab724d8baa147d52c165047f234ae55856 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Sun, 6 Mar 2016 17:36:39 +0000 Subject: [PATCH 70/71] Put the lid on the changelog for 0.1.1 --- docs/changelog.rst | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 2228c9be1..f2ab6cabc 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,12 +1,20 @@ Changelog ######### -* 0.1.1 (master) +* 0.1.1 * Potentially **Breaking Change**: All references to "sender" in the code have been renamed to "correspondent" to better reflect the nature of the property (one could quite reasonably scan a document before sending it to someone.) + * `#67`_: Rewrote the document exporter and added a new importer that allows + for full metadata retention without depending on the file name and + modification time. A big thanks to `Tikitu de Jager`_, `Pit`_, + `Florian Jung`_, and `Christopher Luu`_ for their code snippets and + contributing conversation that lead to this change. + * `#20`_: Added *unpaper* support to help in cleaning up the scanned image + before it's OCR'd. Thanks to `Pit`_ for this one. + * `#71`_ Added (encrypted) thumbnails in anticipation of a proper UI. * `#68`_: Added support for using a proper config file at ``/etc/paperless.conf`` and modified the systemd unit files to use it. * Refactored the Vagrant installation process to use environment variables @@ -69,14 +77,17 @@ Changelog * Initial release -.. _Wayne Werner: https://github.com/waynew .. _Brian Conn: https://github.com/TheConnMan +.. _Christopher Luu: https://github.com/nuudles +.. _Florian Jung: https://github.com/the01 .. _Tikitu de Jager: https://github.com/tikitu -.. _Pit: https://github.com/pitkley .. _Paul: https://github.com/polo2ro +.. _Pit: https://github.com/pitkley +.. _Wayne Werner: https://github.com/waynew .. _darkmatter: https://github.com/darkmatter .. _zedster: https://github.com/zedster +.. _#20: https://github.com/danielquinn/paperless/issues/20 .. _#44: https://github.com/danielquinn/paperless/issues/44 .. _#45: https://github.com/danielquinn/paperless/issues/45 .. _#47: https://github.com/danielquinn/paperless/issues/47 @@ -85,4 +96,6 @@ Changelog .. _#54: https://github.com/danielquinn/paperless/issues/54 .. _#57: https://github.com/danielquinn/paperless/issues/57 .. _#60: https://github.com/danielquinn/paperless/issues/60 +.. _#67: https://github.com/danielquinn/paperless/issues/67 .. _#68: https://github.com/danielquinn/paperless/issues/68 +.. _#71: https://github.com/danielquinn/paperless/issues/71 From 6ca389c28a2cd51294014e3ca6c7bdb65ab144d6 Mon Sep 17 00:00:00 2001 From: Daniel Quinn <code@danielquinn.org> Date: Mon, 7 Mar 2016 10:12:55 +0000 Subject: [PATCH 71/71] #76 --- src/documents/migrations/0012_auto_20160305_0040.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index 62a5c65bc..91d384c22 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -52,6 +52,11 @@ def move_documents_and_create_thumbnails(apps, schema_editor): "\n", opts=("bold",) )) + try: + os.makedirs(settings.SCRATCH_DIR) + except FileExistsError: + pass + for f in sorted(documents): if not f.endswith("gpg"):