From f72c515742cd79be64502de533c7fe151172e0a9 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Fri, 1 Jan 2016 16:13:59 +0000 Subject: [PATCH] Added GPG encryption for the PDFs --- src/documents/admin.py | 19 ++--- src/documents/management/commands/consume.py | 73 ++++++++++---------- src/documents/models.py | 16 +++++ src/documents/views.py | 30 +++++++- src/manage.py | 9 +++ src/paperless/requirements.txt | 4 ++ src/paperless/settings.py | 2 + src/paperless/urls.py | 3 + 8 files changed, 106 insertions(+), 50 deletions(-) create mode 100644 src/paperless/requirements.txt diff --git a/src/documents/admin.py b/src/documents/admin.py index 39a18a4a2..9e5c934b9 100644 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -1,5 +1,5 @@ -from django.conf import settings from django.contrib import admin +from django.core.urlresolvers import reverse from django.templatetags.static import static from .models import Document @@ -8,27 +8,20 @@ from .models import Document class DocumentAdmin(admin.ModelAdmin): search_fields = ("sender", "title", "content",) - list_display = ("edit", "created", "sender", "title", "thumbnail", "pdf") + list_display = ("edit", "created", "sender", "title", "pdf") list_filter = ("created", "sender") save_on_top = True def edit(self, obj): - return 'Edit icon'.format( + return 'Edit icon'.format( static("documents/img/edit.png")) edit.allow_tags = True - def thumbnail(self, obj): - return '' \ - '' \ - ''.format(media=settings.MEDIA_URL, pk=obj.pk) - thumbnail.allow_tags = True - def pdf(self, obj): - return '' \ - 'PDF icon' \ + return '' \ + 'PDF icon' \ ''.format( - settings.MEDIA_URL, - obj.pk, + reverse("fetch", kwargs={"pk": obj.pk}), static("documents/img/application-pdf.png") ) pdf.allow_tags = True diff --git a/src/documents/management/commands/consume.py b/src/documents/management/commands/consume.py index b48f5b6be..da9a1c1a3 100644 --- a/src/documents/management/commands/consume.py +++ b/src/documents/management/commands/consume.py @@ -1,9 +1,9 @@ import datetime import glob +import gnupg import os import random import re -import shutil import subprocess import time @@ -36,8 +36,6 @@ class Command(BaseCommand): CONSUME = settings.CONSUMPTION_DIR OCR = pyocr.get_available_tools()[0] - - MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img") MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$") @@ -45,6 +43,7 @@ class Command(BaseCommand): def __init__(self, *args, **kwargs): self.verbosity = 0 self.stats = {} + self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) BaseCommand.__init__(self, *args, **kwargs) def handle(self, *args, **options): @@ -77,18 +76,16 @@ class Command(BaseCommand): if self._is_ready(pdf): continue - if self.verbosity > 1: - print("Consuming {}".format(pdf)) + self._render("Consuming {}".format(pdf), 1) pngs = self._get_greyscale(pdf) - jpgs = self._get_colour(pdf) text = self._get_ocr(pngs) - self._store(text, jpgs, pdf) - self._cleanup(pngs, jpgs) + self._store(text, pdf) + self._cleanup(pngs, pdf) def _setup(self): - for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF): + for d in (self.SCRATCH, self.MEDIA_PDF): try: os.makedirs(d) except FileExistsError: @@ -112,7 +109,9 @@ class Command(BaseCommand): def _get_greyscale(self, pdf): - i = random.randint(1000000, 4999999) + self._render(" Generating greyscale image", 2) + + i = random.randint(1000000, 9999999) png = os.path.join(self.SCRATCH, "{}.png".format(i)) subprocess.Popen(( @@ -122,45 +121,46 @@ class Command(BaseCommand): return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) - def _get_colour(self, pdf): - - i = random.randint(5000000, 9999999) - jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i)) - - subprocess.Popen((self.CONVERT, pdf, jpg)).wait() - - return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) - def _get_ocr(self, pngs): + self._render(" OCRing the PDF", 2) + r = "" for png in pngs: with Image.open(os.path.join(self.SCRATCH, png)) as f: + self._render(" {}".format(f.filename), 3) r += self.OCR.image_to_string(f) r += "\n\n\n\n\n\n\n\n" return r - def _store(self, text, jpgs, pdf): + def _store(self, text, pdf): sender, title = self._parse_file_name(pdf) stats = os.stat(pdf) + self._render(" Saving record to database", 2) + doc = Document.objects.create( - sender=sender, - title=title, - content=text, - created=timezone.make_aware( - datetime.datetime.fromtimestamp(stats.st_ctime)), - modified=timezone.make_aware( - datetime.datetime.fromtimestamp(stats.st_mtime)), + sender=sender, + title=title, + content=text, + created=timezone.make_aware( + datetime.datetime.fromtimestamp(stats.st_mtime)), + modified=timezone.make_aware( + datetime.datetime.fromtimestamp(stats.st_mtime)) ) - shutil.move(jpgs[0], os.path.join( - self.MEDIA_IMG, "{:07}.jpg".format(doc.pk))) - shutil.move(pdf, os.path.join( - self.MEDIA_PDF, "{:07}.pdf".format(doc.pk))) + with open(pdf, "rb") as unencrypted: + with open(doc.pdf_path, "wb") as encrypted: + self._render(" Encrypting", 3) + encrypted.write(self.gpg.encrypt_file( + unencrypted, + recipients=None, + passphrase=settings.PASSPHRASE, + symmetric=True + ).data) def _parse_file_name(self, pdf): """ @@ -175,12 +175,15 @@ class Command(BaseCommand): return "", "" - def _cleanup(self, pngs, jpgs): + def _cleanup(self, pngs, pdf): - jpg_glob = os.path.join( - self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0])) png_glob = os.path.join( self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) - for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)): + for f in list(glob.glob(png_glob)) + [pdf]: + self._render(" Deleting {}".format(f), 2) os.unlink(f) + + def _render(self, text, verbosity): + if self.verbosity >= verbosity: + print(text) diff --git a/src/documents/models.py b/src/documents/models.py index 16a45d97e..e39397b1b 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,3 +1,6 @@ +import os + +from django.conf import settings from django.db import models from django.utils import timezone @@ -20,3 +23,16 @@ class Document(models.Model): if self.sender or self.title: return "{}: {}, {}".format(created, self.sender or self.title) return str(created) + + @property + def pdf_path(self): + return os.path.join( + settings.MEDIA_ROOT, + "documents", + "pdf", + "{:07}.pdf.gpg".format(self.pk) + ) + + @property + def pdf(self): + return open(self.pdf_path, "rb") diff --git a/src/documents/views.py b/src/documents/views.py index 91ea44a21..b752167c6 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,3 +1,29 @@ -from django.shortcuts import render +import gnupg -# Create your views here. +from django.conf import settings +from django.http import HttpResponse +from django.template.defaultfilters import slugify +from django.views.generic.detail import DetailView + +from .models import Document + + +class PdfView(DetailView): + + model = Document + + def render_to_response(self, context, **response_kwargs): + """ + Override the default to return the unencrypted PDF as raw data. + """ + + gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + + response = HttpResponse(gpg.decrypt_file( + self.object.pdf, + passphrase=settings.PASSPHRASE, + ).data, content_type="application/pdf") + response["Content-Disposition"] = 'attachment; filename="{}"'.format( + slugify(str(self.object)) + ".pdf") + + return response diff --git a/src/manage.py b/src/manage.py index 99d61722e..ed6281d2c 100755 --- a/src/manage.py +++ b/src/manage.py @@ -5,6 +5,15 @@ import sys if __name__ == "__main__": os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") + from django.conf import settings from django.core.management import execute_from_command_line + # The runserver and consumer need to have access to the passphrase, so it + # must be entered at start time to keep it safe. + if "runserver" in sys.argv or "consume" in sys.argv: + settings.PASSPHRASE = "asdf" + if not settings.DEBUG: + settings.PASSPHRASE = input( + "Production environment. Input passphrase: ") + execute_from_command_line(sys.argv) diff --git a/src/paperless/requirements.txt b/src/paperless/requirements.txt new file mode 100644 index 000000000..6975a8733 --- /dev/null +++ b/src/paperless/requirements.txt @@ -0,0 +1,4 @@ +Django==1.9 +Pillow==3.0.0 +pyocr==0.3.1 +python-gnupg==0.3.8 diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 32e9ee36d..a910148c5 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -135,3 +135,5 @@ MEDIA_URL = "/media/" CONVERT_BINARY = "/usr/bin/convert" SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist CONSUMPTION_DIR = "/tmp/paperless/consume" +GNUPG_HOME = os.environ.get("HOME", "/dev/null") +PASSPHRASE = None # Set via manage.py diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 967eb65a4..2b311f858 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -18,6 +18,9 @@ from django.conf import settings from django.conf.urls import url, static from django.contrib import admin +from documents.views import PdfView + urlpatterns = [ + url(r"^fetch/(?P\d+)$", PdfView.as_view(), name="fetch"), url(r'', admin.site.urls), ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)