Added GPG encryption for the PDFs

This commit is contained in:
Daniel Quinn 2016-01-01 16:13:59 +00:00
parent 6956376d71
commit f72c515742
8 changed files with 106 additions and 50 deletions

View File

@ -1,5 +1,5 @@
from django.conf import settings
from django.contrib import admin from django.contrib import admin
from django.core.urlresolvers import reverse
from django.templatetags.static import static from django.templatetags.static import static
from .models import Document from .models import Document
@ -8,27 +8,20 @@ from .models import Document
class DocumentAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin):
search_fields = ("sender", "title", "content",) search_fields = ("sender", "title", "content",)
list_display = ("edit", "created", "sender", "title", "thumbnail", "pdf") list_display = ("edit", "created", "sender", "title", "pdf")
list_filter = ("created", "sender") list_filter = ("created", "sender")
save_on_top = True save_on_top = True
def edit(self, obj): def edit(self, obj):
return '<img src="{}" width="64" height="64" alt="Edit icon" />'.format( return '<img src="{}" width="22" height="22" alt="Edit icon" />'.format(
static("documents/img/edit.png")) static("documents/img/edit.png"))
edit.allow_tags = True edit.allow_tags = True
def thumbnail(self, obj):
return '<a href="{media}documents/img/{pk:07}.jpg" target="_blank">' \
'<img src="{media}documents/img/{pk:07}.jpg" width="100" />' \
'</a>'.format(media=settings.MEDIA_URL, pk=obj.pk)
thumbnail.allow_tags = True
def pdf(self, obj): def pdf(self, obj):
return '<a href="{}documents/pdf/{:07}.pdf">' \ return '<a href="{}">' \
'<img src="{}" width="64" height="64" alt="PDF icon">' \ '<img src="{}" width="22" height="22" alt="PDF icon">' \
'</a>'.format( '</a>'.format(
settings.MEDIA_URL, reverse("fetch", kwargs={"pk": obj.pk}),
obj.pk,
static("documents/img/application-pdf.png") static("documents/img/application-pdf.png")
) )
pdf.allow_tags = True pdf.allow_tags = True

View File

@ -1,9 +1,9 @@
import datetime import datetime
import glob import glob
import gnupg
import os import os
import random import random
import re import re
import shutil
import subprocess import subprocess
import time import time
@ -36,8 +36,6 @@ class Command(BaseCommand):
CONSUME = settings.CONSUMPTION_DIR CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0] OCR = pyocr.get_available_tools()[0]
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$") PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
@ -45,6 +43,7 @@ class Command(BaseCommand):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.verbosity = 0 self.verbosity = 0
self.stats = {} self.stats = {}
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
BaseCommand.__init__(self, *args, **kwargs) BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options): def handle(self, *args, **options):
@ -77,18 +76,16 @@ class Command(BaseCommand):
if self._is_ready(pdf): if self._is_ready(pdf):
continue continue
if self.verbosity > 1: self._render("Consuming {}".format(pdf), 1)
print("Consuming {}".format(pdf))
pngs = self._get_greyscale(pdf) pngs = self._get_greyscale(pdf)
jpgs = self._get_colour(pdf)
text = self._get_ocr(pngs) text = self._get_ocr(pngs)
self._store(text, jpgs, pdf) self._store(text, pdf)
self._cleanup(pngs, jpgs) self._cleanup(pngs, pdf)
def _setup(self): def _setup(self):
for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF): for d in (self.SCRATCH, self.MEDIA_PDF):
try: try:
os.makedirs(d) os.makedirs(d)
except FileExistsError: except FileExistsError:
@ -112,7 +109,9 @@ class Command(BaseCommand):
def _get_greyscale(self, pdf): def _get_greyscale(self, pdf):
i = random.randint(1000000, 4999999) self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i)) png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen(( subprocess.Popen((
@ -122,45 +121,46 @@ class Command(BaseCommand):
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_colour(self, pdf):
i = random.randint(5000000, 9999999)
jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i))
subprocess.Popen((self.CONVERT, pdf, jpg)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs): def _get_ocr(self, pngs):
self._render(" OCRing the PDF", 2)
r = "" r = ""
for png in pngs: for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f: with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f) r += self.OCR.image_to_string(f)
r += "\n\n\n\n\n\n\n\n" r += "\n\n\n\n\n\n\n\n"
return r return r
def _store(self, text, jpgs, pdf): def _store(self, text, pdf):
sender, title = self._parse_file_name(pdf) sender, title = self._parse_file_name(pdf)
stats = os.stat(pdf) stats = os.stat(pdf)
self._render(" Saving record to database", 2)
doc = Document.objects.create( doc = Document.objects.create(
sender=sender, sender=sender,
title=title, title=title,
content=text, content=text,
created=timezone.make_aware( created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_ctime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)), datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
) )
shutil.move(jpgs[0], os.path.join( with open(pdf, "rb") as unencrypted:
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk))) with open(doc.pdf_path, "wb") as encrypted:
shutil.move(pdf, os.path.join( self._render(" Encrypting", 3)
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk))) encrypted.write(self.gpg.encrypt_file(
unencrypted,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data)
def _parse_file_name(self, pdf): def _parse_file_name(self, pdf):
""" """
@ -175,12 +175,15 @@ class Command(BaseCommand):
return "", "" return "", ""
def _cleanup(self, pngs, jpgs): def _cleanup(self, pngs, pdf):
jpg_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0]))
png_glob = os.path.join( png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)): for f in list(glob.glob(png_glob)) + [pdf]:
self._render(" Deleting {}".format(f), 2)
os.unlink(f) os.unlink(f)
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

View File

@ -1,3 +1,6 @@
import os
from django.conf import settings
from django.db import models from django.db import models
from django.utils import timezone from django.utils import timezone
@ -20,3 +23,16 @@ class Document(models.Model):
if self.sender or self.title: if self.sender or self.title:
return "{}: {}, {}".format(created, self.sender or self.title) return "{}: {}, {}".format(created, self.sender or self.title)
return str(created) return str(created)
@property
def pdf_path(self):
return os.path.join(
settings.MEDIA_ROOT,
"documents",
"pdf",
"{:07}.pdf.gpg".format(self.pk)
)
@property
def pdf(self):
return open(self.pdf_path, "rb")

View File

@ -1,3 +1,29 @@
from django.shortcuts import render import gnupg
# Create your views here. from django.conf import settings
from django.http import HttpResponse
from django.template.defaultfilters import slugify
from django.views.generic.detail import DetailView
from .models import Document
class PdfView(DetailView):
model = Document
def render_to_response(self, context, **response_kwargs):
"""
Override the default to return the unencrypted PDF as raw data.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
response = HttpResponse(gpg.decrypt_file(
self.object.pdf,
passphrase=settings.PASSPHRASE,
).data, content_type="application/pdf")
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
slugify(str(self.object)) + ".pdf")
return response

View File

@ -5,6 +5,15 @@ import sys
if __name__ == "__main__": if __name__ == "__main__":
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
from django.conf import settings
from django.core.management import execute_from_command_line from django.core.management import execute_from_command_line
# The runserver and consumer need to have access to the passphrase, so it
# must be entered at start time to keep it safe.
if "runserver" in sys.argv or "consume" in sys.argv:
settings.PASSPHRASE = "asdf"
if not settings.DEBUG:
settings.PASSPHRASE = input(
"Production environment. Input passphrase: ")
execute_from_command_line(sys.argv) execute_from_command_line(sys.argv)

View File

@ -0,0 +1,4 @@
Django==1.9
Pillow==3.0.0
pyocr==0.3.1
python-gnupg==0.3.8

View File

@ -135,3 +135,5 @@ MEDIA_URL = "/media/"
CONVERT_BINARY = "/usr/bin/convert" CONVERT_BINARY = "/usr/bin/convert"
SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist
CONSUMPTION_DIR = "/tmp/paperless/consume" CONSUMPTION_DIR = "/tmp/paperless/consume"
GNUPG_HOME = os.environ.get("HOME", "/dev/null")
PASSPHRASE = None # Set via manage.py

View File

@ -18,6 +18,9 @@ from django.conf import settings
from django.conf.urls import url, static from django.conf.urls import url, static
from django.contrib import admin from django.contrib import admin
from documents.views import PdfView
urlpatterns = [ urlpatterns = [
url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"),
url(r'', admin.site.urls), url(r'', admin.site.urls),
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)