diff --git a/src/documents/admin.py b/src/documents/admin.py
index 39a18a4a2..9e5c934b9 100644
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -1,5 +1,5 @@
-from django.conf import settings
from django.contrib import admin
+from django.core.urlresolvers import reverse
from django.templatetags.static import static
from .models import Document
@@ -8,27 +8,20 @@ from .models import Document
class DocumentAdmin(admin.ModelAdmin):
search_fields = ("sender", "title", "content",)
- list_display = ("edit", "created", "sender", "title", "thumbnail", "pdf")
+ list_display = ("edit", "created", "sender", "title", "pdf")
list_filter = ("created", "sender")
save_on_top = True
def edit(self, obj):
- return '
'.format(
+ return '
'.format(
static("documents/img/edit.png"))
edit.allow_tags = True
- def thumbnail(self, obj):
- return '' \
- '
' \
- ''.format(media=settings.MEDIA_URL, pk=obj.pk)
- thumbnail.allow_tags = True
-
def pdf(self, obj):
- return '' \
- '
' \
+ return '' \
+ '
' \
''.format(
- settings.MEDIA_URL,
- obj.pk,
+ reverse("fetch", kwargs={"pk": obj.pk}),
static("documents/img/application-pdf.png")
)
pdf.allow_tags = True
diff --git a/src/documents/management/commands/consume.py b/src/documents/management/commands/consume.py
index b48f5b6be..da9a1c1a3 100644
--- a/src/documents/management/commands/consume.py
+++ b/src/documents/management/commands/consume.py
@@ -1,9 +1,9 @@
import datetime
import glob
+import gnupg
import os
import random
import re
-import shutil
import subprocess
import time
@@ -36,8 +36,6 @@ class Command(BaseCommand):
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
-
- MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
@@ -45,6 +43,7 @@ class Command(BaseCommand):
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.stats = {}
+ self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
@@ -77,18 +76,16 @@ class Command(BaseCommand):
if self._is_ready(pdf):
continue
- if self.verbosity > 1:
- print("Consuming {}".format(pdf))
+ self._render("Consuming {}".format(pdf), 1)
pngs = self._get_greyscale(pdf)
- jpgs = self._get_colour(pdf)
text = self._get_ocr(pngs)
- self._store(text, jpgs, pdf)
- self._cleanup(pngs, jpgs)
+ self._store(text, pdf)
+ self._cleanup(pngs, pdf)
def _setup(self):
- for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF):
+ for d in (self.SCRATCH, self.MEDIA_PDF):
try:
os.makedirs(d)
except FileExistsError:
@@ -112,7 +109,9 @@ class Command(BaseCommand):
def _get_greyscale(self, pdf):
- i = random.randint(1000000, 4999999)
+ self._render(" Generating greyscale image", 2)
+
+ i = random.randint(1000000, 9999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
@@ -122,45 +121,46 @@ class Command(BaseCommand):
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
- def _get_colour(self, pdf):
-
- i = random.randint(5000000, 9999999)
- jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i))
-
- subprocess.Popen((self.CONVERT, pdf, jpg)).wait()
-
- return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
-
def _get_ocr(self, pngs):
+ self._render(" OCRing the PDF", 2)
+
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
+ self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f)
r += "\n\n\n\n\n\n\n\n"
return r
- def _store(self, text, jpgs, pdf):
+ def _store(self, text, pdf):
sender, title = self._parse_file_name(pdf)
stats = os.stat(pdf)
+ self._render(" Saving record to database", 2)
+
doc = Document.objects.create(
- sender=sender,
- title=title,
- content=text,
- created=timezone.make_aware(
- datetime.datetime.fromtimestamp(stats.st_ctime)),
- modified=timezone.make_aware(
- datetime.datetime.fromtimestamp(stats.st_mtime)),
+ sender=sender,
+ title=title,
+ content=text,
+ created=timezone.make_aware(
+ datetime.datetime.fromtimestamp(stats.st_mtime)),
+ modified=timezone.make_aware(
+ datetime.datetime.fromtimestamp(stats.st_mtime))
)
- shutil.move(jpgs[0], os.path.join(
- self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
- shutil.move(pdf, os.path.join(
- self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
+ with open(pdf, "rb") as unencrypted:
+ with open(doc.pdf_path, "wb") as encrypted:
+ self._render(" Encrypting", 3)
+ encrypted.write(self.gpg.encrypt_file(
+ unencrypted,
+ recipients=None,
+ passphrase=settings.PASSPHRASE,
+ symmetric=True
+ ).data)
def _parse_file_name(self, pdf):
"""
@@ -175,12 +175,15 @@ class Command(BaseCommand):
return "", ""
- def _cleanup(self, pngs, jpgs):
+ def _cleanup(self, pngs, pdf):
- jpg_glob = os.path.join(
- self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0]))
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
- for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)):
+ for f in list(glob.glob(png_glob)) + [pdf]:
+ self._render(" Deleting {}".format(f), 2)
os.unlink(f)
+
+ def _render(self, text, verbosity):
+ if self.verbosity >= verbosity:
+ print(text)
diff --git a/src/documents/models.py b/src/documents/models.py
index 16a45d97e..e39397b1b 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -1,3 +1,6 @@
+import os
+
+from django.conf import settings
from django.db import models
from django.utils import timezone
@@ -20,3 +23,16 @@ class Document(models.Model):
if self.sender or self.title:
return "{}: {}, {}".format(created, self.sender or self.title)
return str(created)
+
+ @property
+ def pdf_path(self):
+ return os.path.join(
+ settings.MEDIA_ROOT,
+ "documents",
+ "pdf",
+ "{:07}.pdf.gpg".format(self.pk)
+ )
+
+ @property
+ def pdf(self):
+ return open(self.pdf_path, "rb")
diff --git a/src/documents/views.py b/src/documents/views.py
index 91ea44a21..b752167c6 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,3 +1,29 @@
-from django.shortcuts import render
+import gnupg
-# Create your views here.
+from django.conf import settings
+from django.http import HttpResponse
+from django.template.defaultfilters import slugify
+from django.views.generic.detail import DetailView
+
+from .models import Document
+
+
+class PdfView(DetailView):
+
+ model = Document
+
+ def render_to_response(self, context, **response_kwargs):
+ """
+ Override the default to return the unencrypted PDF as raw data.
+ """
+
+ gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
+
+ response = HttpResponse(gpg.decrypt_file(
+ self.object.pdf,
+ passphrase=settings.PASSPHRASE,
+ ).data, content_type="application/pdf")
+ response["Content-Disposition"] = 'attachment; filename="{}"'.format(
+ slugify(str(self.object)) + ".pdf")
+
+ return response
diff --git a/src/manage.py b/src/manage.py
index 99d61722e..ed6281d2c 100755
--- a/src/manage.py
+++ b/src/manage.py
@@ -5,6 +5,15 @@ import sys
if __name__ == "__main__":
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
+ from django.conf import settings
from django.core.management import execute_from_command_line
+ # The runserver and consumer need to have access to the passphrase, so it
+ # must be entered at start time to keep it safe.
+ if "runserver" in sys.argv or "consume" in sys.argv:
+ settings.PASSPHRASE = "asdf"
+ if not settings.DEBUG:
+ settings.PASSPHRASE = input(
+ "Production environment. Input passphrase: ")
+
execute_from_command_line(sys.argv)
diff --git a/src/paperless/requirements.txt b/src/paperless/requirements.txt
new file mode 100644
index 000000000..6975a8733
--- /dev/null
+++ b/src/paperless/requirements.txt
@@ -0,0 +1,4 @@
+Django==1.9
+Pillow==3.0.0
+pyocr==0.3.1
+python-gnupg==0.3.8
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 32e9ee36d..a910148c5 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -135,3 +135,5 @@ MEDIA_URL = "/media/"
CONVERT_BINARY = "/usr/bin/convert"
SCRATCH_DIR = "/tmp/paperless" # Will be created if it doesn't exist
CONSUMPTION_DIR = "/tmp/paperless/consume"
+GNUPG_HOME = os.environ.get("HOME", "/dev/null")
+PASSPHRASE = None # Set via manage.py
diff --git a/src/paperless/urls.py b/src/paperless/urls.py
index 967eb65a4..2b311f858 100644
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -18,6 +18,9 @@ from django.conf import settings
from django.conf.urls import url, static
from django.contrib import admin
+from documents.views import PdfView
+
urlpatterns = [
+ url(r"^fetch/(?P\d+)$", PdfView.as_view(), name="fetch"),
url(r'', admin.site.urls),
] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)