Fixed a few consumer bugs and added an exporter

Rename exporter to export and fixt some debugging

Account for files not matching the sender/title pattern

Added a safety note

Wrong regex on the name parser

Renamed the command to something slightly less ambiguous
This commit is contained in:
Daniel Quinn 2016-01-14 19:47:57 +00:00
parent 2e48036f92
commit 17615d43cb
9 changed files with 141 additions and 24 deletions

@ -80,3 +80,15 @@ object, so we're sort of stuck.
passphrase when prompted. passphrase when prompted.
6. Log into your new toy by visiting `http://localhost:8000/`. 6. Log into your new toy by visiting `http://localhost:8000/`.
## Important Note
Document scanners are typically used to scan sensitive documents. Things like
your social insurance number, tax records, invoices, etc. While paperless
encrypts the original PDFs via the consumption script, the OCR'd text is *not*
encrypted and is therefore stored in the clear (it needs to be searchable, so
if someone has ideas on how to do that on encrypted data, I'm all ears). This
means that paperless should never be run on an untrusted host. Instead, I
recommend that if you do want to use it, run it locally on a server in your own
home.

@ -5,7 +5,7 @@ Description=Paperless consumer
EnvironmentFile=/etc/conf.d/paperless EnvironmentFile=/etc/conf.d/paperless
User=paperless User=paperless
Group=paperless Group=paperless
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py consume -v $PAPERLESS_CONSUMPTION_VERBOSITY ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

@ -12,11 +12,12 @@ import pyocr
from PIL import Image from PIL import Image
from django.conf import settings from django.conf import settings
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.utils import timezone from django.utils import timezone
from documents.models import Document, Sender from documents.models import Document, Sender
from paperless.db import GnuPG
class Command(BaseCommand): class Command(BaseCommand):
@ -38,7 +39,8 @@ class Command(BaseCommand):
OCR = pyocr.get_available_tools()[0] OCR = pyocr.get_available_tools()[0]
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$") PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.verbosity = 0 self.verbosity = 0
@ -50,6 +52,10 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"] self.verbosity = options["verbosity"]
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
self._setup() self._setup()
try: try:
@ -70,7 +76,7 @@ class Command(BaseCommand):
if not os.path.isfile(pdf): if not os.path.isfile(pdf):
continue continue
if not pdf.endswith(".pdf"): if not re.match(self.PARSER_REGEX_TITLE, pdf):
continue continue
if self._is_ready(pdf): if self._is_ready(pdf):
@ -155,12 +161,7 @@ class Command(BaseCommand):
with open(pdf, "rb") as unencrypted: with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted: with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3) self._render(" Encrypting", 3)
encrypted.write(self.gpg.encrypt_file( encrypted.write(GnuPG.encrypted(unencrypted))
unencrypted,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data)
def _parse_file_name(self, pdf): def _parse_file_name(self, pdf):
""" """
@ -169,14 +170,17 @@ class Command(BaseCommand):
"sender - title.pdf" "sender - title.pdf"
""" """
m = re.match(self.PARSER_REGEX, pdf) # First we attempt "sender - title.pdf"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
if m: if m:
sender_name, title = m.group(1), m.group(2) sender_name, title = m.group(1), m.group(2)
sender, __ = Sender.objects.get_or_create( sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)}) name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title return sender, title
return "", "" # That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, pdf)
return None, m.group(1)
def _cleanup(self, pngs, pdf): def _cleanup(self, pngs, pdf):
@ -187,6 +191,8 @@ class Command(BaseCommand):
self._render(" Deleting {}".format(f), 2) self._render(" Deleting {}".format(f), 2)
os.unlink(f) os.unlink(f)
self._render("", 2)
def _render(self, text, verbosity): def _render(self, text, verbosity):
if self.verbosity >= verbosity: if self.verbosity >= verbosity:
print(text) print(text)

@ -0,0 +1,53 @@
import gnupg
import os
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from paperless.db import GnuPG
class Command(BaseCommand):
help = """
Decrypt and rename all files in our collection into a given target
directory. Note that we don't export any of the parsed data since
that can always be re-collected via the consumer.
""".replace(" ", "")
def add_arguments(self, parser):
parser.add_argument("target")
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.target = None
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self.target = options["target"]
if not os.path.exists(self.target):
raise CommandError("That path doesn't exist")
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
if not settings.PASSPHRASE:
settings.PASSPHRASE = input("Please enter the passphrase: ")
for document in Document.objects.all():
target = os.path.join(self.target, document.parseable_file_name)
self._render("Exporting: {}".format(target), 1)
with open(target, "wb") as f:
f.write(GnuPG.decrypted(document.pdf))
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-01-14 18:44
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0003_sender'),
]
operations = [
migrations.AlterField(
model_name='document',
name='sender',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'),
),
]

@ -22,7 +22,8 @@ class Sender(models.Model):
class Document(models.Model): class Document(models.Model):
sender = models.ForeignKey(Sender, blank=True) sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True) title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True) content = models.TextField(db_index=True)
created = models.DateTimeField(default=timezone.now, editable=False) created = models.DateTimeField(default=timezone.now, editable=False)
@ -36,7 +37,7 @@ class Document(models.Model):
if self.sender and self.title: if self.sender and self.title:
return "{}: {}, {}".format(created, self.sender, self.title) return "{}: {}, {}".format(created, self.sender, self.title)
if self.sender or self.title: if self.sender or self.title:
return "{}: {}, {}".format(created, self.sender or self.title) return "{}: {}".format(created, self.sender or self.title)
return str(created) return str(created)
@property @property
@ -51,3 +52,9 @@ class Document(models.Model):
@property @property
def pdf(self): def pdf(self):
return open(self.pdf_path, "rb") return open(self.pdf_path, "rb")
@property
def parseable_file_name(self):
if self.sender and self.title:
return "{} - {}.pdf".format(self.sender, self.title)
return os.path.basename(self.pdf_path)

@ -1,10 +1,9 @@
import gnupg
from django.conf import settings
from django.http import HttpResponse from django.http import HttpResponse
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.views.generic.detail import DetailView from django.views.generic.detail import DetailView
from paperless.db import GnuPG
from .models import Document from .models import Document
@ -17,12 +16,8 @@ class PdfView(DetailView):
Override the default to return the unencrypted PDF as raw data. Override the default to return the unencrypted PDF as raw data.
""" """
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) response = HttpResponse(
GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
response = HttpResponse(gpg.decrypt_file(
self.object.pdf,
passphrase=settings.PASSPHRASE,
).data, content_type="application/pdf")
response["Content-Disposition"] = 'attachment; filename="{}"'.format( response["Content-Disposition"] = 'attachment; filename="{}"'.format(
slugify(str(self.object)) + ".pdf") slugify(str(self.object)) + ".pdf")

24
src/paperless/db.py Normal file

@ -0,0 +1,24 @@
import gnupg
from django.conf import settings
class GnuPG(object):
"""
A handy singleton to use when handling encrypted files.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
@classmethod
def decrypted(cls, path):
return cls.gpg.decrypt_file(path, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, path):
return cls.gpg.encrypt_file(
path,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data

@ -148,4 +148,3 @@ CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
# `None` and you'll be prompted for the passphrase at runtime. The default # `None` and you'll be prompted for the passphrase at runtime. The default
# looks for an environment variable. # looks for an environment variable.
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")