Fixed a few consumer bugs and added an exporter

Rename exporter to export and fixt some debugging

Account for files not matching the sender/title pattern

Added a safety note

Wrong regex on the name parser

Renamed the command to something slightly less ambiguous
This commit is contained in:
Daniel Quinn 2016-01-14 19:47:57 +00:00
parent 2e48036f92
commit 17615d43cb
9 changed files with 141 additions and 24 deletions

View File

@ -80,3 +80,15 @@ object, so we're sort of stuck.
passphrase when prompted.
6. Log into your new toy by visiting `http://localhost:8000/`.
## Important Note
Document scanners are typically used to scan sensitive documents. Things like
your social insurance number, tax records, invoices, etc. While paperless
encrypts the original PDFs via the consumption script, the OCR'd text is *not*
encrypted and is therefore stored in the clear (it needs to be searchable, so
if someone has ideas on how to do that on encrypted data, I'm all ears). This
means that paperless should never be run on an untrusted host. Instead, I
recommend that if you do want to use it, run it locally on a server in your own
home.

View File

@ -5,7 +5,7 @@ Description=Paperless consumer
EnvironmentFile=/etc/conf.d/paperless
User=paperless
Group=paperless
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py consume -v $PAPERLESS_CONSUMPTION_VERBOSITY
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
[Install]
WantedBy=multi-user.target

View File

@ -12,11 +12,12 @@ import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify
from django.utils import timezone
from documents.models import Document, Sender
from paperless.db import GnuPG
class Command(BaseCommand):
@ -38,7 +39,8 @@ class Command(BaseCommand):
OCR = pyocr.get_available_tools()[0]
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
def __init__(self, *args, **kwargs):
self.verbosity = 0
@ -50,6 +52,10 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
self._setup()
try:
@ -70,7 +76,7 @@ class Command(BaseCommand):
if not os.path.isfile(pdf):
continue
if not pdf.endswith(".pdf"):
if not re.match(self.PARSER_REGEX_TITLE, pdf):
continue
if self._is_ready(pdf):
@ -155,12 +161,7 @@ class Command(BaseCommand):
with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(self.gpg.encrypt_file(
unencrypted,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data)
encrypted.write(GnuPG.encrypted(unencrypted))
def _parse_file_name(self, pdf):
"""
@ -169,14 +170,17 @@ class Command(BaseCommand):
"sender - title.pdf"
"""
m = re.match(self.PARSER_REGEX, pdf)
# First we attempt "sender - title.pdf"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
if m:
sender_name, title = m.group(1), m.group(2)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title
return "", ""
# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, pdf)
return None, m.group(1)
def _cleanup(self, pngs, pdf):
@ -187,6 +191,8 @@ class Command(BaseCommand):
self._render(" Deleting {}".format(f), 2)
os.unlink(f)
self._render("", 2)
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

View File

@ -0,0 +1,53 @@
import gnupg
import os
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from paperless.db import GnuPG
class Command(BaseCommand):
help = """
Decrypt and rename all files in our collection into a given target
directory. Note that we don't export any of the parsed data since
that can always be re-collected via the consumer.
""".replace(" ", "")
def add_arguments(self, parser):
parser.add_argument("target")
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.target = None
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self.target = options["target"]
if not os.path.exists(self.target):
raise CommandError("That path doesn't exist")
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
if not settings.PASSPHRASE:
settings.PASSPHRASE = input("Please enter the passphrase: ")
for document in Document.objects.all():
target = os.path.join(self.target, document.parseable_file_name)
self._render("Exporting: {}".format(target), 1)
with open(target, "wb") as f:
f.write(GnuPG.decrypted(document.pdf))
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

View File

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-01-14 18:44
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0003_sender'),
]
operations = [
migrations.AlterField(
model_name='document',
name='sender',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'),
),
]

View File

@ -22,7 +22,8 @@ class Sender(models.Model):
class Document(models.Model):
sender = models.ForeignKey(Sender, blank=True)
sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
created = models.DateTimeField(default=timezone.now, editable=False)
@ -36,7 +37,7 @@ class Document(models.Model):
if self.sender and self.title:
return "{}: {}, {}".format(created, self.sender, self.title)
if self.sender or self.title:
return "{}: {}, {}".format(created, self.sender or self.title)
return "{}: {}".format(created, self.sender or self.title)
return str(created)
@property
@ -51,3 +52,9 @@ class Document(models.Model):
@property
def pdf(self):
return open(self.pdf_path, "rb")
@property
def parseable_file_name(self):
if self.sender and self.title:
return "{} - {}.pdf".format(self.sender, self.title)
return os.path.basename(self.pdf_path)

View File

@ -1,10 +1,9 @@
import gnupg
from django.conf import settings
from django.http import HttpResponse
from django.template.defaultfilters import slugify
from django.views.generic.detail import DetailView
from paperless.db import GnuPG
from .models import Document
@ -17,12 +16,8 @@ class PdfView(DetailView):
Override the default to return the unencrypted PDF as raw data.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
response = HttpResponse(gpg.decrypt_file(
self.object.pdf,
passphrase=settings.PASSPHRASE,
).data, content_type="application/pdf")
response = HttpResponse(
GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
slugify(str(self.object)) + ".pdf")

24
src/paperless/db.py Normal file
View File

@ -0,0 +1,24 @@
import gnupg
from django.conf import settings
class GnuPG(object):
"""
A handy singleton to use when handling encrypted files.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
@classmethod
def decrypted(cls, path):
return cls.gpg.decrypt_file(path, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, path):
return cls.gpg.encrypt_file(
path,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data

View File

@ -148,4 +148,3 @@ CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
# `None` and you'll be prompted for the passphrase at runtime. The default
# looks for an environment variable.
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")