Fixed a few consumer bugs and added an exporter

Rename exporter to export and fixt some debugging

Account for files not matching the sender/title pattern

Added a safety note

Wrong regex on the name parser

Renamed the command to something slightly less ambiguous
This commit is contained in:
Daniel Quinn
2016-01-14 19:47:57 +00:00
parent 2e48036f92
commit 17615d43cb
9 changed files with 141 additions and 24 deletions

View File

@@ -12,11 +12,12 @@ import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify
from django.utils import timezone
from documents.models import Document, Sender
from paperless.db import GnuPG
class Command(BaseCommand):
@@ -38,7 +39,8 @@ class Command(BaseCommand):
OCR = pyocr.get_available_tools()[0]
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
def __init__(self, *args, **kwargs):
self.verbosity = 0
@@ -50,6 +52,10 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
self._setup()
try:
@@ -70,7 +76,7 @@ class Command(BaseCommand):
if not os.path.isfile(pdf):
continue
if not pdf.endswith(".pdf"):
if not re.match(self.PARSER_REGEX_TITLE, pdf):
continue
if self._is_ready(pdf):
@@ -155,12 +161,7 @@ class Command(BaseCommand):
with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(self.gpg.encrypt_file(
unencrypted,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data)
encrypted.write(GnuPG.encrypted(unencrypted))
def _parse_file_name(self, pdf):
"""
@@ -169,14 +170,17 @@ class Command(BaseCommand):
"sender - title.pdf"
"""
m = re.match(self.PARSER_REGEX, pdf)
# First we attempt "sender - title.pdf"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
if m:
sender_name, title = m.group(1), m.group(2)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title
return "", ""
# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, pdf)
return None, m.group(1)
def _cleanup(self, pngs, pdf):
@@ -187,6 +191,8 @@ class Command(BaseCommand):
self._render(" Deleting {}".format(f), 2)
os.unlink(f)
self._render("", 2)
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

View File

@@ -0,0 +1,53 @@
import gnupg
import os
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from paperless.db import GnuPG
class Command(BaseCommand):
help = """
Decrypt and rename all files in our collection into a given target
directory. Note that we don't export any of the parsed data since
that can always be re-collected via the consumer.
""".replace(" ", "")
def add_arguments(self, parser):
parser.add_argument("target")
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.target = None
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self.target = options["target"]
if not os.path.exists(self.target):
raise CommandError("That path doesn't exist")
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
if not settings.PASSPHRASE:
settings.PASSPHRASE = input("Please enter the passphrase: ")
for document in Document.objects.all():
target = os.path.join(self.target, document.parseable_file_name)
self._render("Exporting: {}".format(target), 1)
with open(target, "wb") as f:
f.write(GnuPG.decrypted(document.pdf))
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

View File

@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-01-14 18:44
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0003_sender'),
]
operations = [
migrations.AlterField(
model_name='document',
name='sender',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'),
),
]

View File

@@ -22,7 +22,8 @@ class Sender(models.Model):
class Document(models.Model):
sender = models.ForeignKey(Sender, blank=True)
sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
created = models.DateTimeField(default=timezone.now, editable=False)
@@ -36,7 +37,7 @@ class Document(models.Model):
if self.sender and self.title:
return "{}: {}, {}".format(created, self.sender, self.title)
if self.sender or self.title:
return "{}: {}, {}".format(created, self.sender or self.title)
return "{}: {}".format(created, self.sender or self.title)
return str(created)
@property
@@ -51,3 +52,9 @@ class Document(models.Model):
@property
def pdf(self):
return open(self.pdf_path, "rb")
@property
def parseable_file_name(self):
if self.sender and self.title:
return "{} - {}.pdf".format(self.sender, self.title)
return os.path.basename(self.pdf_path)

View File

@@ -1,10 +1,9 @@
import gnupg
from django.conf import settings
from django.http import HttpResponse
from django.template.defaultfilters import slugify
from django.views.generic.detail import DetailView
from paperless.db import GnuPG
from .models import Document
@@ -17,12 +16,8 @@ class PdfView(DetailView):
Override the default to return the unencrypted PDF as raw data.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
response = HttpResponse(gpg.decrypt_file(
self.object.pdf,
passphrase=settings.PASSPHRASE,
).data, content_type="application/pdf")
response = HttpResponse(
GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
slugify(str(self.object)) + ".pdf")

24
src/paperless/db.py Normal file
View File

@@ -0,0 +1,24 @@
import gnupg
from django.conf import settings
class GnuPG(object):
"""
A handy singleton to use when handling encrypted files.
"""
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
@classmethod
def decrypted(cls, path):
return cls.gpg.decrypt_file(path, passphrase=settings.PASSPHRASE).data
@classmethod
def encrypted(cls, path):
return cls.gpg.encrypt_file(
path,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data

View File

@@ -148,4 +148,3 @@ CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
# `None` and you'll be prompted for the passphrase at runtime. The default
# looks for an environment variable.
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")