diff --git a/README.md b/README.md index e482f4a95..fa5d71d7f 100644 --- a/README.md +++ b/README.md @@ -80,3 +80,15 @@ object, so we're sort of stuck. passphrase when prompted. 6. Log into your new toy by visiting `http://localhost:8000/`. + + +## Important Note + +Document scanners are typically used to scan sensitive documents. Things like +your social insurance number, tax records, invoices, etc. While paperless +encrypts the original PDFs via the consumption script, the OCR'd text is *not* +encrypted and is therefore stored in the clear (it needs to be searchable, so +if someone has ideas on how to do that on encrypted data, I'm all ears). This +means that paperless should never be run on an untrusted host. Instead, I +recommend that if you do want to use it, run it locally on a server in your own +home. diff --git a/scripts/paperless-consumer.service b/scripts/paperless-consumer.service index d345ae35d..34d65dedb 100644 --- a/scripts/paperless-consumer.service +++ b/scripts/paperless-consumer.service @@ -5,7 +5,7 @@ Description=Paperless consumer EnvironmentFile=/etc/conf.d/paperless User=paperless Group=paperless -ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py consume -v $PAPERLESS_CONSUMPTION_VERBOSITY +ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY [Install] WantedBy=multi-user.target diff --git a/src/documents/management/commands/consume.py b/src/documents/management/commands/document_consumer.py similarity index 85% rename from src/documents/management/commands/consume.py rename to src/documents/management/commands/document_consumer.py index 61a61ebdd..d8441fcf5 100644 --- a/src/documents/management/commands/consume.py +++ b/src/documents/management/commands/document_consumer.py @@ -12,11 +12,12 @@ import pyocr from PIL import Image from django.conf import settings -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError from django.template.defaultfilters import slugify from django.utils import timezone from documents.models import Document, Sender +from paperless.db import GnuPG class Command(BaseCommand): @@ -38,7 +39,8 @@ class Command(BaseCommand): OCR = pyocr.get_available_tools()[0] MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") - PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$") + PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") + PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$") def __init__(self, *args, **kwargs): self.verbosity = 0 @@ -50,6 +52,10 @@ class Command(BaseCommand): self.verbosity = options["verbosity"] + if not os.path.exists(self.CONSUME): + raise CommandError("Consumption directory {} does not exist".format( + self.CONSUME)) + self._setup() try: @@ -70,7 +76,7 @@ class Command(BaseCommand): if not os.path.isfile(pdf): continue - if not pdf.endswith(".pdf"): + if not re.match(self.PARSER_REGEX_TITLE, pdf): continue if self._is_ready(pdf): @@ -155,12 +161,7 @@ class Command(BaseCommand): with open(pdf, "rb") as unencrypted: with open(doc.pdf_path, "wb") as encrypted: self._render(" Encrypting", 3) - encrypted.write(self.gpg.encrypt_file( - unencrypted, - recipients=None, - passphrase=settings.PASSPHRASE, - symmetric=True - ).data) + encrypted.write(GnuPG.encrypted(unencrypted)) def _parse_file_name(self, pdf): """ @@ -169,14 +170,17 @@ class Command(BaseCommand): "sender - title.pdf" """ - m = re.match(self.PARSER_REGEX, pdf) + # First we attempt "sender - title.pdf" + m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf) if m: sender_name, title = m.group(1), m.group(2) sender, __ = Sender.objects.get_or_create( name=sender_name, defaults={"slug": slugify(sender_name)}) return sender, title - return "", "" + # That didn't work, so we assume sender is None + m = re.match(self.PARSER_REGEX_TITLE, pdf) + return None, m.group(1) def _cleanup(self, pngs, pdf): @@ -187,6 +191,8 @@ class Command(BaseCommand): self._render(" Deleting {}".format(f), 2) os.unlink(f) + self._render("", 2) + def _render(self, text, verbosity): if self.verbosity >= verbosity: print(text) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py new file mode 100644 index 000000000..8b350a39f --- /dev/null +++ b/src/documents/management/commands/document_exporter.py @@ -0,0 +1,53 @@ +import gnupg +import os + +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError + +from documents.models import Document +from paperless.db import GnuPG + + +class Command(BaseCommand): + + help = """ + Decrypt and rename all files in our collection into a given target + directory. Note that we don't export any of the parsed data since + that can always be re-collected via the consumer. + """.replace(" ", "") + + def add_arguments(self, parser): + parser.add_argument("target") + + def __init__(self, *args, **kwargs): + self.verbosity = 0 + self.target = None + self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + BaseCommand.__init__(self, *args, **kwargs) + + def handle(self, *args, **options): + + self.verbosity = options["verbosity"] + self.target = options["target"] + + if not os.path.exists(self.target): + raise CommandError("That path doesn't exist") + + if not os.access(self.target, os.W_OK): + raise CommandError("That path doesn't appear to be writable") + + if not settings.PASSPHRASE: + settings.PASSPHRASE = input("Please enter the passphrase: ") + + for document in Document.objects.all(): + + target = os.path.join(self.target, document.parseable_file_name) + + self._render("Exporting: {}".format(target), 1) + + with open(target, "wb") as f: + f.write(GnuPG.decrypted(document.pdf)) + + def _render(self, text, verbosity): + if self.verbosity >= verbosity: + print(text) diff --git a/src/documents/migrations/0004_auto_20160114_1844.py b/src/documents/migrations/0004_auto_20160114_1844.py new file mode 100644 index 000000000..b9fa616ae --- /dev/null +++ b/src/documents/migrations/0004_auto_20160114_1844.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9 on 2016-01-14 18:44 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0003_sender'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='sender', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 666457f6f..be00f5624 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -22,7 +22,8 @@ class Sender(models.Model): class Document(models.Model): - sender = models.ForeignKey(Sender, blank=True) + sender = models.ForeignKey( + Sender, blank=True, null=True, related_name="documents") title = models.CharField(max_length=128, blank=True, db_index=True) content = models.TextField(db_index=True) created = models.DateTimeField(default=timezone.now, editable=False) @@ -36,7 +37,7 @@ class Document(models.Model): if self.sender and self.title: return "{}: {}, {}".format(created, self.sender, self.title) if self.sender or self.title: - return "{}: {}, {}".format(created, self.sender or self.title) + return "{}: {}".format(created, self.sender or self.title) return str(created) @property @@ -51,3 +52,9 @@ class Document(models.Model): @property def pdf(self): return open(self.pdf_path, "rb") + + @property + def parseable_file_name(self): + if self.sender and self.title: + return "{} - {}.pdf".format(self.sender, self.title) + return os.path.basename(self.pdf_path) diff --git a/src/documents/views.py b/src/documents/views.py index b752167c6..74590769c 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,10 +1,9 @@ -import gnupg - -from django.conf import settings from django.http import HttpResponse from django.template.defaultfilters import slugify from django.views.generic.detail import DetailView +from paperless.db import GnuPG + from .models import Document @@ -17,12 +16,8 @@ class PdfView(DetailView): Override the default to return the unencrypted PDF as raw data. """ - gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) - - response = HttpResponse(gpg.decrypt_file( - self.object.pdf, - passphrase=settings.PASSPHRASE, - ).data, content_type="application/pdf") + response = HttpResponse( + GnuPG.decrypted(self.object.pdf), content_type="application/pdf") response["Content-Disposition"] = 'attachment; filename="{}"'.format( slugify(str(self.object)) + ".pdf") diff --git a/src/paperless/db.py b/src/paperless/db.py new file mode 100644 index 000000000..ba2288e0f --- /dev/null +++ b/src/paperless/db.py @@ -0,0 +1,24 @@ +import gnupg + +from django.conf import settings + + +class GnuPG(object): + """ + A handy singleton to use when handling encrypted files. + """ + + gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + + @classmethod + def decrypted(cls, path): + return cls.gpg.decrypt_file(path, passphrase=settings.PASSPHRASE).data + + @classmethod + def encrypted(cls, path): + return cls.gpg.encrypt_file( + path, + recipients=None, + passphrase=settings.PASSPHRASE, + symmetric=True + ).data diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 414b82d5f..377c47b2a 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -148,4 +148,3 @@ CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME") # `None` and you'll be prompted for the passphrase at runtime. The default # looks for an environment variable. PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE") -