mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Fixed a few consumer bugs and added an exporter
Rename exporter to export and fixt some debugging Account for files not matching the sender/title pattern Added a safety note Wrong regex on the name parser Renamed the command to something slightly less ambiguous
This commit is contained in:
parent
2e48036f92
commit
17615d43cb
12
README.md
12
README.md
@ -80,3 +80,15 @@ object, so we're sort of stuck.
|
||||
passphrase when prompted.
|
||||
|
||||
6. Log into your new toy by visiting `http://localhost:8000/`.
|
||||
|
||||
|
||||
## Important Note
|
||||
|
||||
Document scanners are typically used to scan sensitive documents. Things like
|
||||
your social insurance number, tax records, invoices, etc. While paperless
|
||||
encrypts the original PDFs via the consumption script, the OCR'd text is *not*
|
||||
encrypted and is therefore stored in the clear (it needs to be searchable, so
|
||||
if someone has ideas on how to do that on encrypted data, I'm all ears). This
|
||||
means that paperless should never be run on an untrusted host. Instead, I
|
||||
recommend that if you do want to use it, run it locally on a server in your own
|
||||
home.
|
||||
|
@ -5,7 +5,7 @@ Description=Paperless consumer
|
||||
EnvironmentFile=/etc/conf.d/paperless
|
||||
User=paperless
|
||||
Group=paperless
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py consume -v $PAPERLESS_CONSUMPTION_VERBOSITY
|
||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -12,11 +12,12 @@ import pyocr
|
||||
from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.models import Document, Sender
|
||||
from paperless.db import GnuPG
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
@ -38,7 +39,8 @@ class Command(BaseCommand):
|
||||
OCR = pyocr.get_available_tools()[0]
|
||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
||||
|
||||
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
||||
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
|
||||
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
@ -50,6 +52,10 @@ class Command(BaseCommand):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
if not os.path.exists(self.CONSUME):
|
||||
raise CommandError("Consumption directory {} does not exist".format(
|
||||
self.CONSUME))
|
||||
|
||||
self._setup()
|
||||
|
||||
try:
|
||||
@ -70,7 +76,7 @@ class Command(BaseCommand):
|
||||
if not os.path.isfile(pdf):
|
||||
continue
|
||||
|
||||
if not pdf.endswith(".pdf"):
|
||||
if not re.match(self.PARSER_REGEX_TITLE, pdf):
|
||||
continue
|
||||
|
||||
if self._is_ready(pdf):
|
||||
@ -155,12 +161,7 @@ class Command(BaseCommand):
|
||||
with open(pdf, "rb") as unencrypted:
|
||||
with open(doc.pdf_path, "wb") as encrypted:
|
||||
self._render(" Encrypting", 3)
|
||||
encrypted.write(self.gpg.encrypt_file(
|
||||
unencrypted,
|
||||
recipients=None,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
symmetric=True
|
||||
).data)
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
def _parse_file_name(self, pdf):
|
||||
"""
|
||||
@ -169,14 +170,17 @@ class Command(BaseCommand):
|
||||
"sender - title.pdf"
|
||||
"""
|
||||
|
||||
m = re.match(self.PARSER_REGEX, pdf)
|
||||
# First we attempt "sender - title.pdf"
|
||||
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
|
||||
if m:
|
||||
sender_name, title = m.group(1), m.group(2)
|
||||
sender, __ = Sender.objects.get_or_create(
|
||||
name=sender_name, defaults={"slug": slugify(sender_name)})
|
||||
return sender, title
|
||||
|
||||
return "", ""
|
||||
# That didn't work, so we assume sender is None
|
||||
m = re.match(self.PARSER_REGEX_TITLE, pdf)
|
||||
return None, m.group(1)
|
||||
|
||||
def _cleanup(self, pngs, pdf):
|
||||
|
||||
@ -187,6 +191,8 @@ class Command(BaseCommand):
|
||||
self._render(" Deleting {}".format(f), 2)
|
||||
os.unlink(f)
|
||||
|
||||
self._render("", 2)
|
||||
|
||||
def _render(self, text, verbosity):
|
||||
if self.verbosity >= verbosity:
|
||||
print(text)
|
53
src/documents/management/commands/document_exporter.py
Normal file
53
src/documents/management/commands/document_exporter.py
Normal file
@ -0,0 +1,53 @@
|
||||
import gnupg
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
Decrypt and rename all files in our collection into a given target
|
||||
directory. Note that we don't export any of the parsed data since
|
||||
that can always be re-collected via the consumer.
|
||||
""".replace(" ", "")
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
self.target = None
|
||||
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
self.target = options["target"]
|
||||
|
||||
if not os.path.exists(self.target):
|
||||
raise CommandError("That path doesn't exist")
|
||||
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
if not settings.PASSPHRASE:
|
||||
settings.PASSPHRASE = input("Please enter the passphrase: ")
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
target = os.path.join(self.target, document.parseable_file_name)
|
||||
|
||||
self._render("Exporting: {}".format(target), 1)
|
||||
|
||||
with open(target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.pdf))
|
||||
|
||||
def _render(self, text, verbosity):
|
||||
if self.verbosity >= verbosity:
|
||||
print(text)
|
21
src/documents/migrations/0004_auto_20160114_1844.py
Normal file
21
src/documents/migrations/0004_auto_20160114_1844.py
Normal file
@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.9 on 2016-01-14 18:44
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0003_sender'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='sender',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'),
|
||||
),
|
||||
]
|
@ -22,7 +22,8 @@ class Sender(models.Model):
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
sender = models.ForeignKey(Sender, blank=True)
|
||||
sender = models.ForeignKey(
|
||||
Sender, blank=True, null=True, related_name="documents")
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
content = models.TextField(db_index=True)
|
||||
created = models.DateTimeField(default=timezone.now, editable=False)
|
||||
@ -36,7 +37,7 @@ class Document(models.Model):
|
||||
if self.sender and self.title:
|
||||
return "{}: {}, {}".format(created, self.sender, self.title)
|
||||
if self.sender or self.title:
|
||||
return "{}: {}, {}".format(created, self.sender or self.title)
|
||||
return "{}: {}".format(created, self.sender or self.title)
|
||||
return str(created)
|
||||
|
||||
@property
|
||||
@ -51,3 +52,9 @@ class Document(models.Model):
|
||||
@property
|
||||
def pdf(self):
|
||||
return open(self.pdf_path, "rb")
|
||||
|
||||
@property
|
||||
def parseable_file_name(self):
|
||||
if self.sender and self.title:
|
||||
return "{} - {}.pdf".format(self.sender, self.title)
|
||||
return os.path.basename(self.pdf_path)
|
||||
|
@ -1,10 +1,9 @@
|
||||
import gnupg
|
||||
|
||||
from django.conf import settings
|
||||
from django.http import HttpResponse
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.views.generic.detail import DetailView
|
||||
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from .models import Document
|
||||
|
||||
|
||||
@ -17,12 +16,8 @@ class PdfView(DetailView):
|
||||
Override the default to return the unencrypted PDF as raw data.
|
||||
"""
|
||||
|
||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
|
||||
response = HttpResponse(gpg.decrypt_file(
|
||||
self.object.pdf,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
).data, content_type="application/pdf")
|
||||
response = HttpResponse(
|
||||
GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
|
||||
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
||||
slugify(str(self.object)) + ".pdf")
|
||||
|
||||
|
24
src/paperless/db.py
Normal file
24
src/paperless/db.py
Normal file
@ -0,0 +1,24 @@
|
||||
import gnupg
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class GnuPG(object):
|
||||
"""
|
||||
A handy singleton to use when handling encrypted files.
|
||||
"""
|
||||
|
||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
|
||||
@classmethod
|
||||
def decrypted(cls, path):
|
||||
return cls.gpg.decrypt_file(path, passphrase=settings.PASSPHRASE).data
|
||||
|
||||
@classmethod
|
||||
def encrypted(cls, path):
|
||||
return cls.gpg.encrypt_file(
|
||||
path,
|
||||
recipients=None,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
symmetric=True
|
||||
).data
|
@ -148,4 +148,3 @@ CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
|
||||
# `None` and you'll be prompted for the passphrase at runtime. The default
|
||||
# looks for an environment variable.
|
||||
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user