mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Fixed a few consumer bugs and added an exporter
Rename exporter to export and fixt some debugging Account for files not matching the sender/title pattern Added a safety note Wrong regex on the name parser Renamed the command to something slightly less ambiguous
This commit is contained in:
parent
2e48036f92
commit
17615d43cb
12
README.md
12
README.md
@ -80,3 +80,15 @@ object, so we're sort of stuck.
|
|||||||
passphrase when prompted.
|
passphrase when prompted.
|
||||||
|
|
||||||
6. Log into your new toy by visiting `http://localhost:8000/`.
|
6. Log into your new toy by visiting `http://localhost:8000/`.
|
||||||
|
|
||||||
|
|
||||||
|
## Important Note
|
||||||
|
|
||||||
|
Document scanners are typically used to scan sensitive documents. Things like
|
||||||
|
your social insurance number, tax records, invoices, etc. While paperless
|
||||||
|
encrypts the original PDFs via the consumption script, the OCR'd text is *not*
|
||||||
|
encrypted and is therefore stored in the clear (it needs to be searchable, so
|
||||||
|
if someone has ideas on how to do that on encrypted data, I'm all ears). This
|
||||||
|
means that paperless should never be run on an untrusted host. Instead, I
|
||||||
|
recommend that if you do want to use it, run it locally on a server in your own
|
||||||
|
home.
|
||||||
|
@ -5,7 +5,7 @@ Description=Paperless consumer
|
|||||||
EnvironmentFile=/etc/conf.d/paperless
|
EnvironmentFile=/etc/conf.d/paperless
|
||||||
User=paperless
|
User=paperless
|
||||||
Group=paperless
|
Group=paperless
|
||||||
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py consume -v $PAPERLESS_CONSUMPTION_VERBOSITY
|
ExecStart=/home/paperless/project/virtualenv/bin/python /home/paperless/project/src/manage.py document_consumer -v $PAPERLESS_CONSUMPTION_VERBOSITY
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
@ -12,11 +12,12 @@ import pyocr
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from documents.models import Document, Sender
|
from documents.models import Document, Sender
|
||||||
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
@ -38,7 +39,8 @@ class Command(BaseCommand):
|
|||||||
OCR = pyocr.get_available_tools()[0]
|
OCR = pyocr.get_available_tools()[0]
|
||||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
||||||
|
|
||||||
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
|
||||||
|
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.verbosity = 0
|
self.verbosity = 0
|
||||||
@ -50,6 +52,10 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
self.verbosity = options["verbosity"]
|
self.verbosity = options["verbosity"]
|
||||||
|
|
||||||
|
if not os.path.exists(self.CONSUME):
|
||||||
|
raise CommandError("Consumption directory {} does not exist".format(
|
||||||
|
self.CONSUME))
|
||||||
|
|
||||||
self._setup()
|
self._setup()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -70,7 +76,7 @@ class Command(BaseCommand):
|
|||||||
if not os.path.isfile(pdf):
|
if not os.path.isfile(pdf):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not pdf.endswith(".pdf"):
|
if not re.match(self.PARSER_REGEX_TITLE, pdf):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self._is_ready(pdf):
|
if self._is_ready(pdf):
|
||||||
@ -155,12 +161,7 @@ class Command(BaseCommand):
|
|||||||
with open(pdf, "rb") as unencrypted:
|
with open(pdf, "rb") as unencrypted:
|
||||||
with open(doc.pdf_path, "wb") as encrypted:
|
with open(doc.pdf_path, "wb") as encrypted:
|
||||||
self._render(" Encrypting", 3)
|
self._render(" Encrypting", 3)
|
||||||
encrypted.write(self.gpg.encrypt_file(
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
unencrypted,
|
|
||||||
recipients=None,
|
|
||||||
passphrase=settings.PASSPHRASE,
|
|
||||||
symmetric=True
|
|
||||||
).data)
|
|
||||||
|
|
||||||
def _parse_file_name(self, pdf):
|
def _parse_file_name(self, pdf):
|
||||||
"""
|
"""
|
||||||
@ -169,14 +170,17 @@ class Command(BaseCommand):
|
|||||||
"sender - title.pdf"
|
"sender - title.pdf"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
m = re.match(self.PARSER_REGEX, pdf)
|
# First we attempt "sender - title.pdf"
|
||||||
|
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
|
||||||
if m:
|
if m:
|
||||||
sender_name, title = m.group(1), m.group(2)
|
sender_name, title = m.group(1), m.group(2)
|
||||||
sender, __ = Sender.objects.get_or_create(
|
sender, __ = Sender.objects.get_or_create(
|
||||||
name=sender_name, defaults={"slug": slugify(sender_name)})
|
name=sender_name, defaults={"slug": slugify(sender_name)})
|
||||||
return sender, title
|
return sender, title
|
||||||
|
|
||||||
return "", ""
|
# That didn't work, so we assume sender is None
|
||||||
|
m = re.match(self.PARSER_REGEX_TITLE, pdf)
|
||||||
|
return None, m.group(1)
|
||||||
|
|
||||||
def _cleanup(self, pngs, pdf):
|
def _cleanup(self, pngs, pdf):
|
||||||
|
|
||||||
@ -187,6 +191,8 @@ class Command(BaseCommand):
|
|||||||
self._render(" Deleting {}".format(f), 2)
|
self._render(" Deleting {}".format(f), 2)
|
||||||
os.unlink(f)
|
os.unlink(f)
|
||||||
|
|
||||||
|
self._render("", 2)
|
||||||
|
|
||||||
def _render(self, text, verbosity):
|
def _render(self, text, verbosity):
|
||||||
if self.verbosity >= verbosity:
|
if self.verbosity >= verbosity:
|
||||||
print(text)
|
print(text)
|
53
src/documents/management/commands/document_exporter.py
Normal file
53
src/documents/management/commands/document_exporter.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
import gnupg
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
|
||||||
|
help = """
|
||||||
|
Decrypt and rename all files in our collection into a given target
|
||||||
|
directory. Note that we don't export any of the parsed data since
|
||||||
|
that can always be re-collected via the consumer.
|
||||||
|
""".replace(" ", "")
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument("target")
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.verbosity = 0
|
||||||
|
self.target = None
|
||||||
|
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||||
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
|
self.verbosity = options["verbosity"]
|
||||||
|
self.target = options["target"]
|
||||||
|
|
||||||
|
if not os.path.exists(self.target):
|
||||||
|
raise CommandError("That path doesn't exist")
|
||||||
|
|
||||||
|
if not os.access(self.target, os.W_OK):
|
||||||
|
raise CommandError("That path doesn't appear to be writable")
|
||||||
|
|
||||||
|
if not settings.PASSPHRASE:
|
||||||
|
settings.PASSPHRASE = input("Please enter the passphrase: ")
|
||||||
|
|
||||||
|
for document in Document.objects.all():
|
||||||
|
|
||||||
|
target = os.path.join(self.target, document.parseable_file_name)
|
||||||
|
|
||||||
|
self._render("Exporting: {}".format(target), 1)
|
||||||
|
|
||||||
|
with open(target, "wb") as f:
|
||||||
|
f.write(GnuPG.decrypted(document.pdf))
|
||||||
|
|
||||||
|
def _render(self, text, verbosity):
|
||||||
|
if self.verbosity >= verbosity:
|
||||||
|
print(text)
|
21
src/documents/migrations/0004_auto_20160114_1844.py
Normal file
21
src/documents/migrations/0004_auto_20160114_1844.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.9 on 2016-01-14 18:44
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0003_sender'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='document',
|
||||||
|
name='sender',
|
||||||
|
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='documents', to='documents.Sender'),
|
||||||
|
),
|
||||||
|
]
|
@ -22,7 +22,8 @@ class Sender(models.Model):
|
|||||||
|
|
||||||
class Document(models.Model):
|
class Document(models.Model):
|
||||||
|
|
||||||
sender = models.ForeignKey(Sender, blank=True)
|
sender = models.ForeignKey(
|
||||||
|
Sender, blank=True, null=True, related_name="documents")
|
||||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||||
content = models.TextField(db_index=True)
|
content = models.TextField(db_index=True)
|
||||||
created = models.DateTimeField(default=timezone.now, editable=False)
|
created = models.DateTimeField(default=timezone.now, editable=False)
|
||||||
@ -36,7 +37,7 @@ class Document(models.Model):
|
|||||||
if self.sender and self.title:
|
if self.sender and self.title:
|
||||||
return "{}: {}, {}".format(created, self.sender, self.title)
|
return "{}: {}, {}".format(created, self.sender, self.title)
|
||||||
if self.sender or self.title:
|
if self.sender or self.title:
|
||||||
return "{}: {}, {}".format(created, self.sender or self.title)
|
return "{}: {}".format(created, self.sender or self.title)
|
||||||
return str(created)
|
return str(created)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -51,3 +52,9 @@ class Document(models.Model):
|
|||||||
@property
|
@property
|
||||||
def pdf(self):
|
def pdf(self):
|
||||||
return open(self.pdf_path, "rb")
|
return open(self.pdf_path, "rb")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parseable_file_name(self):
|
||||||
|
if self.sender and self.title:
|
||||||
|
return "{} - {}.pdf".format(self.sender, self.title)
|
||||||
|
return os.path.basename(self.pdf_path)
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
import gnupg
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
from django.http import HttpResponse
|
from django.http import HttpResponse
|
||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
from django.views.generic.detail import DetailView
|
from django.views.generic.detail import DetailView
|
||||||
|
|
||||||
|
from paperless.db import GnuPG
|
||||||
|
|
||||||
from .models import Document
|
from .models import Document
|
||||||
|
|
||||||
|
|
||||||
@ -17,12 +16,8 @@ class PdfView(DetailView):
|
|||||||
Override the default to return the unencrypted PDF as raw data.
|
Override the default to return the unencrypted PDF as raw data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
response = HttpResponse(
|
||||||
|
GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
|
||||||
response = HttpResponse(gpg.decrypt_file(
|
|
||||||
self.object.pdf,
|
|
||||||
passphrase=settings.PASSPHRASE,
|
|
||||||
).data, content_type="application/pdf")
|
|
||||||
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
||||||
slugify(str(self.object)) + ".pdf")
|
slugify(str(self.object)) + ".pdf")
|
||||||
|
|
||||||
|
24
src/paperless/db.py
Normal file
24
src/paperless/db.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import gnupg
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
class GnuPG(object):
|
||||||
|
"""
|
||||||
|
A handy singleton to use when handling encrypted files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def decrypted(cls, path):
|
||||||
|
return cls.gpg.decrypt_file(path, passphrase=settings.PASSPHRASE).data
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def encrypted(cls, path):
|
||||||
|
return cls.gpg.encrypt_file(
|
||||||
|
path,
|
||||||
|
recipients=None,
|
||||||
|
passphrase=settings.PASSPHRASE,
|
||||||
|
symmetric=True
|
||||||
|
).data
|
@ -148,4 +148,3 @@ CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
|
|||||||
# `None` and you'll be prompted for the passphrase at runtime. The default
|
# `None` and you'll be prompted for the passphrase at runtime. The default
|
||||||
# looks for an environment variable.
|
# looks for an environment variable.
|
||||||
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user