mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Added GPG encryption for the PDFs
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
import datetime
|
||||
import glob
|
||||
import gnupg
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
@@ -36,8 +36,6 @@ class Command(BaseCommand):
|
||||
CONSUME = settings.CONSUMPTION_DIR
|
||||
|
||||
OCR = pyocr.get_available_tools()[0]
|
||||
|
||||
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
|
||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
||||
|
||||
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
||||
@@ -45,6 +43,7 @@ class Command(BaseCommand):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
self.stats = {}
|
||||
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
@@ -77,18 +76,16 @@ class Command(BaseCommand):
|
||||
if self._is_ready(pdf):
|
||||
continue
|
||||
|
||||
if self.verbosity > 1:
|
||||
print("Consuming {}".format(pdf))
|
||||
self._render("Consuming {}".format(pdf), 1)
|
||||
|
||||
pngs = self._get_greyscale(pdf)
|
||||
jpgs = self._get_colour(pdf)
|
||||
text = self._get_ocr(pngs)
|
||||
|
||||
self._store(text, jpgs, pdf)
|
||||
self._cleanup(pngs, jpgs)
|
||||
self._store(text, pdf)
|
||||
self._cleanup(pngs, pdf)
|
||||
|
||||
def _setup(self):
|
||||
for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF):
|
||||
for d in (self.SCRATCH, self.MEDIA_PDF):
|
||||
try:
|
||||
os.makedirs(d)
|
||||
except FileExistsError:
|
||||
@@ -112,7 +109,9 @@ class Command(BaseCommand):
|
||||
|
||||
def _get_greyscale(self, pdf):
|
||||
|
||||
i = random.randint(1000000, 4999999)
|
||||
self._render(" Generating greyscale image", 2)
|
||||
|
||||
i = random.randint(1000000, 9999999)
|
||||
png = os.path.join(self.SCRATCH, "{}.png".format(i))
|
||||
|
||||
subprocess.Popen((
|
||||
@@ -122,45 +121,46 @@ class Command(BaseCommand):
|
||||
|
||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||
|
||||
def _get_colour(self, pdf):
|
||||
|
||||
i = random.randint(5000000, 9999999)
|
||||
jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i))
|
||||
|
||||
subprocess.Popen((self.CONVERT, pdf, jpg)).wait()
|
||||
|
||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||
|
||||
def _get_ocr(self, pngs):
|
||||
|
||||
self._render(" OCRing the PDF", 2)
|
||||
|
||||
r = ""
|
||||
for png in pngs:
|
||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||
self._render(" {}".format(f.filename), 3)
|
||||
r += self.OCR.image_to_string(f)
|
||||
r += "\n\n\n\n\n\n\n\n"
|
||||
|
||||
return r
|
||||
|
||||
def _store(self, text, jpgs, pdf):
|
||||
def _store(self, text, pdf):
|
||||
|
||||
sender, title = self._parse_file_name(pdf)
|
||||
|
||||
stats = os.stat(pdf)
|
||||
|
||||
self._render(" Saving record to database", 2)
|
||||
|
||||
doc = Document.objects.create(
|
||||
sender=sender,
|
||||
title=title,
|
||||
content=text,
|
||||
created=timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_ctime)),
|
||||
modified=timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||
sender=sender,
|
||||
title=title,
|
||||
content=text,
|
||||
created=timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||
modified=timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
)
|
||||
|
||||
shutil.move(jpgs[0], os.path.join(
|
||||
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
|
||||
shutil.move(pdf, os.path.join(
|
||||
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
|
||||
with open(pdf, "rb") as unencrypted:
|
||||
with open(doc.pdf_path, "wb") as encrypted:
|
||||
self._render(" Encrypting", 3)
|
||||
encrypted.write(self.gpg.encrypt_file(
|
||||
unencrypted,
|
||||
recipients=None,
|
||||
passphrase=settings.PASSPHRASE,
|
||||
symmetric=True
|
||||
).data)
|
||||
|
||||
def _parse_file_name(self, pdf):
|
||||
"""
|
||||
@@ -175,12 +175,15 @@ class Command(BaseCommand):
|
||||
|
||||
return "", ""
|
||||
|
||||
def _cleanup(self, pngs, jpgs):
|
||||
def _cleanup(self, pngs, pdf):
|
||||
|
||||
jpg_glob = os.path.join(
|
||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0]))
|
||||
png_glob = os.path.join(
|
||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
||||
|
||||
for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)):
|
||||
for f in list(glob.glob(png_glob)) + [pdf]:
|
||||
self._render(" Deleting {}".format(f), 2)
|
||||
os.unlink(f)
|
||||
|
||||
def _render(self, text, verbosity):
|
||||
if self.verbosity >= verbosity:
|
||||
print(text)
|
||||
|
Reference in New Issue
Block a user