mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Added GPG encryption for the PDFs
This commit is contained in:
		| @@ -1,5 +1,5 @@ | ||||
| from django.conf import settings | ||||
| from django.contrib import admin | ||||
| from django.core.urlresolvers import reverse | ||||
| from django.templatetags.static import static | ||||
|  | ||||
| from .models import Document | ||||
| @@ -8,27 +8,20 @@ from .models import Document | ||||
| class DocumentAdmin(admin.ModelAdmin): | ||||
|  | ||||
|     search_fields = ("sender", "title", "content",) | ||||
|     list_display = ("edit", "created", "sender", "title", "thumbnail", "pdf") | ||||
|     list_display = ("edit", "created", "sender", "title", "pdf") | ||||
|     list_filter = ("created", "sender") | ||||
|     save_on_top = True | ||||
|  | ||||
|     def edit(self, obj): | ||||
|         return '<img src="{}" width="64" height="64" alt="Edit icon" />'.format( | ||||
|         return '<img src="{}" width="22" height="22" alt="Edit icon" />'.format( | ||||
|             static("documents/img/edit.png")) | ||||
|     edit.allow_tags = True | ||||
|  | ||||
|     def thumbnail(self, obj): | ||||
|         return '<a href="{media}documents/img/{pk:07}.jpg" target="_blank">' \ | ||||
|                  '<img src="{media}documents/img/{pk:07}.jpg" width="100" />' \ | ||||
|                '</a>'.format(media=settings.MEDIA_URL, pk=obj.pk) | ||||
|     thumbnail.allow_tags = True | ||||
|  | ||||
|     def pdf(self, obj): | ||||
|         return '<a href="{}documents/pdf/{:07}.pdf">' \ | ||||
|                  '<img src="{}" width="64" height="64" alt="PDF icon">' \ | ||||
|         return '<a href="{}">' \ | ||||
|                  '<img src="{}" width="22" height="22" alt="PDF icon">' \ | ||||
|                '</a>'.format( | ||||
|                     settings.MEDIA_URL, | ||||
|                     obj.pk, | ||||
|                     reverse("fetch", kwargs={"pk": obj.pk}), | ||||
|                     static("documents/img/application-pdf.png") | ||||
|                 ) | ||||
|     pdf.allow_tags = True | ||||
|   | ||||
| @@ -1,9 +1,9 @@ | ||||
| import datetime | ||||
| import glob | ||||
| import gnupg | ||||
| import os | ||||
| import random | ||||
| import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import time | ||||
|  | ||||
| @@ -36,8 +36,6 @@ class Command(BaseCommand): | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|  | ||||
|     OCR = pyocr.get_available_tools()[0] | ||||
|  | ||||
|     MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img") | ||||
|     MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") | ||||
|  | ||||
|     PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$") | ||||
| @@ -45,6 +43,7 @@ class Command(BaseCommand): | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         self.verbosity = 0 | ||||
|         self.stats = {} | ||||
|         self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
| @@ -77,18 +76,16 @@ class Command(BaseCommand): | ||||
|             if self._is_ready(pdf): | ||||
|                 continue | ||||
|  | ||||
|             if self.verbosity > 1: | ||||
|                 print("Consuming {}".format(pdf)) | ||||
|             self._render("Consuming {}".format(pdf), 1) | ||||
|  | ||||
|             pngs = self._get_greyscale(pdf) | ||||
|             jpgs = self._get_colour(pdf) | ||||
|             text = self._get_ocr(pngs) | ||||
|  | ||||
|             self._store(text, jpgs, pdf) | ||||
|             self._cleanup(pngs, jpgs) | ||||
|             self._store(text, pdf) | ||||
|             self._cleanup(pngs, pdf) | ||||
|  | ||||
|     def _setup(self): | ||||
|         for d in (self.SCRATCH, self.MEDIA_IMG, self.MEDIA_PDF): | ||||
|         for d in (self.SCRATCH, self.MEDIA_PDF): | ||||
|             try: | ||||
|                 os.makedirs(d) | ||||
|             except FileExistsError: | ||||
| @@ -112,7 +109,9 @@ class Command(BaseCommand): | ||||
|  | ||||
|     def _get_greyscale(self, pdf): | ||||
|  | ||||
|         i = random.randint(1000000, 4999999) | ||||
|         self._render("  Generating greyscale image", 2) | ||||
|  | ||||
|         i = random.randint(1000000, 9999999) | ||||
|         png = os.path.join(self.SCRATCH, "{}.png".format(i)) | ||||
|  | ||||
|         subprocess.Popen(( | ||||
| @@ -122,45 +121,46 @@ class Command(BaseCommand): | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_colour(self, pdf): | ||||
|  | ||||
|         i = random.randint(5000000, 9999999) | ||||
|         jpg = os.path.join(self.SCRATCH, "{}.jpg".format(i)) | ||||
|  | ||||
|         subprocess.Popen((self.CONVERT, pdf, jpg)).wait() | ||||
|  | ||||
|         return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) | ||||
|  | ||||
|     def _get_ocr(self, pngs): | ||||
|  | ||||
|         self._render("  OCRing the PDF", 2) | ||||
|  | ||||
|         r = "" | ||||
|         for png in pngs: | ||||
|             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|                 self._render("    {}".format(f.filename), 3) | ||||
|                 r += self.OCR.image_to_string(f) | ||||
|                 r += "\n\n\n\n\n\n\n\n" | ||||
|  | ||||
|         return r | ||||
|  | ||||
|     def _store(self, text, jpgs, pdf): | ||||
|     def _store(self, text, pdf): | ||||
|  | ||||
|         sender, title = self._parse_file_name(pdf) | ||||
|  | ||||
|         stats = os.stat(pdf) | ||||
|  | ||||
|         self._render("  Saving record to database", 2) | ||||
|  | ||||
|         doc = Document.objects.create( | ||||
|             sender=sender, | ||||
|             title=title, | ||||
|             content=text, | ||||
|             created=timezone.make_aware( | ||||
|                     datetime.datetime.fromtimestamp(stats.st_ctime)), | ||||
|                 modified=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)), | ||||
|             modified=timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||
|         ) | ||||
|  | ||||
|         shutil.move(jpgs[0], os.path.join( | ||||
|             self.MEDIA_IMG, "{:07}.jpg".format(doc.pk))) | ||||
|         shutil.move(pdf, os.path.join( | ||||
|             self.MEDIA_PDF, "{:07}.pdf".format(doc.pk))) | ||||
|         with open(pdf, "rb") as unencrypted: | ||||
|             with open(doc.pdf_path, "wb") as encrypted: | ||||
|                 self._render("  Encrypting", 3) | ||||
|                 encrypted.write(self.gpg.encrypt_file( | ||||
|                     unencrypted, | ||||
|                     recipients=None, | ||||
|                     passphrase=settings.PASSPHRASE, | ||||
|                     symmetric=True | ||||
|                 ).data) | ||||
|  | ||||
|     def _parse_file_name(self, pdf): | ||||
|         """ | ||||
| @@ -175,12 +175,15 @@ class Command(BaseCommand): | ||||
|  | ||||
|         return "", "" | ||||
|  | ||||
|     def _cleanup(self, pngs, jpgs): | ||||
|     def _cleanup(self, pngs, pdf): | ||||
|  | ||||
|         jpg_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.jpg$", "\\1*", jpgs[0])) | ||||
|         png_glob = os.path.join( | ||||
|             self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) | ||||
|  | ||||
|         for f in list(glob.glob(jpg_glob)) + list(glob.glob(png_glob)): | ||||
|         for f in list(glob.glob(png_glob)) + [pdf]: | ||||
|             self._render("  Deleting {}".format(f), 2) | ||||
|             os.unlink(f) | ||||
|  | ||||
|     def _render(self, text, verbosity): | ||||
|         if self.verbosity >= verbosity: | ||||
|             print(text) | ||||
|   | ||||
| @@ -1,3 +1,6 @@ | ||||
| import os | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.db import models | ||||
| from django.utils import timezone | ||||
|  | ||||
| @@ -20,3 +23,16 @@ class Document(models.Model): | ||||
|         if self.sender or self.title: | ||||
|             return "{}: {}, {}".format(created, self.sender or self.title) | ||||
|         return str(created) | ||||
|  | ||||
|     @property | ||||
|     def pdf_path(self): | ||||
|         return os.path.join( | ||||
|             settings.MEDIA_ROOT, | ||||
|             "documents", | ||||
|             "pdf", | ||||
|             "{:07}.pdf.gpg".format(self.pk) | ||||
|         ) | ||||
|  | ||||
|     @property | ||||
|     def pdf(self): | ||||
|         return open(self.pdf_path, "rb") | ||||
|   | ||||
| @@ -1,3 +1,29 @@ | ||||
| from django.shortcuts import render | ||||
| import gnupg | ||||
|  | ||||
| # Create your views here. | ||||
| from django.conf import settings | ||||
| from django.http import HttpResponse | ||||
| from django.template.defaultfilters import slugify | ||||
| from django.views.generic.detail import DetailView | ||||
|  | ||||
| from .models import Document | ||||
|  | ||||
|  | ||||
| class PdfView(DetailView): | ||||
|  | ||||
|     model = Document | ||||
|  | ||||
|     def render_to_response(self, context, **response_kwargs): | ||||
|         """ | ||||
|         Override the default to return the unencrypted PDF as raw data. | ||||
|         """ | ||||
|  | ||||
|         gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) | ||||
|  | ||||
|         response = HttpResponse(gpg.decrypt_file( | ||||
|             self.object.pdf, | ||||
|             passphrase=settings.PASSPHRASE, | ||||
|         ).data, content_type="application/pdf") | ||||
|         response["Content-Disposition"] = 'attachment; filename="{}"'.format( | ||||
|             slugify(str(self.object)) + ".pdf") | ||||
|  | ||||
|         return response | ||||
|   | ||||
| @@ -5,6 +5,15 @@ import sys | ||||
| if __name__ == "__main__": | ||||
|     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") | ||||
|  | ||||
|     from django.conf import settings | ||||
|     from django.core.management import execute_from_command_line | ||||
|  | ||||
|     # The runserver and consumer need to have access to the passphrase, so it | ||||
|     # must be entered at start time to keep it safe. | ||||
|     if "runserver" in sys.argv or "consume" in sys.argv: | ||||
|         settings.PASSPHRASE = "asdf" | ||||
|         if not settings.DEBUG: | ||||
|             settings.PASSPHRASE = input( | ||||
|                 "Production environment.  Input passphrase: ") | ||||
|  | ||||
|     execute_from_command_line(sys.argv) | ||||
|   | ||||
							
								
								
									
										4
									
								
								src/paperless/requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								src/paperless/requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| Django==1.9 | ||||
| Pillow==3.0.0 | ||||
| pyocr==0.3.1 | ||||
| python-gnupg==0.3.8 | ||||
| @@ -135,3 +135,5 @@ MEDIA_URL = "/media/" | ||||
| CONVERT_BINARY = "/usr/bin/convert" | ||||
| SCRATCH_DIR = "/tmp/paperless"  # Will be created if it doesn't exist | ||||
| CONSUMPTION_DIR = "/tmp/paperless/consume" | ||||
| GNUPG_HOME = os.environ.get("HOME", "/dev/null") | ||||
| PASSPHRASE = None  # Set via manage.py | ||||
|   | ||||
| @@ -18,6 +18,9 @@ from django.conf import settings | ||||
| from django.conf.urls import url, static | ||||
| from django.contrib import admin | ||||
|  | ||||
| from documents.views import PdfView | ||||
|  | ||||
| urlpatterns = [ | ||||
|     url(r"^fetch/(?P<pk>\d+)$", PdfView.as_view(), name="fetch"), | ||||
|     url(r'', admin.site.urls), | ||||
| ] + static.static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn