From 52f15b4de14c3dbaa4969fb6f4e5382d47c230a5 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 5 Mar 2016 01:57:49 +0000 Subject: [PATCH 1/6] The first stages of getting thumbnails back --- .gitignore | 5 +- media/documents/originals/.keep | 0 media/documents/thumbnails/.keep | 0 .../migrations/0012_auto_20160305_0040.py | 101 ++++++++++++++++++ src/documents/models.py | 20 +++- src/documents/views.py | 8 +- src/paperless/urls.py | 6 +- 7 files changed, 135 insertions(+), 5 deletions(-) create mode 100644 media/documents/originals/.keep create mode 100644 media/documents/thumbnails/.keep create mode 100644 src/documents/migrations/0012_auto_20160305_0040.py diff --git a/.gitignore b/.gitignore index d4c3fe38e..3c8b8ffea 100644 --- a/.gitignore +++ b/.gitignore @@ -57,7 +57,9 @@ docs/_build/ target/ # Stored PDFs -media/* +media/documents/*.gpg +media/documents/thumbnails/*.gpg +media/documents/originals/*.gpg # Sqlite database db.sqlite3 @@ -74,4 +76,3 @@ docker-compose.env # Used for development scripts/import-for-development environment - diff --git a/media/documents/originals/.keep b/media/documents/originals/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/media/documents/thumbnails/.keep b/media/documents/thumbnails/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py new file mode 100644 index 000000000..e42c6cde5 --- /dev/null +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.2 on 2016-03-05 00:40 +from __future__ import unicode_literals + +import gnupg +import os +import re +import shutil +import subprocess +import tempfile + +from django.conf import settings +from django.db import migrations + + +class GnuPG(object): + """ + A handy singleton to use when handling encrypted files. + """ + + gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + + @classmethod + def decrypted(cls, file_handle): + return cls.gpg.decrypt_file( + file_handle, passphrase=settings.PASSPHRASE).data + + @classmethod + def encrypted(cls, file_handle): + return cls.gpg.encrypt_file( + file_handle, + recipients=None, + passphrase=settings.PASSPHRASE, + symmetric=True + ).data + + +def move_documents_and_create_thumbnails(apps, schema_editor): + + documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents")) + + if not documents: + return + + print("\n") + + for f in sorted(documents): + + if not f.endswith("gpg"): + continue + + print(" * Generating a thumbnail for {}".format(f)) + + thumb_temp = tempfile.mkdtemp( + prefix="paperless", dir=settings.SCRATCH_DIR) + orig_temp = tempfile.mkdtemp( + prefix="paperless", dir=settings.SCRATCH_DIR) + + orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f) + orig_target = os.path.join(orig_temp, f.replace(".gpg", "")) + + with open(orig_source, "rb") as encrypted: + with open(orig_target, "wb") as unencrypted: + unencrypted.write(GnuPG.decrypted(encrypted)) + + subprocess.Popen(( + settings.CONVERT_BINARY, + "-scale", "500x500", + orig_target, + os.path.join(thumb_temp, "convert-%04d.jpg") + )).wait() + + thumb_source = os.path.join(thumb_temp, "convert-0000.jpg") + thumb_target = os.path.join( + settings.MEDIA_ROOT, + "documents", + "thumbnails", + re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.jpg\\2", f) + ) + with open(thumb_source, "rb") as unencrypted: + with open(thumb_target, "wb") as encrypted: + encrypted.write(GnuPG.encrypted(unencrypted)) + + shutil.rmtree(thumb_temp) + shutil.rmtree(orig_temp) + + shutil.move( + os.path.join(settings.MEDIA_ROOT, "documents", f), + os.path.join(settings.MEDIA_ROOT, "documents", "originals", f), + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0011_auto_20160303_1929'), + ] + + operations = [ + migrations.RunPython(move_documents_and_create_thumbnails), + ] diff --git a/src/documents/models.py b/src/documents/models.py index a82f7643f..a3ffb8a74 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -171,6 +171,7 @@ class Document(models.Model): return os.path.join( settings.MEDIA_ROOT, "documents", + "originals", "{:07}.{}.gpg".format(self.pk, self.file_type) ) @@ -184,7 +185,24 @@ class Document(models.Model): @property def download_url(self): - return reverse("fetch", kwargs={"pk": self.pk}) + return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk}) + + @property + def thumbnail_path(self): + return os.path.join( + settings.MEDIA_ROOT, + "documents", + "thumbnails", + "{:07}.jpg.gpg".format(self.pk) + ) + + @property + def thumbnail_file(self): + return open(self.thumbnail_path, "rb") + + @property + def thumbnail_url(self): + return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk}) class Log(models.Model): diff --git a/src/documents/views.py b/src/documents/views.py index ff7c4ce05..4a4a060bf 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -35,7 +35,7 @@ class FetchView(LoginRequiredMixin, DetailView): def render_to_response(self, context, **response_kwargs): """ - Override the default to return the unencrypted PDF as raw data. + Override the default to return the unencrypted image/PDF as raw data. """ content_types = { @@ -46,6 +46,12 @@ class FetchView(LoginRequiredMixin, DetailView): Document.TYPE_TIF: "image/tiff", } + if self.kwargs["kind"] == "thumb": + return HttpResponse( + GnuPG.decrypted(self.object.thumb_file), + content_type=content_types[Document.TYPE_JPG] + ) + response = HttpResponse( GnuPG.decrypted(self.object.source_file), content_type=content_types[self.object.file_type] diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 4b73dc88e..a7775a588 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -44,7 +44,11 @@ urlpatterns = [ # url(r"^$", IndexView.as_view(), name="index"), # File downloads - url(r"^fetch/(?P\d+)$", FetchView.as_view(), name="fetch"), + url( + r"^fetch/(?Pdoc|thumb)/(?P\d+)$", + FetchView.as_view(), + name="fetch" + ), # The Django admin url(r"admin/", admin.site.urls), From 8a9ea4664c01f104436cfc89119f7429050841dd Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 5 Mar 2016 02:15:26 +0000 Subject: [PATCH 2/6] Cleaned up the thumbnails by switching to .png --- src/documents/migrations/0012_auto_20160305_0040.py | 9 +++++---- src/documents/models.py | 2 +- src/documents/views.py | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index e42c6cde5..876c2c68e 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -65,17 +65,18 @@ def move_documents_and_create_thumbnails(apps, schema_editor): subprocess.Popen(( settings.CONVERT_BINARY, - "-scale", "500x500", + "-scale", "500x5000", + "-alpha", "remove", orig_target, - os.path.join(thumb_temp, "convert-%04d.jpg") + os.path.join(thumb_temp, "convert-%04d.png") )).wait() - thumb_source = os.path.join(thumb_temp, "convert-0000.jpg") + thumb_source = os.path.join(thumb_temp, "convert-0000.png") thumb_target = os.path.join( settings.MEDIA_ROOT, "documents", "thumbnails", - re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.jpg\\2", f) + re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f) ) with open(thumb_source, "rb") as unencrypted: with open(thumb_target, "wb") as encrypted: diff --git a/src/documents/models.py b/src/documents/models.py index a3ffb8a74..b8baea7f8 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -193,7 +193,7 @@ class Document(models.Model): settings.MEDIA_ROOT, "documents", "thumbnails", - "{:07}.jpg.gpg".format(self.pk) + "{:07}.png.gpg".format(self.pk) ) @property diff --git a/src/documents/views.py b/src/documents/views.py index 4a4a060bf..1dc23aa4f 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -29,7 +29,7 @@ class IndexView(TemplateView): return TemplateView.get_context_data(self, **kwargs) -class FetchView(LoginRequiredMixin, DetailView): +class FetchView(DetailView): model = Document @@ -48,8 +48,8 @@ class FetchView(LoginRequiredMixin, DetailView): if self.kwargs["kind"] == "thumb": return HttpResponse( - GnuPG.decrypted(self.object.thumb_file), - content_type=content_types[Document.TYPE_JPG] + GnuPG.decrypted(self.object.thumbnail_file), + content_type=content_types[Document.TYPE_PNG] ) response = HttpResponse( From 495ed1c36c9c8ebb120449ad0bdac9be27255f3c Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 5 Mar 2016 12:09:06 +0000 Subject: [PATCH 3/6] Added thumbnail generation to the conumer --- src/documents/consumer.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index eeb42cdf1..5cfc20852 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -119,10 +119,11 @@ class Consumer(object): tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) pngs = self._get_greyscale(tempdir, doc) + thumbnail = self._get_thumbnail(tempdir, doc) try: text = self._get_ocr(pngs) - self._store(text, doc) + self._store(text, doc, thumbnail) except OCRError as e: self._ignore.append(doc) self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) @@ -133,6 +134,9 @@ class Consumer(object): self._cleanup_doc(doc) def _get_greyscale(self, tempdir, doc): + """ + Greyscale images are easier for Tesseract to OCR + """ self.log("info", "Generating greyscale image from {}".format(doc)) @@ -150,6 +154,23 @@ class Consumer(object): return sorted(filter(lambda __: os.path.isfile(__), pngs)) + def _get_thumbnail(self, tempdir, doc): + """ + The thumbnail of a PDF is just a 500px wide image of the first page. + """ + + self.log("info", "Generating the thumbnail") + + subprocess.Popen(( + self.CONVERT, + "-scale", "500x5000", + "-alpha", "remove", + doc, + os.path.join(tempdir, "convert-%04d.png") + )).wait() + + return os.path.join(tempdir, "convert-0000.png") + def _guess_language(self, text): try: guess = langdetect.detect(text) @@ -288,7 +309,7 @@ class Consumer(object): m = re.match(self.REGEX_TITLE, parseable) return None, m.group(1), (), get_suffix(m.group(2)) - def _store(self, text, doc): + def _store(self, text, doc, thumbnail): sender, title, tags, file_type = self._guess_attributes_from_name(doc) relevant_tags = set(list(Tag.match_all(text)) + list(tags)) @@ -313,9 +334,16 @@ class Consumer(object): self.log("debug", "Tagging with {}".format(tag_names)) document.tags.add(*relevant_tags) + # Encrypt and store the actual document with open(doc, "rb") as unencrypted: with open(document.source_path, "wb") as encrypted: - self.log("debug", "Encrypting") + self.log("debug", "Encrypting the document") + encrypted.write(GnuPG.encrypted(unencrypted)) + + # Encrypt and store the thumbnail + with open(thumbnail, "rb") as unencrypted: + with open(document.thumbnail_path, "wb") as encrypted: + self.log("debug", "Encrypting the thumbnail") encrypted.write(GnuPG.encrypted(unencrypted)) self.log("info", "Completed") From ac40aee805a7289a721469b50f146d5c3801cdfe Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 5 Mar 2016 12:31:43 +0000 Subject: [PATCH 4/6] Added some nice output so the migration is less scary --- .../migrations/0012_auto_20160305_0040.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index 876c2c68e..618ace5d8 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -11,6 +11,7 @@ import tempfile from django.conf import settings from django.db import migrations +from django.utils.termcolors import colorize as colourise # Spelling hurts me class GnuPG(object): @@ -42,14 +43,25 @@ def move_documents_and_create_thumbnails(apps, schema_editor): if not documents: return - print("\n") + print(colourise( + "\n\n" + " This is a one-time only migration to generate thumbnails for all of your\n" + " documents so that future UIs will have something to work with. If you have\n" + " a lot of documents though, this may take a while, so a coffee break may be\n" + " in order." + "\n", opts=("bold",) + )) for f in sorted(documents): if not f.endswith("gpg"): continue - print(" * Generating a thumbnail for {}".format(f)) + print(" {} {} {}".format( + colourise("*", fg="green"), + colourise("Generating a thumbnail for", fg="white"), + colourise(f, fg="cyan") + )) thumb_temp = tempfile.mkdtemp( prefix="paperless", dir=settings.SCRATCH_DIR) From 034b96277cbb15ca178b3c60ea10a1db0da0c782 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 5 Mar 2016 12:34:26 +0000 Subject: [PATCH 5/6] Added thumbnail_url to the API --- src/documents/serialisers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index c2b2ae7fd..db50d34ba 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -37,7 +37,8 @@ class DocumentSerializer(serializers.ModelSerializer): "created", "modified", "file_name", - "download_url" + "download_url", + "thumbnail_url", ) From bfad4560e139257ec81ccd284984001ee53bfce9 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 5 Mar 2016 12:43:05 +0000 Subject: [PATCH 6/6] Fixed the check for empty installations --- src/documents/migrations/0012_auto_20160305_0040.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/migrations/0012_auto_20160305_0040.py b/src/documents/migrations/0012_auto_20160305_0040.py index 618ace5d8..62a5c65bc 100644 --- a/src/documents/migrations/0012_auto_20160305_0040.py +++ b/src/documents/migrations/0012_auto_20160305_0040.py @@ -40,7 +40,7 @@ def move_documents_and_create_thumbnails(apps, schema_editor): documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents")) - if not documents: + if set(documents) == {"originals", "thumbnails"}: return print(colourise(