mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #72 from danielquinn/feature/thumbnails
Add thumbnails to the document. Closes #71.
This commit is contained in:
commit
fb7f826688
5
.gitignore
vendored
5
.gitignore
vendored
@ -57,7 +57,9 @@ docs/_build/
|
|||||||
target/
|
target/
|
||||||
|
|
||||||
# Stored PDFs
|
# Stored PDFs
|
||||||
media/*
|
media/documents/*.gpg
|
||||||
|
media/documents/thumbnails/*.gpg
|
||||||
|
media/documents/originals/*.gpg
|
||||||
|
|
||||||
# Sqlite database
|
# Sqlite database
|
||||||
db.sqlite3
|
db.sqlite3
|
||||||
@ -74,4 +76,3 @@ docker-compose.env
|
|||||||
# Used for development
|
# Used for development
|
||||||
scripts/import-for-development
|
scripts/import-for-development
|
||||||
environment
|
environment
|
||||||
|
|
||||||
|
0
media/documents/originals/.keep
Normal file
0
media/documents/originals/.keep
Normal file
0
media/documents/thumbnails/.keep
Normal file
0
media/documents/thumbnails/.keep
Normal file
@ -119,10 +119,11 @@ class Consumer(object):
|
|||||||
|
|
||||||
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||||
pngs = self._get_greyscale(tempdir, doc)
|
pngs = self._get_greyscale(tempdir, doc)
|
||||||
|
thumbnail = self._get_thumbnail(tempdir, doc)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = self._get_ocr(pngs)
|
text = self._get_ocr(pngs)
|
||||||
self._store(text, doc)
|
self._store(text, doc, thumbnail)
|
||||||
except OCRError as e:
|
except OCRError as e:
|
||||||
self._ignore.append(doc)
|
self._ignore.append(doc)
|
||||||
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
|
||||||
@ -133,6 +134,9 @@ class Consumer(object):
|
|||||||
self._cleanup_doc(doc)
|
self._cleanup_doc(doc)
|
||||||
|
|
||||||
def _get_greyscale(self, tempdir, doc):
|
def _get_greyscale(self, tempdir, doc):
|
||||||
|
"""
|
||||||
|
Greyscale images are easier for Tesseract to OCR
|
||||||
|
"""
|
||||||
|
|
||||||
self.log("info", "Generating greyscale image from {}".format(doc))
|
self.log("info", "Generating greyscale image from {}".format(doc))
|
||||||
|
|
||||||
@ -150,6 +154,23 @@ class Consumer(object):
|
|||||||
|
|
||||||
return sorted(filter(lambda __: os.path.isfile(__), pngs))
|
return sorted(filter(lambda __: os.path.isfile(__), pngs))
|
||||||
|
|
||||||
|
def _get_thumbnail(self, tempdir, doc):
|
||||||
|
"""
|
||||||
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.log("info", "Generating the thumbnail")
|
||||||
|
|
||||||
|
subprocess.Popen((
|
||||||
|
self.CONVERT,
|
||||||
|
"-scale", "500x5000",
|
||||||
|
"-alpha", "remove",
|
||||||
|
doc,
|
||||||
|
os.path.join(tempdir, "convert-%04d.png")
|
||||||
|
)).wait()
|
||||||
|
|
||||||
|
return os.path.join(tempdir, "convert-0000.png")
|
||||||
|
|
||||||
def _guess_language(self, text):
|
def _guess_language(self, text):
|
||||||
try:
|
try:
|
||||||
guess = langdetect.detect(text)
|
guess = langdetect.detect(text)
|
||||||
@ -288,7 +309,7 @@ class Consumer(object):
|
|||||||
m = re.match(self.REGEX_TITLE, parseable)
|
m = re.match(self.REGEX_TITLE, parseable)
|
||||||
return None, m.group(1), (), get_suffix(m.group(2))
|
return None, m.group(1), (), get_suffix(m.group(2))
|
||||||
|
|
||||||
def _store(self, text, doc):
|
def _store(self, text, doc, thumbnail):
|
||||||
|
|
||||||
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
|
||||||
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
relevant_tags = set(list(Tag.match_all(text)) + list(tags))
|
||||||
@ -313,9 +334,16 @@ class Consumer(object):
|
|||||||
self.log("debug", "Tagging with {}".format(tag_names))
|
self.log("debug", "Tagging with {}".format(tag_names))
|
||||||
document.tags.add(*relevant_tags)
|
document.tags.add(*relevant_tags)
|
||||||
|
|
||||||
|
# Encrypt and store the actual document
|
||||||
with open(doc, "rb") as unencrypted:
|
with open(doc, "rb") as unencrypted:
|
||||||
with open(document.source_path, "wb") as encrypted:
|
with open(document.source_path, "wb") as encrypted:
|
||||||
self.log("debug", "Encrypting")
|
self.log("debug", "Encrypting the document")
|
||||||
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
|
# Encrypt and store the thumbnail
|
||||||
|
with open(thumbnail, "rb") as unencrypted:
|
||||||
|
with open(document.thumbnail_path, "wb") as encrypted:
|
||||||
|
self.log("debug", "Encrypting the thumbnail")
|
||||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
self.log("info", "Completed")
|
self.log("info", "Completed")
|
||||||
|
114
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
114
src/documents/migrations/0012_auto_20160305_0040.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.9.2 on 2016-03-05 00:40
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import gnupg
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations
|
||||||
|
from django.utils.termcolors import colorize as colourise # Spelling hurts me
|
||||||
|
|
||||||
|
|
||||||
|
class GnuPG(object):
|
||||||
|
"""
|
||||||
|
A handy singleton to use when handling encrypted files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def decrypted(cls, file_handle):
|
||||||
|
return cls.gpg.decrypt_file(
|
||||||
|
file_handle, passphrase=settings.PASSPHRASE).data
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def encrypted(cls, file_handle):
|
||||||
|
return cls.gpg.encrypt_file(
|
||||||
|
file_handle,
|
||||||
|
recipients=None,
|
||||||
|
passphrase=settings.PASSPHRASE,
|
||||||
|
symmetric=True
|
||||||
|
).data
|
||||||
|
|
||||||
|
|
||||||
|
def move_documents_and_create_thumbnails(apps, schema_editor):
|
||||||
|
|
||||||
|
documents = os.listdir(os.path.join(settings.MEDIA_ROOT, "documents"))
|
||||||
|
|
||||||
|
if set(documents) == {"originals", "thumbnails"}:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(colourise(
|
||||||
|
"\n\n"
|
||||||
|
" This is a one-time only migration to generate thumbnails for all of your\n"
|
||||||
|
" documents so that future UIs will have something to work with. If you have\n"
|
||||||
|
" a lot of documents though, this may take a while, so a coffee break may be\n"
|
||||||
|
" in order."
|
||||||
|
"\n", opts=("bold",)
|
||||||
|
))
|
||||||
|
|
||||||
|
for f in sorted(documents):
|
||||||
|
|
||||||
|
if not f.endswith("gpg"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(" {} {} {}".format(
|
||||||
|
colourise("*", fg="green"),
|
||||||
|
colourise("Generating a thumbnail for", fg="white"),
|
||||||
|
colourise(f, fg="cyan")
|
||||||
|
))
|
||||||
|
|
||||||
|
thumb_temp = tempfile.mkdtemp(
|
||||||
|
prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||||
|
orig_temp = tempfile.mkdtemp(
|
||||||
|
prefix="paperless", dir=settings.SCRATCH_DIR)
|
||||||
|
|
||||||
|
orig_source = os.path.join(settings.MEDIA_ROOT, "documents", f)
|
||||||
|
orig_target = os.path.join(orig_temp, f.replace(".gpg", ""))
|
||||||
|
|
||||||
|
with open(orig_source, "rb") as encrypted:
|
||||||
|
with open(orig_target, "wb") as unencrypted:
|
||||||
|
unencrypted.write(GnuPG.decrypted(encrypted))
|
||||||
|
|
||||||
|
subprocess.Popen((
|
||||||
|
settings.CONVERT_BINARY,
|
||||||
|
"-scale", "500x5000",
|
||||||
|
"-alpha", "remove",
|
||||||
|
orig_target,
|
||||||
|
os.path.join(thumb_temp, "convert-%04d.png")
|
||||||
|
)).wait()
|
||||||
|
|
||||||
|
thumb_source = os.path.join(thumb_temp, "convert-0000.png")
|
||||||
|
thumb_target = os.path.join(
|
||||||
|
settings.MEDIA_ROOT,
|
||||||
|
"documents",
|
||||||
|
"thumbnails",
|
||||||
|
re.sub(r"(\d+)\.\w+(\.gpg)", "\\1.png\\2", f)
|
||||||
|
)
|
||||||
|
with open(thumb_source, "rb") as unencrypted:
|
||||||
|
with open(thumb_target, "wb") as encrypted:
|
||||||
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
|
shutil.rmtree(thumb_temp)
|
||||||
|
shutil.rmtree(orig_temp)
|
||||||
|
|
||||||
|
shutil.move(
|
||||||
|
os.path.join(settings.MEDIA_ROOT, "documents", f),
|
||||||
|
os.path.join(settings.MEDIA_ROOT, "documents", "originals", f),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0011_auto_20160303_1929'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RunPython(move_documents_and_create_thumbnails),
|
||||||
|
]
|
@ -171,6 +171,7 @@ class Document(models.Model):
|
|||||||
return os.path.join(
|
return os.path.join(
|
||||||
settings.MEDIA_ROOT,
|
settings.MEDIA_ROOT,
|
||||||
"documents",
|
"documents",
|
||||||
|
"originals",
|
||||||
"{:07}.{}.gpg".format(self.pk, self.file_type)
|
"{:07}.{}.gpg".format(self.pk, self.file_type)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -184,7 +185,24 @@ class Document(models.Model):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def download_url(self):
|
def download_url(self):
|
||||||
return reverse("fetch", kwargs={"pk": self.pk})
|
return reverse("fetch", kwargs={"kind": "doc", "pk": self.pk})
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thumbnail_path(self):
|
||||||
|
return os.path.join(
|
||||||
|
settings.MEDIA_ROOT,
|
||||||
|
"documents",
|
||||||
|
"thumbnails",
|
||||||
|
"{:07}.png.gpg".format(self.pk)
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thumbnail_file(self):
|
||||||
|
return open(self.thumbnail_path, "rb")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def thumbnail_url(self):
|
||||||
|
return reverse("fetch", kwargs={"kind": "thumb", "pk": self.pk})
|
||||||
|
|
||||||
|
|
||||||
class Log(models.Model):
|
class Log(models.Model):
|
||||||
|
@ -37,7 +37,8 @@ class DocumentSerializer(serializers.ModelSerializer):
|
|||||||
"created",
|
"created",
|
||||||
"modified",
|
"modified",
|
||||||
"file_name",
|
"file_name",
|
||||||
"download_url"
|
"download_url",
|
||||||
|
"thumbnail_url",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,13 +29,13 @@ class IndexView(TemplateView):
|
|||||||
return TemplateView.get_context_data(self, **kwargs)
|
return TemplateView.get_context_data(self, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class FetchView(LoginRequiredMixin, DetailView):
|
class FetchView(DetailView):
|
||||||
|
|
||||||
model = Document
|
model = Document
|
||||||
|
|
||||||
def render_to_response(self, context, **response_kwargs):
|
def render_to_response(self, context, **response_kwargs):
|
||||||
"""
|
"""
|
||||||
Override the default to return the unencrypted PDF as raw data.
|
Override the default to return the unencrypted image/PDF as raw data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content_types = {
|
content_types = {
|
||||||
@ -46,6 +46,12 @@ class FetchView(LoginRequiredMixin, DetailView):
|
|||||||
Document.TYPE_TIF: "image/tiff",
|
Document.TYPE_TIF: "image/tiff",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.kwargs["kind"] == "thumb":
|
||||||
|
return HttpResponse(
|
||||||
|
GnuPG.decrypted(self.object.thumbnail_file),
|
||||||
|
content_type=content_types[Document.TYPE_PNG]
|
||||||
|
)
|
||||||
|
|
||||||
response = HttpResponse(
|
response = HttpResponse(
|
||||||
GnuPG.decrypted(self.object.source_file),
|
GnuPG.decrypted(self.object.source_file),
|
||||||
content_type=content_types[self.object.file_type]
|
content_type=content_types[self.object.file_type]
|
||||||
|
@ -44,7 +44,11 @@ urlpatterns = [
|
|||||||
# url(r"^$", IndexView.as_view(), name="index"),
|
# url(r"^$", IndexView.as_view(), name="index"),
|
||||||
|
|
||||||
# File downloads
|
# File downloads
|
||||||
url(r"^fetch/(?P<pk>\d+)$", FetchView.as_view(), name="fetch"),
|
url(
|
||||||
|
r"^fetch/(?P<kind>doc|thumb)/(?P<pk>\d+)$",
|
||||||
|
FetchView.as_view(),
|
||||||
|
name="fetch"
|
||||||
|
),
|
||||||
|
|
||||||
# The Django admin
|
# The Django admin
|
||||||
url(r"admin/", admin.site.urls),
|
url(r"admin/", admin.site.urls),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user