#12: Support image documents

This commit is contained in:
Daniel Quinn 2016-01-29 23:18:03 +00:00
parent d6f4ef27aa
commit ace9389e5f
5 changed files with 104 additions and 57 deletions

View File

@ -34,7 +34,7 @@ class MonthListFilter(admin.SimpleListFilter):
class DocumentAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin):
search_fields = ("sender__name", "title", "content",) search_fields = ("sender__name", "title", "content",)
list_display = ("edit", "created", "sender", "title", "tags_", "pdf") list_display = ("edit", "created", "sender", "title", "tags_", "document")
list_filter = (MonthListFilter, "tags", "sender") list_filter = (MonthListFilter, "tags", "sender")
list_editable = ("sender", "title",) list_editable = ("sender", "title",)
list_per_page = 25 list_per_page = 25
@ -44,14 +44,14 @@ class DocumentAdmin(admin.ModelAdmin):
static("documents/img/edit.png")) static("documents/img/edit.png"))
edit.allow_tags = True edit.allow_tags = True
def pdf(self, obj): def document(self, obj):
return '<a href="{}">' \ return '<a href="{}">' \
'<img src="{}" width="22" height="22" alt="PDF icon">' \ '<img src="{}" width="22" height="22" alt="PDF icon">' \
'</a>'.format( '</a>'.format(
reverse("fetch", kwargs={"pk": obj.pk}), reverse("fetch", kwargs={"pk": obj.pk}),
static("documents/img/application-pdf.png") static("documents/img/application-pdf.png")
) )
pdf.allow_tags = True document.allow_tags = True
def tags_(self, obj): def tags_(self, obj):
r = "" r = ""

View File

@ -31,9 +31,9 @@ class Command(BaseCommand):
Loop over every file found in CONSUMPTION_DIR and: Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png 1. Convert it to a greyscale png
2. Use tesseract on the png 2. Use tesseract on the png
3. Encrypt and store the PDF in the MEDIA_ROOT 3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database 4. Store the OCR'd text in the database
5. Delete the pdf and image(s) 5. Delete the document and image(s)
""" """
LOOP_TIME = 10 # Seconds LOOP_TIME = 10 # Seconds
@ -44,10 +44,12 @@ class Command(BaseCommand):
OCR = pyocr.get_available_tools()[0] OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") PARSER_REGEX_TITLE = re.compile(
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$") r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
PARSER_REGEX_SENDER_TITLE = re.compile(
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -74,35 +76,35 @@ class Command(BaseCommand):
def loop(self): def loop(self):
for pdf in os.listdir(self.CONSUME): for doc in os.listdir(self.CONSUME):
pdf = os.path.join(self.CONSUME, pdf) doc = os.path.join(self.CONSUME, doc)
if not os.path.isfile(pdf): if not os.path.isfile(doc):
continue continue
if not re.match(self.PARSER_REGEX_TITLE, pdf): if not re.match(self.PARSER_REGEX_TITLE, doc):
continue continue
if pdf in self._ignore: if doc in self._ignore:
continue continue
if self._is_ready(pdf): if self._is_ready(doc):
continue continue
self._render("Consuming {}".format(pdf), 1) self._render("Consuming {}".format(doc), 1)
pngs = self._get_greyscale(pdf) pngs = self._get_greyscale(doc)
try: try:
text = self._get_ocr(pngs) text = self._get_ocr(pngs)
except OCRError: except OCRError:
self._ignore.append(pdf) self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(pdf), 0) self._render("OCR FAILURE: {}".format(doc), 0)
continue continue
self._store(text, pdf) self._store(text, doc)
self._cleanup(pngs, pdf) self._cleanup(pngs, doc)
def _setup(self): def _setup(self):
@ -116,29 +118,29 @@ class Command(BaseCommand):
raise CommandError("Consumption directory {} does not exist".format( raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME)) self.CONSUME))
for d in (self.SCRATCH, self.MEDIA_PDF): for d in (self.SCRATCH, self.MEDIA_DOCS):
try: try:
os.makedirs(d) os.makedirs(d)
except FileExistsError: except FileExistsError:
pass pass
def _is_ready(self, pdf): def _is_ready(self, doc):
""" """
Detect whether `pdf` is ready to consume or if it's still being written Detect whether `doc` is ready to consume or if it's still being written
to by the scanner. to by the scanner.
""" """
t = os.stat(pdf).st_mtime t = os.stat(doc).st_mtime
if self.stats.get(pdf) == t: if self.stats.get(doc) == t:
del(self.stats[pdf]) del(self.stats[doc])
return True return True
self.stats[pdf] = t self.stats[doc] = t
return False return False
def _get_greyscale(self, pdf): def _get_greyscale(self, doc):
self._render(" Generating greyscale image", 2) self._render(" Generating greyscale image", 2)
@ -147,14 +149,14 @@ class Command(BaseCommand):
subprocess.Popen(( subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8", self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", pdf, png "-type", "grayscale", doc, png
)).wait() )).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs): def _get_ocr(self, pngs):
self._render(" OCRing the PDF", 2) self._render(" OCRing the document", 2)
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
@ -203,19 +205,22 @@ class Command(BaseCommand):
# Strip out excess white space to allow matching to go smoother # Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r) return re.sub(r"\s+", " ", r)
def _store(self, text, pdf): def _store(self, text, doc):
sender, title = self._parse_file_name(pdf) sender, title, file_type = self._parse_file_name(doc)
relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
stats = os.stat(pdf) lower_text = text.lower()
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
stats = os.stat(doc)
self._render(" Saving record to database", 2) self._render(" Saving record to database", 2)
doc = Document.objects.create( document = Document.objects.create(
sender=sender, sender=sender,
title=title, title=title,
content=text, content=text,
file_type=file_type,
created=timezone.make_aware( created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)), datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware( modified=timezone.make_aware(
@ -225,38 +230,38 @@ class Command(BaseCommand):
if relevant_tags: if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags]) tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2) self._render(" Tagging with {}".format(tag_names), 2)
doc.tags.add(*relevant_tags) document.tags.add(*relevant_tags)
with open(pdf, "rb") as unencrypted: with open(doc, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted: with open(document.source_path, "wb") as encrypted:
self._render(" Encrypting", 3) self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted)) encrypted.write(GnuPG.encrypted(unencrypted))
def _parse_file_name(self, pdf): def _parse_file_name(self, doc):
""" """
We use a crude naming convention to make handling the sender and title We use a crude naming convention to make handling the sender and title
easier: easier:
"sender - title.pdf" "<sender> - <title>.<suffix>"
""" """
# First we attempt "sender - title.pdf" # First we attempt "<sender> - <title>.<suffix>"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf) m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
if m: if m:
sender_name, title = m.group(1), m.group(2) sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
sender, __ = Sender.objects.get_or_create( sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)}) name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title return sender, title, file_type
# That didn't work, so we assume sender is None # That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, pdf) m = re.match(self.PARSER_REGEX_TITLE, doc)
return None, m.group(1) return None, m.group(1), m.group(2)
def _cleanup(self, pngs, pdf): def _cleanup(self, pngs, doc):
png_glob = os.path.join( png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
for f in list(glob.glob(png_glob)) + [pdf]: for f in list(glob.glob(png_glob)) + [doc]:
self._render(" Deleting {}".format(f), 2) self._render(" Deleting {}".format(f), 2)
os.unlink(f) os.unlink(f)

View File

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-01-29 22:58
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0007_auto_20160126_2114'),
]
operations = [
migrations.AddField(
model_name='document',
name='file_type',
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF')], default='pdf', editable=False, max_length=4),
preserve_default=False,
),
]

View File

@ -111,10 +111,22 @@ class Tag(SluggedModel):
class Document(models.Model): class Document(models.Model):
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
sender = models.ForeignKey( sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents") Sender, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True) title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True) content = models.TextField(db_index=True)
file_type = models.CharField(
max_length=4,
editable=False,
choices=tuple([(t, t.upper()) for t in TYPES])
)
tags = models.ManyToManyField(Tag, related_name="documents") tags = models.ManyToManyField(Tag, related_name="documents")
created = models.DateTimeField(default=timezone.now, editable=False) created = models.DateTimeField(default=timezone.now, editable=False)
modified = models.DateTimeField(auto_now=True, editable=False) modified = models.DateTimeField(auto_now=True, editable=False)
@ -131,20 +143,19 @@ class Document(models.Model):
return str(created) return str(created)
@property @property
def pdf_path(self): def source_path(self):
return os.path.join( return os.path.join(
settings.MEDIA_ROOT, settings.MEDIA_ROOT,
"documents", "documents",
"pdf", "{:07}.{}.gpg".format(self.pk, self.file_type)
"{:07}.pdf.gpg".format(self.pk)
) )
@property @property
def pdf(self): def source_file(self):
return open(self.pdf_path, "rb") return open(self.source_path, "rb")
@property @property
def parseable_file_name(self): def parseable_file_name(self):
if self.sender and self.title: if self.sender and self.title:
return "{} - {}.pdf".format(self.sender, self.title) return "{} - {}.{}".format(self.sender, self.title, self.file_types)
return os.path.basename(self.pdf_path) return os.path.basename(self.source_path)

View File

@ -16,9 +16,19 @@ class PdfView(DetailView):
Override the default to return the unencrypted PDF as raw data. Override the default to return the unencrypted PDF as raw data.
""" """
content_types = {
Document.TYPE_PDF: "application/pdf",
Document.TYPE_PNG: "image/png",
Document.TYPE_JPG: "image/jpeg",
Document.TYPE_GIF: "image/gif",
Document.TYPE_TIF: "image/tiff",
}
response = HttpResponse( response = HttpResponse(
GnuPG.decrypted(self.object.pdf), content_type="application/pdf") GnuPG.decrypted(self.object.source_file),
content_type=content_types[self.object.file_type]
)
response["Content-Disposition"] = 'attachment; filename="{}"'.format( response["Content-Disposition"] = 'attachment; filename="{}"'.format(
slugify(str(self.object)) + ".pdf") slugify(str(self.object)) + "." + self.object.file_type)
return response return response