mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
#12: Support image documents
This commit is contained in:
parent
d6f4ef27aa
commit
ace9389e5f
@ -34,7 +34,7 @@ class MonthListFilter(admin.SimpleListFilter):
|
|||||||
class DocumentAdmin(admin.ModelAdmin):
|
class DocumentAdmin(admin.ModelAdmin):
|
||||||
|
|
||||||
search_fields = ("sender__name", "title", "content",)
|
search_fields = ("sender__name", "title", "content",)
|
||||||
list_display = ("edit", "created", "sender", "title", "tags_", "pdf")
|
list_display = ("edit", "created", "sender", "title", "tags_", "document")
|
||||||
list_filter = (MonthListFilter, "tags", "sender")
|
list_filter = (MonthListFilter, "tags", "sender")
|
||||||
list_editable = ("sender", "title",)
|
list_editable = ("sender", "title",)
|
||||||
list_per_page = 25
|
list_per_page = 25
|
||||||
@ -44,14 +44,14 @@ class DocumentAdmin(admin.ModelAdmin):
|
|||||||
static("documents/img/edit.png"))
|
static("documents/img/edit.png"))
|
||||||
edit.allow_tags = True
|
edit.allow_tags = True
|
||||||
|
|
||||||
def pdf(self, obj):
|
def document(self, obj):
|
||||||
return '<a href="{}">' \
|
return '<a href="{}">' \
|
||||||
'<img src="{}" width="22" height="22" alt="PDF icon">' \
|
'<img src="{}" width="22" height="22" alt="PDF icon">' \
|
||||||
'</a>'.format(
|
'</a>'.format(
|
||||||
reverse("fetch", kwargs={"pk": obj.pk}),
|
reverse("fetch", kwargs={"pk": obj.pk}),
|
||||||
static("documents/img/application-pdf.png")
|
static("documents/img/application-pdf.png")
|
||||||
)
|
)
|
||||||
pdf.allow_tags = True
|
document.allow_tags = True
|
||||||
|
|
||||||
def tags_(self, obj):
|
def tags_(self, obj):
|
||||||
r = ""
|
r = ""
|
||||||
|
@ -31,9 +31,9 @@ class Command(BaseCommand):
|
|||||||
Loop over every file found in CONSUMPTION_DIR and:
|
Loop over every file found in CONSUMPTION_DIR and:
|
||||||
1. Convert it to a greyscale png
|
1. Convert it to a greyscale png
|
||||||
2. Use tesseract on the png
|
2. Use tesseract on the png
|
||||||
3. Encrypt and store the PDF in the MEDIA_ROOT
|
3. Encrypt and store the document in the MEDIA_ROOT
|
||||||
4. Store the OCR'd text in the database
|
4. Store the OCR'd text in the database
|
||||||
5. Delete the pdf and image(s)
|
5. Delete the document and image(s)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
LOOP_TIME = 10 # Seconds
|
LOOP_TIME = 10 # Seconds
|
||||||
@ -44,10 +44,12 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
OCR = pyocr.get_available_tools()[0]
|
OCR = pyocr.get_available_tools()[0]
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
|
MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
|
||||||
|
|
||||||
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
|
PARSER_REGEX_TITLE = re.compile(
|
||||||
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
|
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
|
||||||
|
PARSER_REGEX_SENDER_TITLE = re.compile(
|
||||||
|
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
||||||
@ -74,35 +76,35 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
def loop(self):
|
def loop(self):
|
||||||
|
|
||||||
for pdf in os.listdir(self.CONSUME):
|
for doc in os.listdir(self.CONSUME):
|
||||||
|
|
||||||
pdf = os.path.join(self.CONSUME, pdf)
|
doc = os.path.join(self.CONSUME, doc)
|
||||||
|
|
||||||
if not os.path.isfile(pdf):
|
if not os.path.isfile(doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not re.match(self.PARSER_REGEX_TITLE, pdf):
|
if not re.match(self.PARSER_REGEX_TITLE, doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if pdf in self._ignore:
|
if doc in self._ignore:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self._is_ready(pdf):
|
if self._is_ready(doc):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self._render("Consuming {}".format(pdf), 1)
|
self._render("Consuming {}".format(doc), 1)
|
||||||
|
|
||||||
pngs = self._get_greyscale(pdf)
|
pngs = self._get_greyscale(doc)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = self._get_ocr(pngs)
|
text = self._get_ocr(pngs)
|
||||||
except OCRError:
|
except OCRError:
|
||||||
self._ignore.append(pdf)
|
self._ignore.append(doc)
|
||||||
self._render("OCR FAILURE: {}".format(pdf), 0)
|
self._render("OCR FAILURE: {}".format(doc), 0)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self._store(text, pdf)
|
self._store(text, doc)
|
||||||
self._cleanup(pngs, pdf)
|
self._cleanup(pngs, doc)
|
||||||
|
|
||||||
def _setup(self):
|
def _setup(self):
|
||||||
|
|
||||||
@ -116,29 +118,29 @@ class Command(BaseCommand):
|
|||||||
raise CommandError("Consumption directory {} does not exist".format(
|
raise CommandError("Consumption directory {} does not exist".format(
|
||||||
self.CONSUME))
|
self.CONSUME))
|
||||||
|
|
||||||
for d in (self.SCRATCH, self.MEDIA_PDF):
|
for d in (self.SCRATCH, self.MEDIA_DOCS):
|
||||||
try:
|
try:
|
||||||
os.makedirs(d)
|
os.makedirs(d)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _is_ready(self, pdf):
|
def _is_ready(self, doc):
|
||||||
"""
|
"""
|
||||||
Detect whether `pdf` is ready to consume or if it's still being written
|
Detect whether `doc` is ready to consume or if it's still being written
|
||||||
to by the scanner.
|
to by the scanner.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
t = os.stat(pdf).st_mtime
|
t = os.stat(doc).st_mtime
|
||||||
|
|
||||||
if self.stats.get(pdf) == t:
|
if self.stats.get(doc) == t:
|
||||||
del(self.stats[pdf])
|
del(self.stats[doc])
|
||||||
return True
|
return True
|
||||||
|
|
||||||
self.stats[pdf] = t
|
self.stats[doc] = t
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _get_greyscale(self, pdf):
|
def _get_greyscale(self, doc):
|
||||||
|
|
||||||
self._render(" Generating greyscale image", 2)
|
self._render(" Generating greyscale image", 2)
|
||||||
|
|
||||||
@ -147,14 +149,14 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
subprocess.Popen((
|
subprocess.Popen((
|
||||||
self.CONVERT, "-density", "300", "-depth", "8",
|
self.CONVERT, "-density", "300", "-depth", "8",
|
||||||
"-type", "grayscale", pdf, png
|
"-type", "grayscale", doc, png
|
||||||
)).wait()
|
)).wait()
|
||||||
|
|
||||||
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
|
||||||
|
|
||||||
def _get_ocr(self, pngs):
|
def _get_ocr(self, pngs):
|
||||||
|
|
||||||
self._render(" OCRing the PDF", 2)
|
self._render(" OCRing the document", 2)
|
||||||
|
|
||||||
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
|
||||||
|
|
||||||
@ -203,19 +205,22 @@ class Command(BaseCommand):
|
|||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
return re.sub(r"\s+", " ", r)
|
return re.sub(r"\s+", " ", r)
|
||||||
|
|
||||||
def _store(self, text, pdf):
|
def _store(self, text, doc):
|
||||||
|
|
||||||
sender, title = self._parse_file_name(pdf)
|
sender, title, file_type = self._parse_file_name(doc)
|
||||||
relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
|
|
||||||
|
|
||||||
stats = os.stat(pdf)
|
lower_text = text.lower()
|
||||||
|
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
|
||||||
|
|
||||||
|
stats = os.stat(doc)
|
||||||
|
|
||||||
self._render(" Saving record to database", 2)
|
self._render(" Saving record to database", 2)
|
||||||
|
|
||||||
doc = Document.objects.create(
|
document = Document.objects.create(
|
||||||
sender=sender,
|
sender=sender,
|
||||||
title=title,
|
title=title,
|
||||||
content=text,
|
content=text,
|
||||||
|
file_type=file_type,
|
||||||
created=timezone.make_aware(
|
created=timezone.make_aware(
|
||||||
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
datetime.datetime.fromtimestamp(stats.st_mtime)),
|
||||||
modified=timezone.make_aware(
|
modified=timezone.make_aware(
|
||||||
@ -225,38 +230,38 @@ class Command(BaseCommand):
|
|||||||
if relevant_tags:
|
if relevant_tags:
|
||||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
tag_names = ", ".join([t.slug for t in relevant_tags])
|
||||||
self._render(" Tagging with {}".format(tag_names), 2)
|
self._render(" Tagging with {}".format(tag_names), 2)
|
||||||
doc.tags.add(*relevant_tags)
|
document.tags.add(*relevant_tags)
|
||||||
|
|
||||||
with open(pdf, "rb") as unencrypted:
|
with open(doc, "rb") as unencrypted:
|
||||||
with open(doc.pdf_path, "wb") as encrypted:
|
with open(document.source_path, "wb") as encrypted:
|
||||||
self._render(" Encrypting", 3)
|
self._render(" Encrypting", 3)
|
||||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||||
|
|
||||||
def _parse_file_name(self, pdf):
|
def _parse_file_name(self, doc):
|
||||||
"""
|
"""
|
||||||
We use a crude naming convention to make handling the sender and title
|
We use a crude naming convention to make handling the sender and title
|
||||||
easier:
|
easier:
|
||||||
"sender - title.pdf"
|
"<sender> - <title>.<suffix>"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# First we attempt "sender - title.pdf"
|
# First we attempt "<sender> - <title>.<suffix>"
|
||||||
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
|
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
|
||||||
if m:
|
if m:
|
||||||
sender_name, title = m.group(1), m.group(2)
|
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
|
||||||
sender, __ = Sender.objects.get_or_create(
|
sender, __ = Sender.objects.get_or_create(
|
||||||
name=sender_name, defaults={"slug": slugify(sender_name)})
|
name=sender_name, defaults={"slug": slugify(sender_name)})
|
||||||
return sender, title
|
return sender, title, file_type
|
||||||
|
|
||||||
# That didn't work, so we assume sender is None
|
# That didn't work, so we assume sender is None
|
||||||
m = re.match(self.PARSER_REGEX_TITLE, pdf)
|
m = re.match(self.PARSER_REGEX_TITLE, doc)
|
||||||
return None, m.group(1)
|
return None, m.group(1), m.group(2)
|
||||||
|
|
||||||
def _cleanup(self, pngs, pdf):
|
def _cleanup(self, pngs, doc):
|
||||||
|
|
||||||
png_glob = os.path.join(
|
png_glob = os.path.join(
|
||||||
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
|
||||||
|
|
||||||
for f in list(glob.glob(png_glob)) + [pdf]:
|
for f in list(glob.glob(png_glob)) + [doc]:
|
||||||
self._render(" Deleting {}".format(f), 2)
|
self._render(" Deleting {}".format(f), 2)
|
||||||
os.unlink(f)
|
os.unlink(f)
|
||||||
|
|
||||||
|
21
src/documents/migrations/0008_document_file_type.py
Normal file
21
src/documents/migrations/0008_document_file_type.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.9 on 2016-01-29 22:58
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0007_auto_20160126_2114'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='document',
|
||||||
|
name='file_type',
|
||||||
|
field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF')], default='pdf', editable=False, max_length=4),
|
||||||
|
preserve_default=False,
|
||||||
|
),
|
||||||
|
]
|
@ -111,10 +111,22 @@ class Tag(SluggedModel):
|
|||||||
|
|
||||||
class Document(models.Model):
|
class Document(models.Model):
|
||||||
|
|
||||||
|
TYPE_PDF = "pdf"
|
||||||
|
TYPE_PNG = "png"
|
||||||
|
TYPE_JPG = "jpg"
|
||||||
|
TYPE_GIF = "gif"
|
||||||
|
TYPE_TIF = "tiff"
|
||||||
|
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
|
||||||
|
|
||||||
sender = models.ForeignKey(
|
sender = models.ForeignKey(
|
||||||
Sender, blank=True, null=True, related_name="documents")
|
Sender, blank=True, null=True, related_name="documents")
|
||||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||||
content = models.TextField(db_index=True)
|
content = models.TextField(db_index=True)
|
||||||
|
file_type = models.CharField(
|
||||||
|
max_length=4,
|
||||||
|
editable=False,
|
||||||
|
choices=tuple([(t, t.upper()) for t in TYPES])
|
||||||
|
)
|
||||||
tags = models.ManyToManyField(Tag, related_name="documents")
|
tags = models.ManyToManyField(Tag, related_name="documents")
|
||||||
created = models.DateTimeField(default=timezone.now, editable=False)
|
created = models.DateTimeField(default=timezone.now, editable=False)
|
||||||
modified = models.DateTimeField(auto_now=True, editable=False)
|
modified = models.DateTimeField(auto_now=True, editable=False)
|
||||||
@ -131,20 +143,19 @@ class Document(models.Model):
|
|||||||
return str(created)
|
return str(created)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pdf_path(self):
|
def source_path(self):
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
settings.MEDIA_ROOT,
|
settings.MEDIA_ROOT,
|
||||||
"documents",
|
"documents",
|
||||||
"pdf",
|
"{:07}.{}.gpg".format(self.pk, self.file_type)
|
||||||
"{:07}.pdf.gpg".format(self.pk)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pdf(self):
|
def source_file(self):
|
||||||
return open(self.pdf_path, "rb")
|
return open(self.source_path, "rb")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parseable_file_name(self):
|
def parseable_file_name(self):
|
||||||
if self.sender and self.title:
|
if self.sender and self.title:
|
||||||
return "{} - {}.pdf".format(self.sender, self.title)
|
return "{} - {}.{}".format(self.sender, self.title, self.file_types)
|
||||||
return os.path.basename(self.pdf_path)
|
return os.path.basename(self.source_path)
|
||||||
|
@ -16,9 +16,19 @@ class PdfView(DetailView):
|
|||||||
Override the default to return the unencrypted PDF as raw data.
|
Override the default to return the unencrypted PDF as raw data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
content_types = {
|
||||||
|
Document.TYPE_PDF: "application/pdf",
|
||||||
|
Document.TYPE_PNG: "image/png",
|
||||||
|
Document.TYPE_JPG: "image/jpeg",
|
||||||
|
Document.TYPE_GIF: "image/gif",
|
||||||
|
Document.TYPE_TIF: "image/tiff",
|
||||||
|
}
|
||||||
|
|
||||||
response = HttpResponse(
|
response = HttpResponse(
|
||||||
GnuPG.decrypted(self.object.pdf), content_type="application/pdf")
|
GnuPG.decrypted(self.object.source_file),
|
||||||
|
content_type=content_types[self.object.file_type]
|
||||||
|
)
|
||||||
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
response["Content-Disposition"] = 'attachment; filename="{}"'.format(
|
||||||
slugify(str(self.object)) + ".pdf")
|
slugify(str(self.object)) + "." + self.object.file_type)
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
Loading…
x
Reference in New Issue
Block a user