Image imports and consumption by mail work

This commit is contained in:
Daniel Quinn 2016-02-06 17:05:36 +00:00
parent 71075a691a
commit 48761911b3
8 changed files with 343 additions and 286 deletions

View File

@ -16,15 +16,27 @@ from django.template.defaultfilters import slugify
from paperless.db import GnuPG
from ..models import Sender, Tag, Document
from ..languages import ISO639
from .models import Sender, Tag, Document
from .languages import ISO639
class OCRError(Exception):
pass
class ConsumerError(Exception):
pass
class Consumer(object):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database
5. Delete the document and image(s)
"""
SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY
@ -34,15 +46,15 @@ class Consumer(object):
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
r"^.*/([^/]*)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_SENDER_TITLE = re.compile(
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)",
r"^[^/]*/(.+) - ([^/]+)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
REGEX_SENDER_TITLE_TAGS = re.compile(
r"^.*/(.*) - (.*) - ([a-z\-,])\.(pdf|jpe?g|png|gif|tiff)",
r"^.*/([^/]+) - ([^/]+) - ([a-z\-,]+)\.(pdf|jpe?g|png|gif|tiff)$",
flags=re.IGNORECASE
)
@ -55,6 +67,51 @@ class Consumer(object):
except FileExistsError:
pass
self.stats = {}
self._ignore = []
if not self.CONSUME:
raise ConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.CONSUME):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))
def consume(self):
for doc in os.listdir(self.CONSUME):
doc = os.path.join(self.CONSUME, doc)
if not os.path.isfile(doc):
continue
if not re.match(self.REGEX_TITLE, doc):
continue
if doc in self._ignore:
continue
if self._is_ready(doc):
continue
self._render("Consuming {}".format(doc), 1)
pngs = self._get_greyscale(doc)
try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(doc), 0)
continue
self._store(text, doc)
self._cleanup(pngs, doc)
def _get_greyscale(self, doc):
self._render(" Generating greyscale image", 2)
@ -69,17 +126,27 @@ class Consumer(object):
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
self._render(" Language detected: {}".format(guess), 2)
return guess
except Exception:
return None
def _get_ocr(self, pngs):
"""
Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error.
"""
self._render(" OCRing the document", 2)
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
guessed_language = langdetect.detect(raw_text)
guessed_language = self._guess_language(raw_text)
self._render(" Language detected: {}".format(guessed_language), 2)
if guessed_language not in ISO639:
if not guessed_language or guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
@ -108,6 +175,9 @@ class Consumer(object):
raise OCRError
def _ocr(self, pngs, lang):
"""
Performs a single OCR attempt.
"""
self._render(" Parsing for {}".format(lang), 2)
@ -161,10 +231,11 @@ class Consumer(object):
def _store(self, text, doc):
sender, title, file_type = self._guess_attributes_from_name(doc)
sender, title, tags, file_type = self._guess_attributes_from_name(doc)
lower_text = text.lower()
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
relevant_tags = set(
[t for t in Tag.objects.all() if t.matches(lower_text)] + tags)
stats = os.stat(doc)
@ -205,3 +276,19 @@ class Consumer(object):
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)
def _is_ready(self, doc):
"""
Detect whether `doc` is ready to consume or if it's still being written
to by the uploader.
"""
t = os.stat(doc).st_mtime
if self.stats.get(doc) == t:
del(self.stats[doc])
return True
self.stats[doc] = t
return False

View File

@ -1,3 +0,0 @@
from .base import Consumer
from .file import FileConsumer, FileConsumerError
from .mail import MailConsumer, MailConsumerError

View File

@ -1,76 +0,0 @@
import os
import re
from .base import Consumer, OCRError
class FileConsumerError(Exception):
pass
class FileConsumer(Consumer):
def __init__(self, *args, **kwargs):
Consumer.__init__(self, *args, **kwargs)
self.stats = {}
self._ignore = []
if not self.CONSUME:
raise FileConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.CONSUME):
raise FileConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))
def consume(self):
for doc in os.listdir(self.CONSUME):
doc = os.path.join(self.CONSUME, doc)
if not os.path.isfile(doc):
continue
if not re.match(self.REGEX_TITLE, doc):
continue
if doc in self._ignore:
continue
if self._is_ready(doc):
continue
self._render("Consuming {}".format(doc), 1)
pngs = self._get_greyscale(doc)
try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(doc), 0)
continue
self._store(text, doc)
self._cleanup(pngs, doc)
def _is_ready(self, doc):
"""
Detect whether `doc` is ready to consume or if it's still being written
to by the uploader.
"""
t = os.stat(doc).st_mtime
if self.stats.get(doc) == t:
del(self.stats[doc])
return True
self.stats[doc] = t
return False

View File

@ -1,170 +0,0 @@
import datetime
import email
import imaplib
import os
import re
from base64 import b64decode
from django.conf import settings
from . import Consumer
class MailConsumerError(Exception):
pass
class Message(object):
"""
A crude, but simple email message class. We assume that there's a subject
and exactly one attachment, and that we don't care about the message body.
"""
SAFE_SUBJECT_REGEX = re.compile(r"^[\w\- ,.]+$")
SAFE_SUFFIX_REGEX = re.compile(
r"^(application/(pdf))|(image/(png|jpg|gif|tiff))$")
def __init__(self, subject, attachment):
self.subject = subject
self.attachment = attachment
self.suffix = None
m = self.SAFE_SUFFIX_REGEX.match(attachment.content_type)
if not m:
raise MailConsumerError(
"Not-awesome file type: {}".format(attachment.content_type))
self.suffix = m.group(1) or m.group(3)
@property
def file_name(self):
if self.SAFE_SUFFIX_REGEX.match(self.subject):
return "{}.{}".format(self.subject, self.suffix)
class Attachment(object):
def __init__(self, data):
self.content_type = None
self.size = None
self.name = None
self.created = None
self.modified = None
self.data = data
class MailFetcher(object):
def __init__(self):
self._connection = None
self._host = settings.MAIL_CONSUMPTION["HOST"]
self._port = settings.MAIL_CONSUMPTION["PORT"]
self._username = settings.MAIL_CONSUMPTION["USERNAME"]
self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
self._enabled = bool(self._host)
self.last_checked = datetime.datetime.now()
def _connect(self):
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
def _login(self):
login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailConsumerError("Can't log into mail: {}".format(login[1]))
inbox = self._connection.select("INBOX")
if not inbox[0] == "OK":
raise MailConsumerError("Can't find the inbox: {}".format(inbox[1]))
def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
typ, data = self._connection.fetch(num, "(RFC822)")
# self._connection.store(num, "+FLAGS", "\\Deleted")
yield data[0][1]
def consume(self):
"""
We don't actually consume here 'cause it's much easier to do that with
files and we already have a FileConsumer. So instead, we simply write
the attachment to the consumption directory as a file with the proper
format so the FileConsumer can do its job.
"""
if self._enabled:
for message in self.get_messages():
t = message.attachment.created or \
message.attachment.modified or \
datetime.datetime.now()
file_name = os.path.join(Consumer.CONSUME, message.file_name)
with open(file_name, "wb") as f:
f.write(message.attachment.data)
os.utime(file_name, times=(t, t))
self.last_checked = datetime.datetime.now()
def get_messages(self):
self._connect()
self._login()
messages = []
for data in self._fetch():
message = self._parse_message(data)
if message:
messages.append(message)
self._connection.expunge()
self._connection.close()
self._connection.logout()
return messages
@staticmethod
def _parse_message(data):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
r = []
message = email.message_from_string(data)
for part in message.walk():
content_disposition = part.get("Content-Disposition")
if not content_disposition:
continue
dispositions = content_disposition.strip().split(";")
if not dispositions[0].lower() == "attachment":
continue
file_data = part.get_payload()
attachment = Attachment(b64decode(file_data))
attachment.content_type = part.get_content_type()
attachment.size = len(file_data)
for param in dispositions[1:]:
name, value = param.split("=")
name = name.lower()
if name == "filename":
attachment.name = value
elif name == "create-date":
attachment.created = value
elif name == "modification-date":
attachment.modified = value
r.append(Message(message.get("Subject"), attachment))
return r

208
src/documents/mail.py Normal file
View File

@ -0,0 +1,208 @@
import datetime
import email
import imaplib
import os
import random
import re
import time
from base64 import b64decode
from dateutil import parser
from django.conf import settings
from .consumer import Consumer
class MailFetcherError(Exception):
pass
class InvalidMessageError(Exception):
pass
class Message(object):
"""
A crude, but simple email message class. We assume that there's a subject
and n attachments, and that we don't care about the message body.
"""
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_SUBJECT_REGEX = re.compile(r"^[\w\- ,.']+$")
def _set_time(self, message):
self.time = datetime.datetime.now()
message_time = message.get("Date")
if message_time:
try:
self.time = parser.parse(message_time)
except (ValueError, AttributeError):
pass # We assume that "now" is ok
def __init__(self, data):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
self.subject = None
self.time = None
self.attachment = None
message = email.message_from_bytes(data)
self.subject = message.get("Subject")
self._set_time(message)
if self.subject is None:
raise InvalidMessageError("Message does not have a subject")
if not self.SAFE_SUBJECT_REGEX.match(self.subject):
raise InvalidMessageError("Message subject is unsafe")
print('Fetching email: "{}"'.format(self.subject))
attachments = []
for part in message.walk():
content_disposition = part.get("Content-Disposition")
if not content_disposition:
continue
dispositions = content_disposition.strip().split(";")
if not dispositions[0].lower() == "attachment":
continue
file_data = part.get_payload()
attachments.append(Attachment(
b64decode(file_data), content_type=part.get_content_type()))
if len(attachments) == 0:
raise InvalidMessageError(
"There don't appear to be any attachments to this message")
if len(attachments) > 1:
raise InvalidMessageError(
"There's more than one attachment to this message. It cannot "
"be indexed automatically."
)
self.attachment = attachments[0]
def __bool__(self):
return bool(self.attachment)
@property
def file_name(self):
prefix = str(random.randint(100000, 999999))
if self.SAFE_SUBJECT_REGEX.match(self.subject):
prefix = self.subject
return "{}.{}".format(prefix, self.attachment.suffix)
class Attachment(object):
SAFE_SUFFIX_REGEX = re.compile(
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
def __init__(self, data, content_type):
self.content_type = content_type
self.data = data
self.suffix = None
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
if not m:
raise MailFetcherError(
"Not-awesome file type: {}".format(self.content_type))
self.suffix = m.group(2) or m.group(4)
def read(self):
return self.data
class MailFetcher(object):
def __init__(self):
self._connection = None
self._host = settings.MAIL_CONSUMPTION["HOST"]
self._port = settings.MAIL_CONSUMPTION["PORT"]
self._username = settings.MAIL_CONSUMPTION["USERNAME"]
self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
self._enabled = bool(self._host)
self.last_checked = datetime.datetime.now()
def pull(self):
"""
Fetch all available mail at the target address and store it locally in
the consumption directory so that the file consumer can pick it up and
do its thing.
"""
if self._enabled:
for message in self._get_messages():
print("Storing email: \"{}\"".format(message.subject))
t = int(time.mktime(message.time.timetuple()))
file_name = os.path.join(Consumer.CONSUME, message.file_name)
with open(file_name, "wb") as f:
f.write(message.attachment.data)
os.utime(file_name, times=(t, t))
self.last_checked = datetime.datetime.now()
def _get_messages(self):
self._connect()
self._login()
r = []
for message in self._fetch():
if message:
r.append(message)
self._connection.expunge()
self._connection.close()
self._connection.logout()
return r
def _connect(self):
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
def _login(self):
login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
inbox = self._connection.select("INBOX")
if not inbox[0] == "OK":
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
__, data = self._connection.fetch(num, "(RFC822)")
message = None
try:
message = Message(data[0][1])
except InvalidMessageError as e:
print(e)
pass
self._connection.store(num, "+FLAGS", "\\Deleted")
if message:
yield message

View File

@ -5,18 +5,14 @@ import time
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from ...consumers import (
FileConsumer, FileConsumerError, MailConsumer, MailConsumerError)
from ...consumer import Consumer, ConsumerError
from ...mail import MailFetcher, MailFetcherError
class Command(BaseCommand):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
3. Encrypt and store the document in the MEDIA_ROOT
4. Store the OCR'd text in the database
5. Delete the document and image(s)
On every iteration of an infinite loop, consume what we can from the
consumption directory, and fetch any mail available.
"""
LOOP_TIME = 10 # Seconds
@ -29,7 +25,7 @@ class Command(BaseCommand):
self.verbosity = 0
self.file_consumer = None
self.mail_consumer = None
self.mail_fetcher = None
BaseCommand.__init__(self, *args, **kwargs)
@ -38,9 +34,9 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
try:
self.file_consumer = FileConsumer(verbosity=self.verbosity)
self.mail_consumer = MailConsumer(verbosity=self.verbosity)
except (FileConsumerError, MailConsumerError) as e:
self.file_consumer = Consumer(verbosity=self.verbosity)
self.mail_fetcher = MailFetcher()
except (ConsumerError, MailFetcherError) as e:
raise CommandError(e)
try:
@ -59,11 +55,13 @@ class Command(BaseCommand):
def loop(self):
# Consume whatever files we can
self.file_consumer.consume()
delta = self.mail_consumer.last_checked + self.MAIL_DELTA
# Occasionally fetch mail and store it to be consumed on the next loop
delta = self.mail_fetcher.last_checked + self.MAIL_DELTA
if delta > datetime.datetime.now():
self.mail_consumer.consume()
self.mail_fetcher.pull()
def _render(self, text, verbosity):
if self.verbosity >= verbosity:

View File

@ -1,3 +1,4 @@
import base64
import os
import magic
@ -6,10 +7,10 @@ from hashlib import md5
from django.conf import settings
from django.test import TestCase
from ...consumers.mail import MailConsumer
from ...mail import Message, Attachment
class TestMailConsumer(TestCase):
class TestMessage(TestCase):
def __init__(self, *args, **kwargs):
@ -23,21 +24,33 @@ class TestMailConsumer(TestCase):
"mail.txt"
)
def test_parse(self):
consumer = MailConsumer()
with open(self.sample) as f:
def test_init(self):
messages = consumer._parse_message(f.read())
with open(self.sample, "rb") as f:
self.assertTrue(len(messages), 1)
self.assertEqual(messages[0]["subject"], "Test 0")
message = Message(f.read())
attachment = messages[0]["attachment"]
data = attachment.read()
self.assertTrue(message)
self.assertEqual(message.subject, "Test 0")
data = message.attachment.read()
self.assertEqual(
md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
self.assertEqual(attachment.content_type, "application/pdf")
self.assertEqual(
message.attachment.content_type, "application/pdf")
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
self.assertEqual(m.id_buffer(data), "application/pdf")
class TestAttachment(TestCase):
def test_init(self):
data = base64.encodebytes(b"0")
self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
self.assertEqual(Attachment(data, "image/png").suffix, "png")
self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
self.assertEqual(Attachment(data, "image/png").read(), data)

View File

@ -1 +1 @@
from .consumers.mail import TestMailConsumer
from .consumers.mail import TestMailFetcher