diff --git a/src/documents/consumers/base.py b/src/documents/consumer.py similarity index 66% rename from src/documents/consumers/base.py rename to src/documents/consumer.py index ec07dfc72..3294c4792 100644 --- a/src/documents/consumers/base.py +++ b/src/documents/consumer.py @@ -16,15 +16,27 @@ from django.template.defaultfilters import slugify from paperless.db import GnuPG -from ..models import Sender, Tag, Document -from ..languages import ISO639 +from .models import Sender, Tag, Document +from .languages import ISO639 class OCRError(Exception): pass +class ConsumerError(Exception): + pass + + class Consumer(object): + """ + Loop over every file found in CONSUMPTION_DIR and: + 1. Convert it to a greyscale png + 2. Use tesseract on the png + 3. Encrypt and store the document in the MEDIA_ROOT + 4. Store the OCR'd text in the database + 5. Delete the document and image(s) + """ SCRATCH = settings.SCRATCH_DIR CONVERT = settings.CONVERT_BINARY @@ -34,15 +46,15 @@ class Consumer(object): DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE REGEX_TITLE = re.compile( - r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", + r"^.*/([^/]*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) REGEX_SENDER_TITLE = re.compile( - r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", + r"^[^/]*/(.+) - ([^/]+)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) REGEX_SENDER_TITLE_TAGS = re.compile( - r"^.*/(.*) - (.*) - ([a-z\-,])\.(pdf|jpe?g|png|gif|tiff)", + r"^.*/([^/]+) - ([^/]+) - ([a-z\-,]+)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE ) @@ -55,6 +67,51 @@ class Consumer(object): except FileExistsError: pass + self.stats = {} + self._ignore = [] + + if not self.CONSUME: + raise ConsumerError( + "The CONSUMPTION_DIR settings variable does not appear to be " + "set." + ) + + if not os.path.exists(self.CONSUME): + raise ConsumerError( + "Consumption directory {} does not exist".format(self.CONSUME)) + + def consume(self): + + for doc in os.listdir(self.CONSUME): + + doc = os.path.join(self.CONSUME, doc) + + if not os.path.isfile(doc): + continue + + if not re.match(self.REGEX_TITLE, doc): + continue + + if doc in self._ignore: + continue + + if self._is_ready(doc): + continue + + self._render("Consuming {}".format(doc), 1) + + pngs = self._get_greyscale(doc) + + try: + text = self._get_ocr(pngs) + except OCRError: + self._ignore.append(doc) + self._render("OCR FAILURE: {}".format(doc), 0) + continue + + self._store(text, doc) + self._cleanup(pngs, doc) + def _get_greyscale(self, doc): self._render(" Generating greyscale image", 2) @@ -69,17 +126,27 @@ class Consumer(object): return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) + def _guess_language(self, text): + try: + guess = langdetect.detect(text) + self._render(" Language detected: {}".format(guess), 2) + return guess + except Exception: + return None + def _get_ocr(self, pngs): + """ + Attempts to do the best job possible OCR'ing the document based on + simple language detection trial & error. + """ self._render(" OCRing the document", 2) raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) - guessed_language = langdetect.detect(raw_text) + guessed_language = self._guess_language(raw_text) - self._render(" Language detected: {}".format(guessed_language), 2) - - if guessed_language not in ISO639: + if not guessed_language or guessed_language not in ISO639: self._render("Language detection failed!", 0) if settings.FORGIVING_OCR: self._render( @@ -108,6 +175,9 @@ class Consumer(object): raise OCRError def _ocr(self, pngs, lang): + """ + Performs a single OCR attempt. + """ self._render(" Parsing for {}".format(lang), 2) @@ -161,10 +231,11 @@ class Consumer(object): def _store(self, text, doc): - sender, title, file_type = self._guess_attributes_from_name(doc) + sender, title, tags, file_type = self._guess_attributes_from_name(doc) lower_text = text.lower() - relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)] + relevant_tags = set( + [t for t in Tag.objects.all() if t.matches(lower_text)] + tags) stats = os.stat(doc) @@ -205,3 +276,19 @@ class Consumer(object): def _render(self, text, verbosity): if self.verbosity >= verbosity: print(text) + + def _is_ready(self, doc): + """ + Detect whether `doc` is ready to consume or if it's still being written + to by the uploader. + """ + + t = os.stat(doc).st_mtime + + if self.stats.get(doc) == t: + del(self.stats[doc]) + return True + + self.stats[doc] = t + + return False diff --git a/src/documents/consumers/__init__.py b/src/documents/consumers/__init__.py deleted file mode 100644 index d54da1d91..000000000 --- a/src/documents/consumers/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .base import Consumer -from .file import FileConsumer, FileConsumerError -from .mail import MailConsumer, MailConsumerError diff --git a/src/documents/consumers/file.py b/src/documents/consumers/file.py deleted file mode 100644 index b889831bb..000000000 --- a/src/documents/consumers/file.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -import re - -from .base import Consumer, OCRError - - -class FileConsumerError(Exception): - pass - - -class FileConsumer(Consumer): - - def __init__(self, *args, **kwargs): - - Consumer.__init__(self, *args, **kwargs) - - self.stats = {} - self._ignore = [] - - if not self.CONSUME: - raise FileConsumerError( - "The CONSUMPTION_DIR settings variable does not appear to be " - "set." - ) - - if not os.path.exists(self.CONSUME): - raise FileConsumerError( - "Consumption directory {} does not exist".format(self.CONSUME)) - - def consume(self): - - for doc in os.listdir(self.CONSUME): - - doc = os.path.join(self.CONSUME, doc) - - if not os.path.isfile(doc): - continue - - if not re.match(self.REGEX_TITLE, doc): - continue - - if doc in self._ignore: - continue - - if self._is_ready(doc): - continue - - self._render("Consuming {}".format(doc), 1) - - pngs = self._get_greyscale(doc) - - try: - text = self._get_ocr(pngs) - except OCRError: - self._ignore.append(doc) - self._render("OCR FAILURE: {}".format(doc), 0) - continue - - self._store(text, doc) - self._cleanup(pngs, doc) - - def _is_ready(self, doc): - """ - Detect whether `doc` is ready to consume or if it's still being written - to by the uploader. - """ - - t = os.stat(doc).st_mtime - - if self.stats.get(doc) == t: - del(self.stats[doc]) - return True - - self.stats[doc] = t - - return False diff --git a/src/documents/consumers/mail.py b/src/documents/consumers/mail.py deleted file mode 100644 index 44e7d0baa..000000000 --- a/src/documents/consumers/mail.py +++ /dev/null @@ -1,170 +0,0 @@ -import datetime -import email -import imaplib -import os -import re - -from base64 import b64decode - -from django.conf import settings - -from . import Consumer - - -class MailConsumerError(Exception): - pass - - -class Message(object): - """ - A crude, but simple email message class. We assume that there's a subject - and exactly one attachment, and that we don't care about the message body. - """ - - SAFE_SUBJECT_REGEX = re.compile(r"^[\w\- ,.]+$") - SAFE_SUFFIX_REGEX = re.compile( - r"^(application/(pdf))|(image/(png|jpg|gif|tiff))$") - - def __init__(self, subject, attachment): - - self.subject = subject - self.attachment = attachment - self.suffix = None - - m = self.SAFE_SUFFIX_REGEX.match(attachment.content_type) - if not m: - raise MailConsumerError( - "Not-awesome file type: {}".format(attachment.content_type)) - self.suffix = m.group(1) or m.group(3) - - @property - def file_name(self): - if self.SAFE_SUFFIX_REGEX.match(self.subject): - return "{}.{}".format(self.subject, self.suffix) - - -class Attachment(object): - - def __init__(self, data): - self.content_type = None - self.size = None - self.name = None - self.created = None - self.modified = None - self.data = data - - -class MailFetcher(object): - - def __init__(self): - - self._connection = None - self._host = settings.MAIL_CONSUMPTION["HOST"] - self._port = settings.MAIL_CONSUMPTION["PORT"] - self._username = settings.MAIL_CONSUMPTION["USERNAME"] - self._password = settings.MAIL_CONSUMPTION["PASSWORD"] - self._inbox = settings.MAIL_CONSUMPTION["INBOX"] - - self._enabled = bool(self._host) - - self.last_checked = datetime.datetime.now() - - def _connect(self): - self._connection = imaplib.IMAP4_SSL(self._host, self._port) - - def _login(self): - - login = self._connection.login(self._username, self._password) - if not login[0] == "OK": - raise MailConsumerError("Can't log into mail: {}".format(login[1])) - - inbox = self._connection.select("INBOX") - if not inbox[0] == "OK": - raise MailConsumerError("Can't find the inbox: {}".format(inbox[1])) - - def _fetch(self): - for num in self._connection.search(None, "ALL")[1][0].split(): - typ, data = self._connection.fetch(num, "(RFC822)") - # self._connection.store(num, "+FLAGS", "\\Deleted") - yield data[0][1] - - def consume(self): - """ - We don't actually consume here 'cause it's much easier to do that with - files and we already have a FileConsumer. So instead, we simply write - the attachment to the consumption directory as a file with the proper - format so the FileConsumer can do its job. - """ - - if self._enabled: - - for message in self.get_messages(): - - t = message.attachment.created or \ - message.attachment.modified or \ - datetime.datetime.now() - - file_name = os.path.join(Consumer.CONSUME, message.file_name) - with open(file_name, "wb") as f: - f.write(message.attachment.data) - os.utime(file_name, times=(t, t)) - - self.last_checked = datetime.datetime.now() - - def get_messages(self): - - self._connect() - self._login() - - messages = [] - for data in self._fetch(): - message = self._parse_message(data) - if message: - messages.append(message) - - self._connection.expunge() - self._connection.close() - self._connection.logout() - - return messages - - @staticmethod - def _parse_message(data): - """ - Cribbed heavily from - https://www.ianlewis.org/en/parsing-email-attachments-python - """ - - r = [] - message = email.message_from_string(data) - - for part in message.walk(): - - content_disposition = part.get("Content-Disposition") - if not content_disposition: - continue - - dispositions = content_disposition.strip().split(";") - if not dispositions[0].lower() == "attachment": - continue - - file_data = part.get_payload() - attachment = Attachment(b64decode(file_data)) - attachment.content_type = part.get_content_type() - attachment.size = len(file_data) - - for param in dispositions[1:]: - - name, value = param.split("=") - name = name.lower() - - if name == "filename": - attachment.name = value - elif name == "create-date": - attachment.created = value - elif name == "modification-date": - attachment.modified = value - - r.append(Message(message.get("Subject"), attachment)) - - return r diff --git a/src/documents/mail.py b/src/documents/mail.py new file mode 100644 index 000000000..feb370945 --- /dev/null +++ b/src/documents/mail.py @@ -0,0 +1,208 @@ +import datetime +import email +import imaplib +import os +import random +import re +import time + +from base64 import b64decode +from dateutil import parser + +from django.conf import settings + +from .consumer import Consumer + + +class MailFetcherError(Exception): + pass + + +class InvalidMessageError(Exception): + pass + + +class Message(object): + """ + A crude, but simple email message class. We assume that there's a subject + and n attachments, and that we don't care about the message body. + """ + + # This regex is probably more restrictive than it needs to be, but it's + # better safe than sorry. + SAFE_SUBJECT_REGEX = re.compile(r"^[\w\- ,.']+$") + + def _set_time(self, message): + self.time = datetime.datetime.now() + message_time = message.get("Date") + if message_time: + try: + self.time = parser.parse(message_time) + except (ValueError, AttributeError): + pass # We assume that "now" is ok + + def __init__(self, data): + """ + Cribbed heavily from + https://www.ianlewis.org/en/parsing-email-attachments-python + """ + + self.subject = None + self.time = None + self.attachment = None + + message = email.message_from_bytes(data) + self.subject = message.get("Subject") + + self._set_time(message) + + if self.subject is None: + raise InvalidMessageError("Message does not have a subject") + if not self.SAFE_SUBJECT_REGEX.match(self.subject): + raise InvalidMessageError("Message subject is unsafe") + + print('Fetching email: "{}"'.format(self.subject)) + + attachments = [] + for part in message.walk(): + + content_disposition = part.get("Content-Disposition") + if not content_disposition: + continue + + dispositions = content_disposition.strip().split(";") + if not dispositions[0].lower() == "attachment": + continue + + file_data = part.get_payload() + + attachments.append(Attachment( + b64decode(file_data), content_type=part.get_content_type())) + + if len(attachments) == 0: + raise InvalidMessageError( + "There don't appear to be any attachments to this message") + + if len(attachments) > 1: + raise InvalidMessageError( + "There's more than one attachment to this message. It cannot " + "be indexed automatically." + ) + + self.attachment = attachments[0] + + def __bool__(self): + return bool(self.attachment) + + @property + def file_name(self): + + prefix = str(random.randint(100000, 999999)) + if self.SAFE_SUBJECT_REGEX.match(self.subject): + prefix = self.subject + + return "{}.{}".format(prefix, self.attachment.suffix) + + +class Attachment(object): + + SAFE_SUFFIX_REGEX = re.compile( + r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$") + + def __init__(self, data, content_type): + + self.content_type = content_type + self.data = data + self.suffix = None + + m = self.SAFE_SUFFIX_REGEX.match(self.content_type) + if not m: + raise MailFetcherError( + "Not-awesome file type: {}".format(self.content_type)) + self.suffix = m.group(2) or m.group(4) + + def read(self): + return self.data + + +class MailFetcher(object): + + def __init__(self): + + self._connection = None + self._host = settings.MAIL_CONSUMPTION["HOST"] + self._port = settings.MAIL_CONSUMPTION["PORT"] + self._username = settings.MAIL_CONSUMPTION["USERNAME"] + self._password = settings.MAIL_CONSUMPTION["PASSWORD"] + self._inbox = settings.MAIL_CONSUMPTION["INBOX"] + + self._enabled = bool(self._host) + + self.last_checked = datetime.datetime.now() + + def pull(self): + """ + Fetch all available mail at the target address and store it locally in + the consumption directory so that the file consumer can pick it up and + do its thing. + """ + + if self._enabled: + + for message in self._get_messages(): + + print("Storing email: \"{}\"".format(message.subject)) + + t = int(time.mktime(message.time.timetuple())) + file_name = os.path.join(Consumer.CONSUME, message.file_name) + with open(file_name, "wb") as f: + f.write(message.attachment.data) + os.utime(file_name, times=(t, t)) + + self.last_checked = datetime.datetime.now() + + def _get_messages(self): + + self._connect() + self._login() + + r = [] + for message in self._fetch(): + if message: + r.append(message) + + self._connection.expunge() + self._connection.close() + self._connection.logout() + + return r + + def _connect(self): + self._connection = imaplib.IMAP4_SSL(self._host, self._port) + + def _login(self): + + login = self._connection.login(self._username, self._password) + if not login[0] == "OK": + raise MailFetcherError("Can't log into mail: {}".format(login[1])) + + inbox = self._connection.select("INBOX") + if not inbox[0] == "OK": + raise MailFetcherError("Can't find the inbox: {}".format(inbox[1])) + + def _fetch(self): + + for num in self._connection.search(None, "ALL")[1][0].split(): + + __, data = self._connection.fetch(num, "(RFC822)") + + message = None + try: + message = Message(data[0][1]) + except InvalidMessageError as e: + print(e) + pass + + self._connection.store(num, "+FLAGS", "\\Deleted") + if message: + yield message diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index ec18b36ae..d384d1486 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -5,18 +5,14 @@ import time from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from ...consumers import ( - FileConsumer, FileConsumerError, MailConsumer, MailConsumerError) +from ...consumer import Consumer, ConsumerError +from ...mail import MailFetcher, MailFetcherError class Command(BaseCommand): """ - Loop over every file found in CONSUMPTION_DIR and: - 1. Convert it to a greyscale png - 2. Use tesseract on the png - 3. Encrypt and store the document in the MEDIA_ROOT - 4. Store the OCR'd text in the database - 5. Delete the document and image(s) + On every iteration of an infinite loop, consume what we can from the + consumption directory, and fetch any mail available. """ LOOP_TIME = 10 # Seconds @@ -29,7 +25,7 @@ class Command(BaseCommand): self.verbosity = 0 self.file_consumer = None - self.mail_consumer = None + self.mail_fetcher = None BaseCommand.__init__(self, *args, **kwargs) @@ -38,9 +34,9 @@ class Command(BaseCommand): self.verbosity = options["verbosity"] try: - self.file_consumer = FileConsumer(verbosity=self.verbosity) - self.mail_consumer = MailConsumer(verbosity=self.verbosity) - except (FileConsumerError, MailConsumerError) as e: + self.file_consumer = Consumer(verbosity=self.verbosity) + self.mail_fetcher = MailFetcher() + except (ConsumerError, MailFetcherError) as e: raise CommandError(e) try: @@ -59,11 +55,13 @@ class Command(BaseCommand): def loop(self): + # Consume whatever files we can self.file_consumer.consume() - delta = self.mail_consumer.last_checked + self.MAIL_DELTA + # Occasionally fetch mail and store it to be consumed on the next loop + delta = self.mail_fetcher.last_checked + self.MAIL_DELTA if delta > datetime.datetime.now(): - self.mail_consumer.consume() + self.mail_fetcher.pull() def _render(self, text, verbosity): if self.verbosity >= verbosity: diff --git a/src/documents/tests/consumers/mail.py b/src/documents/tests/consumers/mail.py index 5bf4d1371..9922d674e 100644 --- a/src/documents/tests/consumers/mail.py +++ b/src/documents/tests/consumers/mail.py @@ -1,3 +1,4 @@ +import base64 import os import magic @@ -6,10 +7,10 @@ from hashlib import md5 from django.conf import settings from django.test import TestCase -from ...consumers.mail import MailConsumer +from ...mail import Message, Attachment -class TestMailConsumer(TestCase): +class TestMessage(TestCase): def __init__(self, *args, **kwargs): @@ -23,21 +24,33 @@ class TestMailConsumer(TestCase): "mail.txt" ) - def test_parse(self): - consumer = MailConsumer() - with open(self.sample) as f: + def test_init(self): - messages = consumer._parse_message(f.read()) + with open(self.sample, "rb") as f: - self.assertTrue(len(messages), 1) - self.assertEqual(messages[0]["subject"], "Test 0") + message = Message(f.read()) - attachment = messages[0]["attachment"] - data = attachment.read() + self.assertTrue(message) + self.assertEqual(message.subject, "Test 0") + + data = message.attachment.read() self.assertEqual( md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59") - self.assertEqual(attachment.content_type, "application/pdf") + self.assertEqual( + message.attachment.content_type, "application/pdf") with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: self.assertEqual(m.id_buffer(data), "application/pdf") + + +class TestAttachment(TestCase): + + def test_init(self): + data = base64.encodebytes(b"0") + self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf") + self.assertEqual(Attachment(data, "image/png").suffix, "png") + self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg") + self.assertEqual(Attachment(data, "image/gif").suffix, "gif") + self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff") + self.assertEqual(Attachment(data, "image/png").read(), data) diff --git a/src/documents/tests/tests.py b/src/documents/tests/tests.py index 1d8c9b53a..342d09d29 100644 --- a/src/documents/tests/tests.py +++ b/src/documents/tests/tests.py @@ -1 +1 @@ -from .consumers.mail import TestMailConsumer +from .consumers.mail import TestMailFetcher