Image imports and consumption by mail work

2026-02-11 23:59:31 -06:00 · 2016-02-06 17:05:36 +00:00
parent 71075a691a
commit 48761911b3
8 changed files with 343 additions and 286 deletions
--- a/src/documents/consumers/base.py
+++ b/src/documents/consumers/base.py
@@ -16,15 +16,27 @@ from django.template.defaultfilters import slugify

 from paperless.db import GnuPG

-from ..models import Sender, Tag, Document
-from ..languages import ISO639
+from .models import Sender, Tag, Document
+from .languages import ISO639


 class OCRError(Exception):
    pass


+class ConsumerError(Exception):
+    pass
+
+
 class Consumer(object):
+    """
+    Loop over every file found in CONSUMPTION_DIR and:
+      1. Convert it to a greyscale png
+      2. Use tesseract on the png
+      3. Encrypt and store the document in the MEDIA_ROOT
+      4. Store the OCR'd text in the database
+      5. Delete the document and image(s)
+    """

    SCRATCH = settings.SCRATCH_DIR
    CONVERT = settings.CONVERT_BINARY
@@ -34,15 +46,15 @@ class Consumer(object):
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE

    REGEX_TITLE = re.compile(
-        r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$",
+        r"^.*/([^/]*)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    REGEX_SENDER_TITLE = re.compile(
-        r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)",
+        r"^[^/]*/(.+) - ([^/]+)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )
    REGEX_SENDER_TITLE_TAGS = re.compile(
-        r"^.*/(.*) - (.*) - ([a-z\-,])\.(pdf|jpe?g|png|gif|tiff)",
+        r"^.*/([^/]+) - ([^/]+) - ([a-z\-,]+)\.(pdf|jpe?g|png|gif|tiff)$",
        flags=re.IGNORECASE
    )

@@ -55,6 +67,51 @@ class Consumer(object):
        except FileExistsError:
            pass

+        self.stats = {}
+        self._ignore = []
+
+        if not self.CONSUME:
+            raise ConsumerError(
+                "The CONSUMPTION_DIR settings variable does not appear to be "
+                "set."
+            )
+
+        if not os.path.exists(self.CONSUME):
+            raise ConsumerError(
+                "Consumption directory {} does not exist".format(self.CONSUME))
+
+    def consume(self):
+
+        for doc in os.listdir(self.CONSUME):
+
+            doc = os.path.join(self.CONSUME, doc)
+
+            if not os.path.isfile(doc):
+                continue
+
+            if not re.match(self.REGEX_TITLE, doc):
+                continue
+
+            if doc in self._ignore:
+                continue
+
+            if self._is_ready(doc):
+                continue
+
+            self._render("Consuming {}".format(doc), 1)
+
+            pngs = self._get_greyscale(doc)
+
+            try:
+                text = self._get_ocr(pngs)
+            except OCRError:
+                self._ignore.append(doc)
+                self._render("OCR FAILURE: {}".format(doc), 0)
+                continue
+
+            self._store(text, doc)
+            self._cleanup(pngs, doc)
+
    def _get_greyscale(self, doc):

        self._render("  Generating greyscale image", 2)
@@ -69,17 +126,27 @@ class Consumer(object):

        return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))

+    def _guess_language(self, text):
+        try:
+            guess = langdetect.detect(text)
+            self._render("    Language detected: {}".format(guess), 2)
+            return guess
+        except Exception:
+            return None
+
    def _get_ocr(self, pngs):
+        """
+        Attempts to do the best job possible OCR'ing the document based on
+        simple language detection trial & error.
+        """

        self._render("  OCRing the document", 2)

        raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)

-        guessed_language = langdetect.detect(raw_text)
+        guessed_language = self._guess_language(raw_text)

-        self._render("    Language detected: {}".format(guessed_language), 2)
-
-        if guessed_language not in ISO639:
+        if not guessed_language or guessed_language not in ISO639:
            self._render("Language detection failed!", 0)
            if settings.FORGIVING_OCR:
                self._render(
@@ -108,6 +175,9 @@ class Consumer(object):
            raise OCRError

    def _ocr(self, pngs, lang):
+        """
+        Performs a single OCR attempt.
+        """

        self._render("    Parsing for {}".format(lang), 2)

@@ -161,10 +231,11 @@ class Consumer(object):

    def _store(self, text, doc):

-        sender, title, file_type = self._guess_attributes_from_name(doc)
+        sender, title, tags, file_type = self._guess_attributes_from_name(doc)

        lower_text = text.lower()
-        relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
+        relevant_tags = set(
+            [t for t in Tag.objects.all() if t.matches(lower_text)] + tags)

        stats = os.stat(doc)

@@ -205,3 +276,19 @@ class Consumer(object):
    def _render(self, text, verbosity):
        if self.verbosity >= verbosity:
            print(text)
+
+    def _is_ready(self, doc):
+        """
+        Detect whether `doc` is ready to consume or if it's still being written
+        to by the uploader.
+        """
+
+        t = os.stat(doc).st_mtime
+
+        if self.stats.get(doc) == t:
+            del(self.stats[doc])
+            return True
+
+        self.stats[doc] = t
+
+        return False
--- a/src/documents/consumers/init.py
+++ b/src/documents/consumers/init.py
@@ -1,3 +0,0 @@
-from .base import Consumer
-from .file import FileConsumer, FileConsumerError
-from .mail import MailConsumer, MailConsumerError
--- a/src/documents/consumers/file.py
+++ b/src/documents/consumers/file.py
@@ -1,76 +0,0 @@
-import os
-import re
-
-from .base import Consumer, OCRError
-
-
-class FileConsumerError(Exception):
-    pass
-
-
-class FileConsumer(Consumer):
-
-    def __init__(self, *args, **kwargs):
-
-        Consumer.__init__(self, *args, **kwargs)
-
-        self.stats = {}
-        self._ignore = []
-
-        if not self.CONSUME:
-            raise FileConsumerError(
-                "The CONSUMPTION_DIR settings variable does not appear to be "
-                "set."
-            )
-
-        if not os.path.exists(self.CONSUME):
-            raise FileConsumerError(
-                "Consumption directory {} does not exist".format(self.CONSUME))
-
-    def consume(self):
-
-        for doc in os.listdir(self.CONSUME):
-
-            doc = os.path.join(self.CONSUME, doc)
-
-            if not os.path.isfile(doc):
-                continue
-
-            if not re.match(self.REGEX_TITLE, doc):
-                continue
-
-            if doc in self._ignore:
-                continue
-
-            if self._is_ready(doc):
-                continue
-
-            self._render("Consuming {}".format(doc), 1)
-
-            pngs = self._get_greyscale(doc)
-
-            try:
-                text = self._get_ocr(pngs)
-            except OCRError:
-                self._ignore.append(doc)
-                self._render("OCR FAILURE: {}".format(doc), 0)
-                continue
-
-            self._store(text, doc)
-            self._cleanup(pngs, doc)
-
-    def _is_ready(self, doc):
-        """
-        Detect whether `doc` is ready to consume or if it's still being written
-        to by the uploader.
-        """
-
-        t = os.stat(doc).st_mtime
-
-        if self.stats.get(doc) == t:
-            del(self.stats[doc])
-            return True
-
-        self.stats[doc] = t
-
-        return False
--- a/src/documents/consumers/mail.py
+++ b/src/documents/consumers/mail.py
@@ -1,170 +0,0 @@
-import datetime
-import email
-import imaplib
-import os
-import re
-
-from base64 import b64decode
-
-from django.conf import settings
-
-from . import Consumer
-
-
-class MailConsumerError(Exception):
-    pass
-
-
-class Message(object):
-    """
-    A crude, but simple email message class.  We assume that there's a subject
-    and exactly one attachment, and that we don't care about the message body.
-    """
-
-    SAFE_SUBJECT_REGEX = re.compile(r"^[\w\- ,.]+$")
-    SAFE_SUFFIX_REGEX = re.compile(
-        r"^(application/(pdf))|(image/(png|jpg|gif|tiff))$")
-
-    def __init__(self, subject, attachment):
-
-        self.subject = subject
-        self.attachment = attachment
-        self.suffix = None
-
-        m = self.SAFE_SUFFIX_REGEX.match(attachment.content_type)
-        if not m:
-            raise MailConsumerError(
-                "Not-awesome file type: {}".format(attachment.content_type))
-        self.suffix = m.group(1) or m.group(3)
-
-    @property
-    def file_name(self):
-        if self.SAFE_SUFFIX_REGEX.match(self.subject):
-            return "{}.{}".format(self.subject, self.suffix)
-
-
-class Attachment(object):
-
-    def __init__(self, data):
-        self.content_type = None
-        self.size = None
-        self.name = None
-        self.created = None
-        self.modified = None
-        self.data = data
-
-
-class MailFetcher(object):
-
-    def __init__(self):
-
-        self._connection = None
-        self._host = settings.MAIL_CONSUMPTION["HOST"]
-        self._port = settings.MAIL_CONSUMPTION["PORT"]
-        self._username = settings.MAIL_CONSUMPTION["USERNAME"]
-        self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
-        self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
-
-        self._enabled = bool(self._host)
-
-        self.last_checked = datetime.datetime.now()
-
-    def _connect(self):
-        self._connection = imaplib.IMAP4_SSL(self._host, self._port)
-
-    def _login(self):
-
-        login = self._connection.login(self._username, self._password)
-        if not login[0] == "OK":
-            raise MailConsumerError("Can't log into mail: {}".format(login[1]))
-
-        inbox = self._connection.select("INBOX")
-        if not inbox[0] == "OK":
-            raise MailConsumerError("Can't find the inbox: {}".format(inbox[1]))
-
-    def _fetch(self):
-        for num in self._connection.search(None, "ALL")[1][0].split():
-            typ, data = self._connection.fetch(num, "(RFC822)")
-            # self._connection.store(num, "+FLAGS", "\\Deleted")
-            yield data[0][1]
-
-    def consume(self):
-        """
-        We don't actually consume here 'cause it's much easier to do that with
-        files and we already have a FileConsumer.  So instead, we simply write
-        the attachment to the consumption directory as a file with the proper
-        format so the FileConsumer can do its job.
-        """
-
-        if self._enabled:
-
-            for message in self.get_messages():
-
-                t = message.attachment.created or \
-                    message.attachment.modified or \
-                    datetime.datetime.now()
-
-                file_name = os.path.join(Consumer.CONSUME, message.file_name)
-                with open(file_name, "wb") as f:
-                    f.write(message.attachment.data)
-                    os.utime(file_name, times=(t, t))
-
-        self.last_checked = datetime.datetime.now()
-
-    def get_messages(self):
-
-        self._connect()
-        self._login()
-
-        messages = []
-        for data in self._fetch():
-            message = self._parse_message(data)
-            if message:
-                messages.append(message)
-
-        self._connection.expunge()
-        self._connection.close()
-        self._connection.logout()
-
-        return messages
-
-    @staticmethod
-    def _parse_message(data):
-        """
-        Cribbed heavily from
-        https://www.ianlewis.org/en/parsing-email-attachments-python
-        """
-
-        r = []
-        message = email.message_from_string(data)
-
-        for part in message.walk():
-
-            content_disposition = part.get("Content-Disposition")
-            if not content_disposition:
-                continue
-
-            dispositions = content_disposition.strip().split(";")
-            if not dispositions[0].lower() == "attachment":
-                continue
-
-            file_data = part.get_payload()
-            attachment = Attachment(b64decode(file_data))
-            attachment.content_type = part.get_content_type()
-            attachment.size = len(file_data)
-
-            for param in dispositions[1:]:
-
-                name, value = param.split("=")
-                name = name.lower()
-
-                if name == "filename":
-                    attachment.name = value
-                elif name == "create-date":
-                    attachment.created = value
-                elif name == "modification-date":
-                    attachment.modified = value
-
-            r.append(Message(message.get("Subject"), attachment))
-
-        return r
--- a/src/documents/mail.py
+++ b/src/documents/mail.py
@@ -0,0 +1,208 @@
+import datetime
+import email
+import imaplib
+import os
+import random
+import re
+import time
+
+from base64 import b64decode
+from dateutil import parser
+
+from django.conf import settings
+
+from .consumer import Consumer
+
+
+class MailFetcherError(Exception):
+    pass
+
+
+class InvalidMessageError(Exception):
+    pass
+
+
+class Message(object):
+    """
+    A crude, but simple email message class.  We assume that there's a subject
+    and n attachments, and that we don't care about the message body.
+    """
+
+    # This regex is probably more restrictive than it needs to be, but it's
+    # better safe than sorry.
+    SAFE_SUBJECT_REGEX = re.compile(r"^[\w\- ,.']+$")
+
+    def _set_time(self, message):
+        self.time = datetime.datetime.now()
+        message_time = message.get("Date")
+        if message_time:
+            try:
+                self.time = parser.parse(message_time)
+            except (ValueError, AttributeError):
+                pass  # We assume that "now" is ok
+
+    def __init__(self, data):
+        """
+        Cribbed heavily from
+        https://www.ianlewis.org/en/parsing-email-attachments-python
+        """
+
+        self.subject = None
+        self.time = None
+        self.attachment = None
+
+        message = email.message_from_bytes(data)
+        self.subject = message.get("Subject")
+
+        self._set_time(message)
+
+        if self.subject is None:
+            raise InvalidMessageError("Message does not have a subject")
+        if not self.SAFE_SUBJECT_REGEX.match(self.subject):
+            raise InvalidMessageError("Message subject is unsafe")
+
+        print('Fetching email: "{}"'.format(self.subject))
+
+        attachments = []
+        for part in message.walk():
+
+            content_disposition = part.get("Content-Disposition")
+            if not content_disposition:
+                continue
+
+            dispositions = content_disposition.strip().split(";")
+            if not dispositions[0].lower() == "attachment":
+                continue
+
+            file_data = part.get_payload()
+
+            attachments.append(Attachment(
+                b64decode(file_data), content_type=part.get_content_type()))
+
+        if len(attachments) == 0:
+            raise InvalidMessageError(
+                "There don't appear to be any attachments to this message")
+
+        if len(attachments) > 1:
+            raise InvalidMessageError(
+                "There's more than one attachment to this message. It cannot "
+                "be indexed automatically."
+            )
+
+        self.attachment = attachments[0]
+
+    def __bool__(self):
+        return bool(self.attachment)
+
+    @property
+    def file_name(self):
+
+        prefix = str(random.randint(100000, 999999))
+        if self.SAFE_SUBJECT_REGEX.match(self.subject):
+            prefix = self.subject
+
+        return "{}.{}".format(prefix, self.attachment.suffix)
+
+
+class Attachment(object):
+
+    SAFE_SUFFIX_REGEX = re.compile(
+        r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
+
+    def __init__(self, data, content_type):
+
+        self.content_type = content_type
+        self.data = data
+        self.suffix = None
+
+        m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
+        if not m:
+            raise MailFetcherError(
+                "Not-awesome file type: {}".format(self.content_type))
+        self.suffix = m.group(2) or m.group(4)
+
+    def read(self):
+        return self.data
+
+
+class MailFetcher(object):
+
+    def __init__(self):
+
+        self._connection = None
+        self._host = settings.MAIL_CONSUMPTION["HOST"]
+        self._port = settings.MAIL_CONSUMPTION["PORT"]
+        self._username = settings.MAIL_CONSUMPTION["USERNAME"]
+        self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
+        self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
+
+        self._enabled = bool(self._host)
+
+        self.last_checked = datetime.datetime.now()
+
+    def pull(self):
+        """
+        Fetch all available mail at the target address and store it locally in
+        the consumption directory so that the file consumer can pick it up and
+        do its thing.
+        """
+
+        if self._enabled:
+
+            for message in self._get_messages():
+
+                print("Storing email: \"{}\"".format(message.subject))
+
+                t = int(time.mktime(message.time.timetuple()))
+                file_name = os.path.join(Consumer.CONSUME, message.file_name)
+                with open(file_name, "wb") as f:
+                    f.write(message.attachment.data)
+                    os.utime(file_name, times=(t, t))
+
+        self.last_checked = datetime.datetime.now()
+
+    def _get_messages(self):
+
+        self._connect()
+        self._login()
+
+        r = []
+        for message in self._fetch():
+            if message:
+                r.append(message)
+
+        self._connection.expunge()
+        self._connection.close()
+        self._connection.logout()
+
+        return r
+
+    def _connect(self):
+        self._connection = imaplib.IMAP4_SSL(self._host, self._port)
+
+    def _login(self):
+
+        login = self._connection.login(self._username, self._password)
+        if not login[0] == "OK":
+            raise MailFetcherError("Can't log into mail: {}".format(login[1]))
+
+        inbox = self._connection.select("INBOX")
+        if not inbox[0] == "OK":
+            raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
+
+    def _fetch(self):
+
+        for num in self._connection.search(None, "ALL")[1][0].split():
+
+            __, data = self._connection.fetch(num, "(RFC822)")
+
+            message = None
+            try:
+                message = Message(data[0][1])
+            except InvalidMessageError as e:
+                print(e)
+                pass
+
+            self._connection.store(num, "+FLAGS", "\\Deleted")
+            if message:
+                yield message
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -5,18 +5,14 @@ import time
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError

-from ...consumers import (
-    FileConsumer, FileConsumerError, MailConsumer, MailConsumerError)
+from ...consumer import Consumer, ConsumerError
+from ...mail import MailFetcher, MailFetcherError


 class Command(BaseCommand):
    """
-    Loop over every file found in CONSUMPTION_DIR and:
-      1. Convert it to a greyscale png
-      2. Use tesseract on the png
-      3. Encrypt and store the document in the MEDIA_ROOT
-      4. Store the OCR'd text in the database
-      5. Delete the document and image(s)
+    On every iteration of an infinite loop, consume what we can from the
+    consumption directory, and fetch any mail available.
    """

    LOOP_TIME = 10  # Seconds
@@ -29,7 +25,7 @@ class Command(BaseCommand):
        self.verbosity = 0

        self.file_consumer = None
-        self.mail_consumer = None
+        self.mail_fetcher = None

        BaseCommand.__init__(self, *args, **kwargs)

@@ -38,9 +34,9 @@ class Command(BaseCommand):
        self.verbosity = options["verbosity"]

        try:
-            self.file_consumer = FileConsumer(verbosity=self.verbosity)
-            self.mail_consumer = MailConsumer(verbosity=self.verbosity)
-        except (FileConsumerError, MailConsumerError) as e:
+            self.file_consumer = Consumer(verbosity=self.verbosity)
+            self.mail_fetcher = MailFetcher()
+        except (ConsumerError, MailFetcherError) as e:
            raise CommandError(e)

        try:
@@ -59,11 +55,13 @@ class Command(BaseCommand):

    def loop(self):

+        # Consume whatever files we can
        self.file_consumer.consume()

-        delta = self.mail_consumer.last_checked + self.MAIL_DELTA
+        # Occasionally fetch mail and store it to be consumed on the next loop
+        delta = self.mail_fetcher.last_checked + self.MAIL_DELTA
        if delta > datetime.datetime.now():
-            self.mail_consumer.consume()
+            self.mail_fetcher.pull()

    def _render(self, text, verbosity):
        if self.verbosity >= verbosity:
--- a/src/documents/tests/consumers/mail.py
+++ b/src/documents/tests/consumers/mail.py
@@ -1,3 +1,4 @@
+import base64
 import os
 import magic

@@ -6,10 +7,10 @@ from hashlib import md5
 from django.conf import settings
 from django.test import TestCase

-from ...consumers.mail import MailConsumer
+from ...mail import Message, Attachment


-class TestMailConsumer(TestCase):
+class TestMessage(TestCase):

    def __init__(self, *args, **kwargs):

@@ -23,21 +24,33 @@ class TestMailConsumer(TestCase):
            "mail.txt"
        )

-    def test_parse(self):
-        consumer = MailConsumer()
-        with open(self.sample) as f:
+    def test_init(self):

-            messages = consumer._parse_message(f.read())
+        with open(self.sample, "rb") as f:

-            self.assertTrue(len(messages), 1)
-            self.assertEqual(messages[0]["subject"], "Test 0")
+            message = Message(f.read())

-            attachment = messages[0]["attachment"]
-            data = attachment.read()
+            self.assertTrue(message)
+            self.assertEqual(message.subject, "Test 0")
+
+            data = message.attachment.read()

            self.assertEqual(
                md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")

-            self.assertEqual(attachment.content_type, "application/pdf")
+            self.assertEqual(
+                message.attachment.content_type, "application/pdf")
            with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
                self.assertEqual(m.id_buffer(data), "application/pdf")
+
+
+class TestAttachment(TestCase):
+
+    def test_init(self):
+        data = base64.encodebytes(b"0")
+        self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
+        self.assertEqual(Attachment(data, "image/png").suffix, "png")
+        self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
+        self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
+        self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
+        self.assertEqual(Attachment(data, "image/png").read(), data)
--- a/src/documents/tests/tests.py
+++ b/src/documents/tests/tests.py
@@ -1 +1 @@
-from .consumers.mail import TestMailConsumer
+from .consumers.mail import TestMailFetcher