Broke the consumer script into separate files and started on a mail consumer

This commit is contained in:
Daniel Quinn 2016-01-30 01:18:52 +00:00
parent 84d5f8cc5d
commit a70b40f618
8 changed files with 378 additions and 230 deletions

9
.gitignore vendored
View File

@ -67,8 +67,9 @@ db.sqlite3
# Other stuff that doesn't belong
virtualenv
scripts/import-for-development
# Vagrant
.vagrant
# Used for development
scripts/import-for-development
environment

View File

@ -0,0 +1,3 @@
from .base import Consumer
from .file import FileConsumer, FileConsumerError
from .mail import MailConsumer, MailConsumerError

View File

@ -0,0 +1,157 @@
import datetime
import glob
import langdetect
import os
import random
import re
import subprocess
import pyocr
from PIL import Image
from django.conf import settings
from django.utils import timezone
from paperless.db import GnuPG
from ..models import Tag, Document
from ..languages import ISO639
class OCRError(Exception):
pass
class Consumer(object):
SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY
OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
def __init__(self, verbosity=1):
self.verbosity = verbosity
try:
os.makedirs(self.SCRATCH)
except FileExistsError:
pass
def _get_greyscale(self, doc):
self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
self._render(" OCRing the document", 2)
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
guessed_language = langdetect.detect(raw_text)
self._render(" Language detected: {}".format(guessed_language), 2)
if guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
1
)
return raw_text
raise OCRError
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
return raw_text
try:
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self._render(
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
0
)
return raw_text
raise OCRError
def _ocr(self, pngs, lang):
self._render(" Parsing for {}".format(lang), 2)
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f, lang=lang)
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
def _guess_file_attributes(self, doc):
raise NotImplementedError(
"At the very least a consumer should determine the file type.")
def _store(self, text, doc):
sender, title, file_type = self._guess_file_attributes(doc)
lower_text = text.lower()
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
stats = os.stat(doc)
self._render(" Saving record to database", 2)
document = Document.objects.create(
sender=sender,
title=title,
content=text,
file_type=file_type,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
)
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
document.tags.add(*relevant_tags)
with open(doc, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))
def _cleanup(self, pngs, doc):
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
for f in list(glob.glob(png_glob)) + [doc]:
self._render(" Deleting {}".format(f), 2)
os.unlink(f)
self._render("", 2)
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)

View File

@ -0,0 +1,106 @@
import os
import re
from django.conf import settings
from django.template.defaultfilters import slugify
from ..models import Sender
from . import Consumer, OCRError
class FileConsumerError(Exception):
pass
class FileConsumer(Consumer):
CONSUME = settings.CONSUMPTION_DIR
PARSER_REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
PARSER_REGEX_SENDER_TITLE = re.compile(
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
def __init__(self, *args, **kwargs):
Consumer.__init__(self, *args, **kwargs)
self.stats = {}
self._ignore = []
if not self.CONSUME:
raise FileConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.CONSUME):
raise FileConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))
def consume(self):
for doc in os.listdir(self.CONSUME):
doc = os.path.join(self.CONSUME, doc)
if not os.path.isfile(doc):
continue
if not re.match(self.PARSER_REGEX_TITLE, doc):
continue
if doc in self._ignore:
continue
if self._is_ready(doc):
continue
self._render("Consuming {}".format(doc), 1)
pngs = self._get_greyscale(doc)
try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(doc), 0)
continue
self._store(text, doc)
self._cleanup(pngs, doc)
def _is_ready(self, doc):
"""
Detect whether `doc` is ready to consume or if it's still being written
to by the uploader.
"""
t = os.stat(doc).st_mtime
if self.stats.get(doc) == t:
del(self.stats[doc])
return True
self.stats[doc] = t
return False
def _guess_file_attributes(self, doc):
"""
We use a crude naming convention to make handling the sender and title
easier:
"<sender> - <title>.<suffix>"
"""
# First we attempt "<sender> - <title>.<suffix>"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
if m:
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title, file_type
# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, doc)
return None, m.group(1), m.group(2)

View File

@ -0,0 +1,69 @@
import datetime
import imaplib
from django.conf import settings
from . import Consumer
class MailConsumerError(Exception):
pass
class MailConsumer(Consumer):
def __init__(self, *args, **kwargs):
Consumer.__init__(self, *args, **kwargs)
self._connection = None
self._host = settings.MAIL_CONSUMPTION["HOST"]
self._port = settings.MAIL_CONSUMPTION["PORT"]
self._username = settings.MAIL_CONSUMPTION["USERNAME"]
self._password = settings.MAIL_CONSUMPTION["PASSWORD"]
self._inbox = settings.MAIL_CONSUMPTION["INBOX"]
self._enabled = bool(self._host)
self.last_checked = datetime.datetime.now()
def _connect(self):
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
def _login(self):
login = self._connection.login(self._username, self._password)
if not login[0] == "OK":
raise MailConsumerError("Can't log into mail: {}".format(login[1]))
inbox = self._connection.select("INBOX")
if not inbox[0] == "OK":
raise MailConsumerError("Can't find the inbox: {}".format(inbox[1]))
def _fetch(self):
for num in self._connection.search(None, "ALL")[1][0].split():
typ, data = self._connection.fetch(num, "(RFC822)")
# self._connection.store(num, "+FLAGS", "\\Deleted")
yield data[0][1]
def consume(self):
if self._enabled:
self.get_messages()
self.last_checked = datetime.datetime.now()
def get_messages(self):
self._connect()
self._login()
for message in self._fetch():
print(message) # Now we have to do something with the attachment
self._connection.expunge()
self._connection.close()
self._connection.logout()
def _guess_file_attributes(self, doc):
return None, None, "jpg"

View File

@ -1,29 +1,12 @@
import datetime
import glob
import langdetect
import os
import random
import re
import subprocess
import time
import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify
from django.utils import timezone
from paperless.db import GnuPG
from ...languages import ISO639
from ...models import Document, Sender, Tag
class OCRError(BaseException):
pass
from ...consumers import (
FileConsumer, FileConsumerError, MailConsumer, MailConsumerError)
class Command(BaseCommand):
@ -37,25 +20,16 @@ class Command(BaseCommand):
"""
LOOP_TIME = 10 # Seconds
MAIL_DELTA = datetime.timedelta(minutes=10)
CONVERT = settings.CONVERT_BINARY
SCRATCH = settings.SCRATCH_DIR
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
MEDIA_DOCS = os.path.join(settings.MEDIA_ROOT, "documents")
PARSER_REGEX_TITLE = re.compile(
r"^.*/(.*)\.(pdf|jpe?g|png|gif|tiff)$", flags=re.IGNORECASE)
PARSER_REGEX_SENDER_TITLE = re.compile(
r"^.*/(.*) - (.*)\.(pdf|jpe?g|png|gif|tiff)", flags=re.IGNORECASE)
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.stats = {}
self._ignore = []
self.file_consumer = None
self.mail_consumer = None
BaseCommand.__init__(self, *args, **kwargs)
@ -63,7 +37,16 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
self._setup()
try:
self.file_consumer = FileConsumer(verbosity=self.verbosity)
self.mail_consumer = MailConsumer(verbosity=self.verbosity)
except (FileConsumerError, MailConsumerError) as e:
raise CommandError(e)
try:
os.makedirs(self.MEDIA_DOCS)
except FileExistsError:
pass
try:
while True:
@ -76,196 +59,11 @@ class Command(BaseCommand):
def loop(self):
for doc in os.listdir(self.CONSUME):
self.file_consumer.consume()
doc = os.path.join(self.CONSUME, doc)
if not os.path.isfile(doc):
continue
if not re.match(self.PARSER_REGEX_TITLE, doc):
continue
if doc in self._ignore:
continue
if self._is_ready(doc):
continue
self._render("Consuming {}".format(doc), 1)
pngs = self._get_greyscale(doc)
try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(doc)
self._render("OCR FAILURE: {}".format(doc), 0)
continue
self._store(text, doc)
self._cleanup(pngs, doc)
def _setup(self):
if not self.CONSUME:
raise CommandError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
for d in (self.SCRATCH, self.MEDIA_DOCS):
try:
os.makedirs(d)
except FileExistsError:
pass
def _is_ready(self, doc):
"""
Detect whether `doc` is ready to consume or if it's still being written
to by the scanner.
"""
t = os.stat(doc).st_mtime
if self.stats.get(doc) == t:
del(self.stats[doc])
return True
self.stats[doc] = t
return False
def _get_greyscale(self, doc):
self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
self._render(" OCRing the document", 2)
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
guessed_language = langdetect.detect(raw_text)
self._render(" Language detected: {}".format(guessed_language), 2)
if guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
1
)
return raw_text
raise OCRError
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
return raw_text
try:
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self._render(
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
0
)
return raw_text
raise OCRError
def _ocr(self, pngs, lang):
self._render(" Parsing for {}".format(lang), 2)
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f, lang=lang)
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
def _store(self, text, doc):
sender, title, file_type = self._parse_file_name(doc)
lower_text = text.lower()
relevant_tags = [t for t in Tag.objects.all() if t.matches(lower_text)]
stats = os.stat(doc)
self._render(" Saving record to database", 2)
document = Document.objects.create(
sender=sender,
title=title,
content=text,
file_type=file_type,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
)
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
document.tags.add(*relevant_tags)
with open(doc, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))
def _parse_file_name(self, doc):
"""
We use a crude naming convention to make handling the sender and title
easier:
"<sender> - <title>.<suffix>"
"""
# First we attempt "<sender> - <title>.<suffix>"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, doc)
if m:
sender_name, title, file_type = m.group(1), m.group(2), m.group(3)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title, file_type
# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, doc)
return None, m.group(1), m.group(2)
def _cleanup(self, pngs, doc):
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
for f in list(glob.glob(png_glob)) + [doc]:
self._render(" Deleting {}".format(f), 2)
os.unlink(f)
self._render("", 2)
now = datetime.datetime.now()
if self.mail_consumer.last_checked + self.MAIL_DELTA > now:
self.mail_consumer.consume()
def _render(self, text, verbosity):
if self.verbosity >= verbosity:

View File

@ -47,7 +47,7 @@ class Command(BaseCommand):
self._render("Exporting: {}".format(target), 1)
with open(target, "wb") as f:
f.write(GnuPG.decrypted(document.pdf))
f.write(GnuPG.decrypted(document.source_file))
t = int(time.mktime(document.created.timetuple()))
os.utime(target, times=(t, t))

View File

@ -162,7 +162,21 @@ SCRATCH_DIR = "/tmp/paperless"
# This is where Paperless will look for PDFs to index
CONSUMPTION_DIR = os.environ.get("PAPERLESS_CONSUME")
# Set this and change the permissions on this file to 0600, or set it to
# `None` and you'll be prompted for the passphrase at runtime. The default
# looks for an environment variable.
# If you want to use IMAP mail consumption, populate this with useful values.
# If you leave HOST set to None, we assume you're not going to use this feature.
MAIL_CONSUMPTION = {
"HOST": os.environ.get("PAPERLESS_CONSUME_MAIL_HOST"),
"PORT": os.environ.get("PAPERLESS_CONSUME_MAIL_PORT"),
"USERNAME": os.environ.get("PAPERLESS_CONSUME_MAIL_USER"),
"PASSWORD": os.environ.get("PAPERLESS_CONSUME_MAIL_PASS"),
"USE_SSL": True, # If True, use SSL/TLS to connect
"INBOX": "INBOX" # The name of the inbox on the server
}
# This is used to encrypt the original documents and decrypt them later when you
# want to download them. Set it and change the permissions on this file to
# 0600, or set it to `None` and you'll be prompted for the passphrase at
# runtime. The default looks for an environment variable.
# DON'T FORGET TO SET THIS as leaving it blank may cause some strang things with
# GPG, including an interesting case where it may "encrypt" zero-byte files.
PASSPHRASE = os.environ.get("PAPERLESS_PASSPHRASE")