Merged new logging system

This commit is contained in:
Daniel Quinn
2016-02-28 15:01:19 +00:00
19 changed files with 446 additions and 192 deletions

View File

@@ -3,7 +3,7 @@ from django.contrib.auth.models import User, Group
from django.core.urlresolvers import reverse
from django.templatetags.static import static
from .models import Sender, Tag, Document
from .models import Sender, Tag, Document, Log
class MonthListFilter(admin.SimpleListFilter):
@@ -57,7 +57,7 @@ class DocumentAdmin(admin.ModelAdmin):
r = ""
for tag in obj.tags.all():
colour = tag.get_colour_display()
r += html_tag(
r += self._html_tag(
"a",
tag.slug,
**{
@@ -73,9 +73,9 @@ class DocumentAdmin(admin.ModelAdmin):
tags_.allow_tags = True
def document(self, obj):
return html_tag(
return self._html_tag(
"a",
html_tag(
self._html_tag(
"img",
src=static("documents/img/{}.png".format(obj.file_type)),
width=22,
@@ -87,23 +87,32 @@ class DocumentAdmin(admin.ModelAdmin):
)
document.allow_tags = True
@staticmethod
def _html_tag(kind, inside=None, **kwargs):
attributes = []
for lft, rgt in kwargs.items():
attributes.append('{}="{}"'.format(lft, rgt))
if inside is not None:
return "<{kind} {attributes}>{inside}</{kind}>".format(
kind=kind, attributes=" ".join(attributes), inside=inside)
return "<{} {}/>".format(kind, " ".join(attributes))
class LogAdmin(admin.ModelAdmin):
list_display = ("message", "level", "component")
list_filter = ("level", "component",)
admin.site.register(Sender)
admin.site.register(Tag, TagAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Log, LogAdmin)
# Unless we implement multi-user, these default registrations don't make sense.
admin.site.unregister(Group)
admin.site.unregister(User)
def html_tag(kind, inside=None, **kwargs):
attributes = []
for lft, rgt in kwargs.items():
attributes.append('{}="{}"'.format(lft, rgt))
if inside is not None:
return "<{kind} {attributes}>{inside}</{kind}>".format(
kind=kind, attributes=" ".join(attributes), inside=inside)
return "<{} {}/>".format(kind, " ".join(attributes))

View File

@@ -1,5 +1,8 @@
import datetime
import logging
import tempfile
import uuid
from multiprocessing.pool import Pool
import itertools
@@ -19,10 +22,9 @@ from django.utils import timezone
from django.template.defaultfilters import slugify
from pyocr.tesseract import TesseractError
from logger.models import Log
from paperless.db import GnuPG
from .models import Sender, Tag, Document
from .models import Sender, Tag, Document, Log
from .languages import ISO639
@@ -64,9 +66,10 @@ class Consumer(object):
flags=re.IGNORECASE
)
def __init__(self, verbosity=1):
def __init__(self):
self.verbosity = verbosity
self.logger = logging.getLogger(__name__)
self.logging_group = None
try:
os.makedirs(self.SCRATCH)
@@ -86,6 +89,12 @@ class Consumer(object):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group,
"component": Log.COMPONENT_CONSUMER
})
def consume(self):
for doc in os.listdir(self.CONSUME):
@@ -104,7 +113,9 @@ class Consumer(object):
if self._is_ready(doc):
continue
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
pngs = self._get_greyscale(tempdir, doc)
@@ -114,8 +125,7 @@ class Consumer(object):
self._store(text, doc)
except OCRError:
self._ignore.append(doc)
Log.error(
"OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
self.log("error", "OCR FAILURE: {}".format(doc))
self._cleanup_tempdir(tempdir)
continue
else:
@@ -124,10 +134,7 @@ class Consumer(object):
def _get_greyscale(self, tempdir, doc):
Log.debug(
"Generating greyscale image from {}".format(doc),
Log.COMPONENT_CONSUMER
)
self.log("info", "Generating greyscale image from {}".format(doc))
png = os.path.join(tempdir, "convert-%04d.jpg")
@@ -143,18 +150,13 @@ class Consumer(object):
return sorted(filter(lambda __: os.path.isfile(__), pngs))
@staticmethod
def _guess_language(text):
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
Log.debug(
"Language detected: {}".format(guess),
Log.COMPONENT_CONSUMER
)
self.log("debug", "Language detected: {}".format(guess))
return guess
except Exception as e:
Log.warning(
"Language detection error: {}".format(e), Log.COMPONENT_MAIL)
self.log("warning", "Language detection error: {}".format(e))
def _get_ocr(self, pngs):
"""
@@ -165,7 +167,7 @@ class Consumer(object):
if not pngs:
raise OCRError
Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
@@ -175,12 +177,12 @@ class Consumer(object):
guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639:
Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
Log.warning(
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have.",
Log.COMPONENT_CONSUMER
"best with what we have."
)
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
return raw_text
@@ -194,12 +196,12 @@ class Consumer(object):
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
Log.warning(
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
Log.COMPONENT_CONSUMER
)
)
raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
return raw_text
@@ -222,28 +224,15 @@ class Consumer(object):
if not pngs:
return ""
Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool:
r = pool.map(
self.image_to_string, itertools.product(pngs, [lang]))
r = pool.map(image_to_string, itertools.product(pngs, [lang]))
r = " ".join(r)
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
def image_to_string(self, args):
png, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(self.SCRATCH, png)) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except TesseractError:
pass
return ocr.image_to_string(f, lang=lang)
def _guess_attributes_from_name(self, parseable):
"""
We use a crude naming convention to make handling the sender, title,
@@ -301,7 +290,7 @@ class Consumer(object):
stats = os.stat(doc)
Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)
self.log("debug", "Saving record to database")
document = Document.objects.create(
sender=sender,
@@ -316,23 +305,22 @@ class Consumer(object):
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
Log.debug(
"Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
with open(doc, "rb") as unencrypted:
with open(document.source_path, "wb") as encrypted:
Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
self.log("debug", "Encrypting")
encrypted.write(GnuPG.encrypted(unencrypted))
@staticmethod
def _cleanup_tempdir(d):
Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
self.log("info", "Completed")
def _cleanup_tempdir(self, d):
self.log("debug", "Deleting directory {}".format(d))
shutil.rmtree(d)
@staticmethod
def _cleanup_doc(doc):
Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
def _is_ready(self, doc):
@@ -350,3 +338,23 @@ class Consumer(object):
self.stats[doc] = t
return False
def image_to_string(args):
"""
I have no idea why, but if this function were a method of Consumer, it
would explode with:
`TypeError: cannot serialize '_io.TextIOWrapper' object`.
"""
png, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(Consumer.SCRATCH, png)) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except TesseractError:
pass
return ocr.image_to_string(f, lang=lang)

30
src/documents/loggers.py Normal file
View File

@@ -0,0 +1,30 @@
import logging
class PaperlessLogger(logging.StreamHandler):
"""
A logger smart enough to know to log some kinds of messages to the database
for later retrieval in a pretty interface.
"""
def emit(self, record):
logging.StreamHandler.emit(self, record)
if not hasattr(record, "component"):
return
# We have to do the import here or Django will barf when it tries to
# load this because the apps aren't loaded at that point
from .models import Log
kwargs = {
"message": record.msg,
"component": record.component,
"level": record.levelno,
}
if hasattr(record, "group"):
kwargs["group"] = record.group
Log.objects.create(**kwargs)

View File

@@ -1,8 +1,10 @@
import datetime
import imaplib
import logging
import os
import re
import time
import uuid
from base64 import b64decode
from email import policy
@@ -11,10 +13,8 @@ from dateutil import parser
from django.conf import settings
from logger.models import Log
from .consumer import Consumer
from .models import Sender
from .models import Sender, Log
class MailFetcherError(Exception):
@@ -25,7 +25,20 @@ class InvalidMessageError(Exception):
pass
class Message(object):
class Loggable(object):
def __init__(self, group=None):
self.logger = logging.getLogger(__name__)
self.logging_group = group or uuid.uuid4()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group,
"component": Log.COMPONENT_MAIL
})
class Message(Loggable):
"""
A crude, but simple email message class. We assume that there's a subject
and n attachments, and that we don't care about the message body.
@@ -33,13 +46,13 @@ class Message(object):
SECRET = settings.UPLOAD_SHARED_SECRET
def __init__(self, data, verbosity=1):
def __init__(self, data, group=None):
"""
Cribbed heavily from
https://www.ianlewis.org/en/parsing-email-attachments-python
"""
self.verbosity = verbosity
Loggable.__init__(self, group=group)
self.subject = None
self.time = None
@@ -54,8 +67,7 @@ class Message(object):
self._set_time(message)
Log.info(
'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)
self.log("info", 'Importing email: "{}"'.format(self.subject))
attachments = []
for part in message.walk():
@@ -134,9 +146,11 @@ class Attachment(object):
return self.data
class MailFetcher(object):
class MailFetcher(Loggable):
def __init__(self, verbosity=1):
def __init__(self):
Loggable.__init__(self)
self._connection = None
self._host = settings.MAIL_CONSUMPTION["HOST"]
@@ -148,7 +162,6 @@ class MailFetcher(object):
self._enabled = bool(self._host)
self.last_checked = datetime.datetime.now()
self.verbosity = verbosity
def pull(self):
"""
@@ -159,14 +172,14 @@ class MailFetcher(object):
if self._enabled:
Log.info("Checking mail", Log.COMPONENT_MAIL)
# Reset the grouping id for each fetch
self.logging_group = uuid.uuid4()
self.log("info", "Checking mail")
for message in self._get_messages():
Log.debug(
'Storing email: "{}"'.format(message.subject),
Log.COMPONENT_MAIL
)
self.log("info", 'Storing email: "{}"'.format(message.subject))
t = int(time.mktime(message.time.timetuple()))
file_name = os.path.join(Consumer.CONSUME, message.file_name)
@@ -193,7 +206,7 @@ class MailFetcher(object):
self._connection.logout()
except Exception as e:
Log.error(e, Log.COMPONENT_MAIL)
self.log("error", str(e))
return r
@@ -218,9 +231,9 @@ class MailFetcher(object):
message = None
try:
message = Message(data[0][1], self.verbosity)
message = Message(data[0][1], self.logging_group)
except InvalidMessageError as e:
Log.error(e, Log.COMPONENT_MAIL)
self.log("error", str(e))
else:
self._connection.store(num, "+FLAGS", "\\Deleted")

View File

@@ -34,7 +34,7 @@ class Command(BaseCommand):
self.verbosity = options["verbosity"]
try:
self.file_consumer = Consumer(verbosity=self.verbosity)
self.file_consumer = Consumer()
self.mail_fetcher = MailFetcher()
except (ConsumerError, MailFetcherError) as e:
raise CommandError(e)

70
src/documents/managers.py Normal file
View File

@@ -0,0 +1,70 @@
from django.conf import settings
from django.db import models
from django.db.models.aggregates import Max
class Concat(models.Aggregate):
"""
Theoretically, this should work in Sqlite, PostgreSQL, and MySQL, but I've
only ever tested it in Sqlite.
"""
ENGINE_SQLITE = 1
ENGINE_POSTGRESQL = 2
ENGINE_MYSQL = 3
ENGINES = {
"django.db.backends.sqlite3": ENGINE_SQLITE,
"django.db.backends.postgresql_psycopg2": ENGINE_POSTGRESQL,
"django.db.backends.postgresql": ENGINE_POSTGRESQL,
"django.db.backends.mysql": ENGINE_MYSQL
}
def __init__(self, expression, separator="\n", **extra):
self.engine = self._get_engine()
self.function = self._get_function()
self.template = self._get_template(separator)
models.Aggregate.__init__(
self,
expression,
output_field=models.CharField(),
**extra
)
def _get_engine(self):
engine = settings.DATABASES["default"]["ENGINE"]
try:
return self.ENGINES[engine]
except KeyError:
raise NotImplementedError(
"There's currently no support for {} when it comes to group "
"concatenation in Paperless".format(engine)
)
def _get_function(self):
if self.engine == self.ENGINE_POSTGRESQL:
return "STRING_AGG"
return "GROUP_CONCAT"
def _get_template(self, separator):
if self.engine == self.ENGINE_MYSQL:
return "%(function)s(%(expressions)s, SEPARATOR '{}')".format(
separator)
return "%(function)s(%(expressions)s, '{}')".format(separator)
class LogQuerySet(models.query.QuerySet):
def by_group(self):
return self.values("group").annotate(
time=Max("modified"),
messages=Concat("message"),
).order_by("-time")
class LogManager(models.Manager):
def get_queryset(self):
return LogQuerySet(self.model, using=self._db)

View File

@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9 on 2016-02-27 17:54
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0009_auto_20160214_0040'),
]
operations = [
migrations.CreateModel(
name='Log',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('group', models.UUIDField(blank=True)),
('message', models.TextField()),
('level', models.PositiveIntegerField(choices=[(10, 'Debugging'), (20, 'Informational'), (30, 'Warning'), (40, 'Error'), (50, 'Critical')], default=20)),
('component', models.PositiveIntegerField(choices=[(1, 'Consumer'), (2, 'Mail Fetcher')])),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
],
options={
'ordering': ('-modified',),
},
),
]

View File

@@ -1,3 +1,4 @@
import logging
import os
import re
@@ -7,6 +8,8 @@ from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from .managers import LogManager
class SluggedModel(models.Model):
@@ -187,3 +190,36 @@ class Document(models.Model):
@property
def download_url(self):
return reverse("fetch", kwargs={"pk": self.pk})
class Log(models.Model):
LEVELS = (
(logging.DEBUG, "Debugging"),
(logging.INFO, "Informational"),
(logging.WARNING, "Warning"),
(logging.ERROR, "Error"),
(logging.CRITICAL, "Critical"),
)
COMPONENT_CONSUMER = 1
COMPONENT_MAIL = 2
COMPONENTS = (
(COMPONENT_CONSUMER, "Consumer"),
(COMPONENT_MAIL, "Mail Fetcher")
)
group = models.UUIDField(blank=True)
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
component = models.PositiveIntegerField(choices=COMPONENTS)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
objects = LogManager()
class Meta(object):
ordering = ("-modified",)
def __str__(self):
return self.message

View File

@@ -0,0 +1,142 @@
import logging
import uuid
from unittest import mock
from django.test import TestCase
from ..models import Log
class TestPaperlessLog(TestCase):
def __init__(self, *args, **kwargs):
TestCase.__init__(self, *args, **kwargs)
self.logger = logging.getLogger(
"documents.management.commands.document_consumer")
def test_ignored(self):
with mock.patch("logging.StreamHandler.emit") as __:
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message")
self.logger.warning("This is an informational message")
self.logger.error("This is an informational message")
self.logger.critical("This is an informational message")
self.assertEqual(Log.objects.all().count(), 0)
def test_that_it_saves_at_all(self):
kw = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw)
self.assertEqual(Log.objects.all().count(), 1)
self.logger.warning("This is an warning message", extra=kw)
self.assertEqual(Log.objects.all().count(), 2)
self.logger.error("This is an error message", extra=kw)
self.assertEqual(Log.objects.all().count(), 3)
self.logger.critical("This is a critical message", extra=kw)
self.assertEqual(Log.objects.all().count(), 4)
def test_groups(self):
kw1 = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
kw2 = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 1)
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 1)
self.logger.warning("This is an warning message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 2)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 1)
self.logger.error("This is an error message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 3)
self.assertEqual(Log.objects.filter(group=kw2["group"]).count(), 2)
self.logger.critical("This is a critical message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
def test_components(self):
c1 = Log.COMPONENT_CONSUMER
c2 = Log.COMPONENT_MAIL
kw1 = {
"group": uuid.uuid4(),
"component": c1
}
kw2 = {
"group": kw1["group"],
"component": c2
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 1)
self.assertEqual(Log.objects.filter(component=c2).count(), 1)
self.logger.warning("This is an warning message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 2)
self.assertEqual(Log.objects.filter(component=c1).count(), 1)
self.logger.error("This is an error message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 3)
self.assertEqual(Log.objects.filter(component=c2).count(), 2)
self.logger.critical("This is a critical message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(component=c1).count(), 2)
def test_groupped_query(self):
kw = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
with mock.patch("logging.StreamHandler.emit") as __:
self.logger.info("Message 0", extra=kw)
self.logger.info("Message 1", extra=kw)
self.logger.info("Message 2", extra=kw)
self.logger.info("Message 3", extra=kw)
self.assertEqual(Log.objects.all().by_group().count(), 1)
self.assertEqual(
Log.objects.all().by_group()[0]["Messages"],
"Message 0\nMessage 1\nMessage 2\nMessage 3"
)

View File

@@ -3,6 +3,7 @@ import os
import magic
from hashlib import md5
from unittest import mock
from django.conf import settings
from django.test import TestCase
@@ -27,7 +28,8 @@ class TestMessage(TestCase):
with open(self.sample, "rb") as f:
message = Message(f.read(), verbosity=0)
with mock.patch("logging.StreamHandler.emit") as __:
message = Message(f.read())
self.assertTrue(message)
self.assertEqual(message.subject, "Test 0")