Merge pull request #101 from danielquinn/issue/89

Closes #89.
This commit is contained in:
Daniel Quinn 2016-03-28 14:25:56 +01:00
commit bbe691f342
14 changed files with 391 additions and 260 deletions

View File

@ -31,6 +31,13 @@ class MonthListFilter(admin.SimpleListFilter):
return queryset.filter(created__year=year, created__month=month)
class CorrespondentAdmin(admin.ModelAdmin):
list_display = ("name", "match", "matching_algorithm")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
class TagAdmin(admin.ModelAdmin):
list_display = ("name", "colour", "match", "matching_algorithm")
@ -103,11 +110,11 @@ class DocumentAdmin(admin.ModelAdmin):
class LogAdmin(admin.ModelAdmin):
list_display = ("message", "level", "component")
list_filter = ("level", "component",)
list_display = ("message", "level",)
list_filter = ("level",)
admin.site.register(Correspondent)
admin.site.register(Correspondent, CorrespondentAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Log, LogAdmin)

View File

@ -2,4 +2,15 @@ from django.apps import AppConfig
class DocumentsConfig(AppConfig):
name = 'documents'
name = "documents"
def ready(self):
from .signals import document_consumption_finished
from .signals.handlers import set_correspondent, set_tags
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_correspondent)
AppConfig.ready(self)

View File

@ -23,8 +23,10 @@ from pyocr.tesseract import TesseractError
from paperless.db import GnuPG
from .models import Tag, Document, Log, FileInfo
from .models import Tag, Document, FileInfo
from .languages import ISO639
from .signals import (
document_consumption_started, document_consumption_finished)
class OCRError(Exception):
@ -78,8 +80,7 @@ class Consumer(object):
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group,
"component": Log.COMPONENT_CONSUMER
"group": self.logging_group
})
def consume(self):
@ -104,22 +105,39 @@ class Consumer(object):
self.log("info", "Consuming {}".format(doc))
document_consumption_started.send(
sender=self.__class__,
filename=doc,
logging_group=self.logging_group
)
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
imgs = self._get_greyscale(tempdir, doc)
thumbnail = self._get_thumbnail(tempdir, doc)
try:
text = self._get_ocr(imgs)
self._store(text, doc, thumbnail)
document = self._store(self._get_ocr(imgs), doc, thumbnail)
except OCRError as e:
self._ignore.append(doc)
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
self._cleanup_tempdir(tempdir)
continue
else:
self._cleanup_tempdir(tempdir)
self._cleanup_doc(doc)
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group
)
def _get_greyscale(self, tempdir, doc):
"""
Greyscale images are easier for Tesseract to OCR
@ -260,7 +278,6 @@ class Consumer(object):
def _store(self, text, doc, thumbnail):
file_info = FileInfo.from_path(doc)
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
stats = os.stat(doc)
@ -277,6 +294,7 @@ class Consumer(object):
datetime.datetime.fromtimestamp(stats.st_mtime))
)
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self.log("debug", "Tagging with {}".format(tag_names))
@ -296,6 +314,8 @@ class Consumer(object):
self.log("info", "Completed")
return document
def _cleanup_tempdir(self, d):
self.log("debug", "Deleting directory {}".format(d))
shutil.rmtree(d)

View File

@ -11,18 +11,11 @@ class PaperlessLogger(logging.StreamHandler):
logging.StreamHandler.emit(self, record)
if not hasattr(record, "component"):
return
# We have to do the import here or Django will barf when it tries to
# load this because the apps aren't loaded at that point
from .models import Log
kwargs = {
"message": record.msg,
"component": record.component,
"level": record.levelno,
}
kwargs = {"message": record.msg, "level": record.levelno}
if hasattr(record, "group"):
kwargs["group"] = record.group

View File

@ -33,8 +33,7 @@ class Loggable(object):
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group,
"component": Log.COMPONENT_MAIL
"group": self.logging_group
})

View File

@ -47,10 +47,7 @@ class Command(BaseCommand):
pass
logging.getLogger(__name__).info(
"Starting document consumer at {}".format(
settings.CONSUMPTION_DIR
),
extra={"component": Log.COMPONENT_CONSUMER}
"Starting document consumer at {}".format(settings.CONSUMPTION_DIR)
)
try:

View File

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.4 on 2016-03-25 21:11
from __future__ import unicode_literals
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
('documents', '0012_auto_20160305_0040'),
]
operations = [
migrations.AddField(
model_name='correspondent',
name='match',
field=models.CharField(blank=True, max_length=256),
),
migrations.AddField(
model_name='correspondent',
name='matching_algorithm',
field=models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. If you don\'t know what a regex is, you probably don\'t want this option.'),
),
migrations.AlterField(
model_name='document',
name='created',
field=models.DateTimeField(default=django.utils.timezone.now),
),
migrations.RemoveField(
model_name='log',
name='component',
),
]

View File

@ -15,50 +15,7 @@ from django.utils import timezone
from .managers import LogManager
class SluggedModel(models.Model):
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True)
class Meta(object):
abstract = True
def save(self, *args, **kwargs):
if not self.slug:
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
def __str__(self):
return self.name
class Correspondent(SluggedModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
class Meta(object):
ordering = ("name",)
class Tag(SluggedModel):
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
class MatchingModel(models.Model):
MATCH_ANY = 1
MATCH_ALL = 2
@ -71,7 +28,9 @@ class Tag(SluggedModel):
(MATCH_REGEX, "Regular Expression"),
)
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True)
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
@ -88,6 +47,12 @@ class Tag(SluggedModel):
)
)
class Meta(object):
abstract = True
def __str__(self):
return self.name
@property
def conditions(self):
return "{}: \"{}\" ({})".format(
@ -131,8 +96,44 @@ class Tag(SluggedModel):
raise NotImplementedError("Unsupported matching algorithm")
def save(self, *args, **kwargs):
self.match = self.match.lower()
SluggedModel.save(self, *args, **kwargs)
if not self.slug:
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
class Correspondent(MatchingModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
class Meta(object):
ordering = ("name",)
class Tag(MatchingModel):
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
class Document(models.Model):
@ -219,17 +220,9 @@ class Log(models.Model):
(logging.CRITICAL, "Critical"),
)
COMPONENT_CONSUMER = 1
COMPONENT_MAIL = 2
COMPONENTS = (
(COMPONENT_CONSUMER, "Consumer"),
(COMPONENT_MAIL, "Mail Fetcher")
)
group = models.UUIDField(blank=True)
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
component = models.PositiveIntegerField(choices=COMPONENTS)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)

View File

@ -0,0 +1,4 @@
from django.dispatch import Signal
document_consumption_started = Signal(providing_args=["filename"])
document_consumption_finished = Signal(providing_args=["document"])

View File

@ -0,0 +1,53 @@
import logging
from ..models import Correspondent, Tag
def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
def set_correspondent(sender, document=None, logging_group=None, **kwargs):
# No sense in assigning a correspondent when one is already set.
if document.correspondent:
return
# No matching correspondents, so no need to continue
potential_correspondents = list(Correspondent.match_all(document.content))
if not potential_correspondents:
return
potential_count = len(potential_correspondents)
selected = potential_correspondents[0]
if potential_count > 1:
message = "Detected {} potential correspondents, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
logger(
'Assigning correspondent "{}" to "{}" '.format(selected, document),
logging_group
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
def set_tags(sender, document=None, logging_group=None, **kwargs):
current_tags = set(document.tags.all())
relevant_tags = set(Tag.match_all(document.content)) - current_tags
if not relevant_tags:
return
message = 'Tagging "{}" with "{}"'
logger(
message.format(document, ", ".join([t.slug for t in relevant_tags])),
logging_group
)
document.tags.add(*relevant_tags)

View File

@ -15,21 +15,9 @@ class TestPaperlessLog(TestCase):
self.logger = logging.getLogger(
"documents.management.commands.document_consumer")
def test_ignored(self):
with mock.patch("logging.StreamHandler.emit") as __:
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message")
self.logger.warning("This is an informational message")
self.logger.error("This is an informational message")
self.logger.critical("This is an informational message")
self.assertEqual(Log.objects.all().count(), 0)
def test_that_it_saves_at_all(self):
kw = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
kw = {"group": uuid.uuid4()}
self.assertEqual(Log.objects.all().count(), 0)
@ -53,14 +41,8 @@ class TestPaperlessLog(TestCase):
def test_groups(self):
kw1 = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
kw2 = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
kw1 = {"group": uuid.uuid4()}
kw2 = {"group": uuid.uuid4()}
self.assertEqual(Log.objects.all().count(), 0)
@ -86,49 +68,9 @@ class TestPaperlessLog(TestCase):
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(group=kw1["group"]).count(), 2)
def test_components(self):
c1 = Log.COMPONENT_CONSUMER
c2 = Log.COMPONENT_MAIL
kw1 = {
"group": uuid.uuid4(),
"component": c1
}
kw2 = {
"group": kw1["group"],
"component": c2
}
self.assertEqual(Log.objects.all().count(), 0)
with mock.patch("logging.StreamHandler.emit") as __:
# Debug messages are ignored by default
self.logger.debug("This is a debugging message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 0)
self.logger.info("This is an informational message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 1)
self.assertEqual(Log.objects.filter(component=c2).count(), 1)
self.logger.warning("This is an warning message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 2)
self.assertEqual(Log.objects.filter(component=c1).count(), 1)
self.logger.error("This is an error message", extra=kw2)
self.assertEqual(Log.objects.all().count(), 3)
self.assertEqual(Log.objects.filter(component=c2).count(), 2)
self.logger.critical("This is a critical message", extra=kw1)
self.assertEqual(Log.objects.all().count(), 4)
self.assertEqual(Log.objects.filter(component=c1).count(), 2)
def test_groupped_query(self):
kw = {
"group": uuid.uuid4(),
"component": Log.COMPONENT_MAIL
}
kw = {"group": uuid.uuid4()}
with mock.patch("logging.StreamHandler.emit") as __:
self.logger.info("Message 0", extra=kw)
self.logger.info("Message 1", extra=kw)

View File

@ -0,0 +1,196 @@
from random import randint
from django.test import TestCase
from ..models import Correspondent, Document, Tag
from ..signals import document_consumption_finished
class TestMatching(TestCase):
def _test_matching(self, text, algorithm, true, false):
for klass in (Tag, Correspondent):
instance = klass.objects.create(
name=str(randint(10000, 99999)),
match=text,
matching_algorithm=getattr(klass, algorithm)
)
for string in true:
self.assertTrue(instance.matches(string))
for string in false:
self.assertFalse(instance.matches(string))
def test_match_all(self):
self._test_matching(
"alpha charlie gamma",
"MATCH_ALL",
("I have alpha, charlie, and gamma in me",),
(
"I have alpha in me",
"I have charlie in me",
"I have gamma in me",
"I have alpha and charlie in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
"I have bravo in me",
)
)
self._test_matching(
"12 34 56",
"MATCH_ALL",
(
"I have 12 34, and 56 in me",
),
(
"I have 12 in me",
"I have 34 in me",
"I have 56 in me",
"I have 12 and 34 in me",
"I have 120, 34, and 56 in me",
"I have 123456 in me",
"I have 01234567 in me",
)
)
def test_match_any(self):
self._test_matching(
"alpha charlie gamma",
"MATCH_ANY",
(
"I have alpha in me",
"I have charlie in me",
"I have gamma in me",
"I have alpha, charlie, and gamma in me",
"I have alpha and charlie in me",
),
(
"I have alphas in me",
"I have bravo in me",
)
)
self._test_matching(
"12 34 56",
"MATCH_ANY",
(
"I have 12 in me",
"I have 34 in me",
"I have 56 in me",
"I have 12 and 34 in me",
"I have 12, 34, and 56 in me",
"I have 120, 34, and 56 in me",
),
(
"I have 123456 in me",
"I have 01234567 in me",
)
)
def test_match_literal(self):
self._test_matching(
"alpha charlie gamma",
"MATCH_LITERAL",
(
"I have 'alpha charlie gamma' in me",
),
(
"I have alpha in me",
"I have charlie in me",
"I have gamma in me",
"I have alpha and charlie in me",
"I have alpha, charlie, and gamma in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
"I have bravo in me",
)
)
self._test_matching(
"12 34 56",
"MATCH_LITERAL",
(
"I have 12 34 56 in me",
),
(
"I have 12 in me",
"I have 34 in me",
"I have 56 in me",
"I have 12 and 34 in me",
"I have 12 34, and 56 in me",
"I have 120, 34, and 560 in me",
"I have 120, 340, and 560 in me",
"I have 123456 in me",
"I have 01234567 in me",
)
)
def test_match_regex(self):
self._test_matching(
"alpha\w+gamma",
"MATCH_REGEX",
(
"I have alpha_and_gamma in me",
"I have alphas_and_gamma in me",
),
(
"I have alpha in me",
"I have gamma in me",
"I have alpha and charlie in me",
"I have alpha,and,gamma in me",
"I have alpha and gamma in me",
"I have alpha, charlie, and gamma in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
)
)
class TestApplications(TestCase):
"""
We make use of document_consumption_finished, so we should test that it's
doing what we expect wrt to tag & correspondent matching.
"""
def setUp(self):
TestCase.setUp(self)
self.doc_contains = Document.objects.create(
content="I contain the keyword.", file_type="pdf")
def test_tag_applied_any(self):
t1 = Tag.objects.create(
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertTrue(list(self.doc_contains.tags.all()) == [t1])
def test_tag_not_applied(self):
Tag.objects.create(
name="test", match="no-match", matching_algorithm=Tag.MATCH_ANY)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertTrue(list(self.doc_contains.tags.all()) == [])
def test_correspondent_applied(self):
correspondent = Correspondent.objects.create(
name="test",
match="keyword",
matching_algorithm=Correspondent.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertTrue(self.doc_contains.correspondent == correspondent)
def test_correspondent_not_applied(self):
Tag.objects.create(
name="test",
match="no-match",
matching_algorithm=Correspondent.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertEqual(self.doc_contains.correspondent, None)

View File

@ -1,119 +0,0 @@
from django.test import TestCase
from ..models import Tag
class TestTagMatching(TestCase):
def test_match_all(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_ALL
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have charlie in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertTrue(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_ALL
)
self.assertFalse(t.matches("I have 12 in me"))
self.assertFalse(t.matches("I have 34 in me"))
self.assertFalse(t.matches("I have 56 in me"))
self.assertFalse(t.matches("I have 12 and 34 in me"))
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
self.assertFalse(t.matches("I have 120, 34, and 56 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
def test_match_any(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_ANY
)
self.assertTrue(t.matches("I have alpha in me"))
self.assertTrue(t.matches("I have charlie in me"))
self.assertTrue(t.matches("I have gamma in me"))
self.assertTrue(t.matches("I have alpha and charlie in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_ANY
)
self.assertTrue(t.matches("I have 12 in me"))
self.assertTrue(t.matches("I have 34 in me"))
self.assertTrue(t.matches("I have 56 in me"))
self.assertTrue(t.matches("I have 12 and 34 in me"))
self.assertTrue(t.matches("I have 12 34, and 56 in me"))
self.assertTrue(t.matches("I have 120, 34, and 560 in me"))
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
def test_match_literal(self):
t = Tag.objects.create(
name="Test 0",
match="alpha charlie gamma",
matching_algorithm=Tag.MATCH_LITERAL
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have charlie in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertTrue(t.matches("I have 'alpha charlie gamma' in me"))
self.assertFalse(t.matches("I have alphas in me"))
self.assertFalse(t.matches("I have bravo in me"))
t = Tag.objects.create(
name="Test 1",
match="12 34 56",
matching_algorithm=Tag.MATCH_LITERAL
)
self.assertFalse(t.matches("I have 12 in me"))
self.assertFalse(t.matches("I have 34 in me"))
self.assertFalse(t.matches("I have 56 in me"))
self.assertFalse(t.matches("I have 12 and 34 in me"))
self.assertFalse(t.matches("I have 12 34, and 56 in me"))
self.assertFalse(t.matches("I have 120, 34, and 560 in me"))
self.assertFalse(t.matches("I have 120, 340, and 560 in me"))
self.assertFalse(t.matches("I have 123456 in me"))
self.assertFalse(t.matches("I have 01234567 in me"))
self.assertTrue(t.matches("I have 12 34 56 in me"))
def test_match_regex(self):
t = Tag.objects.create(
name="Test 0",
match="alpha\w+gamma",
matching_algorithm=Tag.MATCH_REGEX
)
self.assertFalse(t.matches("I have alpha in me"))
self.assertFalse(t.matches("I have gamma in me"))
self.assertFalse(t.matches("I have alpha and charlie in me"))
self.assertTrue(t.matches("I have alpha_and_gamma in me"))
self.assertTrue(t.matches("I have alphas_and_gamma in me"))
self.assertFalse(t.matches("I have alpha,and,gamma in me"))
self.assertFalse(t.matches("I have alpha and gamma in me"))
self.assertFalse(t.matches("I have alpha, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas, charlie, and gamma in me"))
self.assertFalse(t.matches("I have alphas in me"))

View File

@ -43,7 +43,7 @@ INSTALLED_APPS = [
"django_extensions",
"documents",
"documents.apps.DocumentsConfig",
"rest_framework",
"crispy_forms",