unified document matching, legacy and automatching work alongside now

This commit is contained in:
Jonas Winkler 2020-10-28 11:45:11 +01:00
parent 9e4147ac52
commit 11af74ba36
16 changed files with 629 additions and 225 deletions

View File

@ -10,9 +10,11 @@ class CorrespondentAdmin(admin.ModelAdmin):
list_display = (
"name",
"automatic_classification"
"match",
"matching_algorithm"
)
list_editable = ("automatic_classification",)
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
readonly_fields = ("slug",)
@ -22,11 +24,11 @@ class TagAdmin(admin.ModelAdmin):
list_display = (
"name",
"colour",
"automatic_classification"
"match",
"matching_algorithm"
)
list_filter = ("colour",)
list_editable = ("colour", "automatic_classification")
list_filter = ("colour", "matching_algorithm")
list_editable = ("colour", "match", "matching_algorithm")
readonly_fields = ("slug",)
@ -35,10 +37,11 @@ class DocumentTypeAdmin(admin.ModelAdmin):
list_display = (
"name",
"automatic_classification"
"match",
"matching_algorithm"
)
list_editable = ("automatic_classification",)
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
readonly_fields = ("slug",)

View File

@ -11,20 +11,25 @@ class DocumentsConfig(AppConfig):
from .signals import document_consumption_started
from .signals import document_consumption_finished
from .signals.handlers import (
classify_document,
add_inbox_tags,
run_pre_consume_script,
run_post_consume_script,
cleanup_document_deletion,
set_log_entry,
index_document
index_document,
set_correspondent,
set_document_type,
set_tags
)
document_consumption_started.connect(run_pre_consume_script)
document_consumption_finished.connect(classify_document)
document_consumption_finished.connect(index_document)
document_consumption_finished.connect(add_inbox_tags)
document_consumption_finished.connect(set_correspondent)
document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_log_entry)
document_consumption_finished.connect(run_post_consume_script)

View File

@ -6,7 +6,7 @@ from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from documents.models import Correspondent, DocumentType, Tag, Document
from documents.models import Document, MatchingModel
from paperless import settings
@ -34,6 +34,7 @@ class DocumentClassifier(object):
self.tags_classifier = None
self.correspondent_classifier = None
self.document_type_classifier = None
self.X = None
def reload(self):
if os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
@ -74,21 +75,24 @@ class DocumentClassifier(object):
y = -1
if doc.document_type:
if doc.document_type.automatic_classification:
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.document_type.id
labels_document_type.append(y)
y = -1
if doc.correspondent:
if doc.correspondent.automatic_classification:
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
y = doc.correspondent.id
labels_correspondent.append(y)
tags = [tag.id for tag in doc.tags.filter(
automatic_classification=True
matching_algorithm=MatchingModel.MATCH_AUTO
)]
labels_tags.append(tags)
if not data:
raise ValueError("No training data available.")
labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
logging.getLogger(__name__).info(
"{} documents, {} tag(s), {} correspondent(s), "
@ -163,78 +167,37 @@ class DocumentClassifier(object):
"classifier."
)
def classify_document(
self, document, classify_correspondent=False,
classify_document_type=False, classify_tags=False,
replace_tags=False):
X = self.data_vectorizer.transform(
def update(self, document):
self.X = self.data_vectorizer.transform(
[preprocess_content(document.content)]
)
if classify_correspondent and self.correspondent_classifier:
self._classify_correspondent(X, document)
if classify_document_type and self.document_type_classifier:
self._classify_document_type(X, document)
if classify_tags and self.tags_classifier:
self._classify_tags(X, document, replace_tags)
document.save(update_fields=("correspondent", "document_type"))
def _classify_correspondent(self, X, document):
y = self.correspondent_classifier.predict(X)
correspondent_id = self.correspondent_binarizer.inverse_transform(y)[0]
try:
correspondent = None
def predict_correspondent(self):
if self.correspondent_classifier:
y = self.correspondent_classifier.predict(self.X)
correspondent_id = self.correspondent_binarizer.inverse_transform(y)[0]
if correspondent_id != -1:
correspondent = Correspondent.objects.get(id=correspondent_id)
logging.getLogger(__name__).info(
"Detected correspondent: {}".format(correspondent.name)
)
return correspondent_id
else:
logging.getLogger(__name__).info("Detected correspondent: -")
document.correspondent = correspondent
except Correspondent.DoesNotExist:
logging.getLogger(__name__).warning(
"Detected correspondent with id {} does not exist "
"anymore! Did you delete it?".format(correspondent_id)
)
return None
else:
return None
def _classify_document_type(self, X, document):
y = self.document_type_classifier.predict(X)
document_type_id = self.document_type_binarizer.inverse_transform(y)[0]
try:
document_type = None
def predict_document_type(self):
if self.document_type_classifier:
y = self.document_type_classifier.predict(self.X)
document_type_id = self.document_type_binarizer.inverse_transform(y)[0]
if document_type_id != -1:
document_type = DocumentType.objects.get(id=document_type_id)
logging.getLogger(__name__).info(
"Detected document type: {}".format(document_type.name)
)
return document_type_id
else:
logging.getLogger(__name__).info("Detected document type: -")
document.document_type = document_type
except DocumentType.DoesNotExist:
logging.getLogger(__name__).warning(
"Detected document type with id {} does not exist "
"anymore! Did you delete it?".format(document_type_id)
)
return None
else:
return None
def _classify_tags(self, X, document, replace_tags):
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
if replace_tags:
document.tags.clear()
for tag_id in tags_ids:
try:
tag = Tag.objects.get(id=tag_id)
logging.getLogger(__name__).info(
"Detected tag: {}".format(tag.name)
)
document.tags.add(tag)
except Tag.DoesNotExist:
logging.getLogger(__name__).warning(
"Detected tag with id {} does not exist anymore! Did "
"you delete it?".format(tag_id)
)
def predict_tags(self):
if self.tags_classifier:
y = self.tags_classifier.predict(self.X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
return tags_ids
else:
return []

View File

@ -11,6 +11,7 @@ from operator import itemgetter
from django.conf import settings
from django.utils import timezone
from paperless.db import GnuPG
from .classifier import DocumentClassifier
from .models import Document, FileInfo, Tag
from .parsers import ParseError
@ -49,6 +50,8 @@ class Consumer:
self.consume = consume
self.scratch = scratch
self.classifier = DocumentClassifier()
os.makedirs(self.scratch, exist_ok=True)
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@ -175,10 +178,22 @@ class Consumer:
"Document {} consumption finished".format(document)
)
classifier = None
try:
self.classifier.reload()
self.classifier.update(document)
classifier = self.classifier
except FileNotFoundError:
logging.getLogger(__name__).warning("Cannot classify documents, "
"classifier model file was not "
"found.")
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group
logging_group=self.logging_group,
classifier=classifier
)
return True

View File

@ -17,9 +17,14 @@ class Command(Renderable, BaseCommand):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
clf = DocumentClassifier()
clf.train()
logging.getLogger(__name__).info(
"Saving models to {}...".format(settings.MODEL_FILE)
)
clf.save_classifier()
classifier = DocumentClassifier()
try:
classifier.train()
logging.getLogger(__name__).info(
"Saving models to {}...".format(settings.MODEL_FILE)
)
classifier.save_classifier()
except Exception as e:
logging.getLogger(__name__).error(
"Classifier error: " + str(e)
)

View File

@ -6,6 +6,7 @@ from documents.classifier import DocumentClassifier
from documents.models import Document, Tag
from ...mixins import Renderable
from ...signals.handlers import set_correspondent, set_document_type, set_tags
class Command(Renderable, BaseCommand):
@ -24,23 +25,39 @@ class Command(Renderable, BaseCommand):
def add_arguments(self, parser):
parser.add_argument(
"-c", "--correspondent",
default=False,
action="store_true"
)
parser.add_argument(
"-T", "--tags",
default=False,
action="store_true"
)
parser.add_argument(
"-t", "--type",
"-t", "--document_type",
default=False,
action="store_true"
)
parser.add_argument(
"-i", "--inbox-only",
default=False,
action="store_true"
)
parser.add_argument(
"-r", "--replace-tags",
action="store_true"
"--use-first",
default=False,
action="store_true",
help="By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
)
parser.add_argument(
"-f", "--overwrite",
default=False,
action="store_true",
help="If set, the document retagger will overwrite any previously"
"set correspondent, document and remove correspondents, types"
"and tags that do not match anymore due to changed rules."
)
def handle(self, *args, **options):
@ -53,24 +70,41 @@ class Command(Renderable, BaseCommand):
queryset = Document.objects.all()
documents = queryset.distinct()
logging.getLogger(__name__).info("Loading classifier")
clf = DocumentClassifier()
classifier = DocumentClassifier()
try:
clf.reload()
classifier.reload()
except FileNotFoundError:
logging.getLogger(__name__).fatal("Cannot classify documents, "
logging.getLogger(__name__).warning("Cannot classify documents, "
"classifier model file was not "
"found.")
return
classifier = None
for document in documents:
logging.getLogger(__name__).info(
"Processing document {}".format(document.title)
)
clf.classify_document(
document,
classify_document_type=options["type"],
classify_tags=options["tags"],
classify_correspondent=options["correspondent"],
replace_tags=options["replace_tags"]
)
if classifier:
classifier.update(document)
if options['correspondent']:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'],
use_first=options['use_first'])
if options['document_type']:
set_document_type(sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'],
use_first=options['use_first'])
if options['tags']:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options['overwrite'])

97
src/documents/matching.py Normal file
View File

@ -0,0 +1,97 @@
import re
from fuzzywuzzy import fuzz
from documents.models import MatchingModel, Correspondent, DocumentType, Tag
def match_correspondents(document_content, classifier):
correspondents = Correspondent.objects.all()
predicted_correspondent_id = classifier.predict_correspondent() if classifier else None
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.id == predicted_correspondent_id]
return matched_correspondents
def match_document_types(document_content, classifier):
document_types = DocumentType.objects.all()
predicted_document_type_id = classifier.predict_document_type() if classifier else None
matched_document_types = [o for o in document_types if matches(o, document_content) or o.id == predicted_document_type_id]
return matched_document_types
def match_tags(document_content, classifier):
objects = Tag.objects.all()
predicted_tag_ids = classifier.predict_tags() if classifier else []
matched_tags = [o for o in objects if matches(o, document_content) or o.id in predicted_tag_ids]
return matched_tags
def matches(matching_model, document_content):
search_kwargs = {}
document_content = document_content.lower()
# Check that match is not empty
if matching_model.match.strip() == "":
return False
if matching_model.is_insensitive:
search_kwargs = {"flags": re.IGNORECASE}
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
search_result = re.search(
r"\b{}\b".format(word), document_content, **search_kwargs)
if not search_result:
return False
return True
if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
return True
return False
if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
return bool(re.search(
r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
return bool(re.search(
re.compile(matching_model.match, **search_kwargs), document_content))
if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', matching_model.match)
text = re.sub(r'[^\w\s]', '', document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
return True if fuzz.partial_ratio(match, text) >= 90 else False
if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
# this is done elsewhere.
return False
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(matching_model):
"""
Splits the match to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
for t in findterms(matching_model.match)
]

View File

@ -12,9 +12,8 @@ def re_slug_all_the_things(apps, schema_editor):
Tag = apps.get_model("documents", "Tag")
Correspondent = apps.get_model("documents", "Correspondent")
DocumentType = apps.get_model("documents", "DocumentType")
for klass in (Tag, Correspondent, DocumentType):
for klass in (Tag, Correspondent):
for instance in klass.objects.all():
klass.objects.filter(
pk=instance.pk
@ -26,7 +25,7 @@ def re_slug_all_the_things(apps, schema_editor):
class Migration(migrations.Migration):
dependencies = [
('documents', '1003_auto_20180904_1425'),
('documents', '0021_document_storage_type'),
]
operations = [
@ -49,10 +48,5 @@ class Migration(migrations.Migration):
name='slug',
field=models.SlugField(blank=True, editable=False),
),
migrations.AlterField(
model_name='documenttype',
name='slug',
field=models.SlugField(blank=True, editable=False),
),
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
]

View File

@ -6,7 +6,7 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '0021_document_storage_type'),
('documents', '0022_auto_20181007_1420'),
]
operations = [

View File

@ -16,7 +16,7 @@ class Migration(migrations.Migration):
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=128, unique=True)),
('slug', models.SlugField(blank=True)),
('slug', models.SlugField(blank=True, editable=False)),
('match', models.CharField(blank=True, max_length=256)),
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
('is_insensitive', models.BooleanField(default=True)),

View File

@ -1,77 +0,0 @@
# Generated by Django 2.0.8 on 2018-09-04 14:25
from django.db import migrations, models
def transfer_automatic_classification(apps, schema_editor):
for model_name in ["Tag", "Correspondent", "DocumentType"]:
model_class = apps.get_model("documents", model_name)
for o in model_class.objects.all():
o.automatic_classification = o.match is not None and len(o.match) > 0
o.save()
def reverse_automatic_classification(apps, schema_editor):
pass
class Migration(migrations.Migration):
dependencies = [
('documents', '1002_auto_20180823_1155'),
]
operations = [
migrations.AddField(
model_name='correspondent',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.AddField(
model_name='documenttype',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.AddField(
model_name='tag',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
migrations.RemoveField(
model_name='correspondent',
name='is_insensitive',
),
migrations.RemoveField(
model_name='correspondent',
name='match',
),
migrations.RemoveField(
model_name='correspondent',
name='matching_algorithm',
),
migrations.RemoveField(
model_name='documenttype',
name='is_insensitive',
),
migrations.RemoveField(
model_name='documenttype',
name='match',
),
migrations.RemoveField(
model_name='documenttype',
name='matching_algorithm',
),
migrations.RemoveField(
model_name='tag',
name='is_insensitive',
),
migrations.RemoveField(
model_name='tag',
name='match',
),
migrations.RemoveField(
model_name='tag',
name='matching_algorithm',
),
]

View File

@ -7,13 +7,11 @@ import uuid
from collections import OrderedDict
import dateutil.parser
from django.dispatch import receiver
from django.conf import settings
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.text import slugify
from collections import defaultdict
from .managers import LogManager
@ -25,15 +23,46 @@ except ImportError:
class MatchingModel(models.Model):
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCH_AUTO = 6
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
(MATCH_AUTO, "Automatic Classification"),
)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True, editable=False)
automatic_classification = models.BooleanField(
default=False,
help_text="Automatically assign to newly added documents based on "
"current usage in your document collection."
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
)
is_insensitive = models.BooleanField(default=True)
class Meta:
abstract = True
ordering = ("name",)
@ -43,6 +72,7 @@ class MatchingModel(models.Model):
def save(self, *args, **kwargs):
self.match = self.match.lower()
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)

View File

@ -15,7 +15,9 @@ class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
"id",
"slug",
"name",
"automatic_classification",
"match",
"matching_algorithm",
"is_insensitive",
"document_count",
"last_correspondence"
)
@ -31,7 +33,9 @@ class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
"id",
"slug",
"name",
"automatic_classification",
"match",
"matching_algorithm",
"is_insensitive",
"document_count"
)
@ -47,7 +51,9 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
"slug",
"name",
"colour",
"automatic_classification",
"match",
"matching_algorithm",
"is_insensitive",
"is_inbox_tag",
"document_count"
)

View File

@ -9,7 +9,7 @@ from django.contrib.contenttypes.models import ContentType
from django.utils import timezone
from documents.classifier import DocumentClassifier
from .. import index
from .. import index, matching
from ..models import Document, Tag
@ -17,35 +17,107 @@ def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
#TODO: global? really?
classifier = DocumentClassifier()
def index_document(sender, document=None, logging_group=None, **kwargs):
index.add_document_to_index(sender, instance=document)
def classify_document(sender, document=None, logging_group=None, **kwargs):
global classifier
try:
classifier.reload()
classifier.classify_document(
document,
classify_correspondent=True,
classify_tags=True,
classify_document_type=True
)
except FileNotFoundError:
logging.getLogger(__name__).fatal(
"Cannot classify document, classifier model file was not found."
)
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
inbox_tags = Tag.objects.filter(is_inbox_tag=True)
document.tags.add(*inbox_tags)
def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
if document.correspondent and not replace:
return
potential_correspondents = matching.match_correspondents(document.content, classifier)
potential_count = len(potential_correspondents)
if potential_correspondents:
selected = potential_correspondents[0]
else:
selected = None
if potential_count > 1:
if use_first:
message = "Detected {} potential correspondents, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
else:
message = "Detected {} potential correspondents, not assigning any correspondent"
logger(
message.format(potential_count),
logging_group
)
return
logger(
'Assigning correspondent "{}" to "{}" '.format(selected, document),
logging_group
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
if document.document_type and not replace:
return
potential_document_type = matching.match_document_types(document.content, classifier)
potential_count = len(potential_document_type)
if potential_document_type:
selected = potential_document_type[0]
else:
selected = None
if potential_count > 1:
if use_first:
message = "Detected {} potential document types, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
else:
message = "Detected {} potential document types, not assigning any document type"
logger(
message.format(potential_count),
logging_group
)
return
logger(
'Assigning document type "{}" to "{}" '.format(selected, document),
logging_group
)
document.document_type = selected
document.save(update_fields=("document_type",))
def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
if replace:
document.tags.clear()
current_tags = set([])
else:
current_tags = set(document.tags.all())
relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
if not relevant_tags:
return
message = 'Tagging "{}" with "{}"'
logger(
message.format(document, ", ".join([t.slug for t in relevant_tags])),
logging_group
)
document.tags.add(*relevant_tags)
def run_pre_consume_script(sender, filename, **kwargs):
if not settings.PRE_CONSUME_SCRIPT:

View File

@ -1,17 +1,18 @@
import factory
from factory import Faker
from factory.django import DjangoModelFactory
from ..models import Document, Correspondent
class CorrespondentFactory(factory.DjangoModelFactory):
class CorrespondentFactory(DjangoModelFactory):
class Meta:
model = Correspondent
name = factory.Faker("name")
name = Faker("name")
class DocumentFactory(factory.DjangoModelFactory):
class DocumentFactory(DjangoModelFactory):
class Meta:
model = Document

View File

@ -0,0 +1,256 @@
from random import randint
from django.contrib.admin.models import LogEntry
from django.contrib.auth.models import User
from django.test import TestCase, override_settings
from .. import matching
from ..models import Correspondent, Document, Tag, DocumentType
from ..signals import document_consumption_finished
class TestMatching(TestCase):
def _test_matching(self, text, algorithm, true, false):
for klass in (Tag, Correspondent, DocumentType):
instance = klass.objects.create(
name=str(randint(10000, 99999)),
match=text,
matching_algorithm=getattr(klass, algorithm)
)
for string in true:
self.assertTrue(
matching.matches(instance, string),
'"%s" should match "%s" but it does not' % (text, string)
)
for string in false:
self.assertFalse(
matching.matches(instance, string),
'"%s" should not match "%s" but it does' % (text, string)
)
def test_match_all(self):
self._test_matching(
"alpha charlie gamma",
"MATCH_ALL",
("I have alpha, charlie, and gamma in me",),
(
"I have alpha in me",
"I have charlie in me",
"I have gamma in me",
"I have alpha and charlie in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
"I have bravo in me",
)
)
self._test_matching(
"12 34 56",
"MATCH_ALL",
(
"I have 12 34, and 56 in me",
),
(
"I have 12 in me",
"I have 34 in me",
"I have 56 in me",
"I have 12 and 34 in me",
"I have 120, 34, and 56 in me",
"I have 123456 in me",
"I have 01234567 in me",
)
)
self._test_matching(
'brown fox "lazy dogs"',
"MATCH_ALL",
(
"the quick brown fox jumped over the lazy dogs",
"the quick brown fox jumped over the lazy dogs",
),
(
"the quick fox jumped over the lazy dogs",
"the quick brown wolf jumped over the lazy dogs",
"the quick brown fox jumped over the fat dogs",
"the quick brown fox jumped over the lazy... dogs",
)
)
def test_match_any(self):
self._test_matching(
"alpha charlie gamma",
"MATCH_ANY",
(
"I have alpha in me",
"I have charlie in me",
"I have gamma in me",
"I have alpha, charlie, and gamma in me",
"I have alpha and charlie in me",
),
(
"I have alphas in me",
"I have bravo in me",
)
)
self._test_matching(
"12 34 56",
"MATCH_ANY",
(
"I have 12 in me",
"I have 34 in me",
"I have 56 in me",
"I have 12 and 34 in me",
"I have 12, 34, and 56 in me",
"I have 120, 34, and 56 in me",
),
(
"I have 123456 in me",
"I have 01234567 in me",
)
)
self._test_matching(
'"brown fox" " lazy dogs "',
"MATCH_ANY",
(
"the quick brown fox",
"jumped over the lazy dogs.",
),
(
"the lazy fox jumped over the brown dogs",
)
)
def test_match_literal(self):
self._test_matching(
"alpha charlie gamma",
"MATCH_LITERAL",
(
"I have 'alpha charlie gamma' in me",
),
(
"I have alpha in me",
"I have charlie in me",
"I have gamma in me",
"I have alpha and charlie in me",
"I have alpha, charlie, and gamma in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
"I have bravo in me",
)
)
self._test_matching(
"12 34 56",
"MATCH_LITERAL",
(
"I have 12 34 56 in me",
),
(
"I have 12 in me",
"I have 34 in me",
"I have 56 in me",
"I have 12 and 34 in me",
"I have 12 34, and 56 in me",
"I have 120, 34, and 560 in me",
"I have 120, 340, and 560 in me",
"I have 123456 in me",
"I have 01234567 in me",
)
)
def test_match_regex(self):
self._test_matching(
r"alpha\w+gamma",
"MATCH_REGEX",
(
"I have alpha_and_gamma in me",
"I have alphas_and_gamma in me",
),
(
"I have alpha in me",
"I have gamma in me",
"I have alpha and charlie in me",
"I have alpha,and,gamma in me",
"I have alpha and gamma in me",
"I have alpha, charlie, and gamma in me",
"I have alphas, charlie, and gamma in me",
"I have alphas in me",
)
)
def test_match_fuzzy(self):
self._test_matching(
"Springfield, Miss.",
"MATCH_FUZZY",
(
"1220 Main Street, Springf eld, Miss.",
"1220 Main Street, Spring field, Miss.",
"1220 Main Street, Springfeld, Miss.",
"1220 Main Street Springfield Miss",
),
(
"1220 Main Street, Springfield, Mich.",
)
)
@override_settings(POST_CONSUME_SCRIPT=None)
class TestDocumentConsumptionFinishedSignal(TestCase):
"""
We make use of document_consumption_finished, so we should test that it's
doing what we expect wrt to tag & correspondent matching.
"""
def setUp(self):
TestCase.setUp(self)
User.objects.create_user(username='test_consumer', password='12345')
self.doc_contains = Document.objects.create(
content="I contain the keyword.", file_type="pdf")
def test_tag_applied_any(self):
t1 = Tag.objects.create(
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertTrue(list(self.doc_contains.tags.all()) == [t1])
def test_tag_not_applied(self):
Tag.objects.create(
name="test", match="no-match", matching_algorithm=Tag.MATCH_ANY)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertTrue(list(self.doc_contains.tags.all()) == [])
def test_correspondent_applied(self):
correspondent = Correspondent.objects.create(
name="test",
match="keyword",
matching_algorithm=Correspondent.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertTrue(self.doc_contains.correspondent == correspondent)
def test_correspondent_not_applied(self):
Tag.objects.create(
name="test",
match="no-match",
matching_algorithm=Correspondent.MATCH_ANY
)
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertEqual(self.doc_contains.correspondent, None)
def test_logentry_created(self):
document_consumption_finished.send(
sender=self.__class__, document=self.doc_contains)
self.assertEqual(LogEntry.objects.count(), 1)