mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
unified document matching, legacy and automatching work alongside now
This commit is contained in:
parent
9e4147ac52
commit
11af74ba36
@ -10,9 +10,11 @@ class CorrespondentAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = (
|
||||
"name",
|
||||
"automatic_classification"
|
||||
"match",
|
||||
"matching_algorithm"
|
||||
)
|
||||
list_editable = ("automatic_classification",)
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
@ -22,11 +24,11 @@ class TagAdmin(admin.ModelAdmin):
|
||||
list_display = (
|
||||
"name",
|
||||
"colour",
|
||||
"automatic_classification"
|
||||
"match",
|
||||
"matching_algorithm"
|
||||
)
|
||||
|
||||
list_filter = ("colour",)
|
||||
list_editable = ("colour", "automatic_classification")
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
@ -35,10 +37,11 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = (
|
||||
"name",
|
||||
"automatic_classification"
|
||||
"match",
|
||||
"matching_algorithm"
|
||||
)
|
||||
|
||||
list_editable = ("automatic_classification",)
|
||||
list_filter = ("matching_algorithm",)
|
||||
list_editable = ("match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
|
||||
|
@ -11,20 +11,25 @@ class DocumentsConfig(AppConfig):
|
||||
from .signals import document_consumption_started
|
||||
from .signals import document_consumption_finished
|
||||
from .signals.handlers import (
|
||||
classify_document,
|
||||
add_inbox_tags,
|
||||
run_pre_consume_script,
|
||||
run_post_consume_script,
|
||||
cleanup_document_deletion,
|
||||
set_log_entry,
|
||||
index_document
|
||||
index_document,
|
||||
set_correspondent,
|
||||
set_document_type,
|
||||
set_tags
|
||||
|
||||
)
|
||||
|
||||
document_consumption_started.connect(run_pre_consume_script)
|
||||
|
||||
document_consumption_finished.connect(classify_document)
|
||||
document_consumption_finished.connect(index_document)
|
||||
document_consumption_finished.connect(add_inbox_tags)
|
||||
document_consumption_finished.connect(set_correspondent)
|
||||
document_consumption_finished.connect(set_document_type)
|
||||
document_consumption_finished.connect(set_tags)
|
||||
document_consumption_finished.connect(set_log_entry)
|
||||
document_consumption_finished.connect(run_post_consume_script)
|
||||
|
||||
|
@ -6,7 +6,7 @@ from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
|
||||
from documents.models import Correspondent, DocumentType, Tag, Document
|
||||
from documents.models import Document, MatchingModel
|
||||
from paperless import settings
|
||||
|
||||
|
||||
@ -34,6 +34,7 @@ class DocumentClassifier(object):
|
||||
self.tags_classifier = None
|
||||
self.correspondent_classifier = None
|
||||
self.document_type_classifier = None
|
||||
self.X = None
|
||||
|
||||
def reload(self):
|
||||
if os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
|
||||
@ -74,21 +75,24 @@ class DocumentClassifier(object):
|
||||
|
||||
y = -1
|
||||
if doc.document_type:
|
||||
if doc.document_type.automatic_classification:
|
||||
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.id
|
||||
labels_document_type.append(y)
|
||||
|
||||
y = -1
|
||||
if doc.correspondent:
|
||||
if doc.correspondent.automatic_classification:
|
||||
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.id
|
||||
labels_correspondent.append(y)
|
||||
|
||||
tags = [tag.id for tag in doc.tags.filter(
|
||||
automatic_classification=True
|
||||
matching_algorithm=MatchingModel.MATCH_AUTO
|
||||
)]
|
||||
labels_tags.append(tags)
|
||||
|
||||
if not data:
|
||||
raise ValueError("No training data available.")
|
||||
|
||||
labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
|
||||
logging.getLogger(__name__).info(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
@ -163,78 +167,37 @@ class DocumentClassifier(object):
|
||||
"classifier."
|
||||
)
|
||||
|
||||
def classify_document(
|
||||
self, document, classify_correspondent=False,
|
||||
classify_document_type=False, classify_tags=False,
|
||||
replace_tags=False):
|
||||
|
||||
X = self.data_vectorizer.transform(
|
||||
def update(self, document):
|
||||
self.X = self.data_vectorizer.transform(
|
||||
[preprocess_content(document.content)]
|
||||
)
|
||||
|
||||
if classify_correspondent and self.correspondent_classifier:
|
||||
self._classify_correspondent(X, document)
|
||||
|
||||
if classify_document_type and self.document_type_classifier:
|
||||
self._classify_document_type(X, document)
|
||||
|
||||
if classify_tags and self.tags_classifier:
|
||||
self._classify_tags(X, document, replace_tags)
|
||||
|
||||
document.save(update_fields=("correspondent", "document_type"))
|
||||
|
||||
def _classify_correspondent(self, X, document):
|
||||
y = self.correspondent_classifier.predict(X)
|
||||
correspondent_id = self.correspondent_binarizer.inverse_transform(y)[0]
|
||||
try:
|
||||
correspondent = None
|
||||
def predict_correspondent(self):
|
||||
if self.correspondent_classifier:
|
||||
y = self.correspondent_classifier.predict(self.X)
|
||||
correspondent_id = self.correspondent_binarizer.inverse_transform(y)[0]
|
||||
if correspondent_id != -1:
|
||||
correspondent = Correspondent.objects.get(id=correspondent_id)
|
||||
logging.getLogger(__name__).info(
|
||||
"Detected correspondent: {}".format(correspondent.name)
|
||||
)
|
||||
return correspondent_id
|
||||
else:
|
||||
logging.getLogger(__name__).info("Detected correspondent: -")
|
||||
document.correspondent = correspondent
|
||||
except Correspondent.DoesNotExist:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Detected correspondent with id {} does not exist "
|
||||
"anymore! Did you delete it?".format(correspondent_id)
|
||||
)
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
def _classify_document_type(self, X, document):
|
||||
y = self.document_type_classifier.predict(X)
|
||||
document_type_id = self.document_type_binarizer.inverse_transform(y)[0]
|
||||
try:
|
||||
document_type = None
|
||||
def predict_document_type(self):
|
||||
if self.document_type_classifier:
|
||||
y = self.document_type_classifier.predict(self.X)
|
||||
document_type_id = self.document_type_binarizer.inverse_transform(y)[0]
|
||||
if document_type_id != -1:
|
||||
document_type = DocumentType.objects.get(id=document_type_id)
|
||||
logging.getLogger(__name__).info(
|
||||
"Detected document type: {}".format(document_type.name)
|
||||
)
|
||||
return document_type_id
|
||||
else:
|
||||
logging.getLogger(__name__).info("Detected document type: -")
|
||||
document.document_type = document_type
|
||||
except DocumentType.DoesNotExist:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Detected document type with id {} does not exist "
|
||||
"anymore! Did you delete it?".format(document_type_id)
|
||||
)
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
def _classify_tags(self, X, document, replace_tags):
|
||||
y = self.tags_classifier.predict(X)
|
||||
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
|
||||
if replace_tags:
|
||||
document.tags.clear()
|
||||
for tag_id in tags_ids:
|
||||
try:
|
||||
tag = Tag.objects.get(id=tag_id)
|
||||
logging.getLogger(__name__).info(
|
||||
"Detected tag: {}".format(tag.name)
|
||||
)
|
||||
document.tags.add(tag)
|
||||
except Tag.DoesNotExist:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Detected tag with id {} does not exist anymore! Did "
|
||||
"you delete it?".format(tag_id)
|
||||
)
|
||||
def predict_tags(self):
|
||||
if self.tags_classifier:
|
||||
y = self.tags_classifier.predict(self.X)
|
||||
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
|
||||
return tags_ids
|
||||
else:
|
||||
return []
|
||||
|
@ -11,6 +11,7 @@ from operator import itemgetter
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from paperless.db import GnuPG
|
||||
from .classifier import DocumentClassifier
|
||||
|
||||
from .models import Document, FileInfo, Tag
|
||||
from .parsers import ParseError
|
||||
@ -49,6 +50,8 @@ class Consumer:
|
||||
self.consume = consume
|
||||
self.scratch = scratch
|
||||
|
||||
self.classifier = DocumentClassifier()
|
||||
|
||||
os.makedirs(self.scratch, exist_ok=True)
|
||||
|
||||
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
@ -175,10 +178,22 @@ class Consumer:
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
classifier = None
|
||||
|
||||
try:
|
||||
self.classifier.reload()
|
||||
self.classifier.update(document)
|
||||
classifier = self.classifier
|
||||
except FileNotFoundError:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents, "
|
||||
"classifier model file was not "
|
||||
"found.")
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
return True
|
||||
|
||||
|
@ -17,9 +17,14 @@ class Command(Renderable, BaseCommand):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
clf = DocumentClassifier()
|
||||
clf.train()
|
||||
logging.getLogger(__name__).info(
|
||||
"Saving models to {}...".format(settings.MODEL_FILE)
|
||||
)
|
||||
clf.save_classifier()
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
classifier.train()
|
||||
logging.getLogger(__name__).info(
|
||||
"Saving models to {}...".format(settings.MODEL_FILE)
|
||||
)
|
||||
classifier.save_classifier()
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).error(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
||||
|
@ -6,6 +6,7 @@ from documents.classifier import DocumentClassifier
|
||||
from documents.models import Document, Tag
|
||||
|
||||
from ...mixins import Renderable
|
||||
from ...signals.handlers import set_correspondent, set_document_type, set_tags
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@ -24,23 +25,39 @@ class Command(Renderable, BaseCommand):
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-c", "--correspondent",
|
||||
default=False,
|
||||
action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-T", "--tags",
|
||||
default=False,
|
||||
action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t", "--type",
|
||||
"-t", "--document_type",
|
||||
default=False,
|
||||
action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i", "--inbox-only",
|
||||
default=False,
|
||||
action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r", "--replace-tags",
|
||||
action="store_true"
|
||||
"--use-first",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="By default this command won't try to assign a correspondent "
|
||||
"if more than one matches the document. Use this flag if "
|
||||
"you'd rather it just pick the first one it finds."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f", "--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the document retagger will overwrite any previously"
|
||||
"set correspondent, document and remove correspondents, types"
|
||||
"and tags that do not match anymore due to changed rules."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
@ -53,24 +70,41 @@ class Command(Renderable, BaseCommand):
|
||||
queryset = Document.objects.all()
|
||||
documents = queryset.distinct()
|
||||
|
||||
logging.getLogger(__name__).info("Loading classifier")
|
||||
clf = DocumentClassifier()
|
||||
classifier = DocumentClassifier()
|
||||
try:
|
||||
clf.reload()
|
||||
classifier.reload()
|
||||
except FileNotFoundError:
|
||||
logging.getLogger(__name__).fatal("Cannot classify documents, "
|
||||
logging.getLogger(__name__).warning("Cannot classify documents, "
|
||||
"classifier model file was not "
|
||||
"found.")
|
||||
return
|
||||
classifier = None
|
||||
|
||||
for document in documents:
|
||||
logging.getLogger(__name__).info(
|
||||
"Processing document {}".format(document.title)
|
||||
)
|
||||
clf.classify_document(
|
||||
document,
|
||||
classify_document_type=options["type"],
|
||||
classify_tags=options["tags"],
|
||||
classify_correspondent=options["correspondent"],
|
||||
replace_tags=options["replace_tags"]
|
||||
)
|
||||
|
||||
if classifier:
|
||||
classifier.update(document)
|
||||
|
||||
if options['correspondent']:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options['overwrite'],
|
||||
use_first=options['use_first'])
|
||||
|
||||
if options['document_type']:
|
||||
set_document_type(sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options['overwrite'],
|
||||
use_first=options['use_first'])
|
||||
|
||||
if options['tags']:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options['overwrite'])
|
||||
|
97
src/documents/matching.py
Normal file
97
src/documents/matching.py
Normal file
@ -0,0 +1,97 @@
|
||||
import re
|
||||
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from documents.models import MatchingModel, Correspondent, DocumentType, Tag
|
||||
|
||||
|
||||
def match_correspondents(document_content, classifier):
|
||||
correspondents = Correspondent.objects.all()
|
||||
predicted_correspondent_id = classifier.predict_correspondent() if classifier else None
|
||||
|
||||
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.id == predicted_correspondent_id]
|
||||
return matched_correspondents
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
document_types = DocumentType.objects.all()
|
||||
predicted_document_type_id = classifier.predict_document_type() if classifier else None
|
||||
|
||||
matched_document_types = [o for o in document_types if matches(o, document_content) or o.id == predicted_document_type_id]
|
||||
return matched_document_types
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
objects = Tag.objects.all()
|
||||
predicted_tag_ids = classifier.predict_tags() if classifier else []
|
||||
|
||||
matched_tags = [o for o in objects if matches(o, document_content) or o.id in predicted_tag_ids]
|
||||
return matched_tags
|
||||
|
||||
|
||||
def matches(matching_model, document_content):
|
||||
search_kwargs = {}
|
||||
|
||||
document_content = document_content.lower()
|
||||
|
||||
# Check that match is not empty
|
||||
if matching_model.match.strip() == "":
|
||||
return False
|
||||
|
||||
if matching_model.is_insensitive:
|
||||
search_kwargs = {"flags": re.IGNORECASE}
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
||||
for word in _split_match(matching_model):
|
||||
search_result = re.search(
|
||||
r"\b{}\b".format(word), document_content, **search_kwargs)
|
||||
if not search_result:
|
||||
return False
|
||||
return True
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
for word in _split_match(matching_model):
|
||||
if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
|
||||
return True
|
||||
return False
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
return bool(re.search(
|
||||
r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
return bool(re.search(
|
||||
re.compile(matching_model.match, **search_kwargs), document_content))
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
match = re.sub(r'[^\w\s]', '', matching_model.match)
|
||||
text = re.sub(r'[^\w\s]', '', document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
|
||||
return True if fuzz.partial_ratio(match, text) >= 90 else False
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
# this is done elsewhere.
|
||||
return False
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
||||
def _split_match(matching_model):
|
||||
"""
|
||||
Splits the match to individual keywords, getting rid of unnecessary
|
||||
spaces and grouping quoted words together.
|
||||
|
||||
Example:
|
||||
' some random words "with quotes " and spaces'
|
||||
==>
|
||||
["some", "random", "words", "with+quotes", "and", "spaces"]
|
||||
"""
|
||||
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||
normspace = re.compile(r"\s+").sub
|
||||
return [
|
||||
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
||||
for t in findterms(matching_model.match)
|
||||
]
|
@ -12,9 +12,8 @@ def re_slug_all_the_things(apps, schema_editor):
|
||||
|
||||
Tag = apps.get_model("documents", "Tag")
|
||||
Correspondent = apps.get_model("documents", "Correspondent")
|
||||
DocumentType = apps.get_model("documents", "DocumentType")
|
||||
|
||||
for klass in (Tag, Correspondent, DocumentType):
|
||||
for klass in (Tag, Correspondent):
|
||||
for instance in klass.objects.all():
|
||||
klass.objects.filter(
|
||||
pk=instance.pk
|
||||
@ -26,7 +25,7 @@ def re_slug_all_the_things(apps, schema_editor):
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1003_auto_20180904_1425'),
|
||||
('documents', '0021_document_storage_type'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@ -49,10 +48,5 @@ class Migration(migrations.Migration):
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documenttype',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, editable=False),
|
||||
),
|
||||
migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
|
||||
]
|
||||
|
@ -6,7 +6,7 @@ from django.db import migrations, models
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0021_document_storage_type'),
|
||||
('documents', '0022_auto_20181007_1420'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
|
@ -16,7 +16,7 @@ class Migration(migrations.Migration):
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=128, unique=True)),
|
||||
('slug', models.SlugField(blank=True)),
|
||||
('slug', models.SlugField(blank=True, editable=False)),
|
||||
('match', models.CharField(blank=True, max_length=256)),
|
||||
('matching_algorithm', models.PositiveIntegerField(choices=[(1, 'Any'), (2, 'All'), (3, 'Literal'), (4, 'Regular Expression'), (5, 'Fuzzy Match')], default=1, help_text='Which algorithm you want to use when matching text to the OCR\'d PDF. Here, "any" looks for any occurrence of any word provided in the PDF, while "all" requires that every word provided appear in the PDF, albeit not in the order provided. A "literal" match means that the text you enter must appear in the PDF exactly as you\'ve entered it, and "regular expression" uses a regex to match the PDF. (If you don\'t know what a regex is, you probably don\'t want this option.) Finally, a "fuzzy match" looks for words or phrases that are mostly—but not exactly—the same, which can be useful for matching against documents containg imperfections that foil accurate OCR.')),
|
||||
('is_insensitive', models.BooleanField(default=True)),
|
||||
|
@ -1,77 +0,0 @@
|
||||
# Generated by Django 2.0.8 on 2018-09-04 14:25
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def transfer_automatic_classification(apps, schema_editor):
|
||||
for model_name in ["Tag", "Correspondent", "DocumentType"]:
|
||||
model_class = apps.get_model("documents", model_name)
|
||||
for o in model_class.objects.all():
|
||||
o.automatic_classification = o.match is not None and len(o.match) > 0
|
||||
o.save()
|
||||
|
||||
|
||||
def reverse_automatic_classification(apps, schema_editor):
|
||||
pass
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1002_auto_20180823_1155'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='correspondent',
|
||||
name='automatic_classification',
|
||||
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='documenttype',
|
||||
name='automatic_classification',
|
||||
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='automatic_classification',
|
||||
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
|
||||
),
|
||||
migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
|
||||
migrations.RemoveField(
|
||||
model_name='correspondent',
|
||||
name='is_insensitive',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='correspondent',
|
||||
name='match',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='correspondent',
|
||||
name='matching_algorithm',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='documenttype',
|
||||
name='is_insensitive',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='documenttype',
|
||||
name='match',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='documenttype',
|
||||
name='matching_algorithm',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='is_insensitive',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='match',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='tag',
|
||||
name='matching_algorithm',
|
||||
),
|
||||
]
|
@ -7,13 +7,11 @@ import uuid
|
||||
from collections import OrderedDict
|
||||
|
||||
import dateutil.parser
|
||||
from django.dispatch import receiver
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
from collections import defaultdict
|
||||
|
||||
from .managers import LogManager
|
||||
|
||||
@ -25,15 +23,46 @@ except ImportError:
|
||||
|
||||
class MatchingModel(models.Model):
|
||||
|
||||
MATCH_ANY = 1
|
||||
MATCH_ALL = 2
|
||||
MATCH_LITERAL = 3
|
||||
MATCH_REGEX = 4
|
||||
MATCH_FUZZY = 5
|
||||
MATCH_AUTO = 6
|
||||
|
||||
MATCHING_ALGORITHMS = (
|
||||
(MATCH_ANY, "Any"),
|
||||
(MATCH_ALL, "All"),
|
||||
(MATCH_LITERAL, "Literal"),
|
||||
(MATCH_REGEX, "Regular Expression"),
|
||||
(MATCH_FUZZY, "Fuzzy Match"),
|
||||
(MATCH_AUTO, "Automatic Classification"),
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
slug = models.SlugField(blank=True, editable=False)
|
||||
|
||||
automatic_classification = models.BooleanField(
|
||||
default=False,
|
||||
help_text="Automatically assign to newly added documents based on "
|
||||
"current usage in your document collection."
|
||||
match = models.CharField(max_length=256, blank=True)
|
||||
matching_algorithm = models.PositiveIntegerField(
|
||||
choices=MATCHING_ALGORITHMS,
|
||||
default=MATCH_ANY,
|
||||
help_text=(
|
||||
"Which algorithm you want to use when matching text to the OCR'd "
|
||||
"PDF. Here, \"any\" looks for any occurrence of any word "
|
||||
"provided in the PDF, while \"all\" requires that every word "
|
||||
"provided appear in the PDF, albeit not in the order provided. A "
|
||||
"\"literal\" match means that the text you enter must appear in "
|
||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||
"uses a regex to match the PDF. (If you don't know what a regex "
|
||||
"is, you probably don't want this option.) Finally, a \"fuzzy "
|
||||
"match\" looks for words or phrases that are mostly—but not "
|
||||
"exactly—the same, which can be useful for matching against "
|
||||
"documents containg imperfections that foil accurate OCR."
|
||||
)
|
||||
)
|
||||
|
||||
is_insensitive = models.BooleanField(default=True)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
ordering = ("name",)
|
||||
@ -43,6 +72,7 @@ class MatchingModel(models.Model):
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
self.match = self.match.lower()
|
||||
self.slug = slugify(self.name)
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
@ -15,7 +15,9 @@ class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
"id",
|
||||
"slug",
|
||||
"name",
|
||||
"automatic_classification",
|
||||
"match",
|
||||
"matching_algorithm",
|
||||
"is_insensitive",
|
||||
"document_count",
|
||||
"last_correspondence"
|
||||
)
|
||||
@ -31,7 +33,9 @@ class DocumentTypeSerializer(serializers.HyperlinkedModelSerializer):
|
||||
"id",
|
||||
"slug",
|
||||
"name",
|
||||
"automatic_classification",
|
||||
"match",
|
||||
"matching_algorithm",
|
||||
"is_insensitive",
|
||||
"document_count"
|
||||
)
|
||||
|
||||
@ -47,7 +51,9 @@ class TagSerializer(serializers.HyperlinkedModelSerializer):
|
||||
"slug",
|
||||
"name",
|
||||
"colour",
|
||||
"automatic_classification",
|
||||
"match",
|
||||
"matching_algorithm",
|
||||
"is_insensitive",
|
||||
"is_inbox_tag",
|
||||
"document_count"
|
||||
)
|
||||
|
@ -9,7 +9,7 @@ from django.contrib.contenttypes.models import ContentType
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from .. import index
|
||||
from .. import index, matching
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@ -17,35 +17,107 @@ def logger(message, group):
|
||||
logging.getLogger(__name__).debug(message, extra={"group": group})
|
||||
|
||||
|
||||
#TODO: global? really?
|
||||
classifier = DocumentClassifier()
|
||||
|
||||
|
||||
def index_document(sender, document=None, logging_group=None, **kwargs):
|
||||
index.add_document_to_index(sender, instance=document)
|
||||
|
||||
|
||||
def classify_document(sender, document=None, logging_group=None, **kwargs):
|
||||
global classifier
|
||||
try:
|
||||
classifier.reload()
|
||||
classifier.classify_document(
|
||||
document,
|
||||
classify_correspondent=True,
|
||||
classify_tags=True,
|
||||
classify_document_type=True
|
||||
)
|
||||
except FileNotFoundError:
|
||||
logging.getLogger(__name__).fatal(
|
||||
"Cannot classify document, classifier model file was not found."
|
||||
)
|
||||
|
||||
|
||||
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
inbox_tags = Tag.objects.filter(is_inbox_tag=True)
|
||||
document.tags.add(*inbox_tags)
|
||||
|
||||
|
||||
def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
|
||||
if document.correspondent and not replace:
|
||||
return
|
||||
|
||||
potential_correspondents = matching.match_correspondents(document.content, classifier)
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
if potential_correspondents:
|
||||
selected = potential_correspondents[0]
|
||||
else:
|
||||
selected = None
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
message = "Detected {} potential correspondents, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
logging_group
|
||||
)
|
||||
else:
|
||||
message = "Detected {} potential correspondents, not assigning any correspondent"
|
||||
logger(
|
||||
message.format(potential_count),
|
||||
logging_group
|
||||
)
|
||||
return
|
||||
|
||||
logger(
|
||||
'Assigning correspondent "{}" to "{}" '.format(selected, document),
|
||||
logging_group
|
||||
)
|
||||
|
||||
document.correspondent = selected
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
||||
|
||||
def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
|
||||
if document.document_type and not replace:
|
||||
return
|
||||
|
||||
potential_document_type = matching.match_document_types(document.content, classifier)
|
||||
|
||||
potential_count = len(potential_document_type)
|
||||
if potential_document_type:
|
||||
selected = potential_document_type[0]
|
||||
else:
|
||||
selected = None
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
message = "Detected {} potential document types, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
logging_group
|
||||
)
|
||||
else:
|
||||
message = "Detected {} potential document types, not assigning any document type"
|
||||
logger(
|
||||
message.format(potential_count),
|
||||
logging_group
|
||||
)
|
||||
return
|
||||
|
||||
logger(
|
||||
'Assigning document type "{}" to "{}" '.format(selected, document),
|
||||
logging_group
|
||||
)
|
||||
|
||||
document.document_type = selected
|
||||
document.save(update_fields=("document_type",))
|
||||
|
||||
|
||||
def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
|
||||
if replace:
|
||||
document.tags.clear()
|
||||
current_tags = set([])
|
||||
else:
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
|
||||
|
||||
if not relevant_tags:
|
||||
return
|
||||
|
||||
message = 'Tagging "{}" with "{}"'
|
||||
logger(
|
||||
message.format(document, ", ".join([t.slug for t in relevant_tags])),
|
||||
logging_group
|
||||
)
|
||||
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
|
||||
def run_pre_consume_script(sender, filename, **kwargs):
|
||||
|
||||
if not settings.PRE_CONSUME_SCRIPT:
|
||||
|
@ -1,17 +1,18 @@
|
||||
import factory
|
||||
from factory import Faker
|
||||
from factory.django import DjangoModelFactory
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class CorrespondentFactory(factory.DjangoModelFactory):
|
||||
class CorrespondentFactory(DjangoModelFactory):
|
||||
|
||||
class Meta:
|
||||
model = Correspondent
|
||||
|
||||
name = factory.Faker("name")
|
||||
name = Faker("name")
|
||||
|
||||
|
||||
class DocumentFactory(factory.DjangoModelFactory):
|
||||
class DocumentFactory(DjangoModelFactory):
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
|
256
src/documents/tests/test_matchables.py
Normal file
256
src/documents/tests/test_matchables.py
Normal file
@ -0,0 +1,256 @@
|
||||
from random import randint
|
||||
|
||||
from django.contrib.admin.models import LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .. import matching
|
||||
from ..models import Correspondent, Document, Tag, DocumentType
|
||||
from ..signals import document_consumption_finished
|
||||
|
||||
|
||||
class TestMatching(TestCase):
|
||||
|
||||
def _test_matching(self, text, algorithm, true, false):
|
||||
for klass in (Tag, Correspondent, DocumentType):
|
||||
instance = klass.objects.create(
|
||||
name=str(randint(10000, 99999)),
|
||||
match=text,
|
||||
matching_algorithm=getattr(klass, algorithm)
|
||||
)
|
||||
for string in true:
|
||||
self.assertTrue(
|
||||
matching.matches(instance, string),
|
||||
'"%s" should match "%s" but it does not' % (text, string)
|
||||
)
|
||||
for string in false:
|
||||
self.assertFalse(
|
||||
matching.matches(instance, string),
|
||||
'"%s" should not match "%s" but it does' % (text, string)
|
||||
)
|
||||
|
||||
def test_match_all(self):
|
||||
|
||||
self._test_matching(
|
||||
"alpha charlie gamma",
|
||||
"MATCH_ALL",
|
||||
("I have alpha, charlie, and gamma in me",),
|
||||
(
|
||||
"I have alpha in me",
|
||||
"I have charlie in me",
|
||||
"I have gamma in me",
|
||||
"I have alpha and charlie in me",
|
||||
"I have alphas, charlie, and gamma in me",
|
||||
"I have alphas in me",
|
||||
"I have bravo in me",
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
"12 34 56",
|
||||
"MATCH_ALL",
|
||||
(
|
||||
"I have 12 34, and 56 in me",
|
||||
),
|
||||
(
|
||||
"I have 12 in me",
|
||||
"I have 34 in me",
|
||||
"I have 56 in me",
|
||||
"I have 12 and 34 in me",
|
||||
"I have 120, 34, and 56 in me",
|
||||
"I have 123456 in me",
|
||||
"I have 01234567 in me",
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
'brown fox "lazy dogs"',
|
||||
"MATCH_ALL",
|
||||
(
|
||||
"the quick brown fox jumped over the lazy dogs",
|
||||
"the quick brown fox jumped over the lazy dogs",
|
||||
),
|
||||
(
|
||||
"the quick fox jumped over the lazy dogs",
|
||||
"the quick brown wolf jumped over the lazy dogs",
|
||||
"the quick brown fox jumped over the fat dogs",
|
||||
"the quick brown fox jumped over the lazy... dogs",
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_any(self):
|
||||
|
||||
self._test_matching(
|
||||
"alpha charlie gamma",
|
||||
"MATCH_ANY",
|
||||
(
|
||||
"I have alpha in me",
|
||||
"I have charlie in me",
|
||||
"I have gamma in me",
|
||||
"I have alpha, charlie, and gamma in me",
|
||||
"I have alpha and charlie in me",
|
||||
),
|
||||
(
|
||||
"I have alphas in me",
|
||||
"I have bravo in me",
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
"12 34 56",
|
||||
"MATCH_ANY",
|
||||
(
|
||||
"I have 12 in me",
|
||||
"I have 34 in me",
|
||||
"I have 56 in me",
|
||||
"I have 12 and 34 in me",
|
||||
"I have 12, 34, and 56 in me",
|
||||
"I have 120, 34, and 56 in me",
|
||||
),
|
||||
(
|
||||
"I have 123456 in me",
|
||||
"I have 01234567 in me",
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
'"brown fox" " lazy dogs "',
|
||||
"MATCH_ANY",
|
||||
(
|
||||
"the quick brown fox",
|
||||
"jumped over the lazy dogs.",
|
||||
),
|
||||
(
|
||||
"the lazy fox jumped over the brown dogs",
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_literal(self):
|
||||
|
||||
self._test_matching(
|
||||
"alpha charlie gamma",
|
||||
"MATCH_LITERAL",
|
||||
(
|
||||
"I have 'alpha charlie gamma' in me",
|
||||
),
|
||||
(
|
||||
"I have alpha in me",
|
||||
"I have charlie in me",
|
||||
"I have gamma in me",
|
||||
"I have alpha and charlie in me",
|
||||
"I have alpha, charlie, and gamma in me",
|
||||
"I have alphas, charlie, and gamma in me",
|
||||
"I have alphas in me",
|
||||
"I have bravo in me",
|
||||
)
|
||||
)
|
||||
|
||||
self._test_matching(
|
||||
"12 34 56",
|
||||
"MATCH_LITERAL",
|
||||
(
|
||||
"I have 12 34 56 in me",
|
||||
),
|
||||
(
|
||||
"I have 12 in me",
|
||||
"I have 34 in me",
|
||||
"I have 56 in me",
|
||||
"I have 12 and 34 in me",
|
||||
"I have 12 34, and 56 in me",
|
||||
"I have 120, 34, and 560 in me",
|
||||
"I have 120, 340, and 560 in me",
|
||||
"I have 123456 in me",
|
||||
"I have 01234567 in me",
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_regex(self):
|
||||
|
||||
self._test_matching(
|
||||
r"alpha\w+gamma",
|
||||
"MATCH_REGEX",
|
||||
(
|
||||
"I have alpha_and_gamma in me",
|
||||
"I have alphas_and_gamma in me",
|
||||
),
|
||||
(
|
||||
"I have alpha in me",
|
||||
"I have gamma in me",
|
||||
"I have alpha and charlie in me",
|
||||
"I have alpha,and,gamma in me",
|
||||
"I have alpha and gamma in me",
|
||||
"I have alpha, charlie, and gamma in me",
|
||||
"I have alphas, charlie, and gamma in me",
|
||||
"I have alphas in me",
|
||||
)
|
||||
)
|
||||
|
||||
def test_match_fuzzy(self):
|
||||
|
||||
self._test_matching(
|
||||
"Springfield, Miss.",
|
||||
"MATCH_FUZZY",
|
||||
(
|
||||
"1220 Main Street, Springf eld, Miss.",
|
||||
"1220 Main Street, Spring field, Miss.",
|
||||
"1220 Main Street, Springfeld, Miss.",
|
||||
"1220 Main Street Springfield Miss",
|
||||
),
|
||||
(
|
||||
"1220 Main Street, Springfield, Mich.",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@override_settings(POST_CONSUME_SCRIPT=None)
|
||||
class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
"""
|
||||
We make use of document_consumption_finished, so we should test that it's
|
||||
doing what we expect wrt to tag & correspondent matching.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
TestCase.setUp(self)
|
||||
User.objects.create_user(username='test_consumer', password='12345')
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.", file_type="pdf")
|
||||
|
||||
def test_tag_applied_any(self):
|
||||
t1 = Tag.objects.create(
|
||||
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
self.assertTrue(list(self.doc_contains.tags.all()) == [t1])
|
||||
|
||||
def test_tag_not_applied(self):
|
||||
Tag.objects.create(
|
||||
name="test", match="no-match", matching_algorithm=Tag.MATCH_ANY)
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
self.assertTrue(list(self.doc_contains.tags.all()) == [])
|
||||
|
||||
def test_correspondent_applied(self):
|
||||
correspondent = Correspondent.objects.create(
|
||||
name="test",
|
||||
match="keyword",
|
||||
matching_algorithm=Correspondent.MATCH_ANY
|
||||
)
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
self.assertTrue(self.doc_contains.correspondent == correspondent)
|
||||
|
||||
def test_correspondent_not_applied(self):
|
||||
Tag.objects.create(
|
||||
name="test",
|
||||
match="no-match",
|
||||
matching_algorithm=Correspondent.MATCH_ANY
|
||||
)
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
self.assertEqual(self.doc_contains.correspondent, None)
|
||||
|
||||
def test_logentry_created(self):
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__, document=self.doc_contains)
|
||||
|
||||
self.assertEqual(LogEntry.objects.count(), 1)
|
Loading…
x
Reference in New Issue
Block a user