removed matching model fields, automatic classifier reloading, added autmatic_classification field to matching model

This commit is contained in:
Jonas Winkler 2018-09-04 18:40:26 +02:00
parent 30134034e2
commit 70bd05450a
8 changed files with 126 additions and 143 deletions

View File

@ -102,9 +102,8 @@ class CommonAdmin(admin.ModelAdmin):
class CorrespondentAdmin(CommonAdmin): class CorrespondentAdmin(CommonAdmin):
list_display = ("name", "match", "matching_algorithm", "document_count", "last_correspondence") list_display = ("name", "automatic_classification", "document_count", "last_correspondence")
list_filter = ("matching_algorithm",) list_editable = ("automatic_classification",)
list_editable = ("match", "matching_algorithm")
def get_queryset(self, request): def get_queryset(self, request):
qs = super(CorrespondentAdmin, self).get_queryset(request) qs = super(CorrespondentAdmin, self).get_queryset(request)
@ -122,10 +121,9 @@ class CorrespondentAdmin(CommonAdmin):
class TagAdmin(CommonAdmin): class TagAdmin(CommonAdmin):
list_display = ("name", "colour", "match", "matching_algorithm", list_display = ("name", "colour", "automatic_classification", "document_count")
"document_count") list_filter = ("colour",)
list_filter = ("colour", "matching_algorithm") list_editable = ("colour", "automatic_classification")
list_editable = ("colour", "match", "matching_algorithm")
def get_queryset(self, request): def get_queryset(self, request):
qs = super(TagAdmin, self).get_queryset(request) qs = super(TagAdmin, self).get_queryset(request)
@ -139,9 +137,8 @@ class TagAdmin(CommonAdmin):
class DocumentTypeAdmin(CommonAdmin): class DocumentTypeAdmin(CommonAdmin):
list_display = ("name", "match", "matching_algorithm", "document_count") list_display = ("name", "automatic_classification", "document_count")
list_filter = ("matching_algorithm",) list_editable = ("automatic_classification",)
list_editable = ("match", "matching_algorithm")
def get_queryset(self, request): def get_queryset(self, request):
qs = super(DocumentTypeAdmin, self).get_queryset(request) qs = super(DocumentTypeAdmin, self).get_queryset(request)

View File

@ -1,3 +1,4 @@
import os
import pickle import pickle
from documents.models import Correspondent, DocumentType, Tag from documents.models import Correspondent, DocumentType, Tag
@ -16,6 +17,18 @@ def preprocess_content(content):
class DocumentClassifier(object): class DocumentClassifier(object):
classifier_version = None
data_vectorizer = None
tags_binarizer = None
correspondent_binarizer = None
type_binarizer = None
tags_classifier = None
correspondent_classifier = None
type_classifier = None
@staticmethod @staticmethod
def load_classifier(): def load_classifier():
clf = DocumentClassifier() clf = DocumentClassifier()
@ -23,15 +36,18 @@ class DocumentClassifier(object):
return clf return clf
def reload(self): def reload(self):
with open(settings.MODEL_FILE, "rb") as f: if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
self.data_vectorizer = pickle.load(f) print("reloading classifier")
self.tags_binarizer = pickle.load(f) with open(settings.MODEL_FILE, "rb") as f:
self.correspondent_binarizer = pickle.load(f) self.data_vectorizer = pickle.load(f)
self.type_binarizer = pickle.load(f) self.tags_binarizer = pickle.load(f)
self.correspondent_binarizer = pickle.load(f)
self.type_binarizer = pickle.load(f)
self.tags_classifier = pickle.load(f) self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f) self.correspondent_classifier = pickle.load(f)
self.type_classifier = pickle.load(f) self.type_classifier = pickle.load(f)
self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
def save_classifier(self): def save_classifier(self):
with open(settings.MODEL_FILE, "wb") as f: with open(settings.MODEL_FILE, "wb") as f:

6
src/documents/consumer.py Normal file → Executable file
View File

@ -221,12 +221,6 @@ class Consumer:
storage_type=self.storage_type storage_type=self.storage_type
) )
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
self._write(document, doc, document.source_path) self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path) self._write(document, thumbnail, document.thumbnail_path)

View File

@ -42,9 +42,14 @@ class Command(Renderable, BaseCommand):
# Step 2: vectorize data # Step 2: vectorize data
logging.getLogger(__name__).info("Vectorizing data...") logging.getLogger(__name__).info("Vectorizing data...")
clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05) clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
data_vectorized = clf.data_vectorizer.fit_transform(data) data_vectorized = clf.data_vectorizer.fit_transform(data)
print(clf.data_vectorizer.vocabulary_)
logging.getLogger(__name__).info("Shape of vectorized data: {}".format(data_vectorized.shape))
clf.tags_binarizer = MultiLabelBinarizer() clf.tags_binarizer = MultiLabelBinarizer()
labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags) labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)

View File

@ -46,7 +46,11 @@ class Command(Renderable, BaseCommand):
documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct() documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
logging.getLogger(__name__).info("Loading classifier") logging.getLogger(__name__).info("Loading classifier")
clf = DocumentClassifier.load_classifier() try:
clf = DocumentClassifier.load_classifier()
except FileNotFoundError:
logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.")
return
for document in documents: for document in documents:

View File

@ -0,0 +1,77 @@
# Generated by Django 2.0.8 on 2018-09-04 14:25
from django.db import migrations, models
def transfer_automatic_classification(apps, schema_editor):
for model_name in ["Tag", "Correspondent", "DocumentType"]:
model_class = apps.get_model("documents", model_name)
for o in model_class.objects.all():
o.automatic_classification = o.match is not None and len(o.match) > 0
o.save()
def reverse_automatic_classification(apps, schema_editor):
pass
class Migration(migrations.Migration):
dependencies = [
('documents', '0023_auto_20180823_1155'),
]
operations = [
migrations.AddField(
model_name='correspondent',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.AddField(
model_name='documenttype',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.AddField(
model_name='tag',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
migrations.RemoveField(
model_name='correspondent',
name='is_insensitive',
),
migrations.RemoveField(
model_name='correspondent',
name='match',
),
migrations.RemoveField(
model_name='correspondent',
name='matching_algorithm',
),
migrations.RemoveField(
model_name='documenttype',
name='is_insensitive',
),
migrations.RemoveField(
model_name='documenttype',
name='match',
),
migrations.RemoveField(
model_name='documenttype',
name='matching_algorithm',
),
migrations.RemoveField(
model_name='tag',
name='is_insensitive',
),
migrations.RemoveField(
model_name='tag',
name='match',
),
migrations.RemoveField(
model_name='tag',
name='matching_algorithm',
),
]

View File

@ -15,48 +15,15 @@ from django.db import models
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.utils import timezone from django.utils import timezone
from reminders.models import Reminder
from .managers import LogManager from .managers import LogManager
class MatchingModel(models.Model): class MatchingModel(models.Model):
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
)
name = models.CharField(max_length=128, unique=True) name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True) slug = models.SlugField(blank=True)
match = models.CharField(max_length=256, blank=True) automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
)
is_insensitive = models.BooleanField(default=True)
class Meta: class Meta:
abstract = True abstract = True
@ -64,87 +31,8 @@ class MatchingModel(models.Model):
def __str__(self): def __str__(self):
return self.name return self.name
@property
def conditions(self):
return "{}: \"{}\" ({})".format(
self.name, self.match, self.get_matching_algorithm_display())
@classmethod
def match_all(cls, text, tags=None):
if tags is None:
tags = cls.objects.all()
text = text.lower()
for tag in tags:
if tag.matches(text):
yield tag
def matches(self, text):
search_kwargs = {}
# Check that match is not empty
if self.match.strip() == "":
return False
if self.is_insensitive:
search_kwargs = {"flags": re.IGNORECASE}
if self.matching_algorithm == self.MATCH_ALL:
for word in self._split_match():
search_result = re.search(
r"\b{}\b".format(word), text, **search_kwargs)
if not search_result:
return False
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self._split_match():
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
return True
return False
if self.matching_algorithm == self.MATCH_LITERAL:
return bool(re.search(
r"\b{}\b".format(self.match), text, **search_kwargs))
if self.matching_algorithm == self.MATCH_REGEX:
return bool(re.search(
re.compile(self.match, **search_kwargs), text))
if self.matching_algorithm == self.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', self.match)
text = re.sub(r'[^\w\s]', '', text)
if self.is_insensitive:
match = match.lower()
text = text.lower()
return True if fuzz.partial_ratio(match, text) >= 90 else False
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(self):
"""
Splits the match to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with\s+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
for t in findterms(self.match)
]
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
self.match = self.match.lower()
if not self.slug: if not self.slug:
self.slug = slugify(self.name) self.slug = slugify(self.name)

View File

@ -16,15 +16,17 @@ def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group}) logging.getLogger(__name__).debug(message, extra={"group": group})
classifier = None classifier = DocumentClassifier()
def classify_document(sender, document=None, logging_group=None, **kwargs): def classify_document(sender, document=None, logging_group=None, **kwargs):
global classifier global classifier
if classifier is None: try:
classifier = DocumentClassifier.load_classifier() classifier.reload()
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
except FileNotFoundError:
logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.")
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)