removed matching model fields, automatic classifier reloading, added autmatic_classification field to matching model

This commit is contained in:
Jonas Winkler 2018-09-04 18:40:26 +02:00
parent 30134034e2
commit 70bd05450a
8 changed files with 126 additions and 143 deletions

View File

@ -102,9 +102,8 @@ class CommonAdmin(admin.ModelAdmin):
class CorrespondentAdmin(CommonAdmin):
list_display = ("name", "match", "matching_algorithm", "document_count", "last_correspondence")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
list_display = ("name", "automatic_classification", "document_count", "last_correspondence")
list_editable = ("automatic_classification",)
def get_queryset(self, request):
qs = super(CorrespondentAdmin, self).get_queryset(request)
@ -122,10 +121,9 @@ class CorrespondentAdmin(CommonAdmin):
class TagAdmin(CommonAdmin):
list_display = ("name", "colour", "match", "matching_algorithm",
"document_count")
list_filter = ("colour", "matching_algorithm")
list_editable = ("colour", "match", "matching_algorithm")
list_display = ("name", "colour", "automatic_classification", "document_count")
list_filter = ("colour",)
list_editable = ("colour", "automatic_classification")
def get_queryset(self, request):
qs = super(TagAdmin, self).get_queryset(request)
@ -139,9 +137,8 @@ class TagAdmin(CommonAdmin):
class DocumentTypeAdmin(CommonAdmin):
list_display = ("name", "match", "matching_algorithm", "document_count")
list_filter = ("matching_algorithm",)
list_editable = ("match", "matching_algorithm")
list_display = ("name", "automatic_classification", "document_count")
list_editable = ("automatic_classification",)
def get_queryset(self, request):
qs = super(DocumentTypeAdmin, self).get_queryset(request)

View File

@ -1,3 +1,4 @@
import os
import pickle
from documents.models import Correspondent, DocumentType, Tag
@ -16,6 +17,18 @@ def preprocess_content(content):
class DocumentClassifier(object):
classifier_version = None
data_vectorizer = None
tags_binarizer = None
correspondent_binarizer = None
type_binarizer = None
tags_classifier = None
correspondent_classifier = None
type_classifier = None
@staticmethod
def load_classifier():
clf = DocumentClassifier()
@ -23,15 +36,18 @@ class DocumentClassifier(object):
return clf
def reload(self):
with open(settings.MODEL_FILE, "rb") as f:
self.data_vectorizer = pickle.load(f)
self.tags_binarizer = pickle.load(f)
self.correspondent_binarizer = pickle.load(f)
self.type_binarizer = pickle.load(f)
if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
print("reloading classifier")
with open(settings.MODEL_FILE, "rb") as f:
self.data_vectorizer = pickle.load(f)
self.tags_binarizer = pickle.load(f)
self.correspondent_binarizer = pickle.load(f)
self.type_binarizer = pickle.load(f)
self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f)
self.type_classifier = pickle.load(f)
self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f)
self.type_classifier = pickle.load(f)
self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
def save_classifier(self):
with open(settings.MODEL_FILE, "wb") as f:

6
src/documents/consumer.py Normal file → Executable file
View File

@ -221,12 +221,6 @@ class Consumer:
storage_type=self.storage_type
)
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)

View File

@ -42,9 +42,14 @@ class Command(Renderable, BaseCommand):
# Step 2: vectorize data
logging.getLogger(__name__).info("Vectorizing data...")
clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
data_vectorized = clf.data_vectorizer.fit_transform(data)
print(clf.data_vectorizer.vocabulary_)
logging.getLogger(__name__).info("Shape of vectorized data: {}".format(data_vectorized.shape))
clf.tags_binarizer = MultiLabelBinarizer()
labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)

View File

@ -46,7 +46,11 @@ class Command(Renderable, BaseCommand):
documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
logging.getLogger(__name__).info("Loading classifier")
clf = DocumentClassifier.load_classifier()
try:
clf = DocumentClassifier.load_classifier()
except FileNotFoundError:
logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.")
return
for document in documents:

View File

@ -0,0 +1,77 @@
# Generated by Django 2.0.8 on 2018-09-04 14:25
from django.db import migrations, models
def transfer_automatic_classification(apps, schema_editor):
for model_name in ["Tag", "Correspondent", "DocumentType"]:
model_class = apps.get_model("documents", model_name)
for o in model_class.objects.all():
o.automatic_classification = o.match is not None and len(o.match) > 0
o.save()
def reverse_automatic_classification(apps, schema_editor):
pass
class Migration(migrations.Migration):
dependencies = [
('documents', '0023_auto_20180823_1155'),
]
operations = [
migrations.AddField(
model_name='correspondent',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.AddField(
model_name='documenttype',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.AddField(
model_name='tag',
name='automatic_classification',
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
),
migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
migrations.RemoveField(
model_name='correspondent',
name='is_insensitive',
),
migrations.RemoveField(
model_name='correspondent',
name='match',
),
migrations.RemoveField(
model_name='correspondent',
name='matching_algorithm',
),
migrations.RemoveField(
model_name='documenttype',
name='is_insensitive',
),
migrations.RemoveField(
model_name='documenttype',
name='match',
),
migrations.RemoveField(
model_name='documenttype',
name='matching_algorithm',
),
migrations.RemoveField(
model_name='tag',
name='is_insensitive',
),
migrations.RemoveField(
model_name='tag',
name='match',
),
migrations.RemoveField(
model_name='tag',
name='matching_algorithm',
),
]

View File

@ -15,48 +15,15 @@ from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from reminders.models import Reminder
from .managers import LogManager
class MatchingModel(models.Model):
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True)
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
)
is_insensitive = models.BooleanField(default=True)
automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
class Meta:
abstract = True
@ -64,87 +31,8 @@ class MatchingModel(models.Model):
def __str__(self):
return self.name
@property
def conditions(self):
return "{}: \"{}\" ({})".format(
self.name, self.match, self.get_matching_algorithm_display())
@classmethod
def match_all(cls, text, tags=None):
if tags is None:
tags = cls.objects.all()
text = text.lower()
for tag in tags:
if tag.matches(text):
yield tag
def matches(self, text):
search_kwargs = {}
# Check that match is not empty
if self.match.strip() == "":
return False
if self.is_insensitive:
search_kwargs = {"flags": re.IGNORECASE}
if self.matching_algorithm == self.MATCH_ALL:
for word in self._split_match():
search_result = re.search(
r"\b{}\b".format(word), text, **search_kwargs)
if not search_result:
return False
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self._split_match():
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
return True
return False
if self.matching_algorithm == self.MATCH_LITERAL:
return bool(re.search(
r"\b{}\b".format(self.match), text, **search_kwargs))
if self.matching_algorithm == self.MATCH_REGEX:
return bool(re.search(
re.compile(self.match, **search_kwargs), text))
if self.matching_algorithm == self.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', self.match)
text = re.sub(r'[^\w\s]', '', text)
if self.is_insensitive:
match = match.lower()
text = text.lower()
return True if fuzz.partial_ratio(match, text) >= 90 else False
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(self):
"""
Splits the match to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with\s+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
for t in findterms(self.match)
]
def save(self, *args, **kwargs):
self.match = self.match.lower()
if not self.slug:
self.slug = slugify(self.name)

View File

@ -16,15 +16,17 @@ def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
classifier = None
classifier = DocumentClassifier()
def classify_document(sender, document=None, logging_group=None, **kwargs):
global classifier
if classifier is None:
classifier = DocumentClassifier.load_classifier()
try:
classifier.reload()
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
except FileNotFoundError:
logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.")
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)