Implemented the classifier model, including automatic tagging of new documents

This commit is contained in:
Jonas Winkler
2018-09-04 14:39:55 +02:00
parent 3eecd67fc1
commit c50c517928
10 changed files with 240 additions and 339 deletions

View File

@@ -8,6 +8,7 @@ from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.utils import timezone
from documents.classifier import DocumentClassifier
from ..models import Correspondent, Document, Tag, DocumentType
@@ -15,79 +16,16 @@ def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
def set_correspondent(sender, document=None, logging_group=None, **kwargs):
# No sense in assigning a correspondent when one is already set.
if document.correspondent:
return
# No matching correspondents, so no need to continue
potential_correspondents = list(Correspondent.match_all(document.content))
if not potential_correspondents:
return
potential_count = len(potential_correspondents)
selected = potential_correspondents[0]
if potential_count > 1:
message = "Detected {} potential correspondents, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
logger(
'Assigning correspondent "{}" to "{}" '.format(selected, document),
logging_group
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
classifier = None
def set_document_type(sender, document=None, logging_group=None, **kwargs):
def classify_document(sender, document=None, logging_group=None, **kwargs):
global classifier
if classifier is None:
classifier = DocumentClassifier.load_classifier()
# No sense in assigning a correspondent when one is already set.
if document.document_type:
return
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
# No matching document types, so no need to continue
potential_document_types = list(DocumentType.match_all(document.content))
if not potential_document_types:
return
potential_count = len(potential_document_types)
selected = potential_document_types[0]
if potential_count > 1:
message = "Detected {} potential document types, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
logger(
'Assigning document type "{}" to "{}" '.format(selected, document),
logging_group
)
document.document_type = selected
document.save(update_fields=("document_type",))
def set_tags(sender, document=None, logging_group=None, **kwargs):
current_tags = set(document.tags.all())
relevant_tags = (set(Tag.match_all(document.content)) | set(Tag.objects.filter(is_inbox_tag=True))) - current_tags
if not relevant_tags:
return
message = 'Tagging "{}" with "{}"'
logger(
message.format(document, ", ".join([t.slug for t in relevant_tags])),
logging_group
)
document.tags.add(*relevant_tags)
def run_pre_consume_script(sender, filename, **kwargs):