unified document matching, legacy and automatching work alongside now

This commit is contained in:
Jonas Winkler
2020-10-28 11:45:11 +01:00
parent 368b6d0512
commit dd16b7262e
16 changed files with 629 additions and 225 deletions

View File

@@ -9,7 +9,7 @@ from django.contrib.contenttypes.models import ContentType
from django.utils import timezone
from documents.classifier import DocumentClassifier
from .. import index
from .. import index, matching
from ..models import Document, Tag
@@ -17,35 +17,107 @@ def logger(message, group):
logging.getLogger(__name__).debug(message, extra={"group": group})
#TODO: global? really?
classifier = DocumentClassifier()
def index_document(sender, document=None, logging_group=None, **kwargs):
index.add_document_to_index(sender, instance=document)
def classify_document(sender, document=None, logging_group=None, **kwargs):
global classifier
try:
classifier.reload()
classifier.classify_document(
document,
classify_correspondent=True,
classify_tags=True,
classify_document_type=True
)
except FileNotFoundError:
logging.getLogger(__name__).fatal(
"Cannot classify document, classifier model file was not found."
)
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
inbox_tags = Tag.objects.filter(is_inbox_tag=True)
document.tags.add(*inbox_tags)
def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
if document.correspondent and not replace:
return
potential_correspondents = matching.match_correspondents(document.content, classifier)
potential_count = len(potential_correspondents)
if potential_correspondents:
selected = potential_correspondents[0]
else:
selected = None
if potential_count > 1:
if use_first:
message = "Detected {} potential correspondents, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
else:
message = "Detected {} potential correspondents, not assigning any correspondent"
logger(
message.format(potential_count),
logging_group
)
return
logger(
'Assigning correspondent "{}" to "{}" '.format(selected, document),
logging_group
)
document.correspondent = selected
document.save(update_fields=("correspondent",))
def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
if document.document_type and not replace:
return
potential_document_type = matching.match_document_types(document.content, classifier)
potential_count = len(potential_document_type)
if potential_document_type:
selected = potential_document_type[0]
else:
selected = None
if potential_count > 1:
if use_first:
message = "Detected {} potential document types, so we've opted for {}"
logger(
message.format(potential_count, selected),
logging_group
)
else:
message = "Detected {} potential document types, not assigning any document type"
logger(
message.format(potential_count),
logging_group
)
return
logger(
'Assigning document type "{}" to "{}" '.format(selected, document),
logging_group
)
document.document_type = selected
document.save(update_fields=("document_type",))
def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
if replace:
document.tags.clear()
current_tags = set([])
else:
current_tags = set(document.tags.all())
relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
if not relevant_tags:
return
message = 'Tagging "{}" with "{}"'
logger(
message.format(document, ", ".join([t.slug for t in relevant_tags])),
logging_group
)
document.tags.add(*relevant_tags)
def run_pre_consume_script(sender, filename, **kwargs):
if not settings.PRE_CONSUME_SCRIPT: