This commit is contained in:
jonaswinkler 2021-01-13 17:17:23 +01:00
parent ceb9426fd4
commit d093c004fb
3 changed files with 83 additions and 24 deletions

View File

@ -1,3 +1,4 @@
import logging
import re import re
from fuzzywuzzy import fuzz from fuzzywuzzy import fuzz
@ -5,49 +6,59 @@ from fuzzywuzzy import fuzz
from documents.models import MatchingModel, Correspondent, DocumentType, Tag from documents.models import MatchingModel, Correspondent, DocumentType, Tag
def match_correspondents(document_content, classifier): logger = logging.getLogger(__name__)
def log_reason(matching_model, document, reason):
class_name = type(matching_model).__name__
logger.debug(
f"Assigning {class_name} {matching_model.name} to document "
f"{document} because {reason}")
def match_correspondents(document, classifier):
if classifier: if classifier:
pred_id = classifier.predict_correspondent(document_content) pred_id = classifier.predict_correspondent(document.content)
else: else:
pred_id = None pred_id = None
correspondents = Correspondent.objects.all() correspondents = Correspondent.objects.all()
return list(filter( return list(filter(
lambda o: matches(o, document_content) or o.pk == pred_id, lambda o: matches(o, document) or o.pk == pred_id,
correspondents)) correspondents))
def match_document_types(document_content, classifier): def match_document_types(document, classifier):
if classifier: if classifier:
pred_id = classifier.predict_document_type(document_content) pred_id = classifier.predict_document_type(document.content)
else: else:
pred_id = None pred_id = None
document_types = DocumentType.objects.all() document_types = DocumentType.objects.all()
return list(filter( return list(filter(
lambda o: matches(o, document_content) or o.pk == pred_id, lambda o: matches(o, document) or o.pk == pred_id,
document_types)) document_types))
def match_tags(document_content, classifier): def match_tags(document, classifier):
if classifier: if classifier:
predicted_tag_ids = classifier.predict_tags(document_content) predicted_tag_ids = classifier.predict_tags(document.content)
else: else:
predicted_tag_ids = [] predicted_tag_ids = []
tags = Tag.objects.all() tags = Tag.objects.all()
return list(filter( return list(filter(
lambda o: matches(o, document_content) or o.pk in predicted_tag_ids, lambda o: matches(o, document) or o.pk in predicted_tag_ids,
tags)) tags))
def matches(matching_model, document_content): def matches(matching_model, document):
search_kwargs = {} search_kwargs = {}
document_content = document_content.lower() document_content = document.content.lower()
# Check that match is not empty # Check that match is not empty
if matching_model.match.strip() == "": if matching_model.match.strip() == "":
@ -62,26 +73,54 @@ def matches(matching_model, document_content):
rf"\b{word}\b", document_content, **search_kwargs) rf"\b{word}\b", document_content, **search_kwargs)
if not search_result: if not search_result:
return False return False
log_reason(
matching_model, document,
f"it contains all of these words: {matching_model.match}"
)
return True return True
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY: elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model): for word in _split_match(matching_model):
if re.search(rf"\b{word}\b", document_content, **search_kwargs): if re.search(rf"\b{word}\b", document_content, **search_kwargs):
log_reason(
matching_model, document,
f"it contains this word: {word}"
)
return True return True
return False return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
return bool(re.search( result = bool(re.search(
rf"\b{matching_model.match}\b", rf"\b{matching_model.match}\b",
document_content, document_content,
**search_kwargs **search_kwargs
)) ))
if result:
log_reason(
matching_model, document,
f"it contains this string: \"{matching_model.match}\""
)
return result
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
return bool(re.search( try:
re.compile(matching_model.match, **search_kwargs), match = re.search(
document_content re.compile(matching_model.match, **search_kwargs),
)) document_content
)
except re.error:
logger.error(
f"Error while processing regular expression "
f"{matching_model.match}"
)
return False
if match:
log_reason(
matching_model, document,
f"the string {match.group()} matches the regular expression "
f"{matching_model.match}"
)
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', matching_model.match) match = re.sub(r'[^\w\s]', '', matching_model.match)
@ -89,8 +128,16 @@ def matches(matching_model, document_content):
if matching_model.is_insensitive: if matching_model.is_insensitive:
match = match.lower() match = match.lower()
text = text.lower() text = text.lower()
if fuzz.partial_ratio(match, text) >= 90:
return fuzz.partial_ratio(match, text) >= 90 # TODO: make this better
log_reason(
matching_model, document,
f"parts of the document content somehow match the string "
f"{matching_model.match}"
)
return True
else:
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
# this is done elsewhere. # this is done elsewhere.

View File

@ -38,7 +38,7 @@ def set_correspondent(sender,
if document.correspondent and not replace: if document.correspondent and not replace:
return return
potential_correspondents = matching.match_correspondents(document.content, potential_correspondents = matching.match_correspondents(document,
classifier) classifier)
potential_count = len(potential_correspondents) potential_count = len(potential_correspondents)
@ -81,7 +81,7 @@ def set_document_type(sender,
if document.document_type and not replace: if document.document_type and not replace:
return return
potential_document_type = matching.match_document_types(document.content, potential_document_type = matching.match_document_types(document,
classifier) classifier)
potential_count = len(potential_document_type) potential_count = len(potential_document_type)
@ -130,7 +130,7 @@ def set_tags(sender,
current_tags = set(document.tags.all()) current_tags = set(document.tags.all())
matched_tags = matching.match_tags(document.content, classifier) matched_tags = matching.match_tags(document, classifier)
relevant_tags = set(matched_tags) - current_tags relevant_tags = set(matched_tags) - current_tags

View File

@ -21,13 +21,15 @@ class TestMatching(TestCase):
matching_algorithm=getattr(klass, algorithm) matching_algorithm=getattr(klass, algorithm)
) )
for string in true: for string in true:
doc = Document(content=string)
self.assertTrue( self.assertTrue(
matching.matches(instance, string), matching.matches(instance, doc),
'"%s" should match "%s" but it does not' % (text, string) '"%s" should match "%s" but it does not' % (text, string)
) )
for string in false: for string in false:
doc = Document(content=string)
self.assertFalse( self.assertFalse(
matching.matches(instance, string), matching.matches(instance, doc),
'"%s" should not match "%s" but it does' % (text, string) '"%s" should not match "%s" but it does' % (text, string)
) )
@ -169,7 +171,7 @@ class TestMatching(TestCase):
def test_match_regex(self): def test_match_regex(self):
self._test_matching( self._test_matching(
r"alpha\w+gamma", "alpha\w+gamma",
"MATCH_REGEX", "MATCH_REGEX",
( (
"I have alpha_and_gamma in me", "I have alpha_and_gamma in me",
@ -187,6 +189,16 @@ class TestMatching(TestCase):
) )
) )
def test_tach_invalid_regex(self):
self._test_matching(
"[[",
"MATCH_REGEX",
[],
[
"Don't match this"
]
)
def test_match_fuzzy(self): def test_match_fuzzy(self):
self._test_matching( self._test_matching(