From d093c004fb072edb808a7d28c6f1d5d24ae04165 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Wed, 13 Jan 2021 17:17:23 +0100 Subject: [PATCH] fixes #161 --- src/documents/matching.py | 83 ++++++++++++++++++++------ src/documents/signals/handlers.py | 6 +- src/documents/tests/test_matchables.py | 18 +++++- 3 files changed, 83 insertions(+), 24 deletions(-) diff --git a/src/documents/matching.py b/src/documents/matching.py index 212698ad3..fa9ce6af3 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -1,3 +1,4 @@ +import logging import re from fuzzywuzzy import fuzz @@ -5,49 +6,59 @@ from fuzzywuzzy import fuzz from documents.models import MatchingModel, Correspondent, DocumentType, Tag -def match_correspondents(document_content, classifier): +logger = logging.getLogger(__name__) + + +def log_reason(matching_model, document, reason): + class_name = type(matching_model).__name__ + logger.debug( + f"Assigning {class_name} {matching_model.name} to document " + f"{document} because {reason}") + + +def match_correspondents(document, classifier): if classifier: - pred_id = classifier.predict_correspondent(document_content) + pred_id = classifier.predict_correspondent(document.content) else: pred_id = None correspondents = Correspondent.objects.all() return list(filter( - lambda o: matches(o, document_content) or o.pk == pred_id, + lambda o: matches(o, document) or o.pk == pred_id, correspondents)) -def match_document_types(document_content, classifier): +def match_document_types(document, classifier): if classifier: - pred_id = classifier.predict_document_type(document_content) + pred_id = classifier.predict_document_type(document.content) else: pred_id = None document_types = DocumentType.objects.all() return list(filter( - lambda o: matches(o, document_content) or o.pk == pred_id, + lambda o: matches(o, document) or o.pk == pred_id, document_types)) -def match_tags(document_content, classifier): +def match_tags(document, classifier): if classifier: - predicted_tag_ids = classifier.predict_tags(document_content) + predicted_tag_ids = classifier.predict_tags(document.content) else: predicted_tag_ids = [] tags = Tag.objects.all() return list(filter( - lambda o: matches(o, document_content) or o.pk in predicted_tag_ids, + lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags)) -def matches(matching_model, document_content): +def matches(matching_model, document): search_kwargs = {} - document_content = document_content.lower() + document_content = document.content.lower() # Check that match is not empty if matching_model.match.strip() == "": @@ -62,26 +73,54 @@ def matches(matching_model, document_content): rf"\b{word}\b", document_content, **search_kwargs) if not search_result: return False + log_reason( + matching_model, document, + f"it contains all of these words: {matching_model.match}" + ) return True elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY: for word in _split_match(matching_model): if re.search(rf"\b{word}\b", document_content, **search_kwargs): + log_reason( + matching_model, document, + f"it contains this word: {word}" + ) return True return False elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: - return bool(re.search( + result = bool(re.search( rf"\b{matching_model.match}\b", document_content, **search_kwargs )) + if result: + log_reason( + matching_model, document, + f"it contains this string: \"{matching_model.match}\"" + ) + return result elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: - return bool(re.search( - re.compile(matching_model.match, **search_kwargs), - document_content - )) + try: + match = re.search( + re.compile(matching_model.match, **search_kwargs), + document_content + ) + except re.error: + logger.error( + f"Error while processing regular expression " + f"{matching_model.match}" + ) + return False + if match: + log_reason( + matching_model, document, + f"the string {match.group()} matches the regular expression " + f"{matching_model.match}" + ) + return bool(match) elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: match = re.sub(r'[^\w\s]', '', matching_model.match) @@ -89,8 +128,16 @@ def matches(matching_model, document_content): if matching_model.is_insensitive: match = match.lower() text = text.lower() - - return fuzz.partial_ratio(match, text) >= 90 + if fuzz.partial_ratio(match, text) >= 90: + # TODO: make this better + log_reason( + matching_model, document, + f"parts of the document content somehow match the string " + f"{matching_model.match}" + ) + return True + else: + return False elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: # this is done elsewhere. diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index be101b3af..76c1e76ed 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -38,7 +38,7 @@ def set_correspondent(sender, if document.correspondent and not replace: return - potential_correspondents = matching.match_correspondents(document.content, + potential_correspondents = matching.match_correspondents(document, classifier) potential_count = len(potential_correspondents) @@ -81,7 +81,7 @@ def set_document_type(sender, if document.document_type and not replace: return - potential_document_type = matching.match_document_types(document.content, + potential_document_type = matching.match_document_types(document, classifier) potential_count = len(potential_document_type) @@ -130,7 +130,7 @@ def set_tags(sender, current_tags = set(document.tags.all()) - matched_tags = matching.match_tags(document.content, classifier) + matched_tags = matching.match_tags(document, classifier) relevant_tags = set(matched_tags) - current_tags diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 4e4a3e7dc..da0fa66ea 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -21,13 +21,15 @@ class TestMatching(TestCase): matching_algorithm=getattr(klass, algorithm) ) for string in true: + doc = Document(content=string) self.assertTrue( - matching.matches(instance, string), + matching.matches(instance, doc), '"%s" should match "%s" but it does not' % (text, string) ) for string in false: + doc = Document(content=string) self.assertFalse( - matching.matches(instance, string), + matching.matches(instance, doc), '"%s" should not match "%s" but it does' % (text, string) ) @@ -169,7 +171,7 @@ class TestMatching(TestCase): def test_match_regex(self): self._test_matching( - r"alpha\w+gamma", + "alpha\w+gamma", "MATCH_REGEX", ( "I have alpha_and_gamma in me", @@ -187,6 +189,16 @@ class TestMatching(TestCase): ) ) + def test_tach_invalid_regex(self): + self._test_matching( + "[[", + "MATCH_REGEX", + [], + [ + "Don't match this" + ] + ) + def test_match_fuzzy(self): self._test_matching(