From d093c004fb072edb808a7d28c6f1d5d24ae04165 Mon Sep 17 00:00:00 2001
From: jonaswinkler <jonas.winkler@jpwinkler.de>
Date: Wed, 13 Jan 2021 17:17:23 +0100
Subject: [PATCH] fixes #161

---
 src/documents/matching.py              | 83 ++++++++++++++++++++------
 src/documents/signals/handlers.py      |  6 +-
 src/documents/tests/test_matchables.py | 18 +++++-
 3 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/src/documents/matching.py b/src/documents/matching.py
index 212698ad3..fa9ce6af3 100644
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -1,3 +1,4 @@
+import logging
 import re
 
 from fuzzywuzzy import fuzz
@@ -5,49 +6,59 @@ from fuzzywuzzy import fuzz
 from documents.models import MatchingModel, Correspondent, DocumentType, Tag
 
 
-def match_correspondents(document_content, classifier):
+logger = logging.getLogger(__name__)
+
+
+def log_reason(matching_model, document, reason):
+    class_name = type(matching_model).__name__
+    logger.debug(
+        f"Assigning {class_name} {matching_model.name} to document "
+        f"{document} because {reason}")
+
+
+def match_correspondents(document, classifier):
     if classifier:
-        pred_id = classifier.predict_correspondent(document_content)
+        pred_id = classifier.predict_correspondent(document.content)
     else:
         pred_id = None
 
     correspondents = Correspondent.objects.all()
 
     return list(filter(
-        lambda o: matches(o, document_content) or o.pk == pred_id,
+        lambda o: matches(o, document) or o.pk == pred_id,
         correspondents))
 
 
-def match_document_types(document_content, classifier):
+def match_document_types(document, classifier):
     if classifier:
-        pred_id = classifier.predict_document_type(document_content)
+        pred_id = classifier.predict_document_type(document.content)
     else:
         pred_id = None
 
     document_types = DocumentType.objects.all()
 
     return list(filter(
-        lambda o: matches(o, document_content) or o.pk == pred_id,
+        lambda o: matches(o, document) or o.pk == pred_id,
         document_types))
 
 
-def match_tags(document_content, classifier):
+def match_tags(document, classifier):
     if classifier:
-        predicted_tag_ids = classifier.predict_tags(document_content)
+        predicted_tag_ids = classifier.predict_tags(document.content)
     else:
         predicted_tag_ids = []
 
     tags = Tag.objects.all()
 
     return list(filter(
-        lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
+        lambda o: matches(o, document) or o.pk in predicted_tag_ids,
         tags))
 
 
-def matches(matching_model, document_content):
+def matches(matching_model, document):
     search_kwargs = {}
 
-    document_content = document_content.lower()
+    document_content = document.content.lower()
 
     # Check that match is not empty
     if matching_model.match.strip() == "":
@@ -62,26 +73,54 @@ def matches(matching_model, document_content):
                 rf"\b{word}\b", document_content, **search_kwargs)
             if not search_result:
                 return False
+        log_reason(
+            matching_model, document,
+            f"it contains all of these words: {matching_model.match}"
+        )
         return True
 
     elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
         for word in _split_match(matching_model):
             if re.search(rf"\b{word}\b", document_content, **search_kwargs):
+                log_reason(
+                    matching_model, document,
+                    f"it contains this word: {word}"
+                )
                 return True
         return False
 
     elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
-        return bool(re.search(
+        result = bool(re.search(
             rf"\b{matching_model.match}\b",
             document_content,
             **search_kwargs
         ))
+        if result:
+            log_reason(
+                matching_model, document,
+                f"it contains this string: \"{matching_model.match}\""
+            )
+        return result
 
     elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
-        return bool(re.search(
-            re.compile(matching_model.match, **search_kwargs),
-            document_content
-        ))
+        try:
+            match = re.search(
+                re.compile(matching_model.match, **search_kwargs),
+                document_content
+            )
+        except re.error:
+            logger.error(
+                f"Error while processing regular expression "
+                f"{matching_model.match}"
+            )
+            return False
+        if match:
+            log_reason(
+                matching_model, document,
+                f"the string {match.group()} matches the regular expression "
+                f"{matching_model.match}"
+            )
+        return bool(match)
 
     elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
         match = re.sub(r'[^\w\s]', '', matching_model.match)
@@ -89,8 +128,16 @@ def matches(matching_model, document_content):
         if matching_model.is_insensitive:
             match = match.lower()
             text = text.lower()
-
-        return fuzz.partial_ratio(match, text) >= 90
+        if fuzz.partial_ratio(match, text) >= 90:
+            # TODO: make this better
+            log_reason(
+                matching_model, document,
+                f"parts of the document content somehow match the string "
+                f"{matching_model.match}"
+            )
+            return True
+        else:
+            return False
 
     elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
         # this is done elsewhere.
diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py
index be101b3af..76c1e76ed 100755
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -38,7 +38,7 @@ def set_correspondent(sender,
     if document.correspondent and not replace:
         return
 
-    potential_correspondents = matching.match_correspondents(document.content,
+    potential_correspondents = matching.match_correspondents(document,
                                                              classifier)
 
     potential_count = len(potential_correspondents)
@@ -81,7 +81,7 @@ def set_document_type(sender,
     if document.document_type and not replace:
         return
 
-    potential_document_type = matching.match_document_types(document.content,
+    potential_document_type = matching.match_document_types(document,
                                                             classifier)
 
     potential_count = len(potential_document_type)
@@ -130,7 +130,7 @@ def set_tags(sender,
 
     current_tags = set(document.tags.all())
 
-    matched_tags = matching.match_tags(document.content, classifier)
+    matched_tags = matching.match_tags(document, classifier)
 
     relevant_tags = set(matched_tags) - current_tags
 
diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py
index 4e4a3e7dc..da0fa66ea 100644
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -21,13 +21,15 @@ class TestMatching(TestCase):
                 matching_algorithm=getattr(klass, algorithm)
             )
             for string in true:
+                doc = Document(content=string)
                 self.assertTrue(
-                    matching.matches(instance, string),
+                    matching.matches(instance, doc),
                     '"%s" should match "%s" but it does not' % (text, string)
                 )
             for string in false:
+                doc = Document(content=string)
                 self.assertFalse(
-                    matching.matches(instance, string),
+                    matching.matches(instance, doc),
                     '"%s" should not match "%s" but it does' % (text, string)
                 )
 
@@ -169,7 +171,7 @@ class TestMatching(TestCase):
     def test_match_regex(self):
 
         self._test_matching(
-            r"alpha\w+gamma",
+            "alpha\w+gamma",
             "MATCH_REGEX",
             (
                 "I have alpha_and_gamma in me",
@@ -187,6 +189,16 @@ class TestMatching(TestCase):
             )
         )
 
+    def test_tach_invalid_regex(self):
+        self._test_matching(
+            "[[",
+            "MATCH_REGEX",
+            [],
+            [
+                "Don't match this"
+            ]
+        )
+
     def test_match_fuzzy(self):
 
         self._test_matching(