fixes #161

2026-01-12 21:44:21 -06:00 · 2021-01-13 17:17:23 +01:00
parent ceb9426fd4
commit d093c004fb
3 changed files with 83 additions and 24 deletions
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -1,3 +1,4 @@
+import logging
 import re

 from fuzzywuzzy import fuzz
@@ -5,49 +6,59 @@ from fuzzywuzzy import fuzz
 from documents.models import MatchingModel, Correspondent, DocumentType, Tag


-def match_correspondents(document_content, classifier):
+logger = logging.getLogger(__name__)
+
+
+def log_reason(matching_model, document, reason):
+    class_name = type(matching_model).__name__
+    logger.debug(
+        f"Assigning {class_name} {matching_model.name} to document "
+        f"{document} because {reason}")
+
+
+def match_correspondents(document, classifier):
    if classifier:
-        pred_id = classifier.predict_correspondent(document_content)
+        pred_id = classifier.predict_correspondent(document.content)
    else:
        pred_id = None

    correspondents = Correspondent.objects.all()

    return list(filter(
-        lambda o: matches(o, document_content) or o.pk == pred_id,
+        lambda o: matches(o, document) or o.pk == pred_id,
        correspondents))


-def match_document_types(document_content, classifier):
+def match_document_types(document, classifier):
    if classifier:
-        pred_id = classifier.predict_document_type(document_content)
+        pred_id = classifier.predict_document_type(document.content)
    else:
        pred_id = None

    document_types = DocumentType.objects.all()

    return list(filter(
-        lambda o: matches(o, document_content) or o.pk == pred_id,
+        lambda o: matches(o, document) or o.pk == pred_id,
        document_types))


-def match_tags(document_content, classifier):
+def match_tags(document, classifier):
    if classifier:
-        predicted_tag_ids = classifier.predict_tags(document_content)
+        predicted_tag_ids = classifier.predict_tags(document.content)
    else:
        predicted_tag_ids = []

    tags = Tag.objects.all()

    return list(filter(
-        lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
+        lambda o: matches(o, document) or o.pk in predicted_tag_ids,
        tags))


-def matches(matching_model, document_content):
+def matches(matching_model, document):
    search_kwargs = {}

-    document_content = document_content.lower()
+    document_content = document.content.lower()

    # Check that match is not empty
    if matching_model.match.strip() == "":
@@ -62,26 +73,54 @@ def matches(matching_model, document_content):
                rf"\b{word}\b", document_content, **search_kwargs)
            if not search_result:
                return False
+        log_reason(
+            matching_model, document,
+            f"it contains all of these words: {matching_model.match}"
+        )
        return True

    elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
        for word in _split_match(matching_model):
            if re.search(rf"\b{word}\b", document_content, **search_kwargs):
+                log_reason(
+                    matching_model, document,
+                    f"it contains this word: {word}"
+                )
                return True
        return False

    elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
-        return bool(re.search(
+        result = bool(re.search(
            rf"\b{matching_model.match}\b",
            document_content,
            **search_kwargs
        ))
+        if result:
+            log_reason(
+                matching_model, document,
+                f"it contains this string: \"{matching_model.match}\""
+            )
+        return result

    elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
-        return bool(re.search(
-            re.compile(matching_model.match, **search_kwargs),
-            document_content
-        ))
+        try:
+            match = re.search(
+                re.compile(matching_model.match, **search_kwargs),
+                document_content
+            )
+        except re.error:
+            logger.error(
+                f"Error while processing regular expression "
+                f"{matching_model.match}"
+            )
+            return False
+        if match:
+            log_reason(
+                matching_model, document,
+                f"the string {match.group()} matches the regular expression "
+                f"{matching_model.match}"
+            )
+        return bool(match)

    elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
        match = re.sub(r'[^\w\s]', '', matching_model.match)
@@ -89,8 +128,16 @@ def matches(matching_model, document_content):
        if matching_model.is_insensitive:
            match = match.lower()
            text = text.lower()
-
-        return fuzz.partial_ratio(match, text) >= 90
+        if fuzz.partial_ratio(match, text) >= 90:
+            # TODO: make this better
+            log_reason(
+                matching_model, document,
+                f"parts of the document content somehow match the string "
+                f"{matching_model.match}"
+            )
+            return True
+        else:
+            return False

    elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
        # this is done elsewhere.