add fuzzy matching + tests

2025-12-18 01:41:14 -06:00 · 2017-04-29 17:13:04 -04:00
parent 819a0e1f57
commit 6ce27d225d
3 changed files with 36 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ django-filter>=1.0
 django-flat-responsive>=1.2.0
 djangorestframework>=3.5.3
 filemagic>=1.6
 fuzzywuzzy[speedup]==0.15.0
 langdetect>=1.0.7
 pyocr>=0.4.6
 python-dateutil>=2.6.0
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -5,6 +5,7 @@ import re
 import uuid
 from collections import OrderedDict
 from fuzzywuzzy import fuzz
 from django.conf import settings
 from django.core.urlresolvers import reverse
@@ -21,11 +22,13 @@ class MatchingModel(models.Model):
    MATCH_ALL = 2
    MATCH_LITERAL = 3
    MATCH_REGEX = 4
    MATCH_FUZZY = 5
    MATCHING_ALGORITHMS = (
        (MATCH_ANY, "Any"),
        (MATCH_ALL, "All"),
        (MATCH_LITERAL, "Literal"),
        (MATCH_REGEX, "Regular Expression"),
        (MATCH_FUZZY, "Fuzzy Match"),
    )
    name = models.CharField(max_length=128, unique=True)
@@ -42,8 +45,13 @@ class MatchingModel(models.Model):
            "provided appear in the PDF, albeit not in the order provided.  A "
            "\"literal\" match means that the text you enter must appear in "
            "the PDF exactly as you've entered it, and \"regular expression\" "
-            "uses a regex to match the PDF.  If you don't know what a regex "
+            "uses a regex to match the PDF.  (If you don't know what a regex "
-            "is, you probably don't want this option."
+            "is, you probably don't want this option.)  Finally, a \"fuzzy "
            "match\" strips all punctuation from both the match candidate "
            "and the OCR'd text and looks for a Levenshtein \"partial ratio\" "
            "(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, "
            "which can be useful for matching against documents with "
            "imperfections that foil accurate OCR."
        )
    )
@@ -104,6 +112,15 @@ class MatchingModel(models.Model):
            return bool(re.search(
                re.compile(self.match, **search_kwargs), text))
        if self.matching_algorithm == self.MATCH_FUZZY:
            match = re.sub(r'[^\w\s]', '', self.match)
            text = re.sub(r'[^\w\s]', '', text)
            if self.is_insensitive:
                match = match.lower()
                text = text.lower()
            return True if fuzz.partial_ratio(match, text) >= 90 else False
        raise NotImplementedError("Unsupported matching algorithm")
    def save(self, *args, **kwargs):
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -149,6 +149,22 @@ class TestMatching(TestCase):
            )
        )
    def test_match_fuzzy(self):
        self._test_matching(
            "Springfield, Miss.",
            "MATCH_FUZZY",
            (
                "1220 Main Street, Springf eld, Miss.",
                "1220 Main Street, Spring field, Miss.",
                "1220 Main Street, Springfeld, Miss.",
                "1220 Main Street Springfield Miss",
            ),
            (
                "1220 Main Street, Springfield, Mich.",
            )
        )
 class TestApplications(TestCase):
    """