add fuzzy matching + tests

This commit is contained in:
jgysland 2017-04-29 17:13:04 -04:00
parent 819a0e1f57
commit 6ce27d225d
3 changed files with 36 additions and 2 deletions

View File

@ -6,6 +6,7 @@ django-filter>=1.0
django-flat-responsive>=1.2.0 django-flat-responsive>=1.2.0
djangorestframework>=3.5.3 djangorestframework>=3.5.3
filemagic>=1.6 filemagic>=1.6
fuzzywuzzy[speedup]==0.15.0
langdetect>=1.0.7 langdetect>=1.0.7
pyocr>=0.4.6 pyocr>=0.4.6
python-dateutil>=2.6.0 python-dateutil>=2.6.0

View File

@ -5,6 +5,7 @@ import re
import uuid import uuid
from collections import OrderedDict from collections import OrderedDict
from fuzzywuzzy import fuzz
from django.conf import settings from django.conf import settings
from django.core.urlresolvers import reverse from django.core.urlresolvers import reverse
@ -21,11 +22,13 @@ class MatchingModel(models.Model):
MATCH_ALL = 2 MATCH_ALL = 2
MATCH_LITERAL = 3 MATCH_LITERAL = 3
MATCH_REGEX = 4 MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCHING_ALGORITHMS = ( MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"), (MATCH_ANY, "Any"),
(MATCH_ALL, "All"), (MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"), (MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"), (MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
) )
name = models.CharField(max_length=128, unique=True) name = models.CharField(max_length=128, unique=True)
@ -42,8 +45,13 @@ class MatchingModel(models.Model):
"provided appear in the PDF, albeit not in the order provided. A " "provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in " "\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" " "the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. If you don't know what a regex " "uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option." "is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" strips all punctuation from both the match candidate "
"and the OCR'd text and looks for a Levenshtein \"partial ratio\" "
"(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, "
"which can be useful for matching against documents with "
"imperfections that foil accurate OCR."
) )
) )
@ -104,6 +112,15 @@ class MatchingModel(models.Model):
return bool(re.search( return bool(re.search(
re.compile(self.match, **search_kwargs), text)) re.compile(self.match, **search_kwargs), text))
if self.matching_algorithm == self.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', self.match)
text = re.sub(r'[^\w\s]', '', text)
if self.is_insensitive:
match = match.lower()
text = text.lower()
return True if fuzz.partial_ratio(match, text) >= 90 else False
raise NotImplementedError("Unsupported matching algorithm") raise NotImplementedError("Unsupported matching algorithm")
def save(self, *args, **kwargs): def save(self, *args, **kwargs):

View File

@ -149,6 +149,22 @@ class TestMatching(TestCase):
) )
) )
def test_match_fuzzy(self):
self._test_matching(
"Springfield, Miss.",
"MATCH_FUZZY",
(
"1220 Main Street, Springf eld, Miss.",
"1220 Main Street, Spring field, Miss.",
"1220 Main Street, Springfeld, Miss.",
"1220 Main Street Springfield Miss",
),
(
"1220 Main Street, Springfield, Mich.",
)
)
class TestApplications(TestCase): class TestApplications(TestCase):
""" """