Merge pull request #220 from jgysland/add-fuzzy-matching

fuzzy matching
This commit is contained in:
Daniel Quinn 2017-04-30 19:37:03 -07:00 committed by GitHub
commit 5eb26102d4
3 changed files with 34 additions and 2 deletions

View File

@ -6,6 +6,7 @@ django-filter>=1.0
django-flat-responsive>=1.2.0 django-flat-responsive>=1.2.0
djangorestframework>=3.5.3 djangorestframework>=3.5.3
filemagic>=1.6 filemagic>=1.6
fuzzywuzzy[speedup]==0.15.0
langdetect>=1.0.7 langdetect>=1.0.7
pyocr>=0.4.6 pyocr>=0.4.6
python-dateutil>=2.6.0 python-dateutil>=2.6.0

View File

@ -5,6 +5,7 @@ import re
import uuid import uuid
from collections import OrderedDict from collections import OrderedDict
from fuzzywuzzy import fuzz
from django.conf import settings from django.conf import settings
from django.core.urlresolvers import reverse from django.core.urlresolvers import reverse
@ -21,11 +22,13 @@ class MatchingModel(models.Model):
MATCH_ALL = 2 MATCH_ALL = 2
MATCH_LITERAL = 3 MATCH_LITERAL = 3
MATCH_REGEX = 4 MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCHING_ALGORITHMS = ( MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"), (MATCH_ANY, "Any"),
(MATCH_ALL, "All"), (MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"), (MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"), (MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
) )
name = models.CharField(max_length=128, unique=True) name = models.CharField(max_length=128, unique=True)
@ -42,8 +45,11 @@ class MatchingModel(models.Model):
"provided appear in the PDF, albeit not in the order provided. A " "provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in " "\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" " "the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. If you don't know what a regex " "uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option." "is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
) )
) )
@ -104,6 +110,15 @@ class MatchingModel(models.Model):
return bool(re.search( return bool(re.search(
re.compile(self.match, **search_kwargs), text)) re.compile(self.match, **search_kwargs), text))
if self.matching_algorithm == self.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', self.match)
text = re.sub(r'[^\w\s]', '', text)
if self.is_insensitive:
match = match.lower()
text = text.lower()
return True if fuzz.partial_ratio(match, text) >= 90 else False
raise NotImplementedError("Unsupported matching algorithm") raise NotImplementedError("Unsupported matching algorithm")
def save(self, *args, **kwargs): def save(self, *args, **kwargs):

View File

@ -149,6 +149,22 @@ class TestMatching(TestCase):
) )
) )
def test_match_fuzzy(self):
self._test_matching(
"Springfield, Miss.",
"MATCH_FUZZY",
(
"1220 Main Street, Springf eld, Miss.",
"1220 Main Street, Spring field, Miss.",
"1220 Main Street, Springfeld, Miss.",
"1220 Main Street Springfield Miss",
),
(
"1220 Main Street, Springfield, Mich.",
)
)
class TestApplications(TestCase): class TestApplications(TestCase):
""" """