Merge pull request #220 from jgysland/add-fuzzy-matching

fuzzy matching
This commit is contained in:
Daniel Quinn 2017-04-30 19:37:03 -07:00 committed by GitHub
commit 5eb26102d4
3 changed files with 34 additions and 2 deletions

View File

@ -6,6 +6,7 @@ django-filter>=1.0
django-flat-responsive>=1.2.0
djangorestframework>=3.5.3
filemagic>=1.6
fuzzywuzzy[speedup]==0.15.0
langdetect>=1.0.7
pyocr>=0.4.6
python-dateutil>=2.6.0

View File

@ -5,6 +5,7 @@ import re
import uuid
from collections import OrderedDict
from fuzzywuzzy import fuzz
from django.conf import settings
from django.core.urlresolvers import reverse
@ -21,11 +22,13 @@ class MatchingModel(models.Model):
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
)
name = models.CharField(max_length=128, unique=True)
@ -42,8 +45,11 @@ class MatchingModel(models.Model):
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. If you don't know what a regex "
"is, you probably don't want this option."
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
)
@ -104,6 +110,15 @@ class MatchingModel(models.Model):
return bool(re.search(
re.compile(self.match, **search_kwargs), text))
if self.matching_algorithm == self.MATCH_FUZZY:
match = re.sub(r'[^\w\s]', '', self.match)
text = re.sub(r'[^\w\s]', '', text)
if self.is_insensitive:
match = match.lower()
text = text.lower()
return True if fuzz.partial_ratio(match, text) >= 90 else False
raise NotImplementedError("Unsupported matching algorithm")
def save(self, *args, **kwargs):

View File

@ -149,6 +149,22 @@ class TestMatching(TestCase):
)
)
def test_match_fuzzy(self):
self._test_matching(
"Springfield, Miss.",
"MATCH_FUZZY",
(
"1220 Main Street, Springf eld, Miss.",
"1220 Main Street, Spring field, Miss.",
"1220 Main Street, Springfeld, Miss.",
"1220 Main Street Springfield Miss",
),
(
"1220 Main Street, Springfield, Mich.",
)
)
class TestApplications(TestCase):
"""