From 6ce27d225d8ab2b344d5b6df69d7a84b07707310 Mon Sep 17 00:00:00 2001 From: jgysland Date: Sat, 29 Apr 2017 17:13:04 -0400 Subject: [PATCH 1/2] add fuzzy matching + tests --- requirements.txt | 1 + src/documents/models.py | 21 +++++++++++++++++++-- src/documents/tests/test_matchables.py | 16 ++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 23d8f0b63..5f261fc10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ django-filter>=1.0 django-flat-responsive>=1.2.0 djangorestframework>=3.5.3 filemagic>=1.6 +fuzzywuzzy[speedup]==0.15.0 langdetect>=1.0.7 pyocr>=0.4.6 python-dateutil>=2.6.0 diff --git a/src/documents/models.py b/src/documents/models.py index 0ee896ec3..707b0fcca 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -5,6 +5,7 @@ import re import uuid from collections import OrderedDict +from fuzzywuzzy import fuzz from django.conf import settings from django.core.urlresolvers import reverse @@ -21,11 +22,13 @@ class MatchingModel(models.Model): MATCH_ALL = 2 MATCH_LITERAL = 3 MATCH_REGEX = 4 + MATCH_FUZZY = 5 MATCHING_ALGORITHMS = ( (MATCH_ANY, "Any"), (MATCH_ALL, "All"), (MATCH_LITERAL, "Literal"), (MATCH_REGEX, "Regular Expression"), + (MATCH_FUZZY, "Fuzzy Match"), ) name = models.CharField(max_length=128, unique=True) @@ -42,8 +45,13 @@ class MatchingModel(models.Model): "provided appear in the PDF, albeit not in the order provided. A " "\"literal\" match means that the text you enter must appear in " "the PDF exactly as you've entered it, and \"regular expression\" " - "uses a regex to match the PDF. If you don't know what a regex " - "is, you probably don't want this option." + "uses a regex to match the PDF. (If you don't know what a regex " + "is, you probably don't want this option.) Finally, a \"fuzzy " + "match\" strips all punctuation from both the match candidate " + "and the OCR'd text and looks for a Levenshtein \"partial ratio\" " + "(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, " + "which can be useful for matching against documents with " + "imperfections that foil accurate OCR." ) ) @@ -104,6 +112,15 @@ class MatchingModel(models.Model): return bool(re.search( re.compile(self.match, **search_kwargs), text)) + if self.matching_algorithm == self.MATCH_FUZZY: + match = re.sub(r'[^\w\s]', '', self.match) + text = re.sub(r'[^\w\s]', '', text) + if self.is_insensitive: + match = match.lower() + text = text.lower() + + return True if fuzz.partial_ratio(match, text) >= 90 else False + raise NotImplementedError("Unsupported matching algorithm") def save(self, *args, **kwargs): diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index bcd377cf0..80b5bdb54 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -149,6 +149,22 @@ class TestMatching(TestCase): ) ) + def test_match_fuzzy(self): + + self._test_matching( + "Springfield, Miss.", + "MATCH_FUZZY", + ( + "1220 Main Street, Springf eld, Miss.", + "1220 Main Street, Spring field, Miss.", + "1220 Main Street, Springfeld, Miss.", + "1220 Main Street Springfield Miss", + ), + ( + "1220 Main Street, Springfield, Mich.", + ) + ) + class TestApplications(TestCase): """ From a7fa82a83f6c49170317922c92bbf28ee624e80d Mon Sep 17 00:00:00 2001 From: jgysland Date: Sun, 30 Apr 2017 16:56:50 -0400 Subject: [PATCH 2/2] KISS fuzzy match help text --- src/documents/models.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index 707b0fcca..54f45ae01 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -47,11 +47,9 @@ class MatchingModel(models.Model): "the PDF exactly as you've entered it, and \"regular expression\" " "uses a regex to match the PDF. (If you don't know what a regex " "is, you probably don't want this option.) Finally, a \"fuzzy " - "match\" strips all punctuation from both the match candidate " - "and the OCR'd text and looks for a Levenshtein \"partial ratio\" " - "(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, " - "which can be useful for matching against documents with " - "imperfections that foil accurate OCR." + "match\" looks for words or phrases that are mostly—but not " + "exactly—the same, which can be useful for matching against " + "documents containg imperfections that foil accurate OCR." ) )