Merge pull request #277 from ishirav/multi-word-match

Add multi-word match
This commit is contained in:
Daniel Quinn 2017-12-27 11:21:27 +01:00 committed by GitHub
commit 06117929bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 57 additions and 4 deletions

View File

@ -80,6 +80,12 @@ text and matching algorithm. From the help info there:
uses a regex to match the PDF. If you don't know what a regex is, you
probably don't want this option.
When using the "any" or "all" matching algorithms, you can search for terms that
consist of multiple words by enclosing them in double quotes. For example, defining
a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match
documents that contain either "Bank of America" or "BofA", but will not match
documents containing "Bank of South America".
Then just save your tag/correspondent and run another document through the
consumer. Once complete, you should see the newly-created document,
automatically tagged with the appropriate data.

View File

@ -89,7 +89,7 @@ class MatchingModel(models.Model):
search_kwargs = {"flags": re.IGNORECASE}
if self.matching_algorithm == self.MATCH_ALL:
for word in self.match.split(" "):
for word in self._split_match():
search_result = re.search(
r"\b{}\b".format(word), text, **search_kwargs)
if not search_result:
@ -97,7 +97,7 @@ class MatchingModel(models.Model):
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self.match.split(" "):
for word in self._split_match():
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
return True
return False
@ -121,6 +121,20 @@ class MatchingModel(models.Model):
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(self):
'''
Splits the match to invidual keywords, getting rid of unecessary spaces
and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces']
'''
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r'\s+').sub
return [normspace(r'\s+', (t[0] or t[1]).strip())
for t in findterms(self.match)]
def save(self, *args, **kwargs):
self.match = self.match.lower()

View File

@ -16,9 +16,15 @@ class TestMatching(TestCase):
matching_algorithm=getattr(klass, algorithm)
)
for string in true:
self.assertTrue(instance.matches(string))
self.assertTrue(
instance.matches(string),
'"%s" should match "%s" but it does not' % (text, string)
)
for string in false:
self.assertFalse(instance.matches(string))
self.assertFalse(
instance.matches(string),
'"%s" should not match "%s" but it does' % (text, string)
)
def test_match_all(self):
@ -54,6 +60,21 @@ class TestMatching(TestCase):
)
)
self._test_matching(
'brown fox "lazy dogs"',
"MATCH_ALL",
(
"the quick brown fox jumped over the lazy dogs",
"the quick brown fox jumped over the lazy dogs",
),
(
"the quick fox jumped over the lazy dogs",
"the quick brown wolf jumped over the lazy dogs",
"the quick brown fox jumped over the fat dogs",
"the quick brown fox jumped over the lazy... dogs",
)
)
def test_match_any(self):
self._test_matching(
@ -89,6 +110,18 @@ class TestMatching(TestCase):
)
)
self._test_matching(
'"brown fox" " lazy dogs "',
"MATCH_ANY",
(
"the quick brown fox",
"jumped over the lazy dogs.",
),
(
"the lazy fox jumped over the brown dogs",
)
)
def test_match_literal(self):
self._test_matching(