diff --git a/docs/guesswork.rst b/docs/guesswork.rst index 20407b265..59bdd0ad5 100644 --- a/docs/guesswork.rst +++ b/docs/guesswork.rst @@ -80,6 +80,12 @@ text and matching algorithm. From the help info there: uses a regex to match the PDF. If you don't know what a regex is, you probably don't want this option. +When using the "any" or "all" matching algorithms, you can search for terms that +consist of multiple words by enclosing them in double quotes. For example, defining +a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match +documents that contain either "Bank of America" or "BofA", but will not match +documents containing "Bank of South America". + Then just save your tag/correspondent and run another document through the consumer. Once complete, you should see the newly-created document, automatically tagged with the appropriate data. diff --git a/src/documents/models.py b/src/documents/models.py index 2fa2dca0b..8021e03c1 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -89,7 +89,7 @@ class MatchingModel(models.Model): search_kwargs = {"flags": re.IGNORECASE} if self.matching_algorithm == self.MATCH_ALL: - for word in self.match.split(" "): + for word in self._split_match(): search_result = re.search( r"\b{}\b".format(word), text, **search_kwargs) if not search_result: @@ -97,7 +97,7 @@ class MatchingModel(models.Model): return True if self.matching_algorithm == self.MATCH_ANY: - for word in self.match.split(" "): + for word in self._split_match(): if re.search(r"\b{}\b".format(word), text, **search_kwargs): return True return False @@ -121,6 +121,20 @@ class MatchingModel(models.Model): raise NotImplementedError("Unsupported matching algorithm") + def _split_match(self): + ''' + Splits the match to invidual keywords, getting rid of unecessary spaces + and grouping quoted words together. + Example: + ' some random words "with quotes " and spaces' + ==> + ['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces'] + ''' + findterms = re.compile(r'"([^"]+)"|(\S+)').findall + normspace = re.compile(r'\s+').sub + return [normspace(r'\s+', (t[0] or t[1]).strip()) + for t in findterms(self.match)] + def save(self, *args, **kwargs): self.match = self.match.lower() diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 80b5bdb54..6a84ab06d 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -16,9 +16,15 @@ class TestMatching(TestCase): matching_algorithm=getattr(klass, algorithm) ) for string in true: - self.assertTrue(instance.matches(string)) + self.assertTrue( + instance.matches(string), + '"%s" should match "%s" but it does not' % (text, string) + ) for string in false: - self.assertFalse(instance.matches(string)) + self.assertFalse( + instance.matches(string), + '"%s" should not match "%s" but it does' % (text, string) + ) def test_match_all(self): @@ -54,6 +60,21 @@ class TestMatching(TestCase): ) ) + self._test_matching( + 'brown fox "lazy dogs"', + "MATCH_ALL", + ( + "the quick brown fox jumped over the lazy dogs", + "the quick brown fox jumped over the lazy dogs", + ), + ( + "the quick fox jumped over the lazy dogs", + "the quick brown wolf jumped over the lazy dogs", + "the quick brown fox jumped over the fat dogs", + "the quick brown fox jumped over the lazy... dogs", + ) + ) + def test_match_any(self): self._test_matching( @@ -89,6 +110,18 @@ class TestMatching(TestCase): ) ) + self._test_matching( + '"brown fox" " lazy dogs "', + "MATCH_ANY", + ( + "the quick brown fox", + "jumped over the lazy dogs.", + ), + ( + "the lazy fox jumped over the brown dogs", + ) + ) + def test_match_literal(self): self._test_matching(