Merge pull request #277 from ishirav/multi-word-match

Add multi-word match
2025-12-24 02:05:48 -06:00 · 2017-12-27 11:21:27 +01:00
parent af4623e605 d1c8241947
commit 06117929bb
3 changed files with 57 additions and 4 deletions
--- a/docs/guesswork.rst
+++ b/docs/guesswork.rst
@@ -80,6 +80,12 @@ text and matching algorithm.  From the help info there:
    uses a regex to match the PDF.  If you don't know what a regex is, you
    probably don't want this option.
 When using the "any" or "all" matching algorithms, you can search for terms that
 consist of multiple words by enclosing them in double quotes. For example, defining
 a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match
 documents that contain either "Bank of America" or "BofA", but will not match
 documents containing "Bank of South America".
 Then just save your tag/correspondent and run another document through the
 consumer.  Once complete, you should see the newly-created document,
 automatically tagged with the appropriate data.
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -89,7 +89,7 @@ class MatchingModel(models.Model):
            search_kwargs = {"flags": re.IGNORECASE}
        if self.matching_algorithm == self.MATCH_ALL:
-            for word in self.match.split(" "):
+            for word in self._split_match():
                search_result = re.search(
                    r"\b{}\b".format(word), text, **search_kwargs)
                if not search_result:
@@ -97,7 +97,7 @@ class MatchingModel(models.Model):
            return True
        if self.matching_algorithm == self.MATCH_ANY:
-            for word in self.match.split(" "):
+            for word in self._split_match():
                if re.search(r"\b{}\b".format(word), text, **search_kwargs):
                    return True
            return False
@@ -121,6 +121,20 @@ class MatchingModel(models.Model):
        raise NotImplementedError("Unsupported matching algorithm")
    def _split_match(self):
        '''
        Splits the match to invidual keywords, getting rid of unecessary spaces
        and grouping quoted words together.
        Example:
        '  some random  words "with   quotes  " and   spaces'
            ==>
        ['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces']
        '''
        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
        normspace = re.compile(r'\s+').sub
        return [normspace(r'\s+', (t[0] or t[1]).strip())
                for t in findterms(self.match)]
    def save(self, *args, **kwargs):
        self.match = self.match.lower()
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -16,9 +16,15 @@ class TestMatching(TestCase):
                matching_algorithm=getattr(klass, algorithm)
            )
            for string in true:
-                self.assertTrue(instance.matches(string))
+                self.assertTrue(
                    instance.matches(string),
                    '"%s" should match "%s" but it does not' % (text, string)
                )
            for string in false:
-                self.assertFalse(instance.matches(string))
+                self.assertFalse(
                    instance.matches(string),
                    '"%s" should not match "%s" but it does' % (text, string)
                )
    def test_match_all(self):
@@ -54,6 +60,21 @@ class TestMatching(TestCase):
            )
        )
        self._test_matching(
            'brown fox "lazy dogs"',
            "MATCH_ALL",
            (
                "the quick brown fox jumped over the lazy dogs",
                "the quick brown fox jumped over the lazy  dogs",
            ),
            (
                "the quick fox jumped over the lazy dogs",
                "the quick brown wolf jumped over the lazy dogs",
                "the quick brown fox jumped over the fat dogs",
                "the quick brown fox jumped over the lazy... dogs",
            )
        )
    def test_match_any(self):
        self._test_matching(
@@ -89,6 +110,18 @@ class TestMatching(TestCase):
            )
        )
        self._test_matching(
            '"brown fox" " lazy  dogs "',
            "MATCH_ANY",
            (
                "the quick brown fox",
                "jumped over the lazy  dogs.",
            ),
            (
                "the lazy fox jumped over the brown dogs",
            )
        )
    def test_match_literal(self):
        self._test_matching(