From 6a36a4ec972bf7750f1fb0f62eeffbc33068cf48 Mon Sep 17 00:00:00 2001 From: ishirav Date: Sat, 23 Dec 2017 06:05:48 +0200 Subject: [PATCH 1/5] Support search terms that contain multiple words in ANY/ALL matching modes, by surrounding the terms with double quotes. --- src/documents/models.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index 2fa2dca0b..a9f17c2a2 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -89,7 +89,7 @@ class MatchingModel(models.Model): search_kwargs = {"flags": re.IGNORECASE} if self.matching_algorithm == self.MATCH_ALL: - for word in self.match.split(" "): + for word in self._split_match(): search_result = re.search( r"\b{}\b".format(word), text, **search_kwargs) if not search_result: @@ -97,7 +97,7 @@ class MatchingModel(models.Model): return True if self.matching_algorithm == self.MATCH_ANY: - for word in self.match.split(" "): + for word in self._split_match(): if re.search(r"\b{}\b".format(word), text, **search_kwargs): return True return False @@ -121,6 +121,19 @@ class MatchingModel(models.Model): raise NotImplementedError("Unsupported matching algorithm") + def _split_match(self): + ''' + Splits the match to invidual keywords, getting rid of unecessary spaces + and grouping quoted words together. + Example: + ' some random words "with quotes " and spaces' + ==> + ['some', 'random', 'words', 'with quotes', 'and', 'spaces'] + ''' + findterms = re.compile(r'"([^"]+)"|(\S+)').findall + normspace = re.compile(r'\s{2,}').sub + return [normspace(' ', (t[0] or t[1]).strip()) for t in findterms(self.match)] + def save(self, *args, **kwargs): self.match = self.match.lower() From 83746a9aeb52bdc1dc1022d025c95030b54235a1 Mon Sep 17 00:00:00 2001 From: ishirav Date: Sat, 23 Dec 2017 06:37:00 +0200 Subject: [PATCH 2/5] Add tests and improve whitespace handling --- src/documents/models.py | 6 ++--- src/documents/tests/test_matchables.py | 31 ++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index a9f17c2a2..ee6ee8d31 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -128,11 +128,11 @@ class MatchingModel(models.Model): Example: ' some random words "with quotes " and spaces' ==> - ['some', 'random', 'words', 'with quotes', 'and', 'spaces'] + ['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces'] ''' findterms = re.compile(r'"([^"]+)"|(\S+)').findall - normspace = re.compile(r'\s{2,}').sub - return [normspace(' ', (t[0] or t[1]).strip()) for t in findterms(self.match)] + normspace = re.compile(r'\s+').sub + return [normspace(r'\s+', (t[0] or t[1]).strip()) for t in findterms(self.match)] def save(self, *args, **kwargs): diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 80b5bdb54..6e64fc5bf 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -16,9 +16,9 @@ class TestMatching(TestCase): matching_algorithm=getattr(klass, algorithm) ) for string in true: - self.assertTrue(instance.matches(string)) + self.assertTrue(instance.matches(string), '"%s" should match "%s" but it does not' % (text, string)) for string in false: - self.assertFalse(instance.matches(string)) + self.assertFalse(instance.matches(string), '"%s" should not match "%s" but it does' % (text, string)) def test_match_all(self): @@ -54,6 +54,21 @@ class TestMatching(TestCase): ) ) + self._test_matching( + 'brown fox "lazy dogs"', + "MATCH_ALL", + ( + "the quick brown fox jumped over the lazy dogs", + "the quick brown fox jumped over the lazy dogs", + ), + ( + "the quick fox jumped over the lazy dogs", + "the quick brown wolf jumped over the lazy dogs", + "the quick brown fox jumped over the fat dogs", + "the quick brown fox jumped over the lazy... dogs", + ) + ) + def test_match_any(self): self._test_matching( @@ -89,6 +104,18 @@ class TestMatching(TestCase): ) ) + self._test_matching( + '"brown fox" " lazy dogs "', + "MATCH_ANY", + ( + "the quick brown fox", + "jumped over the lazy dogs.", + ), + ( + "the lazy fox jumped over the brown dogs", + ) + ) + def test_match_literal(self): self._test_matching( From ad0f0a0b5d505283ca5c6f548e455eddf77e4228 Mon Sep 17 00:00:00 2001 From: ishirav Date: Sat, 23 Dec 2017 06:44:06 +0200 Subject: [PATCH 3/5] Add documentation about multi-word search terms --- docs/guesswork.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/guesswork.rst b/docs/guesswork.rst index 20407b265..59bdd0ad5 100644 --- a/docs/guesswork.rst +++ b/docs/guesswork.rst @@ -80,6 +80,12 @@ text and matching algorithm. From the help info there: uses a regex to match the PDF. If you don't know what a regex is, you probably don't want this option. +When using the "any" or "all" matching algorithms, you can search for terms that +consist of multiple words by enclosing them in double quotes. For example, defining +a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match +documents that contain either "Bank of America" or "BofA", but will not match +documents containing "Bank of South America". + Then just save your tag/correspondent and run another document through the consumer. Once complete, you should see the newly-created document, automatically tagged with the appropriate data. From 4c38b28469c8f6185459dc5c588464e877b7c77a Mon Sep 17 00:00:00 2001 From: ishirav Date: Sat, 23 Dec 2017 06:59:48 +0200 Subject: [PATCH 4/5] break long lines (pep8) --- src/documents/models.py | 3 ++- src/documents/tests/test_matchables.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/documents/models.py b/src/documents/models.py index ee6ee8d31..8021e03c1 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -132,7 +132,8 @@ class MatchingModel(models.Model): ''' findterms = re.compile(r'"([^"]+)"|(\S+)').findall normspace = re.compile(r'\s+').sub - return [normspace(r'\s+', (t[0] or t[1]).strip()) for t in findterms(self.match)] + return [normspace(r'\s+', (t[0] or t[1]).strip()) + for t in findterms(self.match)] def save(self, *args, **kwargs): diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index 6e64fc5bf..b1b8eb46f 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -16,9 +16,11 @@ class TestMatching(TestCase): matching_algorithm=getattr(klass, algorithm) ) for string in true: - self.assertTrue(instance.matches(string), '"%s" should match "%s" but it does not' % (text, string)) + self.assertTrue(instance.matches(string), + '"%s" should match "%s" but it does not' % (text, string)) for string in false: - self.assertFalse(instance.matches(string), '"%s" should not match "%s" but it does' % (text, string)) + self.assertFalse(instance.matches(string), + '"%s" should not match "%s" but it does' % (text, string)) def test_match_all(self): From d1c824194743d67b92296dd483e1ff302b5fded3 Mon Sep 17 00:00:00 2001 From: ishirav Date: Sat, 23 Dec 2017 07:39:40 +0200 Subject: [PATCH 5/5] break long lines (pep8) --- src/documents/tests/test_matchables.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py index b1b8eb46f..6a84ab06d 100644 --- a/src/documents/tests/test_matchables.py +++ b/src/documents/tests/test_matchables.py @@ -16,11 +16,15 @@ class TestMatching(TestCase): matching_algorithm=getattr(klass, algorithm) ) for string in true: - self.assertTrue(instance.matches(string), - '"%s" should match "%s" but it does not' % (text, string)) + self.assertTrue( + instance.matches(string), + '"%s" should match "%s" but it does not' % (text, string) + ) for string in false: - self.assertFalse(instance.matches(string), - '"%s" should not match "%s" but it does' % (text, string)) + self.assertFalse( + instance.matches(string), + '"%s" should not match "%s" but it does' % (text, string) + ) def test_match_all(self):