mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #277 from ishirav/multi-word-match
Add multi-word match
This commit is contained in:
commit
06117929bb
@ -80,6 +80,12 @@ text and matching algorithm. From the help info there:
|
|||||||
uses a regex to match the PDF. If you don't know what a regex is, you
|
uses a regex to match the PDF. If you don't know what a regex is, you
|
||||||
probably don't want this option.
|
probably don't want this option.
|
||||||
|
|
||||||
|
When using the "any" or "all" matching algorithms, you can search for terms that
|
||||||
|
consist of multiple words by enclosing them in double quotes. For example, defining
|
||||||
|
a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match
|
||||||
|
documents that contain either "Bank of America" or "BofA", but will not match
|
||||||
|
documents containing "Bank of South America".
|
||||||
|
|
||||||
Then just save your tag/correspondent and run another document through the
|
Then just save your tag/correspondent and run another document through the
|
||||||
consumer. Once complete, you should see the newly-created document,
|
consumer. Once complete, you should see the newly-created document,
|
||||||
automatically tagged with the appropriate data.
|
automatically tagged with the appropriate data.
|
||||||
|
@ -89,7 +89,7 @@ class MatchingModel(models.Model):
|
|||||||
search_kwargs = {"flags": re.IGNORECASE}
|
search_kwargs = {"flags": re.IGNORECASE}
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_ALL:
|
if self.matching_algorithm == self.MATCH_ALL:
|
||||||
for word in self.match.split(" "):
|
for word in self._split_match():
|
||||||
search_result = re.search(
|
search_result = re.search(
|
||||||
r"\b{}\b".format(word), text, **search_kwargs)
|
r"\b{}\b".format(word), text, **search_kwargs)
|
||||||
if not search_result:
|
if not search_result:
|
||||||
@ -97,7 +97,7 @@ class MatchingModel(models.Model):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_ANY:
|
if self.matching_algorithm == self.MATCH_ANY:
|
||||||
for word in self.match.split(" "):
|
for word in self._split_match():
|
||||||
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
|
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@ -121,6 +121,20 @@ class MatchingModel(models.Model):
|
|||||||
|
|
||||||
raise NotImplementedError("Unsupported matching algorithm")
|
raise NotImplementedError("Unsupported matching algorithm")
|
||||||
|
|
||||||
|
def _split_match(self):
|
||||||
|
'''
|
||||||
|
Splits the match to invidual keywords, getting rid of unecessary spaces
|
||||||
|
and grouping quoted words together.
|
||||||
|
Example:
|
||||||
|
' some random words "with quotes " and spaces'
|
||||||
|
==>
|
||||||
|
['some', 'random', 'words', 'with\s+quotes', 'and', 'spaces']
|
||||||
|
'''
|
||||||
|
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||||
|
normspace = re.compile(r'\s+').sub
|
||||||
|
return [normspace(r'\s+', (t[0] or t[1]).strip())
|
||||||
|
for t in findterms(self.match)]
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
|
|
||||||
self.match = self.match.lower()
|
self.match = self.match.lower()
|
||||||
|
@ -16,9 +16,15 @@ class TestMatching(TestCase):
|
|||||||
matching_algorithm=getattr(klass, algorithm)
|
matching_algorithm=getattr(klass, algorithm)
|
||||||
)
|
)
|
||||||
for string in true:
|
for string in true:
|
||||||
self.assertTrue(instance.matches(string))
|
self.assertTrue(
|
||||||
|
instance.matches(string),
|
||||||
|
'"%s" should match "%s" but it does not' % (text, string)
|
||||||
|
)
|
||||||
for string in false:
|
for string in false:
|
||||||
self.assertFalse(instance.matches(string))
|
self.assertFalse(
|
||||||
|
instance.matches(string),
|
||||||
|
'"%s" should not match "%s" but it does' % (text, string)
|
||||||
|
)
|
||||||
|
|
||||||
def test_match_all(self):
|
def test_match_all(self):
|
||||||
|
|
||||||
@ -54,6 +60,21 @@ class TestMatching(TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._test_matching(
|
||||||
|
'brown fox "lazy dogs"',
|
||||||
|
"MATCH_ALL",
|
||||||
|
(
|
||||||
|
"the quick brown fox jumped over the lazy dogs",
|
||||||
|
"the quick brown fox jumped over the lazy dogs",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"the quick fox jumped over the lazy dogs",
|
||||||
|
"the quick brown wolf jumped over the lazy dogs",
|
||||||
|
"the quick brown fox jumped over the fat dogs",
|
||||||
|
"the quick brown fox jumped over the lazy... dogs",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def test_match_any(self):
|
def test_match_any(self):
|
||||||
|
|
||||||
self._test_matching(
|
self._test_matching(
|
||||||
@ -89,6 +110,18 @@ class TestMatching(TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._test_matching(
|
||||||
|
'"brown fox" " lazy dogs "',
|
||||||
|
"MATCH_ANY",
|
||||||
|
(
|
||||||
|
"the quick brown fox",
|
||||||
|
"jumped over the lazy dogs.",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"the lazy fox jumped over the brown dogs",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def test_match_literal(self):
|
def test_match_literal(self):
|
||||||
|
|
||||||
self._test_matching(
|
self._test_matching(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user