mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
add fuzzy matching + tests
This commit is contained in:
parent
819a0e1f57
commit
6ce27d225d
@ -6,6 +6,7 @@ django-filter>=1.0
|
|||||||
django-flat-responsive>=1.2.0
|
django-flat-responsive>=1.2.0
|
||||||
djangorestframework>=3.5.3
|
djangorestframework>=3.5.3
|
||||||
filemagic>=1.6
|
filemagic>=1.6
|
||||||
|
fuzzywuzzy[speedup]==0.15.0
|
||||||
langdetect>=1.0.7
|
langdetect>=1.0.7
|
||||||
pyocr>=0.4.6
|
pyocr>=0.4.6
|
||||||
python-dateutil>=2.6.0
|
python-dateutil>=2.6.0
|
||||||
|
@ -5,6 +5,7 @@ import re
|
|||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.urlresolvers import reverse
|
from django.core.urlresolvers import reverse
|
||||||
@ -21,11 +22,13 @@ class MatchingModel(models.Model):
|
|||||||
MATCH_ALL = 2
|
MATCH_ALL = 2
|
||||||
MATCH_LITERAL = 3
|
MATCH_LITERAL = 3
|
||||||
MATCH_REGEX = 4
|
MATCH_REGEX = 4
|
||||||
|
MATCH_FUZZY = 5
|
||||||
MATCHING_ALGORITHMS = (
|
MATCHING_ALGORITHMS = (
|
||||||
(MATCH_ANY, "Any"),
|
(MATCH_ANY, "Any"),
|
||||||
(MATCH_ALL, "All"),
|
(MATCH_ALL, "All"),
|
||||||
(MATCH_LITERAL, "Literal"),
|
(MATCH_LITERAL, "Literal"),
|
||||||
(MATCH_REGEX, "Regular Expression"),
|
(MATCH_REGEX, "Regular Expression"),
|
||||||
|
(MATCH_FUZZY, "Fuzzy Match"),
|
||||||
)
|
)
|
||||||
|
|
||||||
name = models.CharField(max_length=128, unique=True)
|
name = models.CharField(max_length=128, unique=True)
|
||||||
@ -42,8 +45,13 @@ class MatchingModel(models.Model):
|
|||||||
"provided appear in the PDF, albeit not in the order provided. A "
|
"provided appear in the PDF, albeit not in the order provided. A "
|
||||||
"\"literal\" match means that the text you enter must appear in "
|
"\"literal\" match means that the text you enter must appear in "
|
||||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||||
"uses a regex to match the PDF. If you don't know what a regex "
|
"uses a regex to match the PDF. (If you don't know what a regex "
|
||||||
"is, you probably don't want this option."
|
"is, you probably don't want this option.) Finally, a \"fuzzy "
|
||||||
|
"match\" strips all punctuation from both the match candidate "
|
||||||
|
"and the OCR'd text and looks for a Levenshtein \"partial ratio\" "
|
||||||
|
"(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, "
|
||||||
|
"which can be useful for matching against documents with "
|
||||||
|
"imperfections that foil accurate OCR."
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -104,6 +112,15 @@ class MatchingModel(models.Model):
|
|||||||
return bool(re.search(
|
return bool(re.search(
|
||||||
re.compile(self.match, **search_kwargs), text))
|
re.compile(self.match, **search_kwargs), text))
|
||||||
|
|
||||||
|
if self.matching_algorithm == self.MATCH_FUZZY:
|
||||||
|
match = re.sub(r'[^\w\s]', '', self.match)
|
||||||
|
text = re.sub(r'[^\w\s]', '', text)
|
||||||
|
if self.is_insensitive:
|
||||||
|
match = match.lower()
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
return True if fuzz.partial_ratio(match, text) >= 90 else False
|
||||||
|
|
||||||
raise NotImplementedError("Unsupported matching algorithm")
|
raise NotImplementedError("Unsupported matching algorithm")
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
|
@ -149,6 +149,22 @@ class TestMatching(TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_match_fuzzy(self):
|
||||||
|
|
||||||
|
self._test_matching(
|
||||||
|
"Springfield, Miss.",
|
||||||
|
"MATCH_FUZZY",
|
||||||
|
(
|
||||||
|
"1220 Main Street, Springf eld, Miss.",
|
||||||
|
"1220 Main Street, Spring field, Miss.",
|
||||||
|
"1220 Main Street, Springfeld, Miss.",
|
||||||
|
"1220 Main Street Springfield Miss",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"1220 Main Street, Springfield, Mich.",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestApplications(TestCase):
|
class TestApplications(TestCase):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user