mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	add fuzzy matching + tests
This commit is contained in:
		| @@ -5,6 +5,7 @@ import re | ||||
| import uuid | ||||
|  | ||||
| from collections import OrderedDict | ||||
| from fuzzywuzzy import fuzz | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.urlresolvers import reverse | ||||
| @@ -21,11 +22,13 @@ class MatchingModel(models.Model): | ||||
|     MATCH_ALL = 2 | ||||
|     MATCH_LITERAL = 3 | ||||
|     MATCH_REGEX = 4 | ||||
|     MATCH_FUZZY = 5 | ||||
|     MATCHING_ALGORITHMS = ( | ||||
|         (MATCH_ANY, "Any"), | ||||
|         (MATCH_ALL, "All"), | ||||
|         (MATCH_LITERAL, "Literal"), | ||||
|         (MATCH_REGEX, "Regular Expression"), | ||||
|         (MATCH_FUZZY, "Fuzzy Match"), | ||||
|     ) | ||||
|  | ||||
|     name = models.CharField(max_length=128, unique=True) | ||||
| @@ -42,8 +45,13 @@ class MatchingModel(models.Model): | ||||
|             "provided appear in the PDF, albeit not in the order provided.  A " | ||||
|             "\"literal\" match means that the text you enter must appear in " | ||||
|             "the PDF exactly as you've entered it, and \"regular expression\" " | ||||
|             "uses a regex to match the PDF.  If you don't know what a regex " | ||||
|             "is, you probably don't want this option." | ||||
|             "uses a regex to match the PDF.  (If you don't know what a regex " | ||||
|             "is, you probably don't want this option.)  Finally, a \"fuzzy " | ||||
|             "match\" strips all punctuation from both the match candidate " | ||||
|             "and the OCR'd text and looks for a Levenshtein \"partial ratio\" " | ||||
|             "(as implemented in the Python package \"FuzzyWuzzy\") of >= 90, " | ||||
|             "which can be useful for matching against documents with " | ||||
|             "imperfections that foil accurate OCR." | ||||
|         ) | ||||
|     ) | ||||
|  | ||||
| @@ -104,6 +112,15 @@ class MatchingModel(models.Model): | ||||
|             return bool(re.search( | ||||
|                 re.compile(self.match, **search_kwargs), text)) | ||||
|  | ||||
|         if self.matching_algorithm == self.MATCH_FUZZY: | ||||
|             match = re.sub(r'[^\w\s]', '', self.match) | ||||
|             text = re.sub(r'[^\w\s]', '', text) | ||||
|             if self.is_insensitive: | ||||
|                 match = match.lower() | ||||
|                 text = text.lower() | ||||
|  | ||||
|             return True if fuzz.partial_ratio(match, text) >= 90 else False | ||||
|  | ||||
|         raise NotImplementedError("Unsupported matching algorithm") | ||||
|  | ||||
|     def save(self, *args, **kwargs): | ||||
|   | ||||
| @@ -149,6 +149,22 @@ class TestMatching(TestCase): | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_match_fuzzy(self): | ||||
|  | ||||
|         self._test_matching( | ||||
|             "Springfield, Miss.", | ||||
|             "MATCH_FUZZY", | ||||
|             ( | ||||
|                 "1220 Main Street, Springf eld, Miss.", | ||||
|                 "1220 Main Street, Spring field, Miss.", | ||||
|                 "1220 Main Street, Springfeld, Miss.", | ||||
|                 "1220 Main Street Springfield Miss", | ||||
|             ), | ||||
|             ( | ||||
|                 "1220 Main Street, Springfield, Mich.", | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class TestApplications(TestCase): | ||||
|     """ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jgysland
					jgysland