mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	fixes #161
This commit is contained in:
		| @@ -1,3 +1,4 @@ | ||||
| import logging | ||||
| import re | ||||
|  | ||||
| from fuzzywuzzy import fuzz | ||||
| @@ -5,49 +6,59 @@ from fuzzywuzzy import fuzz | ||||
| from documents.models import MatchingModel, Correspondent, DocumentType, Tag | ||||
|  | ||||
|  | ||||
| def match_correspondents(document_content, classifier): | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def log_reason(matching_model, document, reason): | ||||
|     class_name = type(matching_model).__name__ | ||||
|     logger.debug( | ||||
|         f"Assigning {class_name} {matching_model.name} to document " | ||||
|         f"{document} because {reason}") | ||||
|  | ||||
|  | ||||
| def match_correspondents(document, classifier): | ||||
|     if classifier: | ||||
|         pred_id = classifier.predict_correspondent(document_content) | ||||
|         pred_id = classifier.predict_correspondent(document.content) | ||||
|     else: | ||||
|         pred_id = None | ||||
|  | ||||
|     correspondents = Correspondent.objects.all() | ||||
|  | ||||
|     return list(filter( | ||||
|         lambda o: matches(o, document_content) or o.pk == pred_id, | ||||
|         lambda o: matches(o, document) or o.pk == pred_id, | ||||
|         correspondents)) | ||||
|  | ||||
|  | ||||
| def match_document_types(document_content, classifier): | ||||
| def match_document_types(document, classifier): | ||||
|     if classifier: | ||||
|         pred_id = classifier.predict_document_type(document_content) | ||||
|         pred_id = classifier.predict_document_type(document.content) | ||||
|     else: | ||||
|         pred_id = None | ||||
|  | ||||
|     document_types = DocumentType.objects.all() | ||||
|  | ||||
|     return list(filter( | ||||
|         lambda o: matches(o, document_content) or o.pk == pred_id, | ||||
|         lambda o: matches(o, document) or o.pk == pred_id, | ||||
|         document_types)) | ||||
|  | ||||
|  | ||||
| def match_tags(document_content, classifier): | ||||
| def match_tags(document, classifier): | ||||
|     if classifier: | ||||
|         predicted_tag_ids = classifier.predict_tags(document_content) | ||||
|         predicted_tag_ids = classifier.predict_tags(document.content) | ||||
|     else: | ||||
|         predicted_tag_ids = [] | ||||
|  | ||||
|     tags = Tag.objects.all() | ||||
|  | ||||
|     return list(filter( | ||||
|         lambda o: matches(o, document_content) or o.pk in predicted_tag_ids, | ||||
|         lambda o: matches(o, document) or o.pk in predicted_tag_ids, | ||||
|         tags)) | ||||
|  | ||||
|  | ||||
| def matches(matching_model, document_content): | ||||
| def matches(matching_model, document): | ||||
|     search_kwargs = {} | ||||
|  | ||||
|     document_content = document_content.lower() | ||||
|     document_content = document.content.lower() | ||||
|  | ||||
|     # Check that match is not empty | ||||
|     if matching_model.match.strip() == "": | ||||
| @@ -62,26 +73,54 @@ def matches(matching_model, document_content): | ||||
|                 rf"\b{word}\b", document_content, **search_kwargs) | ||||
|             if not search_result: | ||||
|                 return False | ||||
|         log_reason( | ||||
|             matching_model, document, | ||||
|             f"it contains all of these words: {matching_model.match}" | ||||
|         ) | ||||
|         return True | ||||
|  | ||||
|     elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY: | ||||
|         for word in _split_match(matching_model): | ||||
|             if re.search(rf"\b{word}\b", document_content, **search_kwargs): | ||||
|                 log_reason( | ||||
|                     matching_model, document, | ||||
|                     f"it contains this word: {word}" | ||||
|                 ) | ||||
|                 return True | ||||
|         return False | ||||
|  | ||||
|     elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: | ||||
|         return bool(re.search( | ||||
|         result = bool(re.search( | ||||
|             rf"\b{matching_model.match}\b", | ||||
|             document_content, | ||||
|             **search_kwargs | ||||
|         )) | ||||
|         if result: | ||||
|             log_reason( | ||||
|                 matching_model, document, | ||||
|                 f"it contains this string: \"{matching_model.match}\"" | ||||
|             ) | ||||
|         return result | ||||
|  | ||||
|     elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: | ||||
|         return bool(re.search( | ||||
|             re.compile(matching_model.match, **search_kwargs), | ||||
|             document_content | ||||
|         )) | ||||
|         try: | ||||
|             match = re.search( | ||||
|                 re.compile(matching_model.match, **search_kwargs), | ||||
|                 document_content | ||||
|             ) | ||||
|         except re.error: | ||||
|             logger.error( | ||||
|                 f"Error while processing regular expression " | ||||
|                 f"{matching_model.match}" | ||||
|             ) | ||||
|             return False | ||||
|         if match: | ||||
|             log_reason( | ||||
|                 matching_model, document, | ||||
|                 f"the string {match.group()} matches the regular expression " | ||||
|                 f"{matching_model.match}" | ||||
|             ) | ||||
|         return bool(match) | ||||
|  | ||||
|     elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: | ||||
|         match = re.sub(r'[^\w\s]', '', matching_model.match) | ||||
| @@ -89,8 +128,16 @@ def matches(matching_model, document_content): | ||||
|         if matching_model.is_insensitive: | ||||
|             match = match.lower() | ||||
|             text = text.lower() | ||||
|  | ||||
|         return fuzz.partial_ratio(match, text) >= 90 | ||||
|         if fuzz.partial_ratio(match, text) >= 90: | ||||
|             # TODO: make this better | ||||
|             log_reason( | ||||
|                 matching_model, document, | ||||
|                 f"parts of the document content somehow match the string " | ||||
|                 f"{matching_model.match}" | ||||
|             ) | ||||
|             return True | ||||
|         else: | ||||
|             return False | ||||
|  | ||||
|     elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: | ||||
|         # this is done elsewhere. | ||||
|   | ||||
| @@ -38,7 +38,7 @@ def set_correspondent(sender, | ||||
|     if document.correspondent and not replace: | ||||
|         return | ||||
|  | ||||
|     potential_correspondents = matching.match_correspondents(document.content, | ||||
|     potential_correspondents = matching.match_correspondents(document, | ||||
|                                                              classifier) | ||||
|  | ||||
|     potential_count = len(potential_correspondents) | ||||
| @@ -81,7 +81,7 @@ def set_document_type(sender, | ||||
|     if document.document_type and not replace: | ||||
|         return | ||||
|  | ||||
|     potential_document_type = matching.match_document_types(document.content, | ||||
|     potential_document_type = matching.match_document_types(document, | ||||
|                                                             classifier) | ||||
|  | ||||
|     potential_count = len(potential_document_type) | ||||
| @@ -130,7 +130,7 @@ def set_tags(sender, | ||||
|  | ||||
|     current_tags = set(document.tags.all()) | ||||
|  | ||||
|     matched_tags = matching.match_tags(document.content, classifier) | ||||
|     matched_tags = matching.match_tags(document, classifier) | ||||
|  | ||||
|     relevant_tags = set(matched_tags) - current_tags | ||||
|  | ||||
|   | ||||
| @@ -21,13 +21,15 @@ class TestMatching(TestCase): | ||||
|                 matching_algorithm=getattr(klass, algorithm) | ||||
|             ) | ||||
|             for string in true: | ||||
|                 doc = Document(content=string) | ||||
|                 self.assertTrue( | ||||
|                     matching.matches(instance, string), | ||||
|                     matching.matches(instance, doc), | ||||
|                     '"%s" should match "%s" but it does not' % (text, string) | ||||
|                 ) | ||||
|             for string in false: | ||||
|                 doc = Document(content=string) | ||||
|                 self.assertFalse( | ||||
|                     matching.matches(instance, string), | ||||
|                     matching.matches(instance, doc), | ||||
|                     '"%s" should not match "%s" but it does' % (text, string) | ||||
|                 ) | ||||
|  | ||||
| @@ -169,7 +171,7 @@ class TestMatching(TestCase): | ||||
|     def test_match_regex(self): | ||||
|  | ||||
|         self._test_matching( | ||||
|             r"alpha\w+gamma", | ||||
|             "alpha\w+gamma", | ||||
|             "MATCH_REGEX", | ||||
|             ( | ||||
|                 "I have alpha_and_gamma in me", | ||||
| @@ -187,6 +189,16 @@ class TestMatching(TestCase): | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_tach_invalid_regex(self): | ||||
|         self._test_matching( | ||||
|             "[[", | ||||
|             "MATCH_REGEX", | ||||
|             [], | ||||
|             [ | ||||
|                 "Don't match this" | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|     def test_match_fuzzy(self): | ||||
|  | ||||
|         self._test_matching( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler