mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
fixes #161
This commit is contained in:
parent
ceb9426fd4
commit
d093c004fb
@ -1,3 +1,4 @@
|
|||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from fuzzywuzzy import fuzz
|
from fuzzywuzzy import fuzz
|
||||||
@ -5,49 +6,59 @@ from fuzzywuzzy import fuzz
|
|||||||
from documents.models import MatchingModel, Correspondent, DocumentType, Tag
|
from documents.models import MatchingModel, Correspondent, DocumentType, Tag
|
||||||
|
|
||||||
|
|
||||||
def match_correspondents(document_content, classifier):
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def log_reason(matching_model, document, reason):
|
||||||
|
class_name = type(matching_model).__name__
|
||||||
|
logger.debug(
|
||||||
|
f"Assigning {class_name} {matching_model.name} to document "
|
||||||
|
f"{document} because {reason}")
|
||||||
|
|
||||||
|
|
||||||
|
def match_correspondents(document, classifier):
|
||||||
if classifier:
|
if classifier:
|
||||||
pred_id = classifier.predict_correspondent(document_content)
|
pred_id = classifier.predict_correspondent(document.content)
|
||||||
else:
|
else:
|
||||||
pred_id = None
|
pred_id = None
|
||||||
|
|
||||||
correspondents = Correspondent.objects.all()
|
correspondents = Correspondent.objects.all()
|
||||||
|
|
||||||
return list(filter(
|
return list(filter(
|
||||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
lambda o: matches(o, document) or o.pk == pred_id,
|
||||||
correspondents))
|
correspondents))
|
||||||
|
|
||||||
|
|
||||||
def match_document_types(document_content, classifier):
|
def match_document_types(document, classifier):
|
||||||
if classifier:
|
if classifier:
|
||||||
pred_id = classifier.predict_document_type(document_content)
|
pred_id = classifier.predict_document_type(document.content)
|
||||||
else:
|
else:
|
||||||
pred_id = None
|
pred_id = None
|
||||||
|
|
||||||
document_types = DocumentType.objects.all()
|
document_types = DocumentType.objects.all()
|
||||||
|
|
||||||
return list(filter(
|
return list(filter(
|
||||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
lambda o: matches(o, document) or o.pk == pred_id,
|
||||||
document_types))
|
document_types))
|
||||||
|
|
||||||
|
|
||||||
def match_tags(document_content, classifier):
|
def match_tags(document, classifier):
|
||||||
if classifier:
|
if classifier:
|
||||||
predicted_tag_ids = classifier.predict_tags(document_content)
|
predicted_tag_ids = classifier.predict_tags(document.content)
|
||||||
else:
|
else:
|
||||||
predicted_tag_ids = []
|
predicted_tag_ids = []
|
||||||
|
|
||||||
tags = Tag.objects.all()
|
tags = Tag.objects.all()
|
||||||
|
|
||||||
return list(filter(
|
return list(filter(
|
||||||
lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
|
lambda o: matches(o, document) or o.pk in predicted_tag_ids,
|
||||||
tags))
|
tags))
|
||||||
|
|
||||||
|
|
||||||
def matches(matching_model, document_content):
|
def matches(matching_model, document):
|
||||||
search_kwargs = {}
|
search_kwargs = {}
|
||||||
|
|
||||||
document_content = document_content.lower()
|
document_content = document.content.lower()
|
||||||
|
|
||||||
# Check that match is not empty
|
# Check that match is not empty
|
||||||
if matching_model.match.strip() == "":
|
if matching_model.match.strip() == "":
|
||||||
@ -62,26 +73,54 @@ def matches(matching_model, document_content):
|
|||||||
rf"\b{word}\b", document_content, **search_kwargs)
|
rf"\b{word}\b", document_content, **search_kwargs)
|
||||||
if not search_result:
|
if not search_result:
|
||||||
return False
|
return False
|
||||||
|
log_reason(
|
||||||
|
matching_model, document,
|
||||||
|
f"it contains all of these words: {matching_model.match}"
|
||||||
|
)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||||
for word in _split_match(matching_model):
|
for word in _split_match(matching_model):
|
||||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
||||||
|
log_reason(
|
||||||
|
matching_model, document,
|
||||||
|
f"it contains this word: {word}"
|
||||||
|
)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||||
return bool(re.search(
|
result = bool(re.search(
|
||||||
rf"\b{matching_model.match}\b",
|
rf"\b{matching_model.match}\b",
|
||||||
document_content,
|
document_content,
|
||||||
**search_kwargs
|
**search_kwargs
|
||||||
))
|
))
|
||||||
|
if result:
|
||||||
|
log_reason(
|
||||||
|
matching_model, document,
|
||||||
|
f"it contains this string: \"{matching_model.match}\""
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||||
return bool(re.search(
|
try:
|
||||||
re.compile(matching_model.match, **search_kwargs),
|
match = re.search(
|
||||||
document_content
|
re.compile(matching_model.match, **search_kwargs),
|
||||||
))
|
document_content
|
||||||
|
)
|
||||||
|
except re.error:
|
||||||
|
logger.error(
|
||||||
|
f"Error while processing regular expression "
|
||||||
|
f"{matching_model.match}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
if match:
|
||||||
|
log_reason(
|
||||||
|
matching_model, document,
|
||||||
|
f"the string {match.group()} matches the regular expression "
|
||||||
|
f"{matching_model.match}"
|
||||||
|
)
|
||||||
|
return bool(match)
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||||
match = re.sub(r'[^\w\s]', '', matching_model.match)
|
match = re.sub(r'[^\w\s]', '', matching_model.match)
|
||||||
@ -89,8 +128,16 @@ def matches(matching_model, document_content):
|
|||||||
if matching_model.is_insensitive:
|
if matching_model.is_insensitive:
|
||||||
match = match.lower()
|
match = match.lower()
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
if fuzz.partial_ratio(match, text) >= 90:
|
||||||
return fuzz.partial_ratio(match, text) >= 90
|
# TODO: make this better
|
||||||
|
log_reason(
|
||||||
|
matching_model, document,
|
||||||
|
f"parts of the document content somehow match the string "
|
||||||
|
f"{matching_model.match}"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||||
# this is done elsewhere.
|
# this is done elsewhere.
|
||||||
|
@ -38,7 +38,7 @@ def set_correspondent(sender,
|
|||||||
if document.correspondent and not replace:
|
if document.correspondent and not replace:
|
||||||
return
|
return
|
||||||
|
|
||||||
potential_correspondents = matching.match_correspondents(document.content,
|
potential_correspondents = matching.match_correspondents(document,
|
||||||
classifier)
|
classifier)
|
||||||
|
|
||||||
potential_count = len(potential_correspondents)
|
potential_count = len(potential_correspondents)
|
||||||
@ -81,7 +81,7 @@ def set_document_type(sender,
|
|||||||
if document.document_type and not replace:
|
if document.document_type and not replace:
|
||||||
return
|
return
|
||||||
|
|
||||||
potential_document_type = matching.match_document_types(document.content,
|
potential_document_type = matching.match_document_types(document,
|
||||||
classifier)
|
classifier)
|
||||||
|
|
||||||
potential_count = len(potential_document_type)
|
potential_count = len(potential_document_type)
|
||||||
@ -130,7 +130,7 @@ def set_tags(sender,
|
|||||||
|
|
||||||
current_tags = set(document.tags.all())
|
current_tags = set(document.tags.all())
|
||||||
|
|
||||||
matched_tags = matching.match_tags(document.content, classifier)
|
matched_tags = matching.match_tags(document, classifier)
|
||||||
|
|
||||||
relevant_tags = set(matched_tags) - current_tags
|
relevant_tags = set(matched_tags) - current_tags
|
||||||
|
|
||||||
|
@ -21,13 +21,15 @@ class TestMatching(TestCase):
|
|||||||
matching_algorithm=getattr(klass, algorithm)
|
matching_algorithm=getattr(klass, algorithm)
|
||||||
)
|
)
|
||||||
for string in true:
|
for string in true:
|
||||||
|
doc = Document(content=string)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
matching.matches(instance, string),
|
matching.matches(instance, doc),
|
||||||
'"%s" should match "%s" but it does not' % (text, string)
|
'"%s" should match "%s" but it does not' % (text, string)
|
||||||
)
|
)
|
||||||
for string in false:
|
for string in false:
|
||||||
|
doc = Document(content=string)
|
||||||
self.assertFalse(
|
self.assertFalse(
|
||||||
matching.matches(instance, string),
|
matching.matches(instance, doc),
|
||||||
'"%s" should not match "%s" but it does' % (text, string)
|
'"%s" should not match "%s" but it does' % (text, string)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -169,7 +171,7 @@ class TestMatching(TestCase):
|
|||||||
def test_match_regex(self):
|
def test_match_regex(self):
|
||||||
|
|
||||||
self._test_matching(
|
self._test_matching(
|
||||||
r"alpha\w+gamma",
|
"alpha\w+gamma",
|
||||||
"MATCH_REGEX",
|
"MATCH_REGEX",
|
||||||
(
|
(
|
||||||
"I have alpha_and_gamma in me",
|
"I have alpha_and_gamma in me",
|
||||||
@ -187,6 +189,16 @@ class TestMatching(TestCase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_tach_invalid_regex(self):
|
||||||
|
self._test_matching(
|
||||||
|
"[[",
|
||||||
|
"MATCH_REGEX",
|
||||||
|
[],
|
||||||
|
[
|
||||||
|
"Don't match this"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
def test_match_fuzzy(self):
|
def test_match_fuzzy(self):
|
||||||
|
|
||||||
self._test_matching(
|
self._test_matching(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user