Cleans up the docs, adds validation of the process count, include the test descriptions

This commit is contained in:
Trenton H 2023-09-11 17:10:09 -07:00
parent a03a745295
commit 81b9f2d4e0
3 changed files with 73 additions and 4 deletions

View File

@ -582,8 +582,11 @@ duplicate. But the content should be exact or close, allowing detection.
This tool does a fuzzy match over document content, looking for
those which look close according to a given ratio.
At this time, other metadata (such as correspondent or type) is not
take into account by the detection.
```
document_fuzzy_match [--ratio]
document_fuzzy_match [--ratio] [--processes N]
```
| Option | Required | Default | Description |

View File

@ -27,6 +27,10 @@ class _WorkResult:
def _process_and_match(work: _WorkPackage) -> _WorkResult:
"""
Does basic processing of document content, gets the basic ratio
and returns the result package
"""
# Normalize the string some, lower case, whitespace, etc
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
@ -72,6 +76,9 @@ class Command(BaseCommand):
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
if options["processes"] < 1:
raise CommandError("There must be at least 1 process")
all_docs = Document.objects.all().order_by("id")
# Build work packages for processing

View File

@ -22,15 +22,54 @@ class TestFuzzyMatchCommand(TestCase):
return stdout.getvalue(), stderr.getvalue()
def test_invalid_ratio_lower_limit(self):
with self.assertRaises(CommandError):
"""
GIVEN:
- Invalid ratio below lower limit
WHEN:
- Command is called
THEN:
- Error is raised indicating issue
"""
with self.assertRaises(CommandError) as e:
self.call_command("--ratio", "-1")
self.assertIn("The ratio must be between 0 and 100", str(e))
def test_invalid_ratio_upper_limit(self):
with self.assertRaises(CommandError):
"""
GIVEN:s
- Invalid ratio above upper
WHEN:
- Command is called
THEN:
- Error is raised indicating issue
"""
with self.assertRaises(CommandError) as e:
self.call_command("--ratio", "101")
self.assertIn("The ratio must be between 0 and 100", str(e))
def test_invalid_process_count(self):
"""
GIVEN:
- Invalid process count less than 0 above upper
WHEN:
- Command is called
THEN:
- Error is raised indicating issue
"""
with self.assertRaises(CommandError) as e:
self.call_command("--processes", "0")
self.assertIn("There must be at least 1 process", str(e))
def test_no_matches(self):
# Content similarity is 82.35
"""
GIVEN:
- 2 documents exist
- Similarity between content is 82.32
WHEN:
- Command is called
THEN:
- No matches are found
"""
Document.objects.create(
checksum="BEEFCAFE",
title="A",
@ -49,6 +88,16 @@ class TestFuzzyMatchCommand(TestCase):
self.assertEqual(stdout, "No matches found\n")
def test_with_matches(self):
"""
GIVEN:
- 2 documents exist
- Similarity between content is 86.667
WHEN:
- Command is called
THEN:
- 1 match is returned from doc 1 to doc 2
- No match from doc 2 to doc 1 reported
"""
# Content similarity is 86.667
Document.objects.create(
checksum="BEEFCAFE",
@ -68,6 +117,16 @@ class TestFuzzyMatchCommand(TestCase):
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
def test_with_3_matches(self):
"""
GIVEN:
- 3 documents exist
- All documents have similarity over 85.0
WHEN:
- Command is called
THEN:
- 3 matches is returned from each document to the others
- No duplication of matches returned
"""
# Content similarity is 86.667
Document.objects.create(
checksum="BEEFCAFE",