mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Cleans up the docs, adds validation of the process count, include the test descriptions
This commit is contained in:
parent
a03a745295
commit
81b9f2d4e0
@ -582,8 +582,11 @@ duplicate. But the content should be exact or close, allowing detection.
|
||||
This tool does a fuzzy match over document content, looking for
|
||||
those which look close according to a given ratio.
|
||||
|
||||
At this time, other metadata (such as correspondent or type) is not
|
||||
take into account by the detection.
|
||||
|
||||
```
|
||||
document_fuzzy_match [--ratio]
|
||||
document_fuzzy_match [--ratio] [--processes N]
|
||||
```
|
||||
|
||||
| Option | Required | Default | Description |
|
||||
|
@ -27,6 +27,10 @@ class _WorkResult:
|
||||
|
||||
|
||||
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
||||
"""
|
||||
Does basic processing of document content, gets the basic ratio
|
||||
and returns the result package
|
||||
"""
|
||||
# Normalize the string some, lower case, whitespace, etc
|
||||
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
||||
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
||||
@ -72,6 +76,9 @@ class Command(BaseCommand):
|
||||
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
||||
raise CommandError("The ratio must be between 0 and 100")
|
||||
|
||||
if options["processes"] < 1:
|
||||
raise CommandError("There must be at least 1 process")
|
||||
|
||||
all_docs = Document.objects.all().order_by("id")
|
||||
|
||||
# Build work packages for processing
|
||||
|
@ -22,15 +22,54 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
return stdout.getvalue(), stderr.getvalue()
|
||||
|
||||
def test_invalid_ratio_lower_limit(self):
|
||||
with self.assertRaises(CommandError):
|
||||
"""
|
||||
GIVEN:
|
||||
- Invalid ratio below lower limit
|
||||
WHEN:
|
||||
- Command is called
|
||||
THEN:
|
||||
- Error is raised indicating issue
|
||||
"""
|
||||
with self.assertRaises(CommandError) as e:
|
||||
self.call_command("--ratio", "-1")
|
||||
self.assertIn("The ratio must be between 0 and 100", str(e))
|
||||
|
||||
def test_invalid_ratio_upper_limit(self):
|
||||
with self.assertRaises(CommandError):
|
||||
"""
|
||||
GIVEN:s
|
||||
- Invalid ratio above upper
|
||||
WHEN:
|
||||
- Command is called
|
||||
THEN:
|
||||
- Error is raised indicating issue
|
||||
"""
|
||||
with self.assertRaises(CommandError) as e:
|
||||
self.call_command("--ratio", "101")
|
||||
self.assertIn("The ratio must be between 0 and 100", str(e))
|
||||
|
||||
def test_invalid_process_count(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Invalid process count less than 0 above upper
|
||||
WHEN:
|
||||
- Command is called
|
||||
THEN:
|
||||
- Error is raised indicating issue
|
||||
"""
|
||||
with self.assertRaises(CommandError) as e:
|
||||
self.call_command("--processes", "0")
|
||||
self.assertIn("There must be at least 1 process", str(e))
|
||||
|
||||
def test_no_matches(self):
|
||||
# Content similarity is 82.35
|
||||
"""
|
||||
GIVEN:
|
||||
- 2 documents exist
|
||||
- Similarity between content is 82.32
|
||||
WHEN:
|
||||
- Command is called
|
||||
THEN:
|
||||
- No matches are found
|
||||
"""
|
||||
Document.objects.create(
|
||||
checksum="BEEFCAFE",
|
||||
title="A",
|
||||
@ -49,6 +88,16 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
self.assertEqual(stdout, "No matches found\n")
|
||||
|
||||
def test_with_matches(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- 2 documents exist
|
||||
- Similarity between content is 86.667
|
||||
WHEN:
|
||||
- Command is called
|
||||
THEN:
|
||||
- 1 match is returned from doc 1 to doc 2
|
||||
- No match from doc 2 to doc 1 reported
|
||||
"""
|
||||
# Content similarity is 86.667
|
||||
Document.objects.create(
|
||||
checksum="BEEFCAFE",
|
||||
@ -68,6 +117,16 @@ class TestFuzzyMatchCommand(TestCase):
|
||||
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
|
||||
|
||||
def test_with_3_matches(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- 3 documents exist
|
||||
- All documents have similarity over 85.0
|
||||
WHEN:
|
||||
- Command is called
|
||||
THEN:
|
||||
- 3 matches is returned from each document to the others
|
||||
- No duplication of matches returned
|
||||
"""
|
||||
# Content similarity is 86.667
|
||||
Document.objects.create(
|
||||
checksum="BEEFCAFE",
|
||||
|
Loading…
x
Reference in New Issue
Block a user