mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Cleans up the docs, adds validation of the process count, include the test descriptions
This commit is contained in:
parent
a03a745295
commit
81b9f2d4e0
@ -582,8 +582,11 @@ duplicate. But the content should be exact or close, allowing detection.
|
|||||||
This tool does a fuzzy match over document content, looking for
|
This tool does a fuzzy match over document content, looking for
|
||||||
those which look close according to a given ratio.
|
those which look close according to a given ratio.
|
||||||
|
|
||||||
|
At this time, other metadata (such as correspondent or type) is not
|
||||||
|
take into account by the detection.
|
||||||
|
|
||||||
```
|
```
|
||||||
document_fuzzy_match [--ratio]
|
document_fuzzy_match [--ratio] [--processes N]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Option | Required | Default | Description |
|
| Option | Required | Default | Description |
|
||||||
|
@ -27,6 +27,10 @@ class _WorkResult:
|
|||||||
|
|
||||||
|
|
||||||
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
||||||
|
"""
|
||||||
|
Does basic processing of document content, gets the basic ratio
|
||||||
|
and returns the result package
|
||||||
|
"""
|
||||||
# Normalize the string some, lower case, whitespace, etc
|
# Normalize the string some, lower case, whitespace, etc
|
||||||
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
||||||
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
||||||
@ -72,6 +76,9 @@ class Command(BaseCommand):
|
|||||||
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
||||||
raise CommandError("The ratio must be between 0 and 100")
|
raise CommandError("The ratio must be between 0 and 100")
|
||||||
|
|
||||||
|
if options["processes"] < 1:
|
||||||
|
raise CommandError("There must be at least 1 process")
|
||||||
|
|
||||||
all_docs = Document.objects.all().order_by("id")
|
all_docs = Document.objects.all().order_by("id")
|
||||||
|
|
||||||
# Build work packages for processing
|
# Build work packages for processing
|
||||||
|
@ -22,15 +22,54 @@ class TestFuzzyMatchCommand(TestCase):
|
|||||||
return stdout.getvalue(), stderr.getvalue()
|
return stdout.getvalue(), stderr.getvalue()
|
||||||
|
|
||||||
def test_invalid_ratio_lower_limit(self):
|
def test_invalid_ratio_lower_limit(self):
|
||||||
with self.assertRaises(CommandError):
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Invalid ratio below lower limit
|
||||||
|
WHEN:
|
||||||
|
- Command is called
|
||||||
|
THEN:
|
||||||
|
- Error is raised indicating issue
|
||||||
|
"""
|
||||||
|
with self.assertRaises(CommandError) as e:
|
||||||
self.call_command("--ratio", "-1")
|
self.call_command("--ratio", "-1")
|
||||||
|
self.assertIn("The ratio must be between 0 and 100", str(e))
|
||||||
|
|
||||||
def test_invalid_ratio_upper_limit(self):
|
def test_invalid_ratio_upper_limit(self):
|
||||||
with self.assertRaises(CommandError):
|
"""
|
||||||
|
GIVEN:s
|
||||||
|
- Invalid ratio above upper
|
||||||
|
WHEN:
|
||||||
|
- Command is called
|
||||||
|
THEN:
|
||||||
|
- Error is raised indicating issue
|
||||||
|
"""
|
||||||
|
with self.assertRaises(CommandError) as e:
|
||||||
self.call_command("--ratio", "101")
|
self.call_command("--ratio", "101")
|
||||||
|
self.assertIn("The ratio must be between 0 and 100", str(e))
|
||||||
|
|
||||||
|
def test_invalid_process_count(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Invalid process count less than 0 above upper
|
||||||
|
WHEN:
|
||||||
|
- Command is called
|
||||||
|
THEN:
|
||||||
|
- Error is raised indicating issue
|
||||||
|
"""
|
||||||
|
with self.assertRaises(CommandError) as e:
|
||||||
|
self.call_command("--processes", "0")
|
||||||
|
self.assertIn("There must be at least 1 process", str(e))
|
||||||
|
|
||||||
def test_no_matches(self):
|
def test_no_matches(self):
|
||||||
# Content similarity is 82.35
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- 2 documents exist
|
||||||
|
- Similarity between content is 82.32
|
||||||
|
WHEN:
|
||||||
|
- Command is called
|
||||||
|
THEN:
|
||||||
|
- No matches are found
|
||||||
|
"""
|
||||||
Document.objects.create(
|
Document.objects.create(
|
||||||
checksum="BEEFCAFE",
|
checksum="BEEFCAFE",
|
||||||
title="A",
|
title="A",
|
||||||
@ -49,6 +88,16 @@ class TestFuzzyMatchCommand(TestCase):
|
|||||||
self.assertEqual(stdout, "No matches found\n")
|
self.assertEqual(stdout, "No matches found\n")
|
||||||
|
|
||||||
def test_with_matches(self):
|
def test_with_matches(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- 2 documents exist
|
||||||
|
- Similarity between content is 86.667
|
||||||
|
WHEN:
|
||||||
|
- Command is called
|
||||||
|
THEN:
|
||||||
|
- 1 match is returned from doc 1 to doc 2
|
||||||
|
- No match from doc 2 to doc 1 reported
|
||||||
|
"""
|
||||||
# Content similarity is 86.667
|
# Content similarity is 86.667
|
||||||
Document.objects.create(
|
Document.objects.create(
|
||||||
checksum="BEEFCAFE",
|
checksum="BEEFCAFE",
|
||||||
@ -68,6 +117,16 @@ class TestFuzzyMatchCommand(TestCase):
|
|||||||
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
|
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
|
||||||
|
|
||||||
def test_with_3_matches(self):
|
def test_with_3_matches(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- 3 documents exist
|
||||||
|
- All documents have similarity over 85.0
|
||||||
|
WHEN:
|
||||||
|
- Command is called
|
||||||
|
THEN:
|
||||||
|
- 3 matches is returned from each document to the others
|
||||||
|
- No duplication of matches returned
|
||||||
|
"""
|
||||||
# Content similarity is 86.667
|
# Content similarity is 86.667
|
||||||
Document.objects.create(
|
Document.objects.create(
|
||||||
checksum="BEEFCAFE",
|
checksum="BEEFCAFE",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user