mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-14 00:26:21 +00:00
Implements a new command for fuzzy matching document content and reporting potential duplicates
This commit is contained in:

committed by
Trenton H

parent
9a0e44a731
commit
e2ae919a84
97
src/documents/tests/test_management_fuzzy.py
Normal file
97
src/documents/tests/test_management_fuzzy.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from io import StringIO
|
||||
|
||||
from django.core.management import CommandError
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class TestFuzzyMatchCommand(TestCase):
|
||||
def call_command(self, *args, **kwargs):
|
||||
stdout = StringIO()
|
||||
stderr = StringIO()
|
||||
call_command(
|
||||
"document_fuzzy_match",
|
||||
*args,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
**kwargs,
|
||||
)
|
||||
return stdout.getvalue(), stderr.getvalue()
|
||||
|
||||
def test_invalid_ratio_lower_limit(self):
|
||||
with self.assertRaises(CommandError):
|
||||
self.call_command("--ratio", "-1")
|
||||
|
||||
def test_invalid_ratio_upper_limit(self):
|
||||
with self.assertRaises(CommandError):
|
||||
self.call_command("--ratio", "101")
|
||||
|
||||
def test_no_matches(self):
|
||||
# Content similarity is 82.35
|
||||
Document.objects.create(
|
||||
checksum="BEEFCAFE",
|
||||
title="A",
|
||||
content="first document",
|
||||
mime_type="application/pdf",
|
||||
filename="test.pdf",
|
||||
)
|
||||
Document.objects.create(
|
||||
checksum="DEADBEAF",
|
||||
title="A",
|
||||
content="other first document",
|
||||
mime_type="application/pdf",
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command()
|
||||
self.assertEqual(stdout, "")
|
||||
|
||||
def test_with_matches(self):
|
||||
# Content similarity is 86.667
|
||||
Document.objects.create(
|
||||
checksum="BEEFCAFE",
|
||||
title="A",
|
||||
content="first document scanned by bob",
|
||||
mime_type="application/pdf",
|
||||
filename="test.pdf",
|
||||
)
|
||||
Document.objects.create(
|
||||
checksum="DEADBEAF",
|
||||
title="A",
|
||||
content="first document scanned by alice",
|
||||
mime_type="application/pdf",
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command()
|
||||
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
|
||||
|
||||
def test_with_3_matches(self):
|
||||
# Content similarity is 86.667
|
||||
Document.objects.create(
|
||||
checksum="BEEFCAFE",
|
||||
title="A",
|
||||
content="first document scanned by bob",
|
||||
mime_type="application/pdf",
|
||||
filename="test.pdf",
|
||||
)
|
||||
Document.objects.create(
|
||||
checksum="DEADBEAF",
|
||||
title="A",
|
||||
content="first document scanned by alice",
|
||||
mime_type="application/pdf",
|
||||
filename="other_test.pdf",
|
||||
)
|
||||
Document.objects.create(
|
||||
checksum="CATTLE",
|
||||
title="A",
|
||||
content="first document scanned by pete",
|
||||
mime_type="application/pdf",
|
||||
filename="final_test.pdf",
|
||||
)
|
||||
stdout, _ = self.call_command()
|
||||
lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
|
||||
self.assertEqual(len(lines), 3)
|
||||
self.assertEqual(lines[0], "Document 1 fuzzy match to 2 (confidence 86.667)")
|
||||
self.assertEqual(lines[1], "Document 1 fuzzy match to 3 (confidence 88.136)")
|
||||
self.assertEqual(lines[2], "Document 2 fuzzy match to 3 (confidence 88.525)")
|
Reference in New Issue
Block a user