Implements a new command for fuzzy matching document content and reporting potential duplicates

This commit is contained in:
Trenton Holmes
2023-09-10 16:32:10 -07:00
committed by Trenton H
parent 9a0e44a731
commit e2ae919a84
4 changed files with 178 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
from typing import Final
import rapidfuzz
from django.core.management import BaseCommand
from django.core.management import CommandError
from documents.models import Document
class Command(BaseCommand):
help = "Manages the document index."
def add_arguments(self, parser):
parser.add_argument(
"--ratio",
default=85.0,
type=float,
help="Ratio to consider documents a match",
)
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
opt_ratio = options["ratio"]
match_pairs = set()
# Ratio is a float from 0.0 to 100.0
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
all_docs = Document.objects.all().order_by("id")
for first_doc in all_docs:
for second_doc in all_docs:
if first_doc.pk == second_doc.pk:
continue
# Normalize the string some, lower case, whitespace, etc
first_string = rapidfuzz.utils.default_process(first_doc.content)
second_string = rapidfuzz.utils.default_process(second_doc.content)
# Basic matching ratio
match = rapidfuzz.fuzz.ratio(first_string, second_string)
if match >= opt_ratio:
# Skip matching which have already been matched together
# doc 1 to doc 2 is the same as doc 2 to doc 1
if (first_doc.pk, second_doc.pk) in match_pairs or (
second_doc.pk,
first_doc.pk,
) in match_pairs:
continue
else:
match_pairs.add((first_doc.pk, second_doc.pk))
match_pairs.add((second_doc.pk, first_doc.pk))
self.stdout.write(
self.style.NOTICE(
f"Document {first_doc.pk} fuzzy match"
f" to {second_doc.pk} (confidence {match:.3f})",
),
)

View File

@@ -0,0 +1,97 @@
from io import StringIO
from django.core.management import CommandError
from django.core.management import call_command
from django.test import TestCase
from documents.models import Document
class TestFuzzyMatchCommand(TestCase):
def call_command(self, *args, **kwargs):
stdout = StringIO()
stderr = StringIO()
call_command(
"document_fuzzy_match",
*args,
stdout=stdout,
stderr=stderr,
**kwargs,
)
return stdout.getvalue(), stderr.getvalue()
def test_invalid_ratio_lower_limit(self):
with self.assertRaises(CommandError):
self.call_command("--ratio", "-1")
def test_invalid_ratio_upper_limit(self):
with self.assertRaises(CommandError):
self.call_command("--ratio", "101")
def test_no_matches(self):
# Content similarity is 82.35
Document.objects.create(
checksum="BEEFCAFE",
title="A",
content="first document",
mime_type="application/pdf",
filename="test.pdf",
)
Document.objects.create(
checksum="DEADBEAF",
title="A",
content="other first document",
mime_type="application/pdf",
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertEqual(stdout, "")
def test_with_matches(self):
# Content similarity is 86.667
Document.objects.create(
checksum="BEEFCAFE",
title="A",
content="first document scanned by bob",
mime_type="application/pdf",
filename="test.pdf",
)
Document.objects.create(
checksum="DEADBEAF",
title="A",
content="first document scanned by alice",
mime_type="application/pdf",
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
def test_with_3_matches(self):
# Content similarity is 86.667
Document.objects.create(
checksum="BEEFCAFE",
title="A",
content="first document scanned by bob",
mime_type="application/pdf",
filename="test.pdf",
)
Document.objects.create(
checksum="DEADBEAF",
title="A",
content="first document scanned by alice",
mime_type="application/pdf",
filename="other_test.pdf",
)
Document.objects.create(
checksum="CATTLE",
title="A",
content="first document scanned by pete",
mime_type="application/pdf",
filename="final_test.pdf",
)
stdout, _ = self.call_command()
lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
self.assertEqual(len(lines), 3)
self.assertEqual(lines[0], "Document 1 fuzzy match to 2 (confidence 86.667)")
self.assertEqual(lines[1], "Document 1 fuzzy match to 3 (confidence 88.136)")
self.assertEqual(lines[2], "Document 2 fuzzy match to 3 (confidence 88.525)")