From e2ae919a846006f6dd25c3bba9aa58adbaab8364 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sun, 10 Sep 2023 16:32:10 -0700 Subject: [PATCH] Implements a new command for fuzzy matching document content and reporting potential duplicates --- docker/install_management_commands.sh | 1 + docs/administration.md | 17 ++++ .../commands/document_fuzzy_match.py | 63 ++++++++++++ src/documents/tests/test_management_fuzzy.py | 97 +++++++++++++++++++ 4 files changed, 178 insertions(+) create mode 100644 src/documents/management/commands/document_fuzzy_match.py create mode 100644 src/documents/tests/test_management_fuzzy.py diff --git a/docker/install_management_commands.sh b/docker/install_management_commands.sh index e5c8b30a0..38604af9d 100755 --- a/docker/install_management_commands.sh +++ b/docker/install_management_commands.sh @@ -13,6 +13,7 @@ for command in decrypt_documents \ document_retagger \ document_thumbnails \ document_sanity_checker \ + document_fuzzy_match \ manage_superuser; do echo "installing $command..." diff --git a/docs/administration.md b/docs/administration.md index 2003edec9..7ecdb76a6 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -572,3 +572,20 @@ it here) ``` decrypt_documents [--passphrase SECR3TP4SSPHRA$E] ``` + +### Detecting duplicates {#fuzzy_duplicate} + +Paperless already catches and prevents upload of exactly matching documents, +however a new scan of an existing document may not produce an exact bit for bit +duplicate. But the content should be exact or close, allowing detection. + +This tool does a fuzzy match over document content, looking for +those which look close according to a given ratio. + +``` +document_fuzzy_match [--ratio] +``` + +Optional arguments: +--ratio - a number between 0 and 100, setting how similar a document must be for it to be reported. +Higher numbers mean more similarity. diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py new file mode 100644 index 000000000..f33e2d07c --- /dev/null +++ b/src/documents/management/commands/document_fuzzy_match.py @@ -0,0 +1,63 @@ +from typing import Final + +import rapidfuzz +from django.core.management import BaseCommand +from django.core.management import CommandError + +from documents.models import Document + + +class Command(BaseCommand): + help = "Manages the document index." + + def add_arguments(self, parser): + parser.add_argument( + "--ratio", + default=85.0, + type=float, + help="Ratio to consider documents a match", + ) + + def handle(self, *args, **options): + RATIO_MIN: Final[float] = 0.0 + RATIO_MAX: Final[float] = 100.0 + + opt_ratio = options["ratio"] + match_pairs = set() + + # Ratio is a float from 0.0 to 100.0 + if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX: + raise CommandError("The ratio must be between 0 and 100") + + all_docs = Document.objects.all().order_by("id") + + for first_doc in all_docs: + for second_doc in all_docs: + if first_doc.pk == second_doc.pk: + continue + + # Normalize the string some, lower case, whitespace, etc + first_string = rapidfuzz.utils.default_process(first_doc.content) + second_string = rapidfuzz.utils.default_process(second_doc.content) + + # Basic matching ratio + match = rapidfuzz.fuzz.ratio(first_string, second_string) + + if match >= opt_ratio: + # Skip matching which have already been matched together + # doc 1 to doc 2 is the same as doc 2 to doc 1 + if (first_doc.pk, second_doc.pk) in match_pairs or ( + second_doc.pk, + first_doc.pk, + ) in match_pairs: + continue + else: + match_pairs.add((first_doc.pk, second_doc.pk)) + match_pairs.add((second_doc.pk, first_doc.pk)) + + self.stdout.write( + self.style.NOTICE( + f"Document {first_doc.pk} fuzzy match" + f" to {second_doc.pk} (confidence {match:.3f})", + ), + ) diff --git a/src/documents/tests/test_management_fuzzy.py b/src/documents/tests/test_management_fuzzy.py new file mode 100644 index 000000000..71b04b506 --- /dev/null +++ b/src/documents/tests/test_management_fuzzy.py @@ -0,0 +1,97 @@ +from io import StringIO + +from django.core.management import CommandError +from django.core.management import call_command +from django.test import TestCase + +from documents.models import Document + + +class TestFuzzyMatchCommand(TestCase): + def call_command(self, *args, **kwargs): + stdout = StringIO() + stderr = StringIO() + call_command( + "document_fuzzy_match", + *args, + stdout=stdout, + stderr=stderr, + **kwargs, + ) + return stdout.getvalue(), stderr.getvalue() + + def test_invalid_ratio_lower_limit(self): + with self.assertRaises(CommandError): + self.call_command("--ratio", "-1") + + def test_invalid_ratio_upper_limit(self): + with self.assertRaises(CommandError): + self.call_command("--ratio", "101") + + def test_no_matches(self): + # Content similarity is 82.35 + Document.objects.create( + checksum="BEEFCAFE", + title="A", + content="first document", + mime_type="application/pdf", + filename="test.pdf", + ) + Document.objects.create( + checksum="DEADBEAF", + title="A", + content="other first document", + mime_type="application/pdf", + filename="other_test.pdf", + ) + stdout, _ = self.call_command() + self.assertEqual(stdout, "") + + def test_with_matches(self): + # Content similarity is 86.667 + Document.objects.create( + checksum="BEEFCAFE", + title="A", + content="first document scanned by bob", + mime_type="application/pdf", + filename="test.pdf", + ) + Document.objects.create( + checksum="DEADBEAF", + title="A", + content="first document scanned by alice", + mime_type="application/pdf", + filename="other_test.pdf", + ) + stdout, _ = self.call_command() + self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n") + + def test_with_3_matches(self): + # Content similarity is 86.667 + Document.objects.create( + checksum="BEEFCAFE", + title="A", + content="first document scanned by bob", + mime_type="application/pdf", + filename="test.pdf", + ) + Document.objects.create( + checksum="DEADBEAF", + title="A", + content="first document scanned by alice", + mime_type="application/pdf", + filename="other_test.pdf", + ) + Document.objects.create( + checksum="CATTLE", + title="A", + content="first document scanned by pete", + mime_type="application/pdf", + filename="final_test.pdf", + ) + stdout, _ = self.call_command() + lines = [x.strip() for x in stdout.split("\n") if len(x.strip())] + self.assertEqual(len(lines), 3) + self.assertEqual(lines[0], "Document 1 fuzzy match to 2 (confidence 86.667)") + self.assertEqual(lines[1], "Document 1 fuzzy match to 3 (confidence 88.136)") + self.assertEqual(lines[2], "Document 2 fuzzy match to 3 (confidence 88.525)")