diff --git a/docs/administration.md b/docs/administration.md index 808d6afaf..cf8a24294 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -607,3 +607,10 @@ document_fuzzy_match [--ratio] [--processes N] | ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ | | --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. | | --processes | No | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes | +| --delete | No | False | If provided, one document of a matched pair above the ratio will be deleted. | + +!!! warning + + If providing the `--delete` option, it is highly recommended to have a backup. + While every effort has been taken to ensure proper operation, there is always the + chance of deletion of a file you want to keep. diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py index 597a9d2c1..9e01ff1b0 100644 --- a/src/documents/management/commands/document_fuzzy_match.py +++ b/src/documents/management/commands/document_fuzzy_match.py @@ -53,6 +53,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): type=float, help="Ratio to consider documents a match", ) + parser.add_argument( + "--delete", + default=False, + action="store_true", + help="If set, one document of matches above the ratio WILL BE DELETED", + ) self.add_argument_progress_bar_mixin(parser) self.add_argument_processes_mixin(parser) @@ -63,6 +69,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): self.handle_processes_mixin(**options) self.handle_progress_bar_mixin(**options) + if options["delete"]: + self.stdout.write( + self.style.WARNING( + "The command is configured to delete documents. Use with caution", + ), + ) + opt_ratio = options["ratio"] checked_pairs: set[tuple[int, int]] = set() work_pkgs: list[_WorkPackage] = [] @@ -81,15 +94,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): continue # Skip matching which have already been matched together # doc 1 to doc 2 is the same as doc 2 to doc 1 - if (first_doc.pk, second_doc.pk) in checked_pairs or ( - second_doc.pk, - first_doc.pk, - ) in checked_pairs: + doc_1_to_doc_2 = (first_doc.pk, second_doc.pk) + doc_2_to_doc_1 = doc_1_to_doc_2[::-1] + if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs: continue - checked_pairs.update( - [(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)], - ) - + checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1]) + # Actually something useful to work on now work_pkgs.append(_WorkPackage(first_doc, second_doc)) # Don't spin up a pool of 1 process @@ -109,6 +119,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): # Check results messages = [] + maybe_delete_ids = [] for result in sorted(results): if result.ratio >= opt_ratio: messages.append( @@ -117,6 +128,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): f" to {result.doc_two_pk} (confidence {result.ratio:.3f})", ), ) + maybe_delete_ids.append(result.doc_two_pk) if len(messages) == 0: messages.append( @@ -125,3 +137,10 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): self.stdout.writelines( messages, ) + if options["delete"]: + self.stdout.write( + self.style.NOTICE( + f"Deleting {len(maybe_delete_ids)} documents based on ratio matches", + ), + ) + Document.objects.filter(pk__in=maybe_delete_ids).delete() diff --git a/src/documents/tests/test_management_fuzzy.py b/src/documents/tests/test_management_fuzzy.py index abbf3c921..c215c43ca 100644 --- a/src/documents/tests/test_management_fuzzy.py +++ b/src/documents/tests/test_management_fuzzy.py @@ -157,3 +157,55 @@ class TestFuzzyMatchCommand(TestCase): self.assertRegex(lines[0], self.MSG_REGEX) self.assertRegex(lines[1], self.MSG_REGEX) self.assertRegex(lines[2], self.MSG_REGEX) + + def test_document_deletion(self): + """ + GIVEN: + - 3 documents exist + - Document 1 to document 3 has a similarity over 85.0 + WHEN: + - Command is called with the --delete option + THEN: + - User is warned about the deletion flag + - Document 3 is deleted + - Documents 1 and 2 remain + """ + # Content similarity is 86.667 + Document.objects.create( + checksum="BEEFCAFE", + title="A", + content="first document scanned by bob", + mime_type="application/pdf", + filename="test.pdf", + ) + Document.objects.create( + checksum="DEADBEAF", + title="A", + content="second document scanned by alice", + mime_type="application/pdf", + filename="other_test.pdf", + ) + Document.objects.create( + checksum="CATTLE", + title="A", + content="first document scanned by pete", + mime_type="application/pdf", + filename="final_test.pdf", + ) + + self.assertEqual(Document.objects.count(), 3) + + stdout, _ = self.call_command("--delete") + print(stdout) + lines = [x.strip() for x in stdout.split("\n") if len(x.strip())] + self.assertEqual(len(lines), 3) + self.assertEqual( + lines[0], + "The command is configured to delete documents. Use with caution", + ) + self.assertRegex(lines[1], self.MSG_REGEX) + self.assertEqual(lines[2], "Deleting 1 documents based on ratio matches") + + self.assertEqual(Document.objects.count(), 2) + self.assertIsNotNone(Document.objects.get(pk=1)) + self.assertIsNotNone(Document.objects.get(pk=2))