mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Feature: Allow deletion of documents via the fuzzy matching command (#4957)
* Adds new flag allowing deletion of one of a document pair which is over the match ratio * Documents the new command option
This commit is contained in:
		| @@ -53,6 +53,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): | ||||
|             type=float, | ||||
|             help="Ratio to consider documents a match", | ||||
|         ) | ||||
|         parser.add_argument( | ||||
|             "--delete", | ||||
|             default=False, | ||||
|             action="store_true", | ||||
|             help="If set, one document of matches above the ratio WILL BE DELETED", | ||||
|         ) | ||||
|         self.add_argument_progress_bar_mixin(parser) | ||||
|         self.add_argument_processes_mixin(parser) | ||||
|  | ||||
| @@ -63,6 +69,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): | ||||
|         self.handle_processes_mixin(**options) | ||||
|         self.handle_progress_bar_mixin(**options) | ||||
|  | ||||
|         if options["delete"]: | ||||
|             self.stdout.write( | ||||
|                 self.style.WARNING( | ||||
|                     "The command is configured to delete documents.  Use with caution", | ||||
|                 ), | ||||
|             ) | ||||
|  | ||||
|         opt_ratio = options["ratio"] | ||||
|         checked_pairs: set[tuple[int, int]] = set() | ||||
|         work_pkgs: list[_WorkPackage] = [] | ||||
| @@ -81,15 +94,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): | ||||
|                     continue | ||||
|                 # Skip matching which have already been matched together | ||||
|                 # doc 1 to doc 2 is the same as doc 2 to doc 1 | ||||
|                 if (first_doc.pk, second_doc.pk) in checked_pairs or ( | ||||
|                     second_doc.pk, | ||||
|                     first_doc.pk, | ||||
|                 ) in checked_pairs: | ||||
|                 doc_1_to_doc_2 = (first_doc.pk, second_doc.pk) | ||||
|                 doc_2_to_doc_1 = doc_1_to_doc_2[::-1] | ||||
|                 if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs: | ||||
|                     continue | ||||
|                 checked_pairs.update( | ||||
|                     [(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)], | ||||
|                 ) | ||||
|  | ||||
|                 checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1]) | ||||
|                 # Actually something useful to work on now | ||||
|                 work_pkgs.append(_WorkPackage(first_doc, second_doc)) | ||||
|  | ||||
|         # Don't spin up a pool of 1 process | ||||
| @@ -109,6 +119,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): | ||||
|  | ||||
|         # Check results | ||||
|         messages = [] | ||||
|         maybe_delete_ids = [] | ||||
|         for result in sorted(results): | ||||
|             if result.ratio >= opt_ratio: | ||||
|                 messages.append( | ||||
| @@ -117,6 +128,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): | ||||
|                         f" to {result.doc_two_pk} (confidence {result.ratio:.3f})", | ||||
|                     ), | ||||
|                 ) | ||||
|                 maybe_delete_ids.append(result.doc_two_pk) | ||||
|  | ||||
|         if len(messages) == 0: | ||||
|             messages.append( | ||||
| @@ -125,3 +137,10 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): | ||||
|         self.stdout.writelines( | ||||
|             messages, | ||||
|         ) | ||||
|         if options["delete"]: | ||||
|             self.stdout.write( | ||||
|                 self.style.NOTICE( | ||||
|                     f"Deleting {len(maybe_delete_ids)} documents based on ratio matches", | ||||
|                 ), | ||||
|             ) | ||||
|             Document.objects.filter(pk__in=maybe_delete_ids).delete() | ||||
|   | ||||
| @@ -157,3 +157,55 @@ class TestFuzzyMatchCommand(TestCase): | ||||
|         self.assertRegex(lines[0], self.MSG_REGEX) | ||||
|         self.assertRegex(lines[1], self.MSG_REGEX) | ||||
|         self.assertRegex(lines[2], self.MSG_REGEX) | ||||
|  | ||||
|     def test_document_deletion(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - 3 documents exist | ||||
|             - Document 1 to document 3 has a similarity over 85.0 | ||||
|         WHEN: | ||||
|             - Command is called with the --delete option | ||||
|         THEN: | ||||
|             - User is warned about the deletion flag | ||||
|             - Document 3 is deleted | ||||
|             - Documents 1 and 2 remain | ||||
|         """ | ||||
|         # Content similarity is 86.667 | ||||
|         Document.objects.create( | ||||
|             checksum="BEEFCAFE", | ||||
|             title="A", | ||||
|             content="first document scanned by bob", | ||||
|             mime_type="application/pdf", | ||||
|             filename="test.pdf", | ||||
|         ) | ||||
|         Document.objects.create( | ||||
|             checksum="DEADBEAF", | ||||
|             title="A", | ||||
|             content="second document scanned by alice", | ||||
|             mime_type="application/pdf", | ||||
|             filename="other_test.pdf", | ||||
|         ) | ||||
|         Document.objects.create( | ||||
|             checksum="CATTLE", | ||||
|             title="A", | ||||
|             content="first document scanned by pete", | ||||
|             mime_type="application/pdf", | ||||
|             filename="final_test.pdf", | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(Document.objects.count(), 3) | ||||
|  | ||||
|         stdout, _ = self.call_command("--delete") | ||||
|         print(stdout) | ||||
|         lines = [x.strip() for x in stdout.split("\n") if len(x.strip())] | ||||
|         self.assertEqual(len(lines), 3) | ||||
|         self.assertEqual( | ||||
|             lines[0], | ||||
|             "The command is configured to delete documents.  Use with caution", | ||||
|         ) | ||||
|         self.assertRegex(lines[1], self.MSG_REGEX) | ||||
|         self.assertEqual(lines[2], "Deleting 1 documents based on ratio matches") | ||||
|  | ||||
|         self.assertEqual(Document.objects.count(), 2) | ||||
|         self.assertIsNotNone(Document.objects.get(pk=1)) | ||||
|         self.assertIsNotNone(Document.objects.get(pk=2)) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H