mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Feature: Allow deletion of documents via the fuzzy matching command (#4957)
* Adds new flag allowing deletion of one of a document pair which is over the match ratio * Documents the new command option
This commit is contained in:
parent
55dadf0b00
commit
7289c4ea56
@ -607,3 +607,10 @@ document_fuzzy_match [--ratio] [--processes N]
|
|||||||
| ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
|
| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
|
||||||
| --processes | No | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes |
|
| --processes | No | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes |
|
||||||
|
| --delete | No | False | If provided, one document of a matched pair above the ratio will be deleted. |
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
|
||||||
|
If providing the `--delete` option, it is highly recommended to have a backup.
|
||||||
|
While every effort has been taken to ensure proper operation, there is always the
|
||||||
|
chance of deletion of a file you want to keep.
|
||||||
|
@ -53,6 +53,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
type=float,
|
type=float,
|
||||||
help="Ratio to consider documents a match",
|
help="Ratio to consider documents a match",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--delete",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="If set, one document of matches above the ratio WILL BE DELETED",
|
||||||
|
)
|
||||||
self.add_argument_progress_bar_mixin(parser)
|
self.add_argument_progress_bar_mixin(parser)
|
||||||
self.add_argument_processes_mixin(parser)
|
self.add_argument_processes_mixin(parser)
|
||||||
|
|
||||||
@ -63,6 +69,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
self.handle_processes_mixin(**options)
|
self.handle_processes_mixin(**options)
|
||||||
self.handle_progress_bar_mixin(**options)
|
self.handle_progress_bar_mixin(**options)
|
||||||
|
|
||||||
|
if options["delete"]:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.WARNING(
|
||||||
|
"The command is configured to delete documents. Use with caution",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
opt_ratio = options["ratio"]
|
opt_ratio = options["ratio"]
|
||||||
checked_pairs: set[tuple[int, int]] = set()
|
checked_pairs: set[tuple[int, int]] = set()
|
||||||
work_pkgs: list[_WorkPackage] = []
|
work_pkgs: list[_WorkPackage] = []
|
||||||
@ -81,15 +94,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
continue
|
continue
|
||||||
# Skip matching which have already been matched together
|
# Skip matching which have already been matched together
|
||||||
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
||||||
if (first_doc.pk, second_doc.pk) in checked_pairs or (
|
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
|
||||||
second_doc.pk,
|
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
|
||||||
first_doc.pk,
|
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
|
||||||
) in checked_pairs:
|
|
||||||
continue
|
continue
|
||||||
checked_pairs.update(
|
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
|
||||||
[(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
|
# Actually something useful to work on now
|
||||||
)
|
|
||||||
|
|
||||||
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
||||||
|
|
||||||
# Don't spin up a pool of 1 process
|
# Don't spin up a pool of 1 process
|
||||||
@ -109,6 +119,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
|
|
||||||
# Check results
|
# Check results
|
||||||
messages = []
|
messages = []
|
||||||
|
maybe_delete_ids = []
|
||||||
for result in sorted(results):
|
for result in sorted(results):
|
||||||
if result.ratio >= opt_ratio:
|
if result.ratio >= opt_ratio:
|
||||||
messages.append(
|
messages.append(
|
||||||
@ -117,6 +128,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
|
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
maybe_delete_ids.append(result.doc_two_pk)
|
||||||
|
|
||||||
if len(messages) == 0:
|
if len(messages) == 0:
|
||||||
messages.append(
|
messages.append(
|
||||||
@ -125,3 +137,10 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
self.stdout.writelines(
|
self.stdout.writelines(
|
||||||
messages,
|
messages,
|
||||||
)
|
)
|
||||||
|
if options["delete"]:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.NOTICE(
|
||||||
|
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
Document.objects.filter(pk__in=maybe_delete_ids).delete()
|
||||||
|
@ -157,3 +157,55 @@ class TestFuzzyMatchCommand(TestCase):
|
|||||||
self.assertRegex(lines[0], self.MSG_REGEX)
|
self.assertRegex(lines[0], self.MSG_REGEX)
|
||||||
self.assertRegex(lines[1], self.MSG_REGEX)
|
self.assertRegex(lines[1], self.MSG_REGEX)
|
||||||
self.assertRegex(lines[2], self.MSG_REGEX)
|
self.assertRegex(lines[2], self.MSG_REGEX)
|
||||||
|
|
||||||
|
def test_document_deletion(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- 3 documents exist
|
||||||
|
- Document 1 to document 3 has a similarity over 85.0
|
||||||
|
WHEN:
|
||||||
|
- Command is called with the --delete option
|
||||||
|
THEN:
|
||||||
|
- User is warned about the deletion flag
|
||||||
|
- Document 3 is deleted
|
||||||
|
- Documents 1 and 2 remain
|
||||||
|
"""
|
||||||
|
# Content similarity is 86.667
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="BEEFCAFE",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by bob",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="test.pdf",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="DEADBEAF",
|
||||||
|
title="A",
|
||||||
|
content="second document scanned by alice",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="other_test.pdf",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="CATTLE",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by pete",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="final_test.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(Document.objects.count(), 3)
|
||||||
|
|
||||||
|
stdout, _ = self.call_command("--delete")
|
||||||
|
print(stdout)
|
||||||
|
lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
|
||||||
|
self.assertEqual(len(lines), 3)
|
||||||
|
self.assertEqual(
|
||||||
|
lines[0],
|
||||||
|
"The command is configured to delete documents. Use with caution",
|
||||||
|
)
|
||||||
|
self.assertRegex(lines[1], self.MSG_REGEX)
|
||||||
|
self.assertEqual(lines[2], "Deleting 1 documents based on ratio matches")
|
||||||
|
|
||||||
|
self.assertEqual(Document.objects.count(), 2)
|
||||||
|
self.assertIsNotNone(Document.objects.get(pk=1))
|
||||||
|
self.assertIsNotNone(Document.objects.get(pk=2))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user