mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Implements a new command for fuzzy matching document content and reporting potential duplicates
This commit is contained in:
parent
9a0e44a731
commit
e2ae919a84
@ -13,6 +13,7 @@ for command in decrypt_documents \
|
|||||||
document_retagger \
|
document_retagger \
|
||||||
document_thumbnails \
|
document_thumbnails \
|
||||||
document_sanity_checker \
|
document_sanity_checker \
|
||||||
|
document_fuzzy_match \
|
||||||
manage_superuser;
|
manage_superuser;
|
||||||
do
|
do
|
||||||
echo "installing $command..."
|
echo "installing $command..."
|
||||||
|
@ -572,3 +572,20 @@ it here)
|
|||||||
```
|
```
|
||||||
decrypt_documents [--passphrase SECR3TP4SSPHRA$E]
|
decrypt_documents [--passphrase SECR3TP4SSPHRA$E]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Detecting duplicates {#fuzzy_duplicate}
|
||||||
|
|
||||||
|
Paperless already catches and prevents upload of exactly matching documents,
|
||||||
|
however a new scan of an existing document may not produce an exact bit for bit
|
||||||
|
duplicate. But the content should be exact or close, allowing detection.
|
||||||
|
|
||||||
|
This tool does a fuzzy match over document content, looking for
|
||||||
|
those which look close according to a given ratio.
|
||||||
|
|
||||||
|
```
|
||||||
|
document_fuzzy_match [--ratio]
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional arguments:
|
||||||
|
--ratio - a number between 0 and 100, setting how similar a document must be for it to be reported.
|
||||||
|
Higher numbers mean more similarity.
|
||||||
|
63
src/documents/management/commands/document_fuzzy_match.py
Normal file
63
src/documents/management/commands/document_fuzzy_match.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
from typing import Final
|
||||||
|
|
||||||
|
import rapidfuzz
|
||||||
|
from django.core.management import BaseCommand
|
||||||
|
from django.core.management import CommandError
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "Manages the document index."
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
"--ratio",
|
||||||
|
default=85.0,
|
||||||
|
type=float,
|
||||||
|
help="Ratio to consider documents a match",
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
RATIO_MIN: Final[float] = 0.0
|
||||||
|
RATIO_MAX: Final[float] = 100.0
|
||||||
|
|
||||||
|
opt_ratio = options["ratio"]
|
||||||
|
match_pairs = set()
|
||||||
|
|
||||||
|
# Ratio is a float from 0.0 to 100.0
|
||||||
|
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
||||||
|
raise CommandError("The ratio must be between 0 and 100")
|
||||||
|
|
||||||
|
all_docs = Document.objects.all().order_by("id")
|
||||||
|
|
||||||
|
for first_doc in all_docs:
|
||||||
|
for second_doc in all_docs:
|
||||||
|
if first_doc.pk == second_doc.pk:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize the string some, lower case, whitespace, etc
|
||||||
|
first_string = rapidfuzz.utils.default_process(first_doc.content)
|
||||||
|
second_string = rapidfuzz.utils.default_process(second_doc.content)
|
||||||
|
|
||||||
|
# Basic matching ratio
|
||||||
|
match = rapidfuzz.fuzz.ratio(first_string, second_string)
|
||||||
|
|
||||||
|
if match >= opt_ratio:
|
||||||
|
# Skip matching which have already been matched together
|
||||||
|
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
||||||
|
if (first_doc.pk, second_doc.pk) in match_pairs or (
|
||||||
|
second_doc.pk,
|
||||||
|
first_doc.pk,
|
||||||
|
) in match_pairs:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
match_pairs.add((first_doc.pk, second_doc.pk))
|
||||||
|
match_pairs.add((second_doc.pk, first_doc.pk))
|
||||||
|
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.NOTICE(
|
||||||
|
f"Document {first_doc.pk} fuzzy match"
|
||||||
|
f" to {second_doc.pk} (confidence {match:.3f})",
|
||||||
|
),
|
||||||
|
)
|
97
src/documents/tests/test_management_fuzzy.py
Normal file
97
src/documents/tests/test_management_fuzzy.py
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
from django.core.management import CommandError
|
||||||
|
from django.core.management import call_command
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
|
||||||
|
|
||||||
|
class TestFuzzyMatchCommand(TestCase):
|
||||||
|
def call_command(self, *args, **kwargs):
|
||||||
|
stdout = StringIO()
|
||||||
|
stderr = StringIO()
|
||||||
|
call_command(
|
||||||
|
"document_fuzzy_match",
|
||||||
|
*args,
|
||||||
|
stdout=stdout,
|
||||||
|
stderr=stderr,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return stdout.getvalue(), stderr.getvalue()
|
||||||
|
|
||||||
|
def test_invalid_ratio_lower_limit(self):
|
||||||
|
with self.assertRaises(CommandError):
|
||||||
|
self.call_command("--ratio", "-1")
|
||||||
|
|
||||||
|
def test_invalid_ratio_upper_limit(self):
|
||||||
|
with self.assertRaises(CommandError):
|
||||||
|
self.call_command("--ratio", "101")
|
||||||
|
|
||||||
|
def test_no_matches(self):
|
||||||
|
# Content similarity is 82.35
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="BEEFCAFE",
|
||||||
|
title="A",
|
||||||
|
content="first document",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="test.pdf",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="DEADBEAF",
|
||||||
|
title="A",
|
||||||
|
content="other first document",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="other_test.pdf",
|
||||||
|
)
|
||||||
|
stdout, _ = self.call_command()
|
||||||
|
self.assertEqual(stdout, "")
|
||||||
|
|
||||||
|
def test_with_matches(self):
|
||||||
|
# Content similarity is 86.667
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="BEEFCAFE",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by bob",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="test.pdf",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="DEADBEAF",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by alice",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="other_test.pdf",
|
||||||
|
)
|
||||||
|
stdout, _ = self.call_command()
|
||||||
|
self.assertEqual(stdout, "Document 1 fuzzy match to 2 (confidence 86.667)\n")
|
||||||
|
|
||||||
|
def test_with_3_matches(self):
|
||||||
|
# Content similarity is 86.667
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="BEEFCAFE",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by bob",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="test.pdf",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="DEADBEAF",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by alice",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="other_test.pdf",
|
||||||
|
)
|
||||||
|
Document.objects.create(
|
||||||
|
checksum="CATTLE",
|
||||||
|
title="A",
|
||||||
|
content="first document scanned by pete",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
filename="final_test.pdf",
|
||||||
|
)
|
||||||
|
stdout, _ = self.call_command()
|
||||||
|
lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
|
||||||
|
self.assertEqual(len(lines), 3)
|
||||||
|
self.assertEqual(lines[0], "Document 1 fuzzy match to 2 (confidence 86.667)")
|
||||||
|
self.assertEqual(lines[1], "Document 1 fuzzy match to 3 (confidence 88.136)")
|
||||||
|
self.assertEqual(lines[2], "Document 2 fuzzy match to 3 (confidence 88.525)")
|
Loading…
x
Reference in New Issue
Block a user