Feature: allow duplicates with warnings, UI for discovery (#11815)

This commit is contained in:
shamoon
2026-01-26 10:55:08 -08:00
committed by GitHub
parent df1aa13551
commit 4428354150
14 changed files with 316 additions and 49 deletions

View File

@@ -779,19 +779,45 @@ class ConsumerPreflightPlugin(
Q(checksum=checksum) | Q(archive_checksum=checksum),
)
if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
existing_doc = existing_doc.order_by("-created")
duplicates_in_trash = existing_doc.filter(deleted_at__isnull=False)
log_msg = (
f"Consuming duplicate {self.filename}: "
f"{existing_doc.count()} existing document(s) share the same content."
)
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
if duplicates_in_trash.exists():
log_msg += " Note: at least one existing document is in the trash."
self.log.warning(log_msg)
if settings.CONSUMER_DELETE_DUPLICATES:
duplicate = existing_doc.first()
duplicate_label = (
duplicate.title
or duplicate.original_filename
or (Path(duplicate.filename).name if duplicate.filename else None)
or str(duplicate.pk)
)
Path(self.input_doc.original_file).unlink()
self._fail(
msg,
log_msg,
)
failure_msg = (
f"Not consuming {self.filename}: "
f"It is a duplicate of {duplicate_label} (#{duplicate.pk})"
)
status_msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
if duplicates_in_trash.exists():
status_msg = (
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
)
failure_msg += " Note: existing document is in the trash."
self._fail(
status_msg,
failure_msg,
)
def pre_check_directories(self):
"""

View File

@@ -0,0 +1,23 @@
# Generated by Django 5.2.7 on 2026-01-14 17:45
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "0005_workflowtrigger_filter_has_any_correspondents_and_more"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
max_length=32,
verbose_name="checksum",
help_text="The checksum of the original document.",
),
),
]

View File

@@ -205,7 +205,6 @@ class Document(SoftDeleteModel, ModelWithOwner):
_("checksum"),
max_length=32,
editable=False,
unique=True,
help_text=_("The checksum of the original document."),
)

View File

@@ -148,13 +148,29 @@ def get_document_count_filter_for_user(user):
)
def get_objects_for_user_owner_aware(user, perms, Model) -> QuerySet:
objects_owned = Model.objects.filter(owner=user)
objects_unowned = Model.objects.filter(owner__isnull=True)
def get_objects_for_user_owner_aware(
user,
perms,
Model,
*,
include_deleted=False,
) -> QuerySet:
"""
Returns objects the user owns, are unowned, or has explicit perms.
When include_deleted is True, soft-deleted items are also included.
"""
manager = (
Model.global_objects
if include_deleted and hasattr(Model, "global_objects")
else Model.objects
)
objects_owned = manager.filter(owner=user)
objects_unowned = manager.filter(owner__isnull=True)
objects_with_perms = get_objects_for_user(
user=user,
perms=perms,
klass=Model,
klass=manager.all(),
accept_global_perms=False,
)
return objects_owned | objects_unowned | objects_with_perms

View File

@@ -23,6 +23,7 @@ from django.core.validators import MinValueValidator
from django.core.validators import RegexValidator
from django.core.validators import integer_validator
from django.db.models import Count
from django.db.models import Q
from django.db.models.functions import Lower
from django.utils.crypto import get_random_string
from django.utils.dateparse import parse_datetime
@@ -72,6 +73,7 @@ from documents.models import WorkflowTrigger
from documents.parsers import is_mime_type_supported
from documents.permissions import get_document_count_filter_for_user
from documents.permissions import get_groups_with_only_permission
from documents.permissions import get_objects_for_user_owner_aware
from documents.permissions import set_permissions_for_object
from documents.regex import validate_regex_pattern
from documents.templating.filepath import validate_filepath_template_and_render
@@ -82,6 +84,9 @@ from documents.validators import url_validator
if TYPE_CHECKING:
from collections.abc import Iterable
from django.db.models.query import QuerySet
logger = logging.getLogger("paperless.serializers")
@@ -1014,6 +1019,32 @@ class NotesSerializer(serializers.ModelSerializer):
return ret
def _get_viewable_duplicates(
document: Document,
user: User | None,
) -> QuerySet[Document]:
checksums = {document.checksum}
if document.archive_checksum:
checksums.add(document.archive_checksum)
duplicates = Document.global_objects.filter(
Q(checksum__in=checksums) | Q(archive_checksum__in=checksums),
).exclude(pk=document.pk)
duplicates = duplicates.order_by("-created")
allowed = get_objects_for_user_owner_aware(
user,
"documents.view_document",
Document,
include_deleted=True,
)
return duplicates.filter(id__in=allowed)
class DuplicateDocumentSummarySerializer(serializers.Serializer):
id = serializers.IntegerField()
title = serializers.CharField()
deleted_at = serializers.DateTimeField(allow_null=True)
@extend_schema_serializer(
deprecate_fields=["created_date"],
)
@@ -1031,6 +1062,7 @@ class DocumentSerializer(
archived_file_name = SerializerMethodField()
created_date = serializers.DateField(required=False)
page_count = SerializerMethodField()
duplicate_documents = SerializerMethodField()
notes = NotesSerializer(many=True, required=False, read_only=True)
@@ -1056,6 +1088,16 @@ class DocumentSerializer(
def get_page_count(self, obj) -> int | None:
return obj.page_count
@extend_schema_field(DuplicateDocumentSummarySerializer(many=True))
def get_duplicate_documents(self, obj):
view = self.context.get("view")
if view and getattr(view, "action", None) != "retrieve":
return []
request = self.context.get("request")
user = request.user if request else None
duplicates = _get_viewable_duplicates(obj, user)
return list(duplicates.values("id", "title", "deleted_at"))
def get_original_file_name(self, obj) -> str | None:
return obj.original_filename
@@ -1233,6 +1275,7 @@ class DocumentSerializer(
"archive_serial_number",
"original_file_name",
"archived_file_name",
"duplicate_documents",
"owner",
"permissions",
"user_can_change",
@@ -2094,10 +2137,12 @@ class TasksViewSerializer(OwnedObjectSerializer):
"result",
"acknowledged",
"related_document",
"duplicate_documents",
"owner",
)
related_document = serializers.SerializerMethodField()
duplicate_documents = serializers.SerializerMethodField()
created_doc_re = re.compile(r"New document id (\d+) created")
duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)")
@@ -2122,6 +2167,17 @@ class TasksViewSerializer(OwnedObjectSerializer):
return result
@extend_schema_field(DuplicateDocumentSummarySerializer(many=True))
def get_duplicate_documents(self, obj):
related_document = self.get_related_document(obj)
request = self.context.get("request")
user = request.user if request else None
document = Document.global_objects.filter(pk=related_document).first()
if not related_document or not user or not document:
return []
duplicates = _get_viewable_duplicates(document, user)
return list(duplicates.values("id", "title", "deleted_at"))
class RunTaskViewSerializer(serializers.Serializer):
task_name = serializers.ChoiceField(

View File

@@ -7,6 +7,7 @@ from django.contrib.auth.models import User
from rest_framework import status
from rest_framework.test import APITestCase
from documents.models import Document
from documents.models import PaperlessTask
from documents.tests.utils import DirectoriesMixin
from documents.views import TasksViewSet
@@ -258,7 +259,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
status=celery.states.FAILURE,
result="test.pdf: Not consuming test.pdf: It is a duplicate.",
result="test.pdf: Unexpected error during ingestion.",
)
response = self.client.get(self.ENDPOINT)
@@ -270,7 +271,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual(
returned_data["result"],
"test.pdf: Not consuming test.pdf: It is a duplicate.",
"test.pdf: Unexpected error during ingestion.",
)
def test_task_name_webui(self):
@@ -325,20 +326,34 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual(returned_data["task_file_name"], "anothertest.pdf")
def test_task_result_failed_duplicate_includes_related_doc(self):
def test_task_result_duplicate_warning_includes_count(self):
"""
GIVEN:
- A celery task failed with a duplicate error
- A celery task succeeds, but a duplicate exists
WHEN:
- API call is made to get tasks
THEN:
- The returned data includes a related document link
- The returned data includes duplicate warning metadata
"""
checksum = "duplicate-checksum"
Document.objects.create(
title="Existing",
content="",
mime_type="application/pdf",
checksum=checksum,
)
created_doc = Document.objects.create(
title="Created",
content="",
mime_type="application/pdf",
checksum=checksum,
archive_checksum="another-checksum",
)
PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
status=celery.states.FAILURE,
result="Not consuming task_one.pdf: It is a duplicate of task_one_existing.pdf (#1234).",
status=celery.states.SUCCESS,
result=f"Success. New document id {created_doc.pk} created",
)
response = self.client.get(self.ENDPOINT)
@@ -348,7 +363,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
returned_data = response.data[0]
self.assertEqual(returned_data["related_document"], "1234")
self.assertEqual(returned_data["related_document"], str(created_doc.pk))
def test_run_train_classifier_task(self):
"""

View File

@@ -485,21 +485,21 @@ class TestConsumer(
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
self._assert_first_last_send_progress(last_status="FAILED")
self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
def testDuplicates2(self):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
with self.get_consumer(self.get_test_archive_file()) as consumer:
consumer.run()
with self.get_consumer(self.get_test_archive_file()) as consumer:
consumer.run()
self._assert_first_last_send_progress(last_status="FAILED")
self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
def testDuplicates3(self):
with self.get_consumer(self.get_test_archive_file()) as consumer:
@@ -513,9 +513,10 @@ class TestConsumer(
Document.objects.all().delete()
with self.assertRaisesMessage(ConsumerError, "document is in the trash"):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
self.assertEqual(Document.objects.count(), 1)
def testAsnExists(self):
with self.get_consumer(
@@ -718,12 +719,45 @@ class TestConsumer(
dst = self.get_test_file()
self.assertIsFile(dst)
with self.assertRaises(ConsumerError):
expected_message = (
f"{dst.name}: Not consuming {dst.name}: "
f"It is a duplicate of {document.title} (#{document.pk})"
)
with self.assertRaisesMessage(ConsumerError, expected_message):
with self.get_consumer(dst) as consumer:
consumer.run()
self.assertIsNotFile(dst)
self._assert_first_last_send_progress(last_status="FAILED")
self.assertEqual(Document.objects.count(), 1)
self._assert_first_last_send_progress(last_status=ProgressStatusOptions.FAILED)
@override_settings(CONSUMER_DELETE_DUPLICATES=True)
def test_delete_duplicate_in_trash(self):
dst = self.get_test_file()
with self.get_consumer(dst) as consumer:
consumer.run()
# Move the existing document to trash
document = Document.objects.first()
document.delete()
dst = self.get_test_file()
self.assertIsFile(dst)
expected_message = (
f"{dst.name}: Not consuming {dst.name}: "
f"It is a duplicate of {document.title} (#{document.pk})"
f" Note: existing document is in the trash."
)
with self.assertRaisesMessage(ConsumerError, expected_message):
with self.get_consumer(dst) as consumer:
consumer.run()
self.assertIsNotFile(dst)
self.assertEqual(Document.global_objects.count(), 1)
self.assertEqual(Document.objects.count(), 0)
@override_settings(CONSUMER_DELETE_DUPLICATES=False)
def test_no_delete_duplicate(self):
@@ -743,15 +777,12 @@ class TestConsumer(
dst = self.get_test_file()
self.assertIsFile(dst)
with self.assertRaisesRegex(
ConsumerError,
r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)",
):
with self.get_consumer(dst) as consumer:
consumer.run()
with self.get_consumer(dst) as consumer:
consumer.run()
self.assertIsFile(dst)
self._assert_first_last_send_progress(last_status="FAILED")
self.assertIsNotFile(dst)
self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{title}")
@mock.patch("documents.parsers.document_consumer_declaration.send")