diff --git a/src-ui/src/app/components/admin/tasks/tasks.component.html b/src-ui/src/app/components/admin/tasks/tasks.component.html
index 084195221..ad625789c 100644
--- a/src-ui/src/app/components/admin/tasks/tasks.component.html
+++ b/src-ui/src/app/components/admin/tasks/tasks.component.html
@@ -97,6 +97,12 @@
(click for full output)
}
+ @if (task.duplicate_documents?.length > 0) {
+
+
+ Duplicate(s) detected
+
+ }
}
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html
index 44304c942..baba74d25 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@@ -145,6 +145,24 @@
Details
+ @if (document?.duplicate_documents?.length) {
+
+ Duplicate content detected.
+
+ @for (duplicate of document.duplicate_documents; track duplicate.id) {
+ -
+
+
+ }
+
+
+ }
int | None:
return obj.page_count
+ def get_duplicate_documents(self, obj):
+ view = self.context.get("view")
+ if view and getattr(view, "action", None) != "retrieve":
+ return []
+ request = self.context.get("request")
+ user = request.user if request else None
+ duplicates = _get_viewable_duplicates(obj, user)
+ return list(duplicates.values("id", "title"))
+
def get_original_file_name(self, obj) -> str | None:
return obj.original_filename
@@ -1233,6 +1268,7 @@ class DocumentSerializer(
"archive_serial_number",
"original_file_name",
"archived_file_name",
+ "duplicate_documents",
"owner",
"permissions",
"user_can_change",
@@ -2094,10 +2130,12 @@ class TasksViewSerializer(OwnedObjectSerializer):
"result",
"acknowledged",
"related_document",
+ "duplicate_documents",
"owner",
)
related_document = serializers.SerializerMethodField()
+ duplicate_documents = serializers.SerializerMethodField()
created_doc_re = re.compile(r"New document id (\d+) created")
duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)")
@@ -2122,6 +2160,30 @@ class TasksViewSerializer(OwnedObjectSerializer):
return result
+ def _get_duplicate_documents(self, obj):
+ if not hasattr(self, "_duplicate_documents_cache"):
+ self._duplicate_documents_cache = {}
+ cache = self._duplicate_documents_cache
+ if obj.pk in cache:
+ return cache[obj.pk]
+ related_document = self.get_related_document(obj)
+ if not related_document:
+ cache[obj.pk] = []
+ return cache[obj.pk]
+ try:
+ document = Document.objects.get(pk=related_document)
+ except Document.DoesNotExist:
+ cache[obj.pk] = []
+ return cache[obj.pk]
+ request = self.context.get("request")
+ user = request.user if request else None
+ duplicates = _get_viewable_duplicates(document, user)
+ cache[obj.pk] = list(duplicates.values("id", "title"))
+ return cache[obj.pk]
+
+ def get_duplicate_documents(self, obj):
+ return self._get_duplicate_documents(obj)
+
class RunTaskViewSerializer(serializers.Serializer):
task_name = serializers.ChoiceField(
diff --git a/src/documents/tests/test_api_tasks.py b/src/documents/tests/test_api_tasks.py
index aa42577c4..295747979 100644
--- a/src/documents/tests/test_api_tasks.py
+++ b/src/documents/tests/test_api_tasks.py
@@ -7,6 +7,7 @@ from django.contrib.auth.models import User
from rest_framework import status
from rest_framework.test import APITestCase
+from documents.models import Document
from documents.models import PaperlessTask
from documents.tests.utils import DirectoriesMixin
from documents.views import TasksViewSet
@@ -258,7 +259,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
status=celery.states.FAILURE,
- result="test.pdf: Not consuming test.pdf: It is a duplicate.",
+ result="test.pdf: Unexpected error during ingestion.",
)
response = self.client.get(self.ENDPOINT)
@@ -270,7 +271,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual(
returned_data["result"],
- "test.pdf: Not consuming test.pdf: It is a duplicate.",
+ "test.pdf: Unexpected error during ingestion.",
)
def test_task_name_webui(self):
@@ -325,20 +326,33 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual(returned_data["task_file_name"], "anothertest.pdf")
- def test_task_result_failed_duplicate_includes_related_doc(self):
+ def test_task_result_duplicate_warning_includes_count(self):
"""
GIVEN:
- - A celery task failed with a duplicate error
+ - A celery task succeeds, but a duplicate exists
WHEN:
- API call is made to get tasks
THEN:
- - The returned data includes a related document link
+ - The returned data includes duplicate warning metadata
"""
+ checksum = "duplicate-checksum"
+ Document.objects.create(
+ title="Existing",
+ content="",
+ mime_type="application/pdf",
+ checksum=checksum,
+ )
+ created_doc = Document.objects.create(
+ title="Created",
+ content="",
+ mime_type="application/pdf",
+ checksum=checksum,
+ )
PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
- status=celery.states.FAILURE,
- result="Not consuming task_one.pdf: It is a duplicate of task_one_existing.pdf (#1234).",
+ status=celery.states.SUCCESS,
+ result=f"Success. New document id {created_doc.pk} created",
)
response = self.client.get(self.ENDPOINT)
@@ -348,7 +362,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
returned_data = response.data[0]
- self.assertEqual(returned_data["related_document"], "1234")
+ self.assertEqual(returned_data["related_document"], str(created_doc.pk))
def test_run_train_classifier_task(self):
"""
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index 63d6f8f5b..5b3b32fad 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -485,21 +485,21 @@ class TestConsumer(
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
- with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
- with self.get_consumer(self.get_test_file()) as consumer:
- consumer.run()
+ with self.get_consumer(self.get_test_file()) as consumer:
+ consumer.run()
- self._assert_first_last_send_progress(last_status="FAILED")
+ self.assertEqual(Document.objects.count(), 2)
+ self._assert_first_last_send_progress()
def testDuplicates2(self):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
- with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
- with self.get_consumer(self.get_test_archive_file()) as consumer:
- consumer.run()
+ with self.get_consumer(self.get_test_archive_file()) as consumer:
+ consumer.run()
- self._assert_first_last_send_progress(last_status="FAILED")
+ self.assertEqual(Document.objects.count(), 2)
+ self._assert_first_last_send_progress()
def testDuplicates3(self):
with self.get_consumer(self.get_test_archive_file()) as consumer:
@@ -513,9 +513,10 @@ class TestConsumer(
Document.objects.all().delete()
- with self.assertRaisesMessage(ConsumerError, "document is in the trash"):
- with self.get_consumer(self.get_test_file()) as consumer:
- consumer.run()
+ with self.get_consumer(self.get_test_file()) as consumer:
+ consumer.run()
+
+ self.assertEqual(Document.objects.count(), 1)
def testAsnExists(self):
with self.get_consumer(
@@ -718,12 +719,12 @@ class TestConsumer(
dst = self.get_test_file()
self.assertIsFile(dst)
- with self.assertRaises(ConsumerError):
- with self.get_consumer(dst) as consumer:
- consumer.run()
+ with self.get_consumer(dst) as consumer:
+ consumer.run()
self.assertIsNotFile(dst)
- self._assert_first_last_send_progress(last_status="FAILED")
+ self.assertEqual(Document.objects.count(), 2)
+ self._assert_first_last_send_progress()
@override_settings(CONSUMER_DELETE_DUPLICATES=False)
def test_no_delete_duplicate(self):
@@ -743,15 +744,12 @@ class TestConsumer(
dst = self.get_test_file()
self.assertIsFile(dst)
- with self.assertRaisesRegex(
- ConsumerError,
- r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)",
- ):
- with self.get_consumer(dst) as consumer:
- consumer.run()
+ with self.get_consumer(dst) as consumer:
+ consumer.run()
- self.assertIsFile(dst)
- self._assert_first_last_send_progress(last_status="FAILED")
+ self.assertIsNotFile(dst)
+ self.assertEqual(Document.objects.count(), 2)
+ self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{title}")
@mock.patch("documents.parsers.document_consumer_declaration.send")
|