From 1e595a5aab1406a516406151272a279cb8ef7b27 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 18 Jan 2026 11:40:43 -0800 Subject: [PATCH] Core elements, migration, consumer modifications --- .../admin/tasks/tasks.component.html | 6 ++ .../document-detail.component.html | 18 ++++++ .../document-detail.component.ts | 4 ++ src-ui/src/app/data/document.ts | 7 +++ src-ui/src/app/data/paperless-task.ts | 3 + src/documents/consumer.py | 20 +++--- .../1077_alter_document_checksum_unique.py | 23 +++++++ src/documents/models.py | 1 - src/documents/serialisers.py | 62 +++++++++++++++++++ src/documents/tests/test_api_tasks.py | 30 ++++++--- src/documents/tests/test_consumer.py | 44 +++++++------ 11 files changed, 174 insertions(+), 44 deletions(-) create mode 100644 src/documents/migrations/1077_alter_document_checksum_unique.py diff --git a/src-ui/src/app/components/admin/tasks/tasks.component.html b/src-ui/src/app/components/admin/tasks/tasks.component.html index 084195221..ad625789c 100644 --- a/src-ui/src/app/components/admin/tasks/tasks.component.html +++ b/src-ui/src/app/components/admin/tasks/tasks.component.html @@ -97,6 +97,12 @@
(click for full output) } + @if (task.duplicate_documents?.length > 0) { +
+ + Duplicate(s) detected +
+ } } diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 44304c942..baba74d25 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -145,6 +145,24 @@ Details
+ @if (document?.duplicate_documents?.length) { +
+
Duplicate content detected.
+
    + @for (duplicate of document.duplicate_documents; track duplicate.id) { +
  • + +
  • + } +
+
+ } int | None: return obj.page_count + def get_duplicate_documents(self, obj): + view = self.context.get("view") + if view and getattr(view, "action", None) != "retrieve": + return [] + request = self.context.get("request") + user = request.user if request else None + duplicates = _get_viewable_duplicates(obj, user) + return list(duplicates.values("id", "title")) + def get_original_file_name(self, obj) -> str | None: return obj.original_filename @@ -1233,6 +1268,7 @@ class DocumentSerializer( "archive_serial_number", "original_file_name", "archived_file_name", + "duplicate_documents", "owner", "permissions", "user_can_change", @@ -2094,10 +2130,12 @@ class TasksViewSerializer(OwnedObjectSerializer): "result", "acknowledged", "related_document", + "duplicate_documents", "owner", ) related_document = serializers.SerializerMethodField() + duplicate_documents = serializers.SerializerMethodField() created_doc_re = re.compile(r"New document id (\d+) created") duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)") @@ -2122,6 +2160,30 @@ class TasksViewSerializer(OwnedObjectSerializer): return result + def _get_duplicate_documents(self, obj): + if not hasattr(self, "_duplicate_documents_cache"): + self._duplicate_documents_cache = {} + cache = self._duplicate_documents_cache + if obj.pk in cache: + return cache[obj.pk] + related_document = self.get_related_document(obj) + if not related_document: + cache[obj.pk] = [] + return cache[obj.pk] + try: + document = Document.objects.get(pk=related_document) + except Document.DoesNotExist: + cache[obj.pk] = [] + return cache[obj.pk] + request = self.context.get("request") + user = request.user if request else None + duplicates = _get_viewable_duplicates(document, user) + cache[obj.pk] = list(duplicates.values("id", "title")) + return cache[obj.pk] + + def get_duplicate_documents(self, obj): + return self._get_duplicate_documents(obj) + class RunTaskViewSerializer(serializers.Serializer): task_name = serializers.ChoiceField( diff --git a/src/documents/tests/test_api_tasks.py b/src/documents/tests/test_api_tasks.py index aa42577c4..295747979 100644 --- a/src/documents/tests/test_api_tasks.py +++ b/src/documents/tests/test_api_tasks.py @@ -7,6 +7,7 @@ from django.contrib.auth.models import User from rest_framework import status from rest_framework.test import APITestCase +from documents.models import Document from documents.models import PaperlessTask from documents.tests.utils import DirectoriesMixin from documents.views import TasksViewSet @@ -258,7 +259,7 @@ class TestTasks(DirectoriesMixin, APITestCase): task_id=str(uuid.uuid4()), task_file_name="task_one.pdf", status=celery.states.FAILURE, - result="test.pdf: Not consuming test.pdf: It is a duplicate.", + result="test.pdf: Unexpected error during ingestion.", ) response = self.client.get(self.ENDPOINT) @@ -270,7 +271,7 @@ class TestTasks(DirectoriesMixin, APITestCase): self.assertEqual( returned_data["result"], - "test.pdf: Not consuming test.pdf: It is a duplicate.", + "test.pdf: Unexpected error during ingestion.", ) def test_task_name_webui(self): @@ -325,20 +326,33 @@ class TestTasks(DirectoriesMixin, APITestCase): self.assertEqual(returned_data["task_file_name"], "anothertest.pdf") - def test_task_result_failed_duplicate_includes_related_doc(self): + def test_task_result_duplicate_warning_includes_count(self): """ GIVEN: - - A celery task failed with a duplicate error + - A celery task succeeds, but a duplicate exists WHEN: - API call is made to get tasks THEN: - - The returned data includes a related document link + - The returned data includes duplicate warning metadata """ + checksum = "duplicate-checksum" + Document.objects.create( + title="Existing", + content="", + mime_type="application/pdf", + checksum=checksum, + ) + created_doc = Document.objects.create( + title="Created", + content="", + mime_type="application/pdf", + checksum=checksum, + ) PaperlessTask.objects.create( task_id=str(uuid.uuid4()), task_file_name="task_one.pdf", - status=celery.states.FAILURE, - result="Not consuming task_one.pdf: It is a duplicate of task_one_existing.pdf (#1234).", + status=celery.states.SUCCESS, + result=f"Success. New document id {created_doc.pk} created", ) response = self.client.get(self.ENDPOINT) @@ -348,7 +362,7 @@ class TestTasks(DirectoriesMixin, APITestCase): returned_data = response.data[0] - self.assertEqual(returned_data["related_document"], "1234") + self.assertEqual(returned_data["related_document"], str(created_doc.pk)) def test_run_train_classifier_task(self): """ diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 63d6f8f5b..5b3b32fad 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -485,21 +485,21 @@ class TestConsumer( with self.get_consumer(self.get_test_file()) as consumer: consumer.run() - with self.assertRaisesMessage(ConsumerError, "It is a duplicate"): - with self.get_consumer(self.get_test_file()) as consumer: - consumer.run() + with self.get_consumer(self.get_test_file()) as consumer: + consumer.run() - self._assert_first_last_send_progress(last_status="FAILED") + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() def testDuplicates2(self): with self.get_consumer(self.get_test_file()) as consumer: consumer.run() - with self.assertRaisesMessage(ConsumerError, "It is a duplicate"): - with self.get_consumer(self.get_test_archive_file()) as consumer: - consumer.run() + with self.get_consumer(self.get_test_archive_file()) as consumer: + consumer.run() - self._assert_first_last_send_progress(last_status="FAILED") + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() def testDuplicates3(self): with self.get_consumer(self.get_test_archive_file()) as consumer: @@ -513,9 +513,10 @@ class TestConsumer( Document.objects.all().delete() - with self.assertRaisesMessage(ConsumerError, "document is in the trash"): - with self.get_consumer(self.get_test_file()) as consumer: - consumer.run() + with self.get_consumer(self.get_test_file()) as consumer: + consumer.run() + + self.assertEqual(Document.objects.count(), 1) def testAsnExists(self): with self.get_consumer( @@ -718,12 +719,12 @@ class TestConsumer( dst = self.get_test_file() self.assertIsFile(dst) - with self.assertRaises(ConsumerError): - with self.get_consumer(dst) as consumer: - consumer.run() + with self.get_consumer(dst) as consumer: + consumer.run() self.assertIsNotFile(dst) - self._assert_first_last_send_progress(last_status="FAILED") + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() @override_settings(CONSUMER_DELETE_DUPLICATES=False) def test_no_delete_duplicate(self): @@ -743,15 +744,12 @@ class TestConsumer( dst = self.get_test_file() self.assertIsFile(dst) - with self.assertRaisesRegex( - ConsumerError, - r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)", - ): - with self.get_consumer(dst) as consumer: - consumer.run() + with self.get_consumer(dst) as consumer: + consumer.run() - self.assertIsFile(dst) - self._assert_first_last_send_progress(last_status="FAILED") + self.assertIsNotFile(dst) + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() @override_settings(FILENAME_FORMAT="{title}") @mock.patch("documents.parsers.document_consumer_declaration.send")