From 4428354150dfaeacd65287529a59c7f0e5594068 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 26 Jan 2026 10:55:08 -0800 Subject: [PATCH] Feature: allow duplicates with warnings, UI for discovery (#11815) --- docs/configuration.md | 5 +- .../admin/tasks/tasks.component.html | 6 ++ .../document-detail.component.html | 31 ++++++++ .../document-detail.component.spec.ts | 50 ++++++++++++- .../document-detail.component.ts | 16 +++- src-ui/src/app/data/document.ts | 2 + src-ui/src/app/data/paperless-task.ts | 3 + src/documents/consumer.py | 44 ++++++++--- .../0006_alter_document_checksum_unique.py | 23 ++++++ src/documents/models.py | 1 - src/documents/permissions.py | 24 +++++- src/documents/serialisers.py | 56 ++++++++++++++ src/documents/tests/test_api_tasks.py | 31 ++++++-- src/documents/tests/test_consumer.py | 73 +++++++++++++------ 14 files changed, 316 insertions(+), 49 deletions(-) create mode 100644 src/documents/migrations/0006_alter_document_checksum_unique.py diff --git a/docs/configuration.md b/docs/configuration.md index 872c93e44..41d43d424 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1152,8 +1152,9 @@ via the consumption directory, you can disable the consumer to save resources. #### [`PAPERLESS_CONSUMER_DELETE_DUPLICATES=`](#PAPERLESS_CONSUMER_DELETE_DUPLICATES) {#PAPERLESS_CONSUMER_DELETE_DUPLICATES} -: When the consumer detects a duplicate document, it will not touch -the original document. This default behavior can be changed here. +: As of version 3.0 Paperless-ngx allows duplicate documents to be consumed by default, _except_ when +this setting is enabled. When enabled, Paperless will check if a document with the same hash already +exists in the system and delete the duplicate file from the consumption directory without consuming it. Defaults to false. diff --git a/src-ui/src/app/components/admin/tasks/tasks.component.html b/src-ui/src/app/components/admin/tasks/tasks.component.html index 084195221..ad625789c 100644 --- a/src-ui/src/app/components/admin/tasks/tasks.component.html +++ b/src-ui/src/app/components/admin/tasks/tasks.component.html @@ -97,6 +97,12 @@
(click for full output) } + @if (task.duplicate_documents?.length > 0) { +
+ + Duplicate(s) detected +
+ } } diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 34e1f7980..5ca002479 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -370,6 +370,37 @@ } + + @if (document?.duplicate_documents?.length) { +
  • + + Duplicates + {{ document.duplicate_documents.length }} + + +
    +
    Duplicate documents detected:
    + +
    +
    +
  • + }
    diff --git a/src-ui/src/app/components/document-detail/document-detail.component.spec.ts b/src-ui/src/app/components/document-detail/document-detail.component.spec.ts index 198e7a7a4..d1d10c985 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.spec.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.spec.ts @@ -301,16 +301,16 @@ describe('DocumentDetailComponent', () => { .spyOn(openDocumentsService, 'openDocument') .mockReturnValueOnce(of(true)) fixture.detectChanges() - expect(component.activeNavID).toEqual(5) // DocumentDetailNavIDs.Notes + expect(component.activeNavID).toEqual(component.DocumentDetailNavIDs.Notes) }) it('should change url on tab switch', () => { initNormally() const navigateSpy = jest.spyOn(router, 'navigate') - component.nav.select(5) + component.nav.select(component.DocumentDetailNavIDs.Notes) component.nav.navChange.next({ activeId: 1, - nextId: 5, + nextId: component.DocumentDetailNavIDs.Notes, preventDefault: () => {}, }) fixture.detectChanges() @@ -352,6 +352,18 @@ describe('DocumentDetailComponent', () => { expect(component.document).toEqual(doc) }) + it('should fall back to details tab when duplicates tab is active but no duplicates', () => { + initNormally() + component.activeNavID = component.DocumentDetailNavIDs.Duplicates + const noDupDoc = { ...doc, duplicate_documents: [] } + + component.updateComponent(noDupDoc) + + expect(component.activeNavID).toEqual( + component.DocumentDetailNavIDs.Details + ) + }) + it('should load already-opened document via param', () => { initNormally() jest.spyOn(documentService, 'get').mockReturnValueOnce(of(doc)) @@ -367,6 +379,38 @@ describe('DocumentDetailComponent', () => { expect(component.document).toEqual(doc) }) + it('should update cached open document duplicates when reloading an open doc', () => { + const openDoc = { ...doc, duplicate_documents: [{ id: 1, title: 'Old' }] } + const updatedDuplicates = [ + { id: 2, title: 'Newer duplicate', deleted_at: null }, + ] + jest + .spyOn(activatedRoute, 'paramMap', 'get') + .mockReturnValue(of(convertToParamMap({ id: 3, section: 'details' }))) + jest.spyOn(documentService, 'get').mockReturnValue( + of({ + ...doc, + modified: new Date('2024-01-02T00:00:00Z'), + duplicate_documents: updatedDuplicates, + }) + ) + jest.spyOn(openDocumentsService, 'getOpenDocument').mockReturnValue(openDoc) + const saveSpy = jest.spyOn(openDocumentsService, 'save') + jest.spyOn(openDocumentsService, 'openDocument').mockReturnValue(of(true)) + jest.spyOn(customFieldsService, 'listAll').mockReturnValue( + of({ + count: customFields.length, + all: customFields.map((f) => f.id), + results: customFields, + }) + ) + + fixture.detectChanges() + + expect(openDoc.duplicate_documents).toEqual(updatedDuplicates) + expect(saveSpy).toHaveBeenCalled() + }) + it('should disable form if user cannot edit', () => { currentUserHasObjectPermissions = false initNormally() diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index 5bac6fe72..917597ef6 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -8,7 +8,7 @@ import { FormsModule, ReactiveFormsModule, } from '@angular/forms' -import { ActivatedRoute, Router } from '@angular/router' +import { ActivatedRoute, Router, RouterModule } from '@angular/router' import { NgbDateStruct, NgbDropdownModule, @@ -124,6 +124,7 @@ enum DocumentDetailNavIDs { Notes = 5, Permissions = 6, History = 7, + Duplicates = 8, } enum ContentRenderType { @@ -181,6 +182,7 @@ export enum ZoomSetting { NgxBootstrapIconsModule, PdfViewerModule, TextAreaComponent, + RouterModule, ], }) export class DocumentDetailComponent @@ -454,6 +456,11 @@ export class DocumentDetailComponent const openDocument = this.openDocumentService.getOpenDocument( this.documentId ) + // update duplicate documents if present + if (openDocument && doc?.duplicate_documents) { + openDocument.duplicate_documents = doc.duplicate_documents + this.openDocumentService.save() + } const useDoc = openDocument || doc if (openDocument) { if ( @@ -704,6 +711,13 @@ export class DocumentDetailComponent } this.title = this.documentTitlePipe.transform(doc.title) this.prepareForm(doc) + + if ( + this.activeNavID === DocumentDetailNavIDs.Duplicates && + !doc?.duplicate_documents?.length + ) { + this.activeNavID = DocumentDetailNavIDs.Details + } } get customFieldFormFields(): FormArray { diff --git a/src-ui/src/app/data/document.ts b/src-ui/src/app/data/document.ts index 8aae31945..03d3bf09b 100644 --- a/src-ui/src/app/data/document.ts +++ b/src-ui/src/app/data/document.ts @@ -159,6 +159,8 @@ export interface Document extends ObjectWithPermissions { page_count?: number + duplicate_documents?: Document[] + // Frontend only __changedFields?: string[] } diff --git a/src-ui/src/app/data/paperless-task.ts b/src-ui/src/app/data/paperless-task.ts index b30af7cdd..19dd3921e 100644 --- a/src-ui/src/app/data/paperless-task.ts +++ b/src-ui/src/app/data/paperless-task.ts @@ -1,3 +1,4 @@ +import { Document } from './document' import { ObjectWithId } from './object-with-id' export enum PaperlessTaskType { @@ -42,5 +43,7 @@ export interface PaperlessTask extends ObjectWithId { related_document?: number + duplicate_documents?: Document[] + owner?: number } diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 4c8c4dd28..1ff60220b 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -779,19 +779,45 @@ class ConsumerPreflightPlugin( Q(checksum=checksum) | Q(archive_checksum=checksum), ) if existing_doc.exists(): - msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS - log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})." + existing_doc = existing_doc.order_by("-created") + duplicates_in_trash = existing_doc.filter(deleted_at__isnull=False) + log_msg = ( + f"Consuming duplicate {self.filename}: " + f"{existing_doc.count()} existing document(s) share the same content." + ) - if existing_doc.first().deleted_at is not None: - msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH - log_msg += " Note: existing document is in the trash." + if duplicates_in_trash.exists(): + log_msg += " Note: at least one existing document is in the trash." + + self.log.warning(log_msg) if settings.CONSUMER_DELETE_DUPLICATES: + duplicate = existing_doc.first() + duplicate_label = ( + duplicate.title + or duplicate.original_filename + or (Path(duplicate.filename).name if duplicate.filename else None) + or str(duplicate.pk) + ) + Path(self.input_doc.original_file).unlink() - self._fail( - msg, - log_msg, - ) + + failure_msg = ( + f"Not consuming {self.filename}: " + f"It is a duplicate of {duplicate_label} (#{duplicate.pk})" + ) + status_msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS + + if duplicates_in_trash.exists(): + status_msg = ( + ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH + ) + failure_msg += " Note: existing document is in the trash." + + self._fail( + status_msg, + failure_msg, + ) def pre_check_directories(self): """ diff --git a/src/documents/migrations/0006_alter_document_checksum_unique.py b/src/documents/migrations/0006_alter_document_checksum_unique.py new file mode 100644 index 000000000..f86799494 --- /dev/null +++ b/src/documents/migrations/0006_alter_document_checksum_unique.py @@ -0,0 +1,23 @@ +# Generated by Django 5.2.7 on 2026-01-14 17:45 + +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + dependencies = [ + ("documents", "0005_workflowtrigger_filter_has_any_correspondents_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="document", + name="checksum", + field=models.CharField( + editable=False, + max_length=32, + verbose_name="checksum", + help_text="The checksum of the original document.", + ), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 0ea525d49..fe41796bd 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -205,7 +205,6 @@ class Document(SoftDeleteModel, ModelWithOwner): _("checksum"), max_length=32, editable=False, - unique=True, help_text=_("The checksum of the original document."), ) diff --git a/src/documents/permissions.py b/src/documents/permissions.py index ac6d3f9ca..9d5c9eb68 100644 --- a/src/documents/permissions.py +++ b/src/documents/permissions.py @@ -148,13 +148,29 @@ def get_document_count_filter_for_user(user): ) -def get_objects_for_user_owner_aware(user, perms, Model) -> QuerySet: - objects_owned = Model.objects.filter(owner=user) - objects_unowned = Model.objects.filter(owner__isnull=True) +def get_objects_for_user_owner_aware( + user, + perms, + Model, + *, + include_deleted=False, +) -> QuerySet: + """ + Returns objects the user owns, are unowned, or has explicit perms. + When include_deleted is True, soft-deleted items are also included. + """ + manager = ( + Model.global_objects + if include_deleted and hasattr(Model, "global_objects") + else Model.objects + ) + + objects_owned = manager.filter(owner=user) + objects_unowned = manager.filter(owner__isnull=True) objects_with_perms = get_objects_for_user( user=user, perms=perms, - klass=Model, + klass=manager.all(), accept_global_perms=False, ) return objects_owned | objects_unowned | objects_with_perms diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index d9e5c22e0..4dc23d740 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -23,6 +23,7 @@ from django.core.validators import MinValueValidator from django.core.validators import RegexValidator from django.core.validators import integer_validator from django.db.models import Count +from django.db.models import Q from django.db.models.functions import Lower from django.utils.crypto import get_random_string from django.utils.dateparse import parse_datetime @@ -72,6 +73,7 @@ from documents.models import WorkflowTrigger from documents.parsers import is_mime_type_supported from documents.permissions import get_document_count_filter_for_user from documents.permissions import get_groups_with_only_permission +from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import set_permissions_for_object from documents.regex import validate_regex_pattern from documents.templating.filepath import validate_filepath_template_and_render @@ -82,6 +84,9 @@ from documents.validators import url_validator if TYPE_CHECKING: from collections.abc import Iterable + from django.db.models.query import QuerySet + + logger = logging.getLogger("paperless.serializers") @@ -1014,6 +1019,32 @@ class NotesSerializer(serializers.ModelSerializer): return ret +def _get_viewable_duplicates( + document: Document, + user: User | None, +) -> QuerySet[Document]: + checksums = {document.checksum} + if document.archive_checksum: + checksums.add(document.archive_checksum) + duplicates = Document.global_objects.filter( + Q(checksum__in=checksums) | Q(archive_checksum__in=checksums), + ).exclude(pk=document.pk) + duplicates = duplicates.order_by("-created") + allowed = get_objects_for_user_owner_aware( + user, + "documents.view_document", + Document, + include_deleted=True, + ) + return duplicates.filter(id__in=allowed) + + +class DuplicateDocumentSummarySerializer(serializers.Serializer): + id = serializers.IntegerField() + title = serializers.CharField() + deleted_at = serializers.DateTimeField(allow_null=True) + + @extend_schema_serializer( deprecate_fields=["created_date"], ) @@ -1031,6 +1062,7 @@ class DocumentSerializer( archived_file_name = SerializerMethodField() created_date = serializers.DateField(required=False) page_count = SerializerMethodField() + duplicate_documents = SerializerMethodField() notes = NotesSerializer(many=True, required=False, read_only=True) @@ -1056,6 +1088,16 @@ class DocumentSerializer( def get_page_count(self, obj) -> int | None: return obj.page_count + @extend_schema_field(DuplicateDocumentSummarySerializer(many=True)) + def get_duplicate_documents(self, obj): + view = self.context.get("view") + if view and getattr(view, "action", None) != "retrieve": + return [] + request = self.context.get("request") + user = request.user if request else None + duplicates = _get_viewable_duplicates(obj, user) + return list(duplicates.values("id", "title", "deleted_at")) + def get_original_file_name(self, obj) -> str | None: return obj.original_filename @@ -1233,6 +1275,7 @@ class DocumentSerializer( "archive_serial_number", "original_file_name", "archived_file_name", + "duplicate_documents", "owner", "permissions", "user_can_change", @@ -2094,10 +2137,12 @@ class TasksViewSerializer(OwnedObjectSerializer): "result", "acknowledged", "related_document", + "duplicate_documents", "owner", ) related_document = serializers.SerializerMethodField() + duplicate_documents = serializers.SerializerMethodField() created_doc_re = re.compile(r"New document id (\d+) created") duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)") @@ -2122,6 +2167,17 @@ class TasksViewSerializer(OwnedObjectSerializer): return result + @extend_schema_field(DuplicateDocumentSummarySerializer(many=True)) + def get_duplicate_documents(self, obj): + related_document = self.get_related_document(obj) + request = self.context.get("request") + user = request.user if request else None + document = Document.global_objects.filter(pk=related_document).first() + if not related_document or not user or not document: + return [] + duplicates = _get_viewable_duplicates(document, user) + return list(duplicates.values("id", "title", "deleted_at")) + class RunTaskViewSerializer(serializers.Serializer): task_name = serializers.ChoiceField( diff --git a/src/documents/tests/test_api_tasks.py b/src/documents/tests/test_api_tasks.py index aa42577c4..6429ef44f 100644 --- a/src/documents/tests/test_api_tasks.py +++ b/src/documents/tests/test_api_tasks.py @@ -7,6 +7,7 @@ from django.contrib.auth.models import User from rest_framework import status from rest_framework.test import APITestCase +from documents.models import Document from documents.models import PaperlessTask from documents.tests.utils import DirectoriesMixin from documents.views import TasksViewSet @@ -258,7 +259,7 @@ class TestTasks(DirectoriesMixin, APITestCase): task_id=str(uuid.uuid4()), task_file_name="task_one.pdf", status=celery.states.FAILURE, - result="test.pdf: Not consuming test.pdf: It is a duplicate.", + result="test.pdf: Unexpected error during ingestion.", ) response = self.client.get(self.ENDPOINT) @@ -270,7 +271,7 @@ class TestTasks(DirectoriesMixin, APITestCase): self.assertEqual( returned_data["result"], - "test.pdf: Not consuming test.pdf: It is a duplicate.", + "test.pdf: Unexpected error during ingestion.", ) def test_task_name_webui(self): @@ -325,20 +326,34 @@ class TestTasks(DirectoriesMixin, APITestCase): self.assertEqual(returned_data["task_file_name"], "anothertest.pdf") - def test_task_result_failed_duplicate_includes_related_doc(self): + def test_task_result_duplicate_warning_includes_count(self): """ GIVEN: - - A celery task failed with a duplicate error + - A celery task succeeds, but a duplicate exists WHEN: - API call is made to get tasks THEN: - - The returned data includes a related document link + - The returned data includes duplicate warning metadata """ + checksum = "duplicate-checksum" + Document.objects.create( + title="Existing", + content="", + mime_type="application/pdf", + checksum=checksum, + ) + created_doc = Document.objects.create( + title="Created", + content="", + mime_type="application/pdf", + checksum=checksum, + archive_checksum="another-checksum", + ) PaperlessTask.objects.create( task_id=str(uuid.uuid4()), task_file_name="task_one.pdf", - status=celery.states.FAILURE, - result="Not consuming task_one.pdf: It is a duplicate of task_one_existing.pdf (#1234).", + status=celery.states.SUCCESS, + result=f"Success. New document id {created_doc.pk} created", ) response = self.client.get(self.ENDPOINT) @@ -348,7 +363,7 @@ class TestTasks(DirectoriesMixin, APITestCase): returned_data = response.data[0] - self.assertEqual(returned_data["related_document"], "1234") + self.assertEqual(returned_data["related_document"], str(created_doc.pk)) def test_run_train_classifier_task(self): """ diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 63d6f8f5b..16fa2bf70 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -485,21 +485,21 @@ class TestConsumer( with self.get_consumer(self.get_test_file()) as consumer: consumer.run() - with self.assertRaisesMessage(ConsumerError, "It is a duplicate"): - with self.get_consumer(self.get_test_file()) as consumer: - consumer.run() + with self.get_consumer(self.get_test_file()) as consumer: + consumer.run() - self._assert_first_last_send_progress(last_status="FAILED") + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() def testDuplicates2(self): with self.get_consumer(self.get_test_file()) as consumer: consumer.run() - with self.assertRaisesMessage(ConsumerError, "It is a duplicate"): - with self.get_consumer(self.get_test_archive_file()) as consumer: - consumer.run() + with self.get_consumer(self.get_test_archive_file()) as consumer: + consumer.run() - self._assert_first_last_send_progress(last_status="FAILED") + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() def testDuplicates3(self): with self.get_consumer(self.get_test_archive_file()) as consumer: @@ -513,9 +513,10 @@ class TestConsumer( Document.objects.all().delete() - with self.assertRaisesMessage(ConsumerError, "document is in the trash"): - with self.get_consumer(self.get_test_file()) as consumer: - consumer.run() + with self.get_consumer(self.get_test_file()) as consumer: + consumer.run() + + self.assertEqual(Document.objects.count(), 1) def testAsnExists(self): with self.get_consumer( @@ -718,12 +719,45 @@ class TestConsumer( dst = self.get_test_file() self.assertIsFile(dst) - with self.assertRaises(ConsumerError): + expected_message = ( + f"{dst.name}: Not consuming {dst.name}: " + f"It is a duplicate of {document.title} (#{document.pk})" + ) + + with self.assertRaisesMessage(ConsumerError, expected_message): with self.get_consumer(dst) as consumer: consumer.run() self.assertIsNotFile(dst) - self._assert_first_last_send_progress(last_status="FAILED") + self.assertEqual(Document.objects.count(), 1) + self._assert_first_last_send_progress(last_status=ProgressStatusOptions.FAILED) + + @override_settings(CONSUMER_DELETE_DUPLICATES=True) + def test_delete_duplicate_in_trash(self): + dst = self.get_test_file() + with self.get_consumer(dst) as consumer: + consumer.run() + + # Move the existing document to trash + document = Document.objects.first() + document.delete() + + dst = self.get_test_file() + self.assertIsFile(dst) + + expected_message = ( + f"{dst.name}: Not consuming {dst.name}: " + f"It is a duplicate of {document.title} (#{document.pk})" + f" Note: existing document is in the trash." + ) + + with self.assertRaisesMessage(ConsumerError, expected_message): + with self.get_consumer(dst) as consumer: + consumer.run() + + self.assertIsNotFile(dst) + self.assertEqual(Document.global_objects.count(), 1) + self.assertEqual(Document.objects.count(), 0) @override_settings(CONSUMER_DELETE_DUPLICATES=False) def test_no_delete_duplicate(self): @@ -743,15 +777,12 @@ class TestConsumer( dst = self.get_test_file() self.assertIsFile(dst) - with self.assertRaisesRegex( - ConsumerError, - r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)", - ): - with self.get_consumer(dst) as consumer: - consumer.run() + with self.get_consumer(dst) as consumer: + consumer.run() - self.assertIsFile(dst) - self._assert_first_last_send_progress(last_status="FAILED") + self.assertIsNotFile(dst) + self.assertEqual(Document.objects.count(), 2) + self._assert_first_last_send_progress() @override_settings(FILENAME_FORMAT="{title}") @mock.patch("documents.parsers.document_consumer_declaration.send")