From ef661ae1017c7d34536dd9928503f061671a16c3 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 18 Jan 2026 12:27:31 -0800 Subject: [PATCH] Treat CONSUMER_DELETE_DUPLICATES as a hard no --- docs/configuration.md | 5 ++-- src/documents/consumer.py | 39 +++++++++++++++++++++++++++- src/documents/tests/test_consumer.py | 14 +++++++--- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index b7b24d313..01efe049e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1146,8 +1146,9 @@ via the consumption directory, you can disable the consumer to save resources. #### [`PAPERLESS_CONSUMER_DELETE_DUPLICATES=`](#PAPERLESS_CONSUMER_DELETE_DUPLICATES) {#PAPERLESS_CONSUMER_DELETE_DUPLICATES} -: When the consumer detects a duplicate document, it will not touch -the original document. This default behavior can be changed here. +: As of version 3.0 Paperless-ngx allows duplicate documents to be consumed by default, _except_ when +this setting is enabled. When enabled, Paperless will check if a document with the same hash already +exists in the system and delete the duplicate file from the consumption directory without consuming it. Defaults to false. diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 122977bdf..2dc7fd7b7 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -785,16 +785,53 @@ class ConsumerPreflightPlugin( Q(checksum=checksum) | Q(archive_checksum=checksum), ) if existing_doc.exists(): + existing_doc = existing_doc.order_by("-created") + duplicates_in_trash = existing_doc.filter(deleted_at__isnull=False) log_msg = ( f"Consuming duplicate {self.filename}: " f"{existing_doc.count()} existing document(s) share the same content." ) - if existing_doc.filter(deleted_at__isnull=False).exists(): + if duplicates_in_trash.exists(): log_msg += " Note: at least one existing document is in the trash." self.log.warning(log_msg) + if settings.CONSUMER_DELETE_DUPLICATES: + duplicate = existing_doc.first() + duplicate_label = ( + duplicate.title + or duplicate.original_filename + or (Path(duplicate.filename).name if duplicate.filename else None) + or str(duplicate.pk) + ) + + try: + Path(self.input_doc.original_file).unlink() + except FileNotFoundError: + pass + except Exception as exc: # pragma: no cover + self.log.warning( + f"Could not delete duplicate file {self.input_doc.original_file}: {exc}", + ) + + failure_msg = ( + f"Not consuming {self.filename}: " + f"It is a duplicate of {duplicate_label} (#{duplicate.pk})" + ) + status_msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS + + if duplicates_in_trash.exists(): + status_msg = ( + ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH + ) + failure_msg += " Note: existing document is in the trash." + + self._fail( + status_msg, + failure_msg, + ) + def pre_check_directories(self): """ Ensure all required directories exist before attempting to use them diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 5b3b32fad..3048dd46d 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -719,12 +719,18 @@ class TestConsumer( dst = self.get_test_file() self.assertIsFile(dst) - with self.get_consumer(dst) as consumer: - consumer.run() + expected_message = ( + f"{dst.name}: Not consuming {dst.name}: " + f"It is a duplicate of {document.title} (#{document.pk})" + ) + + with self.assertRaisesMessage(ConsumerError, expected_message): + with self.get_consumer(dst) as consumer: + consumer.run() self.assertIsNotFile(dst) - self.assertEqual(Document.objects.count(), 2) - self._assert_first_last_send_progress() + self.assertEqual(Document.objects.count(), 1) + self._assert_first_last_send_progress(last_status=ProgressStatusOptions.FAILED) @override_settings(CONSUMER_DELETE_DUPLICATES=False) def test_no_delete_duplicate(self):