Core elements, migration, consumer modifications

This commit is contained in:
shamoon
2026-01-18 11:40:43 -08:00
parent 62248f5702
commit 1e595a5aab
11 changed files with 174 additions and 44 deletions

View File

@@ -97,6 +97,12 @@
<br/><em>(<ng-container i18n>click for full output</ng-container>)</em> <br/><em>(<ng-container i18n>click for full output</ng-container>)</em>
} }
</ng-template> </ng-template>
@if (task.duplicate_documents?.length > 0) {
<div class="small text-warning-emphasis d-flex align-items-center gap-1">
<i-bs class="lh-1" width="1em" height="1em" name="exclamation-triangle"></i-bs>
<span i18n>Duplicate(s) detected</span>
</div>
}
</td> </td>
} }
<td class="d-lg-none"> <td class="d-lg-none">

View File

@@ -145,6 +145,24 @@
<a ngbNavLink i18n>Details</a> <a ngbNavLink i18n>Details</a>
<ng-template ngbNavContent> <ng-template ngbNavContent>
<div> <div>
@if (document?.duplicate_documents?.length) {
<div class="alert alert-warning">
<div class="fw-semibold" i18n>Duplicate content detected.</div>
<ul class="mb-0 mt-2">
@for (duplicate of document.duplicate_documents; track duplicate.id) {
<li>
<button
type="button"
class="btn btn-link p-0 align-baseline"
(click)="openDuplicateDocument(duplicate.id)"
>
{{ duplicate.title }}
</button>
</li>
}
</ul>
</div>
}
<pngx-input-text #inputTitle i18n-title title="Title" formControlName="title" [horizontal]="true" [suggestion]="suggestions?.title" (keyup)="titleKeyUp($event)" [error]="error?.title"></pngx-input-text> <pngx-input-text #inputTitle i18n-title title="Title" formControlName="title" [horizontal]="true" [suggestion]="suggestions?.title" (keyup)="titleKeyUp($event)" [error]="error?.title"></pngx-input-text>
<pngx-input-number i18n-title title="Archive serial number" [error]="error?.archive_serial_number" [horizontal]="true" formControlName='archive_serial_number'></pngx-input-number> <pngx-input-number i18n-title title="Archive serial number" [error]="error?.archive_serial_number" [horizontal]="true" formControlName='archive_serial_number'></pngx-input-number>
<pngx-input-date i18n-title title="Date created" formControlName="created" [suggestions]="suggestions?.dates" [showFilter]="true" [horizontal]="true" (filterDocuments)="filterDocuments($event)" <pngx-input-date i18n-title title="Date created" formControlName="created" [suggestions]="suggestions?.dates" [showFilter]="true" [horizontal]="true" (filterDocuments)="filterDocuments($event)"

View File

@@ -706,6 +706,10 @@ export class DocumentDetailComponent
this.prepareForm(doc) this.prepareForm(doc)
} }
openDuplicateDocument(documentId: number) {
this.router.navigate(['documents', documentId, 'details'])
}
get customFieldFormFields(): FormArray { get customFieldFormFields(): FormArray {
return this.documentForm.get('custom_fields') as FormArray return this.documentForm.get('custom_fields') as FormArray
} }

View File

@@ -112,6 +112,11 @@ export interface SearchHit {
note_highlights?: string note_highlights?: string
} }
export interface DuplicateDocument {
id: number
title: string
}
export interface Document extends ObjectWithPermissions { export interface Document extends ObjectWithPermissions {
correspondent?: number correspondent?: number
@@ -159,6 +164,8 @@ export interface Document extends ObjectWithPermissions {
page_count?: number page_count?: number
duplicate_documents?: DuplicateDocument[]
// Frontend only // Frontend only
__changedFields?: string[] __changedFields?: string[]
} }

View File

@@ -1,3 +1,4 @@
import { DuplicateDocument } from './document'
import { ObjectWithId } from './object-with-id' import { ObjectWithId } from './object-with-id'
export enum PaperlessTaskType { export enum PaperlessTaskType {
@@ -42,5 +43,7 @@ export interface PaperlessTask extends ObjectWithId {
related_document?: number related_document?: number
duplicate_documents?: DuplicateDocument[]
owner?: number owner?: number
} }

View File

@@ -785,20 +785,16 @@ class ConsumerPreflightPlugin(
Q(checksum=checksum) | Q(archive_checksum=checksum), Q(checksum=checksum) | Q(archive_checksum=checksum),
) )
if existing_doc.exists(): if existing_doc.exists():
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS log_msg = (
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})." f"Consuming duplicate {self.filename}: "
f"{existing_doc.count()} existing document(s) share the same content."
if existing_doc.first().deleted_at is not None:
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
log_msg += " Note: existing document is in the trash."
if settings.CONSUMER_DELETE_DUPLICATES:
Path(self.input_doc.original_file).unlink()
self._fail(
msg,
log_msg,
) )
if existing_doc.filter(deleted_at__isnull=False).exists():
log_msg += " Note: at least one existing document is in the trash."
self.log.warning(log_msg)
def pre_check_directories(self): def pre_check_directories(self):
""" """
Ensure all required directories exist before attempting to use them Ensure all required directories exist before attempting to use them

View File

@@ -0,0 +1,23 @@
# Generated by Django 5.2.7 on 2026-01-14 17:45
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1076_alter_paperlesstask_task_name"),
]
operations = [
migrations.AlterField(
model_name="document",
name="checksum",
field=models.CharField(
editable=False,
max_length=32,
verbose_name="checksum",
help_text="The checksum of the original document.",
),
),
]

View File

@@ -212,7 +212,6 @@ class Document(SoftDeleteModel, ModelWithOwner):
_("checksum"), _("checksum"),
max_length=32, max_length=32,
editable=False, editable=False,
unique=True,
help_text=_("The checksum of the original document."), help_text=_("The checksum of the original document."),
) )

View File

@@ -23,6 +23,7 @@ from django.core.validators import MinValueValidator
from django.core.validators import RegexValidator from django.core.validators import RegexValidator
from django.core.validators import integer_validator from django.core.validators import integer_validator
from django.db.models import Count from django.db.models import Count
from django.db.models import Q
from django.db.models.functions import Lower from django.db.models.functions import Lower
from django.utils.crypto import get_random_string from django.utils.crypto import get_random_string
from django.utils.dateparse import parse_datetime from django.utils.dateparse import parse_datetime
@@ -72,6 +73,7 @@ from documents.models import WorkflowTrigger
from documents.parsers import is_mime_type_supported from documents.parsers import is_mime_type_supported
from documents.permissions import get_document_count_filter_for_user from documents.permissions import get_document_count_filter_for_user
from documents.permissions import get_groups_with_only_permission from documents.permissions import get_groups_with_only_permission
from documents.permissions import get_objects_for_user_owner_aware
from documents.permissions import set_permissions_for_object from documents.permissions import set_permissions_for_object
from documents.regex import validate_regex_pattern from documents.regex import validate_regex_pattern
from documents.templating.filepath import validate_filepath_template_and_render from documents.templating.filepath import validate_filepath_template_and_render
@@ -1014,6 +1016,29 @@ class NotesSerializer(serializers.ModelSerializer):
return ret return ret
def _get_viewable_duplicates(document: Document, user: User | None):
checksums = {document.checksum}
if document.archive_checksum:
checksums.add(document.archive_checksum)
duplicates = (
Document.global_objects.filter(
Q(checksum__in=checksums) | Q(archive_checksum__in=checksums),
deleted_at__isnull=True,
)
.exclude(pk=document.pk)
.order_by("-created")
)
if user.is_superuser:
return duplicates
return duplicates.filter(
id__in=get_objects_for_user_owner_aware(
user,
"documents.view_document",
Document,
).values_list("id", flat=True),
)
@extend_schema_serializer( @extend_schema_serializer(
deprecate_fields=["created_date"], deprecate_fields=["created_date"],
) )
@@ -1031,6 +1056,7 @@ class DocumentSerializer(
archived_file_name = SerializerMethodField() archived_file_name = SerializerMethodField()
created_date = serializers.DateField(required=False) created_date = serializers.DateField(required=False)
page_count = SerializerMethodField() page_count = SerializerMethodField()
duplicate_documents = SerializerMethodField()
notes = NotesSerializer(many=True, required=False, read_only=True) notes = NotesSerializer(many=True, required=False, read_only=True)
@@ -1056,6 +1082,15 @@ class DocumentSerializer(
def get_page_count(self, obj) -> int | None: def get_page_count(self, obj) -> int | None:
return obj.page_count return obj.page_count
def get_duplicate_documents(self, obj):
view = self.context.get("view")
if view and getattr(view, "action", None) != "retrieve":
return []
request = self.context.get("request")
user = request.user if request else None
duplicates = _get_viewable_duplicates(obj, user)
return list(duplicates.values("id", "title"))
def get_original_file_name(self, obj) -> str | None: def get_original_file_name(self, obj) -> str | None:
return obj.original_filename return obj.original_filename
@@ -1233,6 +1268,7 @@ class DocumentSerializer(
"archive_serial_number", "archive_serial_number",
"original_file_name", "original_file_name",
"archived_file_name", "archived_file_name",
"duplicate_documents",
"owner", "owner",
"permissions", "permissions",
"user_can_change", "user_can_change",
@@ -2094,10 +2130,12 @@ class TasksViewSerializer(OwnedObjectSerializer):
"result", "result",
"acknowledged", "acknowledged",
"related_document", "related_document",
"duplicate_documents",
"owner", "owner",
) )
related_document = serializers.SerializerMethodField() related_document = serializers.SerializerMethodField()
duplicate_documents = serializers.SerializerMethodField()
created_doc_re = re.compile(r"New document id (\d+) created") created_doc_re = re.compile(r"New document id (\d+) created")
duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)") duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)")
@@ -2122,6 +2160,30 @@ class TasksViewSerializer(OwnedObjectSerializer):
return result return result
def _get_duplicate_documents(self, obj):
if not hasattr(self, "_duplicate_documents_cache"):
self._duplicate_documents_cache = {}
cache = self._duplicate_documents_cache
if obj.pk in cache:
return cache[obj.pk]
related_document = self.get_related_document(obj)
if not related_document:
cache[obj.pk] = []
return cache[obj.pk]
try:
document = Document.objects.get(pk=related_document)
except Document.DoesNotExist:
cache[obj.pk] = []
return cache[obj.pk]
request = self.context.get("request")
user = request.user if request else None
duplicates = _get_viewable_duplicates(document, user)
cache[obj.pk] = list(duplicates.values("id", "title"))
return cache[obj.pk]
def get_duplicate_documents(self, obj):
return self._get_duplicate_documents(obj)
class RunTaskViewSerializer(serializers.Serializer): class RunTaskViewSerializer(serializers.Serializer):
task_name = serializers.ChoiceField( task_name = serializers.ChoiceField(

View File

@@ -7,6 +7,7 @@ from django.contrib.auth.models import User
from rest_framework import status from rest_framework import status
from rest_framework.test import APITestCase from rest_framework.test import APITestCase
from documents.models import Document
from documents.models import PaperlessTask from documents.models import PaperlessTask
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
from documents.views import TasksViewSet from documents.views import TasksViewSet
@@ -258,7 +259,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
task_id=str(uuid.uuid4()), task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf", task_file_name="task_one.pdf",
status=celery.states.FAILURE, status=celery.states.FAILURE,
result="test.pdf: Not consuming test.pdf: It is a duplicate.", result="test.pdf: Unexpected error during ingestion.",
) )
response = self.client.get(self.ENDPOINT) response = self.client.get(self.ENDPOINT)
@@ -270,7 +271,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual( self.assertEqual(
returned_data["result"], returned_data["result"],
"test.pdf: Not consuming test.pdf: It is a duplicate.", "test.pdf: Unexpected error during ingestion.",
) )
def test_task_name_webui(self): def test_task_name_webui(self):
@@ -325,20 +326,33 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual(returned_data["task_file_name"], "anothertest.pdf") self.assertEqual(returned_data["task_file_name"], "anothertest.pdf")
def test_task_result_failed_duplicate_includes_related_doc(self): def test_task_result_duplicate_warning_includes_count(self):
""" """
GIVEN: GIVEN:
- A celery task failed with a duplicate error - A celery task succeeds, but a duplicate exists
WHEN: WHEN:
- API call is made to get tasks - API call is made to get tasks
THEN: THEN:
- The returned data includes a related document link - The returned data includes duplicate warning metadata
""" """
checksum = "duplicate-checksum"
Document.objects.create(
title="Existing",
content="",
mime_type="application/pdf",
checksum=checksum,
)
created_doc = Document.objects.create(
title="Created",
content="",
mime_type="application/pdf",
checksum=checksum,
)
PaperlessTask.objects.create( PaperlessTask.objects.create(
task_id=str(uuid.uuid4()), task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf", task_file_name="task_one.pdf",
status=celery.states.FAILURE, status=celery.states.SUCCESS,
result="Not consuming task_one.pdf: It is a duplicate of task_one_existing.pdf (#1234).", result=f"Success. New document id {created_doc.pk} created",
) )
response = self.client.get(self.ENDPOINT) response = self.client.get(self.ENDPOINT)
@@ -348,7 +362,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
returned_data = response.data[0] returned_data = response.data[0]
self.assertEqual(returned_data["related_document"], "1234") self.assertEqual(returned_data["related_document"], str(created_doc.pk))
def test_run_train_classifier_task(self): def test_run_train_classifier_task(self):
""" """

View File

@@ -485,21 +485,21 @@ class TestConsumer(
with self.get_consumer(self.get_test_file()) as consumer: with self.get_consumer(self.get_test_file()) as consumer:
consumer.run() consumer.run()
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"): with self.get_consumer(self.get_test_file()) as consumer:
with self.get_consumer(self.get_test_file()) as consumer: consumer.run()
consumer.run()
self._assert_first_last_send_progress(last_status="FAILED") self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
def testDuplicates2(self): def testDuplicates2(self):
with self.get_consumer(self.get_test_file()) as consumer: with self.get_consumer(self.get_test_file()) as consumer:
consumer.run() consumer.run()
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"): with self.get_consumer(self.get_test_archive_file()) as consumer:
with self.get_consumer(self.get_test_archive_file()) as consumer: consumer.run()
consumer.run()
self._assert_first_last_send_progress(last_status="FAILED") self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
def testDuplicates3(self): def testDuplicates3(self):
with self.get_consumer(self.get_test_archive_file()) as consumer: with self.get_consumer(self.get_test_archive_file()) as consumer:
@@ -513,9 +513,10 @@ class TestConsumer(
Document.objects.all().delete() Document.objects.all().delete()
with self.assertRaisesMessage(ConsumerError, "document is in the trash"): with self.get_consumer(self.get_test_file()) as consumer:
with self.get_consumer(self.get_test_file()) as consumer: consumer.run()
consumer.run()
self.assertEqual(Document.objects.count(), 1)
def testAsnExists(self): def testAsnExists(self):
with self.get_consumer( with self.get_consumer(
@@ -718,12 +719,12 @@ class TestConsumer(
dst = self.get_test_file() dst = self.get_test_file()
self.assertIsFile(dst) self.assertIsFile(dst)
with self.assertRaises(ConsumerError): with self.get_consumer(dst) as consumer:
with self.get_consumer(dst) as consumer: consumer.run()
consumer.run()
self.assertIsNotFile(dst) self.assertIsNotFile(dst)
self._assert_first_last_send_progress(last_status="FAILED") self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
@override_settings(CONSUMER_DELETE_DUPLICATES=False) @override_settings(CONSUMER_DELETE_DUPLICATES=False)
def test_no_delete_duplicate(self): def test_no_delete_duplicate(self):
@@ -743,15 +744,12 @@ class TestConsumer(
dst = self.get_test_file() dst = self.get_test_file()
self.assertIsFile(dst) self.assertIsFile(dst)
with self.assertRaisesRegex( with self.get_consumer(dst) as consumer:
ConsumerError, consumer.run()
r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)",
):
with self.get_consumer(dst) as consumer:
consumer.run()
self.assertIsFile(dst) self.assertIsNotFile(dst)
self._assert_first_last_send_progress(last_status="FAILED") self.assertEqual(Document.objects.count(), 2)
self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{title}") @override_settings(FILENAME_FORMAT="{title}")
@mock.patch("documents.parsers.document_consumer_declaration.send") @mock.patch("documents.parsers.document_consumer_declaration.send")