mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-18 22:14:22 -06:00
Core elements, migration, consumer modifications
This commit is contained in:
@@ -785,20 +785,16 @@ class ConsumerPreflightPlugin(
|
||||
Q(checksum=checksum) | Q(archive_checksum=checksum),
|
||||
)
|
||||
if existing_doc.exists():
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS
|
||||
log_msg = f"Not consuming {self.filename}: It is a duplicate of {existing_doc.get().title} (#{existing_doc.get().pk})."
|
||||
|
||||
if existing_doc.first().deleted_at is not None:
|
||||
msg = ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS_IN_TRASH
|
||||
log_msg += " Note: existing document is in the trash."
|
||||
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
Path(self.input_doc.original_file).unlink()
|
||||
self._fail(
|
||||
msg,
|
||||
log_msg,
|
||||
log_msg = (
|
||||
f"Consuming duplicate {self.filename}: "
|
||||
f"{existing_doc.count()} existing document(s) share the same content."
|
||||
)
|
||||
|
||||
if existing_doc.filter(deleted_at__isnull=False).exists():
|
||||
log_msg += " Note: at least one existing document is in the trash."
|
||||
|
||||
self.log.warning(log_msg)
|
||||
|
||||
def pre_check_directories(self):
|
||||
"""
|
||||
Ensure all required directories exist before attempting to use them
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 5.2.7 on 2026-01-14 17:45
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1076_alter_paperlesstask_task_name"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="document",
|
||||
name="checksum",
|
||||
field=models.CharField(
|
||||
editable=False,
|
||||
max_length=32,
|
||||
verbose_name="checksum",
|
||||
help_text="The checksum of the original document.",
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -212,7 +212,6 @@ class Document(SoftDeleteModel, ModelWithOwner):
|
||||
_("checksum"),
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text=_("The checksum of the original document."),
|
||||
)
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ from django.core.validators import MinValueValidator
|
||||
from django.core.validators import RegexValidator
|
||||
from django.core.validators import integer_validator
|
||||
from django.db.models import Count
|
||||
from django.db.models import Q
|
||||
from django.db.models.functions import Lower
|
||||
from django.utils.crypto import get_random_string
|
||||
from django.utils.dateparse import parse_datetime
|
||||
@@ -72,6 +73,7 @@ from documents.models import WorkflowTrigger
|
||||
from documents.parsers import is_mime_type_supported
|
||||
from documents.permissions import get_document_count_filter_for_user
|
||||
from documents.permissions import get_groups_with_only_permission
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.regex import validate_regex_pattern
|
||||
from documents.templating.filepath import validate_filepath_template_and_render
|
||||
@@ -1014,6 +1016,29 @@ class NotesSerializer(serializers.ModelSerializer):
|
||||
return ret
|
||||
|
||||
|
||||
def _get_viewable_duplicates(document: Document, user: User | None):
|
||||
checksums = {document.checksum}
|
||||
if document.archive_checksum:
|
||||
checksums.add(document.archive_checksum)
|
||||
duplicates = (
|
||||
Document.global_objects.filter(
|
||||
Q(checksum__in=checksums) | Q(archive_checksum__in=checksums),
|
||||
deleted_at__isnull=True,
|
||||
)
|
||||
.exclude(pk=document.pk)
|
||||
.order_by("-created")
|
||||
)
|
||||
if user.is_superuser:
|
||||
return duplicates
|
||||
return duplicates.filter(
|
||||
id__in=get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_document",
|
||||
Document,
|
||||
).values_list("id", flat=True),
|
||||
)
|
||||
|
||||
|
||||
@extend_schema_serializer(
|
||||
deprecate_fields=["created_date"],
|
||||
)
|
||||
@@ -1031,6 +1056,7 @@ class DocumentSerializer(
|
||||
archived_file_name = SerializerMethodField()
|
||||
created_date = serializers.DateField(required=False)
|
||||
page_count = SerializerMethodField()
|
||||
duplicate_documents = SerializerMethodField()
|
||||
|
||||
notes = NotesSerializer(many=True, required=False, read_only=True)
|
||||
|
||||
@@ -1056,6 +1082,15 @@ class DocumentSerializer(
|
||||
def get_page_count(self, obj) -> int | None:
|
||||
return obj.page_count
|
||||
|
||||
def get_duplicate_documents(self, obj):
|
||||
view = self.context.get("view")
|
||||
if view and getattr(view, "action", None) != "retrieve":
|
||||
return []
|
||||
request = self.context.get("request")
|
||||
user = request.user if request else None
|
||||
duplicates = _get_viewable_duplicates(obj, user)
|
||||
return list(duplicates.values("id", "title"))
|
||||
|
||||
def get_original_file_name(self, obj) -> str | None:
|
||||
return obj.original_filename
|
||||
|
||||
@@ -1233,6 +1268,7 @@ class DocumentSerializer(
|
||||
"archive_serial_number",
|
||||
"original_file_name",
|
||||
"archived_file_name",
|
||||
"duplicate_documents",
|
||||
"owner",
|
||||
"permissions",
|
||||
"user_can_change",
|
||||
@@ -2094,10 +2130,12 @@ class TasksViewSerializer(OwnedObjectSerializer):
|
||||
"result",
|
||||
"acknowledged",
|
||||
"related_document",
|
||||
"duplicate_documents",
|
||||
"owner",
|
||||
)
|
||||
|
||||
related_document = serializers.SerializerMethodField()
|
||||
duplicate_documents = serializers.SerializerMethodField()
|
||||
created_doc_re = re.compile(r"New document id (\d+) created")
|
||||
duplicate_doc_re = re.compile(r"It is a duplicate of .* \(#(\d+)\)")
|
||||
|
||||
@@ -2122,6 +2160,30 @@ class TasksViewSerializer(OwnedObjectSerializer):
|
||||
|
||||
return result
|
||||
|
||||
def _get_duplicate_documents(self, obj):
|
||||
if not hasattr(self, "_duplicate_documents_cache"):
|
||||
self._duplicate_documents_cache = {}
|
||||
cache = self._duplicate_documents_cache
|
||||
if obj.pk in cache:
|
||||
return cache[obj.pk]
|
||||
related_document = self.get_related_document(obj)
|
||||
if not related_document:
|
||||
cache[obj.pk] = []
|
||||
return cache[obj.pk]
|
||||
try:
|
||||
document = Document.objects.get(pk=related_document)
|
||||
except Document.DoesNotExist:
|
||||
cache[obj.pk] = []
|
||||
return cache[obj.pk]
|
||||
request = self.context.get("request")
|
||||
user = request.user if request else None
|
||||
duplicates = _get_viewable_duplicates(document, user)
|
||||
cache[obj.pk] = list(duplicates.values("id", "title"))
|
||||
return cache[obj.pk]
|
||||
|
||||
def get_duplicate_documents(self, obj):
|
||||
return self._get_duplicate_documents(obj)
|
||||
|
||||
|
||||
class RunTaskViewSerializer(serializers.Serializer):
|
||||
task_name = serializers.ChoiceField(
|
||||
|
||||
@@ -7,6 +7,7 @@ from django.contrib.auth.models import User
|
||||
from rest_framework import status
|
||||
from rest_framework.test import APITestCase
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.views import TasksViewSet
|
||||
@@ -258,7 +259,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
|
||||
task_id=str(uuid.uuid4()),
|
||||
task_file_name="task_one.pdf",
|
||||
status=celery.states.FAILURE,
|
||||
result="test.pdf: Not consuming test.pdf: It is a duplicate.",
|
||||
result="test.pdf: Unexpected error during ingestion.",
|
||||
)
|
||||
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
@@ -270,7 +271,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
|
||||
|
||||
self.assertEqual(
|
||||
returned_data["result"],
|
||||
"test.pdf: Not consuming test.pdf: It is a duplicate.",
|
||||
"test.pdf: Unexpected error during ingestion.",
|
||||
)
|
||||
|
||||
def test_task_name_webui(self):
|
||||
@@ -325,20 +326,33 @@ class TestTasks(DirectoriesMixin, APITestCase):
|
||||
|
||||
self.assertEqual(returned_data["task_file_name"], "anothertest.pdf")
|
||||
|
||||
def test_task_result_failed_duplicate_includes_related_doc(self):
|
||||
def test_task_result_duplicate_warning_includes_count(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A celery task failed with a duplicate error
|
||||
- A celery task succeeds, but a duplicate exists
|
||||
WHEN:
|
||||
- API call is made to get tasks
|
||||
THEN:
|
||||
- The returned data includes a related document link
|
||||
- The returned data includes duplicate warning metadata
|
||||
"""
|
||||
checksum = "duplicate-checksum"
|
||||
Document.objects.create(
|
||||
title="Existing",
|
||||
content="",
|
||||
mime_type="application/pdf",
|
||||
checksum=checksum,
|
||||
)
|
||||
created_doc = Document.objects.create(
|
||||
title="Created",
|
||||
content="",
|
||||
mime_type="application/pdf",
|
||||
checksum=checksum,
|
||||
)
|
||||
PaperlessTask.objects.create(
|
||||
task_id=str(uuid.uuid4()),
|
||||
task_file_name="task_one.pdf",
|
||||
status=celery.states.FAILURE,
|
||||
result="Not consuming task_one.pdf: It is a duplicate of task_one_existing.pdf (#1234).",
|
||||
status=celery.states.SUCCESS,
|
||||
result=f"Success. New document id {created_doc.pk} created",
|
||||
)
|
||||
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
@@ -348,7 +362,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
|
||||
|
||||
returned_data = response.data[0]
|
||||
|
||||
self.assertEqual(returned_data["related_document"], "1234")
|
||||
self.assertEqual(returned_data["related_document"], str(created_doc.pk))
|
||||
|
||||
def test_run_train_classifier_task(self):
|
||||
"""
|
||||
|
||||
@@ -485,21 +485,21 @@ class TestConsumer(
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
|
||||
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testDuplicates2(self):
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
|
||||
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
|
||||
with self.get_consumer(self.get_test_archive_file()) as consumer:
|
||||
consumer.run()
|
||||
with self.get_consumer(self.get_test_archive_file()) as consumer:
|
||||
consumer.run()
|
||||
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testDuplicates3(self):
|
||||
with self.get_consumer(self.get_test_archive_file()) as consumer:
|
||||
@@ -513,9 +513,10 @@ class TestConsumer(
|
||||
|
||||
Document.objects.all().delete()
|
||||
|
||||
with self.assertRaisesMessage(ConsumerError, "document is in the trash"):
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
with self.get_consumer(self.get_test_file()) as consumer:
|
||||
consumer.run()
|
||||
|
||||
self.assertEqual(Document.objects.count(), 1)
|
||||
|
||||
def testAsnExists(self):
|
||||
with self.get_consumer(
|
||||
@@ -718,12 +719,12 @@ class TestConsumer(
|
||||
dst = self.get_test_file()
|
||||
self.assertIsFile(dst)
|
||||
|
||||
with self.assertRaises(ConsumerError):
|
||||
with self.get_consumer(dst) as consumer:
|
||||
consumer.run()
|
||||
with self.get_consumer(dst) as consumer:
|
||||
consumer.run()
|
||||
|
||||
self.assertIsNotFile(dst)
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(CONSUMER_DELETE_DUPLICATES=False)
|
||||
def test_no_delete_duplicate(self):
|
||||
@@ -743,15 +744,12 @@ class TestConsumer(
|
||||
dst = self.get_test_file()
|
||||
self.assertIsFile(dst)
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
ConsumerError,
|
||||
r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)",
|
||||
):
|
||||
with self.get_consumer(dst) as consumer:
|
||||
consumer.run()
|
||||
with self.get_consumer(dst) as consumer:
|
||||
consumer.run()
|
||||
|
||||
self.assertIsFile(dst)
|
||||
self._assert_first_last_send_progress(last_status="FAILED")
|
||||
self.assertIsNotFile(dst)
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{title}")
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
|
||||
Reference in New Issue
Block a user