Feature: documents trash aka soft delete (#6944)

This commit is contained in:
shamoon
2024-06-17 08:07:08 -07:00
committed by GitHub
parent 9d4e2d4652
commit a796e58a94
38 changed files with 1283 additions and 191 deletions

View File

@@ -0,0 +1,23 @@
# Generated by Django 4.2.11 on 2024-04-23 07:56
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1048_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.AddField(
model_name="document",
name="deleted_at",
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name="document",
name="restored_at",
field=models.DateTimeField(blank=True, null=True),
),
]

View File

@@ -23,6 +23,8 @@ from multiselectfield import MultiSelectField
if settings.AUDIT_LOG_ENABLED:
from auditlog.registry import auditlog
from django_softdelete.models import SoftDeleteModel
from documents.data_models import DocumentSource
from documents.parsers import get_default_file_extension
@@ -130,7 +132,7 @@ class StoragePath(MatchingModel):
verbose_name_plural = _("storage paths")
class Document(ModelWithOwner):
class Document(SoftDeleteModel, ModelWithOwner):
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (

View File

@@ -786,6 +786,7 @@ class DocumentSerializer(
"created_date",
"modified",
"added",
"deleted_at",
"archive_serial_number",
"original_file_name",
"archived_file_name",
@@ -1863,3 +1864,26 @@ class WorkflowSerializer(serializers.ModelSerializer):
self.prune_triggers_and_actions()
return instance
class TrashSerializer(SerializerWithPerms):
documents = serializers.ListField(
required=False,
label="Documents",
write_only=True,
child=serializers.IntegerField(),
)
action = serializers.ChoiceField(
choices=["restore", "empty"],
label="Action",
write_only=True,
)
def validate_documents(self, documents):
count = Document.deleted_objects.filter(id__in=documents).count()
if not count == len(documents):
raise serializers.ValidationError(
"Some documents in the list have not yet been deleted.",
)
return documents

View File

@@ -301,10 +301,10 @@ def set_storage_path(
document.save(update_fields=("storage_path",))
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
# see empty_trash in documents/tasks.py for signal handling
def cleanup_document_deletion(sender, instance, **kwargs):
with FileLock(settings.MEDIA_LOCK):
if settings.TRASH_DIR:
if settings.EMPTY_TRASH_DIR:
# Find a non-conflicting filename in case a document with the same
# name was moved to trash earlier
counter = 0
@@ -313,7 +313,7 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
while True:
new_file_path = os.path.join(
settings.TRASH_DIR,
settings.EMPTY_TRASH_DIR,
old_filebase + (f"_{counter:02}" if counter else "") + old_fileext,
)

View File

@@ -2,6 +2,7 @@ import hashlib
import logging
import shutil
import uuid
from datetime import timedelta
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional
@@ -10,8 +11,10 @@ import tqdm
from celery import Task
from celery import shared_task
from django.conf import settings
from django.db import models
from django.db import transaction
from django.db.models.signals import post_save
from django.utils import timezone
from filelock import FileLock
from whoosh.writing import AsyncWriter
@@ -41,6 +44,7 @@ from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressStatusOptions
from documents.sanity_checker import SanityCheckFailedException
from documents.signals import document_updated
from documents.signals.handlers import cleanup_document_deletion
if settings.AUDIT_LOG_ENABLED:
import json
@@ -292,3 +296,29 @@ def update_document_archive_file(document_id):
)
finally:
parser.cleanup()
@shared_task
def empty_trash(doc_ids=None):
documents = (
Document.deleted_objects.filter(id__in=doc_ids)
if doc_ids is not None
else Document.deleted_objects.filter(
deleted_at__lt=timezone.localtime(timezone.now())
- timedelta(
days=settings.EMPTY_TRASH_DELAY,
),
)
)
try:
# Temporarily connect the cleanup handler
models.signals.post_delete.connect(cleanup_document_deletion, sender=Document)
documents.delete() # this is effectively a hard delete
except Exception as e: # pragma: no cover
logger.exception(f"Error while emptying trash: {e}")
finally:
models.signals.post_delete.disconnect(
cleanup_document_deletion,
sender=Document,
)

View File

@@ -0,0 +1,155 @@
from django.contrib.auth.models import User
from django.core.cache import cache
from rest_framework import status
from rest_framework.test import APITestCase
from documents.models import Document
class TestTrashAPI(APITestCase):
def setUp(self):
super().setUp()
self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=self.user)
cache.clear()
def test_api_trash(self):
"""
GIVEN:
- Existing document
WHEN:
- API request to delete document
- API request to restore document
- API request to empty trash
THEN:
- Document is moved to trash
- Document is restored from trash
- Trash is emptied
"""
document = Document.objects.create(
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
)
self.client.force_login(user=self.user)
self.client.delete(f"/api/documents/{document.pk}/")
self.assertEqual(Document.objects.count(), 0)
self.assertEqual(Document.global_objects.count(), 1)
self.assertEqual(Document.deleted_objects.count(), 1)
resp = self.client.get("/api/trash/")
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(resp.data["count"], 1)
resp = self.client.post(
"/api/trash/",
{"action": "restore", "documents": [document.pk]},
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(Document.objects.count(), 1)
resp = self.client.get("/api/trash/")
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(resp.data["count"], 0)
self.client.delete(f"/api/documents/{document.pk}/")
resp = self.client.post(
"/api/trash/",
{"action": "empty", "documents": [document.pk]},
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(Document.global_objects.count(), 0)
def test_trash_api_empty_all(self):
"""
GIVEN:
- Existing documents in trash
WHEN:
- API request to empty trash
THEN:
- Trash is emptied
"""
document = Document.objects.create(
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
)
document.delete()
document2 = Document.objects.create(
title="Title2",
content="content2",
checksum="checksum2",
mime_type="application/pdf",
)
document2.delete()
self.client.force_login(user=self.user)
resp = self.client.post(
"/api/trash/",
{"action": "empty", "documents": []},
)
self.assertEqual(resp.status_code, status.HTTP_200_OK)
self.assertEqual(Document.global_objects.count(), 0)
def test_api_trash_insufficient_permissions(self):
"""
GIVEN:
- Existing document with owner = user2 in trash
WHEN:
- user 1 makes API request to empty document from trash
THEN:
- 403 Forbidden
"""
user1 = User.objects.create_user(username="user1")
self.client.force_authenticate(user=user1)
self.client.force_login(user=user1)
user2 = User.objects.create_user(username="user2")
document = Document.objects.create(
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
owner=user2,
)
document.delete()
resp = self.client.post(
"/api/trash/",
{"action": "empty", "documents": [document.pk]},
)
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
self.assertEqual(Document.global_objects.count(), 1)
def test_api_trash_invalid_params(self):
"""
GIVEN:
- Existing documents
WHEN:
- API request to trash with invalid params
THEN:
- 400 Bad Request
"""
document = Document.objects.create(
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
)
self.client.force_login(user=self.user)
# document isn't in trash
resp = self.client.post(
"/api/trash/",
{"action": "restore", "documents": [document.pk]},
)
self.assertEqual(resp.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn("have not yet been deleted", resp.data["documents"][0])

View File

@@ -40,6 +40,7 @@ class TestApiUiSettings(DirectoriesMixin, APITestCase):
"app_title": None,
"app_logo": None,
"auditlog_enabled": True,
"trash_delay": 30,
"update_checking": {
"backend_setting": "default",
},

View File

@@ -10,6 +10,7 @@ from django.utils import timezone
from documents.models import Correspondent
from documents.models import Document
from documents.tasks import empty_trash
class TestDocument(TestCase):
@@ -43,10 +44,39 @@ class TestDocument(TestCase):
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
empty_trash([document.pk])
mock_unlink.assert_any_call(file_path)
mock_unlink.assert_any_call(thumb_path)
self.assertEqual(mock_unlink.call_count, 2)
def test_document_soft_delete(self):
document = Document.objects.create(
correspondent=Correspondent.objects.create(name="Test0"),
title="Title",
content="content",
checksum="checksum",
mime_type="application/pdf",
)
file_path = document.source_path
thumb_path = document.thumbnail_path
Path(file_path).touch()
Path(thumb_path).touch()
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
self.assertEqual(mock_unlink.call_count, 0)
self.assertEqual(Document.objects.count(), 0)
document.restore(strict=False)
self.assertEqual(Document.objects.count(), 1)
document.delete()
empty_trash([document.pk])
self.assertEqual(mock_unlink.call_count, 2)
def test_file_name(self):
doc = Document(
mime_type="application/pdf",

View File

@@ -19,6 +19,7 @@ from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.tasks import empty_trash
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
@@ -169,6 +170,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
document.save()
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
@@ -176,6 +178,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure file deletion after delete
document.delete()
empty_trash([document.pk])
self.assertIsNotFile(
os.path.join(settings.ORIGINALS_DIR, "none", "none.pdf"),
)
@@ -183,9 +186,9 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@override_settings(
FILENAME_FORMAT="{correspondent}/{correspondent}",
TRASH_DIR=tempfile.mkdtemp(),
EMPTY_TRASH_DIR=tempfile.mkdtemp(),
)
def test_document_delete_trash(self):
def test_document_delete_trash_dir(self):
document = Document()
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@@ -193,20 +196,22 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
document.save()
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
# Ensure file was moved to trash after delete
self.assertIsNotFile(os.path.join(settings.TRASH_DIR, "none", "none.pdf"))
self.assertIsNotFile(os.path.join(settings.EMPTY_TRASH_DIR, "none", "none.pdf"))
document.delete()
empty_trash([document.pk])
self.assertIsNotFile(
os.path.join(settings.ORIGINALS_DIR, "none", "none.pdf"),
)
self.assertIsNotDir(os.path.join(settings.ORIGINALS_DIR, "none"))
self.assertIsFile(os.path.join(settings.TRASH_DIR, "none.pdf"))
self.assertIsNotFile(os.path.join(settings.TRASH_DIR, "none_01.pdf"))
self.assertIsFile(os.path.join(settings.EMPTY_TRASH_DIR, "none.pdf"))
self.assertIsNotFile(os.path.join(settings.EMPTY_TRASH_DIR, "none_01.pdf"))
# Create an identical document and ensure it is trashed under a new name
document = Document()
@@ -214,10 +219,12 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
document.filename = generate_filename(document)
document.save()
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
document.delete()
self.assertIsFile(os.path.join(settings.TRASH_DIR, "none_01.pdf"))
empty_trash([document.pk])
self.assertIsFile(os.path.join(settings.EMPTY_TRASH_DIR, "none_01.pdf"))
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_document_delete_nofile(self):
@@ -227,6 +234,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.save()
document.delete()
empty_trash([document.pk])
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
def test_directory_not_empty(self):
@@ -436,6 +444,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
document.save()
self.assertEqual(document.filename, "none/none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
@@ -444,6 +453,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsDir(os.path.join(settings.ORIGINALS_DIR, "none/none"))
document.delete()
empty_trash([document.pk])
self.assertIsNotFile(
os.path.join(settings.ORIGINALS_DIR, "none/none/none.pdf"),
@@ -550,6 +560,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(document2.filename, "qwe_01.pdf")
document.delete()
empty_trash([document.pk])
self.assertIsNotFile(document.source_path)
@@ -819,6 +830,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, FileSystemAssertsMixin, Test
self.assertIsFile(doc.archive_path)
doc.delete()
empty_trash([doc.pk])
self.assertIsNotFile(original)
self.assertIsNotFile(archive)
@@ -854,6 +866,7 @@ class TestFileHandlingWithArchive(DirectoriesMixin, FileSystemAssertsMixin, Test
self.assertIsFile(doc2.source_path)
doc2.delete()
empty_trash([doc2.pk])
self.assertIsFile(doc1.source_path)
self.assertIsFile(doc1.archive_path)

View File

@@ -1,4 +1,5 @@
import os
from datetime import timedelta
from unittest import mock
from django.conf import settings
@@ -150,3 +151,36 @@ class TestBulkUpdate(DirectoriesMixin, TestCase):
)
tasks.bulk_update_documents([doc1.pk])
class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
GIVEN:
- Existing document in trash
WHEN:
- Empty trash task is called without doc_ids
THEN:
- Document is only deleted if it has been in trash for more than delay (default 30 days)
"""
def test_empty_trash(self):
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
added=timezone.now(),
created=timezone.now(),
modified=timezone.now(),
)
doc.delete()
self.assertEqual(Document.global_objects.count(), 1)
self.assertEqual(Document.objects.count(), 0)
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 1)
doc.deleted_at = timezone.now() - timedelta(days=31)
doc.save()
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0)

View File

@@ -142,12 +142,14 @@ from documents.serialisers import StoragePathSerializer
from documents.serialisers import TagSerializer
from documents.serialisers import TagSerializerVersion1
from documents.serialisers import TasksViewSerializer
from documents.serialisers import TrashSerializer
from documents.serialisers import UiSettingsViewSerializer
from documents.serialisers import WorkflowActionSerializer
from documents.serialisers import WorkflowSerializer
from documents.serialisers import WorkflowTriggerSerializer
from documents.signals import document_updated
from documents.tasks import consume_file
from documents.tasks import empty_trash
from paperless import version
from paperless.celery import app as celery_app
from paperless.config import GeneralConfig
@@ -1557,6 +1559,8 @@ class UiSettingsView(GenericAPIView):
"backend_setting": settings.ENABLE_UPDATE_CHECK,
}
ui_settings["trash_delay"] = settings.EMPTY_TRASH_DELAY
general_config = GeneralConfig()
ui_settings["app_title"] = settings.APP_TITLE
@@ -2050,3 +2054,41 @@ class SystemStatusView(PassUserMixin):
},
},
)
class TrashView(ListModelMixin, PassUserMixin):
permission_classes = (IsAuthenticated,)
serializer_class = TrashSerializer
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
pagination_class = StandardPagination
model = Document
queryset = Document.deleted_objects.all()
def get(self, request, format=None):
self.serializer_class = DocumentSerializer
return self.list(request, format)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
doc_ids = serializer.validated_data.get("documents")
docs = (
Document.global_objects.filter(id__in=doc_ids)
if doc_ids is not None
else Document.deleted_objects.all()
)
for doc in docs:
if not has_perms_owner_aware(request.user, "delete_document", doc):
return HttpResponseForbidden("Insufficient permissions")
action = serializer.validated_data.get("action")
if action == "restore":
for doc in Document.deleted_objects.filter(id__in=doc_ids).all():
doc.restore(strict=False)
elif action == "empty":
if doc_ids is None:
doc_ids = [doc.id for doc in docs]
empty_trash(doc_ids=doc_ids)
return Response({"result": "OK", "doc_ids": doc_ids})

View File

@@ -62,7 +62,7 @@ def paths_check(app_configs, **kwargs):
return (
path_check("PAPERLESS_DATA_DIR", settings.DATA_DIR)
+ path_check("PAPERLESS_TRASH_DIR", settings.TRASH_DIR)
+ path_check("PAPERLESS_EMPTY_TRASH_DIR", settings.EMPTY_TRASH_DIR)
+ path_check("PAPERLESS_MEDIA_ROOT", settings.MEDIA_ROOT)
+ path_check("PAPERLESS_CONSUMPTION_DIR", settings.CONSUMPTION_DIR)
)

View File

@@ -207,6 +207,17 @@ def _parse_beat_schedule() -> dict:
"expires": ((7.0 * 24.0) - 1.0) * 60.0 * 60.0,
},
},
{
"name": "Empty trash",
"env_key": "PAPERLESS_EMPTY_TRASH_TASK_CRON",
# Default daily at 01:00
"env_default": "0 1 * * *",
"task": "documents.tasks.empty_trash",
"options": {
# 1 hour before default schedule sends again
"expires": 23.0 * 60.0 * 60.0,
},
},
]
for task in tasks:
# Either get the environment setting or use the default
@@ -250,7 +261,11 @@ DATA_DIR = __get_path("PAPERLESS_DATA_DIR", BASE_DIR.parent / "data")
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/share/nltk_data")
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
# Check deprecated setting first
EMPTY_TRASH_DIR = os.getenv(
"PAPERLESS_TRASH_DIR",
os.getenv("PAPERLESS_EMPTY_TRASH_DIR"),
)
# Lock file for synchronizing changes to the MEDIA directory across multiple
# threads.
@@ -1148,3 +1163,9 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] "
if DEBUG: # pragma: no cover
EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
###############################################################################
# Soft Delete
###############################################################################
EMPTY_TRASH_DELAY = max(__get_int("PAPERLESS_EMPTY_TRASH_DELAY", 30), 1)

View File

@@ -156,6 +156,7 @@ class TestCeleryScheduleParsing(TestCase):
CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0
INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
EMPTY_TRASH_EXPIRE_TIME = 23.0 * 60.0 * 60.0
def test_schedule_configuration_default(self):
"""
@@ -190,6 +191,11 @@ class TestCeleryScheduleParsing(TestCase):
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
"options": {"expires": self.SANITY_EXPIRE_TIME},
},
"Empty trash": {
"task": "documents.tasks.empty_trash",
"schedule": crontab(minute=0, hour="1"),
"options": {"expires": self.EMPTY_TRASH_EXPIRE_TIME},
},
},
schedule,
)
@@ -232,6 +238,11 @@ class TestCeleryScheduleParsing(TestCase):
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
"options": {"expires": self.SANITY_EXPIRE_TIME},
},
"Empty trash": {
"task": "documents.tasks.empty_trash",
"schedule": crontab(minute=0, hour="1"),
"options": {"expires": self.EMPTY_TRASH_EXPIRE_TIME},
},
},
schedule,
)
@@ -266,6 +277,11 @@ class TestCeleryScheduleParsing(TestCase):
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
"options": {"expires": self.SANITY_EXPIRE_TIME},
},
"Empty trash": {
"task": "documents.tasks.empty_trash",
"schedule": crontab(minute=0, hour="1"),
"options": {"expires": self.EMPTY_TRASH_EXPIRE_TIME},
},
},
schedule,
)
@@ -286,6 +302,7 @@ class TestCeleryScheduleParsing(TestCase):
"PAPERLESS_TRAIN_TASK_CRON": "disable",
"PAPERLESS_SANITY_TASK_CRON": "disable",
"PAPERLESS_INDEX_TASK_CRON": "disable",
"PAPERLESS_EMPTY_TRASH_TASK_CRON": "disable",
},
):
schedule = _parse_beat_schedule()

View File

@@ -36,6 +36,7 @@ from documents.views import StoragePathViewSet
from documents.views import SystemStatusView
from documents.views import TagViewSet
from documents.views import TasksViewSet
from documents.views import TrashView
from documents.views import UiSettingsView
from documents.views import UnifiedSearchViewSet
from documents.views import WorkflowActionViewSet
@@ -159,6 +160,11 @@ urlpatterns = [
SystemStatusView.as_view(),
name="system_status",
),
re_path(
"^trash/",
TrashView.as_view(),
name="trash",
),
*api_router.urls,
],
),