Merge branch 'dev' into dev

This commit is contained in:
lufi
2024-11-21 21:09:15 +01:00
committed by GitHub
24 changed files with 293 additions and 84 deletions

View File

@@ -24,7 +24,7 @@ from documents.models import StoragePath
from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents
from documents.tasks import consume_file
from documents.tasks import update_document_archive_file
from documents.tasks import update_document_content_maybe_archive_file
logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
@@ -191,7 +191,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
def reprocess(doc_ids: list[int]) -> Literal["OK"]:
for document_id in doc_ids:
update_document_archive_file.delay(
update_document_content_maybe_archive_file.delay(
document_id=document_id,
)
@@ -245,7 +245,7 @@ def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save()
rotate_tasks.append(
update_document_archive_file.s(
update_document_content_maybe_archive_file.s(
document_id=doc.id,
),
)
@@ -423,7 +423,7 @@ def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
if doc.page_count is not None:
doc.page_count = doc.page_count - len(pages)
doc.save()
update_document_archive_file.delay(document_id=doc.id)
update_document_content_maybe_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}")
except Exception as e:
logger.exception(f"Error deleting pages from document {doc.id}: {e}")

View File

@@ -9,7 +9,7 @@ from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tasks import update_document_content_maybe_archive_file
logger = logging.getLogger("paperless.management.archiver")
@@ -77,13 +77,13 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
if self.process_count == 1:
for doc_id in document_ids:
update_document_archive_file(doc_id)
update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
update_document_archive_file,
update_document_content_maybe_archive_file,
document_ids,
),
total=len(document_ids),

View File

@@ -0,0 +1,28 @@
# Generated by Django 5.1.1 on 2024-11-04 21:56
import django.db.models.deletion
from django.conf import settings
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1056_customfieldinstance_deleted_at_and_more"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AddField(
model_name="paperlesstask",
name="owner",
field=models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
to=settings.AUTH_USER_MODEL,
verbose_name="owner",
),
),
]

View File

@@ -641,7 +641,7 @@ class UiSettings(models.Model):
return self.user.username
class PaperlessTask(models.Model):
class PaperlessTask(ModelWithOwner):
ALL_STATES = sorted(states.ALL_STATES)
TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES))

View File

@@ -1567,7 +1567,7 @@ class UiSettingsViewSerializer(serializers.ModelSerializer):
return ui_settings
class TasksViewSerializer(serializers.ModelSerializer):
class TasksViewSerializer(OwnedObjectSerializer):
class Meta:
model = PaperlessTask
depth = 1
@@ -1582,6 +1582,7 @@ class TasksViewSerializer(serializers.ModelSerializer):
"result",
"acknowledged",
"related_document",
"owner",
)
type = serializers.SerializerMethodField()

View File

@@ -940,9 +940,10 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs):
close_old_connections()
task_args = body[0]
input_doc, _ = task_args
input_doc, overrides = task_args
task_file_name = input_doc.original_file.name
user_id = overrides.owner_id if overrides else None
PaperlessTask.objects.create(
task_id=headers["id"],
@@ -953,6 +954,7 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs):
date_created=timezone.now(),
date_started=None,
date_done=None,
owner_id=user_id,
)
except Exception: # pragma: no cover
# Don't let an exception in the signal handlers prevent

View File

@@ -206,9 +206,10 @@ def bulk_update_documents(document_ids):
@shared_task
def update_document_archive_file(document_id):
def update_document_content_maybe_archive_file(document_id):
"""
Re-creates the archive file of a document, including new OCR content and thumbnail
Re-creates OCR content and thumbnail for a document, and archive file if
it exists.
"""
document = Document.objects.get(id=document_id)
@@ -234,8 +235,9 @@ def update_document_archive_file(document_id):
document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic():
with transaction.atomic():
oldDocument = Document.objects.get(pk=document.pk)
if parser.get_archive_path():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
@@ -246,7 +248,6 @@ def update_document_archive_file(document_id):
document,
archive_filename=True,
)
oldDocument = Document.objects.get(pk=document.pk)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
@@ -268,24 +269,41 @@ def update_document_archive_file(document_id):
],
},
additional_data={
"reason": "Update document archive file",
"reason": "Update document content",
},
action=LogEntry.Action.UPDATE,
)
else:
Document.objects.filter(pk=document.pk).update(
content=parser.get_text(),
)
if settings.AUDIT_LOG_ENABLED:
LogEntry.objects.log_create(
instance=oldDocument,
changes={
"content": [oldDocument.content, parser.get_text()],
},
additional_data={
"reason": "Update document content",
},
action=LogEntry.Action.UPDATE,
)
with FileLock(settings.MEDIA_LOCK):
with FileLock(settings.MEDIA_LOCK):
if parser.get_archive_path():
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
shutil.move(thumbnail, document.thumbnail_path)
document.refresh_from_db()
logger.info(
f"Updating index for document {document_id} ({document.archive_checksum})",
)
with index.open_index_writer() as writer:
index.update_document(writer, document)
document.refresh_from_db()
logger.info(
f"Updating index for document {document_id} ({document.archive_checksum})",
)
with index.open_index_writer() as writer:
index.update_document(writer, document)
clear_document_caches(document.pk)
clear_document_caches(document.pk)
except Exception:
logger.exception(

View File

@@ -1,6 +1,7 @@
import uuid
import celery
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from rest_framework import status
from rest_framework.test import APITestCase
@@ -11,7 +12,6 @@ from documents.tests.utils import DirectoriesMixin
class TestTasks(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/tasks/"
ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"
def setUp(self):
super().setUp()
@@ -125,7 +125,7 @@ class TestTasks(DirectoriesMixin, APITestCase):
self.assertEqual(len(response.data), 1)
response = self.client.post(
self.ENDPOINT_ACKNOWLEDGE,
self.ENDPOINT + "acknowledge/",
{"tasks": [task.id]},
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
@@ -133,6 +133,52 @@ class TestTasks(DirectoriesMixin, APITestCase):
response = self.client.get(self.ENDPOINT)
self.assertEqual(len(response.data), 0)
def test_tasks_owner_aware(self):
"""
GIVEN:
- Existing PaperlessTasks with owner and with no owner
WHEN:
- API call is made to get tasks
THEN:
- Only tasks with no owner or request user are returned
"""
regular_user = User.objects.create_user(username="test")
regular_user.user_permissions.add(*Permission.objects.all())
self.client.logout()
self.client.force_authenticate(user=regular_user)
task1 = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
owner=self.user,
)
task2 = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_two.pdf",
)
task3 = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_three.pdf",
owner=regular_user,
)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(len(response.data), 2)
self.assertEqual(response.data[0]["task_id"], task3.task_id)
self.assertEqual(response.data[1]["task_id"], task2.task_id)
acknowledge_response = self.client.post(
self.ENDPOINT + "acknowledge/",
{"tasks": [task1.id, task2.id, task3.id]},
)
self.assertEqual(acknowledge_response.status_code, status.HTTP_200_OK)
self.assertEqual(acknowledge_response.data, {"result": 2})
def test_task_result_no_error(self):
"""
GIVEN:

View File

@@ -607,7 +607,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate(self, mock_chord, mock_update_document, mock_update_documents):
"""
@@ -626,7 +626,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("pikepdf.Pdf.save")
def test_rotate_with_error(
self,
@@ -654,7 +654,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_update_archive_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.si")
@mock.patch("documents.tasks.update_document_archive_file.s")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.s")
@mock.patch("celery.chord.delay")
def test_rotate_non_pdf(
self,
@@ -680,7 +680,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
mock_chord.assert_called_once()
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages(self, mock_pdf_save, mock_update_archive_file):
"""
@@ -705,7 +705,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
self.doc2.refresh_from_db()
self.assertEqual(self.doc2.page_count, expected_page_count)
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
"""

View File

@@ -13,7 +13,7 @@ from django.test import override_settings
from documents.file_handling import generate_filename
from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tasks import update_document_content_maybe_archive_file
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
@@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
update_document_archive_file(doc.pk)
update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
doc.save()
shutil.copy(sample_file, doc.source_path)
update_document_archive_file(doc.pk)
update_document_content_maybe_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, "document_01.pdf"),
)
update_document_archive_file(doc2.pk)
update_document_archive_file(doc1.pk)
update_document_content_maybe_archive_file(doc2.pk)
update_document_content_maybe_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)

View File

@@ -5,6 +5,7 @@ import celery
from django.test import TestCase
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import PaperlessTask
from documents.signals.handlers import before_task_publish_handler
@@ -48,7 +49,10 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
source=DocumentSource.ConsumeFolder,
original_file="/consume/hello-999.pdf",
),
None,
DocumentMetadataOverrides(
title="Hello world",
owner_id=1,
),
),
# kwargs
{},
@@ -65,6 +69,7 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
self.assertEqual(headers["id"], task.task_id)
self.assertEqual("hello-999.pdf", task.task_file_name)
self.assertEqual("documents.tasks.consume_file", task.task_name)
self.assertEqual(1, task.owner_id)
self.assertEqual(celery.states.PENDING, task.status)
def test_task_prerun_handler(self):

View File

@@ -1,5 +1,7 @@
import os
import shutil
from datetime import timedelta
from pathlib import Path
from unittest import mock
from django.conf import settings
@@ -184,3 +186,75 @@ class TestEmptyTrashTask(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
tasks.empty_trash()
self.assertEqual(Document.global_objects.count(), 0)
class TestUpdateContent(DirectoriesMixin, TestCase):
def test_update_content_maybe_archive_file(self):
"""
GIVEN:
- Existing document with archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content and checksum are updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
sample1_archive = self.dirs.archive_dir / "sample_archive.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1_archive,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
archive_checksum="wow",
filename=sample1,
mime_type="application/pdf",
archive_filename=sample1_archive,
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")
self.assertNotEqual(Document.objects.get(pk=doc.pk).archive_checksum, "wow")
def test_update_content_maybe_archive_file_no_archive(self):
"""
GIVEN:
- Existing document without archive file
WHEN:
- Update content task is called
THEN:
- Document is reprocessed, content is updated
"""
sample1 = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf",
sample1,
)
doc = Document.objects.create(
title="test",
content="my document",
checksum="wow",
filename=sample1,
mime_type="application/pdf",
)
tasks.update_document_content_maybe_archive_file(doc.pk)
self.assertNotEqual(Document.objects.get(pk=doc.pk).content, "test")

View File

@@ -1705,6 +1705,7 @@ class RemoteVersionView(GenericAPIView):
class TasksViewSet(ReadOnlyModelViewSet):
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
serializer_class = TasksViewSerializer
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
def get_queryset(self):
queryset = (
@@ -1719,19 +1720,17 @@ class TasksViewSet(ReadOnlyModelViewSet):
queryset = PaperlessTask.objects.filter(task_id=task_id)
return queryset
class AcknowledgeTasksView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = AcknowledgeTasksViewSerializer
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
@action(methods=["post"], detail=False)
def acknowledge(self, request):
serializer = AcknowledgeTasksViewSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
tasks = serializer.validated_data.get("tasks")
task_ids = serializer.validated_data.get("tasks")
try:
result = PaperlessTask.objects.filter(id__in=tasks).update(
tasks = PaperlessTask.objects.filter(id__in=task_ids)
if request.user is not None and not request.user.is_superuser:
tasks = tasks.filter(owner=request.user) | tasks.filter(owner=None)
result = tasks.update(
acknowledged=True,
)
return Response({"result": result})