Merge bulk edit

2025-09-03 01:56:16 +00:00 · 2025-04-08 16:30:13 -07:00
parent 5c4f16d16c
commit 531cda52b5
5 changed files with 4 additions and 4 deletions
--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -1,575 +0,0 @@
-from __future__ import annotations
-
-import hashlib
-import itertools
-import logging
-import tempfile
-from pathlib import Path
-from typing import TYPE_CHECKING
-from typing import Literal
-
-from celery import chain
-from celery import chord
-from celery import group
-from celery import shared_task
-from django.conf import settings
-from django.db.models import Q
-from django.utils import timezone
-
-from documents.permissions import set_permissions_for_object
-from documents.plugins.helpers import DocumentsStatusManager
-from documents.tasks import bulk_update_documents
-from documents.tasks import consume_file
-from documents.tasks import update_document_content_maybe_archive_file
-from paperless.data_models import ConsumableDocument
-from paperless.data_models import DocumentMetadataOverrides
-from paperless.data_models import DocumentSource
-from paperless.models import Correspondent
-from paperless.models import CustomField
-from paperless.models import CustomFieldInstance
-from paperless.models import Document
-from paperless.models import DocumentType
-from paperless.models import StoragePath
-
-if TYPE_CHECKING:
-    from django.contrib.auth.models import User
-
-logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
-
-
-def set_correspondent(
-    doc_ids: list[int],
-    correspondent: Correspondent,
-) -> Literal["OK"]:
-    if correspondent:
-        correspondent = Correspondent.objects.only("pk").get(id=correspondent)
-
-    qs = (
-        Document.objects.filter(Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
-        .select_related("correspondent")
-        .only("pk", "correspondent__id")
-    )
-    affected_docs = list(qs.values_list("pk", flat=True))
-    qs.update(correspondent=correspondent)
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-def set_storage_path(doc_ids: list[int], storage_path: StoragePath) -> Literal["OK"]:
-    if storage_path:
-        storage_path = StoragePath.objects.only("pk").get(id=storage_path)
-
-    qs = (
-        Document.objects.filter(
-            Q(id__in=doc_ids) & ~Q(storage_path=storage_path),
-        )
-        .select_related("storage_path")
-        .only("pk", "storage_path__id")
-    )
-    affected_docs = list(qs.values_list("pk", flat=True))
-    qs.update(storage_path=storage_path)
-
-    bulk_update_documents.delay(
-        document_ids=affected_docs,
-    )
-
-    return "OK"
-
-
-def set_document_type(doc_ids: list[int], document_type: DocumentType) -> Literal["OK"]:
-    if document_type:
-        document_type = DocumentType.objects.only("pk").get(id=document_type)
-
-    qs = (
-        Document.objects.filter(Q(id__in=doc_ids) & ~Q(document_type=document_type))
-        .select_related("document_type")
-        .only("pk", "document_type__id")
-    )
-    affected_docs = list(qs.values_list("pk", flat=True))
-    qs.update(document_type=document_type)
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-def add_tag(doc_ids: list[int], tag: int) -> Literal["OK"]:
-    qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(tags__id=tag)).only("pk")
-    affected_docs = list(qs.values_list("pk", flat=True))
-
-    DocumentTagRelationship = Document.tags.through
-
-    DocumentTagRelationship.objects.bulk_create(
-        [DocumentTagRelationship(document_id=doc, tag_id=tag) for doc in affected_docs],
-    )
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-def remove_tag(doc_ids: list[int], tag: int) -> Literal["OK"]:
-    qs = Document.objects.filter(Q(id__in=doc_ids) & Q(tags__id=tag)).only("pk")
-    affected_docs = list(qs.values_list("pk", flat=True))
-
-    DocumentTagRelationship = Document.tags.through
-
-    DocumentTagRelationship.objects.filter(
-        Q(document_id__in=affected_docs) & Q(tag_id=tag),
-    ).delete()
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-def modify_tags(
-    doc_ids: list[int],
-    add_tags: list[int],
-    remove_tags: list[int],
-) -> Literal["OK"]:
-    qs = Document.objects.filter(id__in=doc_ids).only("pk")
-    affected_docs = list(qs.values_list("pk", flat=True))
-
-    DocumentTagRelationship = Document.tags.through
-
-    DocumentTagRelationship.objects.filter(
-        document_id__in=affected_docs,
-        tag_id__in=remove_tags,
-    ).delete()
-
-    DocumentTagRelationship.objects.bulk_create(
-        [
-            DocumentTagRelationship(document_id=doc, tag_id=tag)
-            for (doc, tag) in itertools.product(affected_docs, add_tags)
-        ],
-        ignore_conflicts=True,
-    )
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-def modify_custom_fields(
-    doc_ids: list[int],
-    add_custom_fields: list[int] | dict,
-    remove_custom_fields: list[int],
-) -> Literal["OK"]:
-    qs = Document.objects.filter(id__in=doc_ids).only("pk")
-    affected_docs = list(qs.values_list("pk", flat=True))
-    # Ensure add_custom_fields is a list of tuples, supports old API
-    add_custom_fields = (
-        add_custom_fields.items()
-        if isinstance(add_custom_fields, dict)
-        else [(field, None) for field in add_custom_fields]
-    )
-
-    custom_fields = CustomField.objects.filter(
-        id__in=[int(field) for field, _ in add_custom_fields],
-    ).distinct()
-    for field_id, value in add_custom_fields:
-        for doc_id in affected_docs:
-            defaults = {}
-            custom_field = custom_fields.get(id=field_id)
-            if custom_field:
-                value_field = CustomFieldInstance.TYPE_TO_DATA_STORE_NAME_MAP[
-                    custom_field.data_type
-                ]
-                defaults[value_field] = value
-            CustomFieldInstance.objects.update_or_create(
-                document_id=doc_id,
-                field_id=field_id,
-                defaults=defaults,
-            )
-            if custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
-                doc = Document.objects.get(id=doc_id)
-                reflect_doclinks(doc, custom_field, value)
-
-    # For doc link fields that are being removed, remove symmetrical links
-    for doclink_being_removed_instance in CustomFieldInstance.objects.filter(
-        document_id__in=affected_docs,
-        field__id__in=remove_custom_fields,
-        field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
-        value_document_ids__isnull=False,
-    ):
-        for target_doc_id in doclink_being_removed_instance.value:
-            remove_doclink(
-                document=Document.objects.get(
-                    id=doclink_being_removed_instance.document.id,
-                ),
-                field=doclink_being_removed_instance.field,
-                target_doc_id=target_doc_id,
-            )
-
-    # Finally, remove the custom fields
-    CustomFieldInstance.objects.filter(
-        document_id__in=affected_docs,
-        field_id__in=remove_custom_fields,
-    ).hard_delete()
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-@shared_task
-def delete(doc_ids: list[int]) -> Literal["OK"]:
-    try:
-        Document.objects.filter(id__in=doc_ids).delete()
-
-        from documents import index
-
-        with index.open_index_writer() as writer:
-            for id in doc_ids:
-                index.remove_document_by_id(writer, id)
-
-        status_mgr = DocumentsStatusManager()
-        status_mgr.send_documents_deleted(doc_ids)
-    except Exception as e:
-        if "Data too long for column" in str(e):
-            logger.warning(
-                "Detected a possible incompatible database column. See https://docs.paperless-ngx.com/troubleshooting/#convert-uuid-field",
-            )
-        logger.error(f"Error deleting documents: {e!s}")
-
-    return "OK"
-
-
-def reprocess(doc_ids: list[int]) -> Literal["OK"]:
-    for document_id in doc_ids:
-        update_document_content_maybe_archive_file.delay(
-            document_id=document_id,
-        )
-
-    return "OK"
-
-
-def set_permissions(
-    doc_ids: list[int],
-    set_permissions,
-    *,
-    owner=None,
-    merge=False,
-) -> Literal["OK"]:
-    qs = Document.objects.filter(id__in=doc_ids).select_related("owner")
-
-    if merge:
-        # If merging, only set owner for documents that don't have an owner
-        qs.filter(owner__isnull=True).update(owner=owner)
-    else:
-        qs.update(owner=owner)
-
-    for doc in qs:
-        set_permissions_for_object(permissions=set_permissions, object=doc, merge=merge)
-
-    affected_docs = list(qs.values_list("pk", flat=True))
-
-    bulk_update_documents.delay(document_ids=affected_docs)
-
-    return "OK"
-
-
-def rotate(doc_ids: list[int], degrees: int) -> Literal["OK"]:
-    logger.info(
-        f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
-    )
-    qs = Document.objects.filter(id__in=doc_ids)
-    affected_docs: list[int] = []
-    import pikepdf
-
-    rotate_tasks = []
-    for doc in qs:
-        if doc.mime_type != "application/pdf":
-            logger.warning(
-                f"Document {doc.id} is not a PDF, skipping rotation.",
-            )
-            continue
-        try:
-            with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
-                for page in pdf.pages:
-                    page.rotate(degrees, relative=True)
-                pdf.save()
-                doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
-                doc.save()
-                rotate_tasks.append(
-                    update_document_content_maybe_archive_file.s(
-                        document_id=doc.id,
-                    ),
-                )
-                logger.info(
-                    f"Rotated document {doc.id} by {degrees} degrees",
-                )
-                affected_docs.append(doc.id)
-        except Exception as e:
-            logger.exception(f"Error rotating document {doc.id}: {e}")
-
-    if len(affected_docs) > 0:
-        bulk_update_task = bulk_update_documents.si(document_ids=affected_docs)
-        chord(header=rotate_tasks, body=bulk_update_task).delay()
-
-    return "OK"
-
-
-def merge(
-    doc_ids: list[int],
-    *,
-    metadata_document_id: int | None = None,
-    delete_originals: bool = False,
-    archive_fallback: bool = False,
-    user: User | None = None,
-) -> Literal["OK"]:
-    logger.info(
-        f"Attempting to merge {len(doc_ids)} documents into a single document.",
-    )
-    qs = Document.objects.filter(id__in=doc_ids)
-    affected_docs: list[int] = []
-    import pikepdf
-
-    merged_pdf = pikepdf.new()
-    version: str = merged_pdf.pdf_version
-    # use doc_ids to preserve order
-    for doc_id in doc_ids:
-        doc = qs.get(id=doc_id)
-        try:
-            doc_path = (
-                doc.archive_path
-                if archive_fallback
-                and doc.mime_type != "application/pdf"
-                and doc.has_archive_version
-                else doc.source_path
-            )
-            with pikepdf.open(str(doc_path)) as pdf:
-                version = max(version, pdf.pdf_version)
-                merged_pdf.pages.extend(pdf.pages)
-            affected_docs.append(doc.id)
-        except Exception as e:
-            logger.exception(
-                f"Error merging document {doc.id}, it will not be included in the merge: {e}",
-            )
-    if len(affected_docs) == 0:
-        logger.warning("No documents were merged")
-        return "OK"
-
-    filepath = (
-        Path(
-            tempfile.mkdtemp(dir=settings.SCRATCH_DIR),
-        )
-        / f"{'_'.join([str(doc_id) for doc_id in affected_docs])[:100]}_merged.pdf"
-    )
-    merged_pdf.remove_unreferenced_resources()
-    merged_pdf.save(filepath, min_version=version)
-    merged_pdf.close()
-
-    if metadata_document_id:
-        metadata_document = qs.get(id=metadata_document_id)
-        if metadata_document is not None:
-            overrides: DocumentMetadataOverrides = (
-                DocumentMetadataOverrides.from_document(metadata_document)
-            )
-            overrides.title = metadata_document.title + " (merged)"
-        else:
-            overrides = DocumentMetadataOverrides()
-    else:
-        overrides = DocumentMetadataOverrides()
-
-    if user is not None:
-        overrides.owner_id = user.id
-
-    logger.info("Adding merged document to the task queue.")
-
-    consume_task = consume_file.s(
-        ConsumableDocument(
-            source=DocumentSource.ConsumeFolder,
-            original_file=filepath,
-        ),
-        overrides,
-    )
-
-    if delete_originals:
-        logger.info(
-            "Queueing removal of original documents after consumption of merged document",
-        )
-        chain(consume_task, delete.si(affected_docs)).delay()
-    else:
-        consume_task.delay()
-
-    return "OK"
-
-
-def split(
-    doc_ids: list[int],
-    pages: list[list[int]],
-    *,
-    delete_originals: bool = False,
-    user: User | None = None,
-) -> Literal["OK"]:
-    logger.info(
-        f"Attempting to split document {doc_ids[0]} into {len(pages)} documents",
-    )
-    doc = Document.objects.get(id=doc_ids[0])
-    import pikepdf
-
-    consume_tasks = []
-
-    try:
-        with pikepdf.open(doc.source_path) as pdf:
-            for idx, split_doc in enumerate(pages):
-                dst: pikepdf.Pdf = pikepdf.new()
-                for page in split_doc:
-                    dst.pages.append(pdf.pages[page - 1])
-                filepath: Path = (
-                    Path(
-                        tempfile.mkdtemp(dir=settings.SCRATCH_DIR),
-                    )
-                    / f"{doc.id}_{split_doc[0]}-{split_doc[-1]}.pdf"
-                )
-                dst.remove_unreferenced_resources()
-                dst.save(filepath)
-                dst.close()
-
-                overrides: DocumentMetadataOverrides = (
-                    DocumentMetadataOverrides().from_document(doc)
-                )
-                overrides.title = f"{doc.title} (split {idx + 1})"
-                if user is not None:
-                    overrides.owner_id = user.id
-                logger.info(
-                    f"Adding split document with pages {split_doc} to the task queue.",
-                )
-                consume_tasks.append(
-                    consume_file.s(
-                        ConsumableDocument(
-                            source=DocumentSource.ConsumeFolder,
-                            original_file=filepath,
-                        ),
-                        overrides,
-                    ),
-                )
-
-            if delete_originals:
-                logger.info(
-                    "Queueing removal of original document after consumption of the split documents",
-                )
-                chord(header=consume_tasks, body=delete.si([doc.id])).delay()
-            else:
-                group(consume_tasks).delay()
-
-    except Exception as e:
-        logger.exception(f"Error splitting document {doc.id}: {e}")
-
-    return "OK"
-
-
-def delete_pages(doc_ids: list[int], pages: list[int]) -> Literal["OK"]:
-    logger.info(
-        f"Attempting to delete pages {pages} from {len(doc_ids)} documents",
-    )
-    doc = Document.objects.get(id=doc_ids[0])
-    pages = sorted(pages)  # sort pages to avoid index issues
-    import pikepdf
-
-    try:
-        with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
-            offset = 1  # pages are 1-indexed
-            for page_num in pages:
-                pdf.pages.remove(pdf.pages[page_num - offset])
-                offset += 1  # remove() changes the index of the pages
-            pdf.remove_unreferenced_resources()
-            pdf.save()
-            doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
-            if doc.page_count is not None:
-                doc.page_count = doc.page_count - len(pages)
-            doc.save()
-            update_document_content_maybe_archive_file.delay(document_id=doc.id)
-            logger.info(f"Deleted pages {pages} from document {doc.id}")
-    except Exception as e:
-        logger.exception(f"Error deleting pages from document {doc.id}: {e}")
-
-    return "OK"
-
-
-def reflect_doclinks(
-    document: Document,
-    field: CustomField,
-    target_doc_ids: list[int],
-):
-    """
-    Add or remove 'symmetrical' links to `document` on all `target_doc_ids`
-    """
-
-    if target_doc_ids is None:
-        target_doc_ids = []
-
-    # Check if any documents are going to be removed from the current list of links and remove the symmetrical links
-    current_field_instance = CustomFieldInstance.objects.filter(
-        field=field,
-        document=document,
-    ).first()
-    if current_field_instance is not None and current_field_instance.value is not None:
-        for doc_id in current_field_instance.value:
-            if doc_id not in target_doc_ids:
-                remove_doclink(
-                    document=document,
-                    field=field,
-                    target_doc_id=doc_id,
-                )
-
-    # Create an instance if target doc doesn't have this field or append it to an existing one
-    existing_custom_field_instances = {
-        custom_field.document_id: custom_field
-        for custom_field in CustomFieldInstance.objects.filter(
-            field=field,
-            document_id__in=target_doc_ids,
-        )
-    }
-    custom_field_instances_to_create = []
-    custom_field_instances_to_update = []
-    for target_doc_id in target_doc_ids:
-        target_doc_field_instance = existing_custom_field_instances.get(
-            target_doc_id,
-        )
-        if target_doc_field_instance is None:
-            custom_field_instances_to_create.append(
-                CustomFieldInstance(
-                    document_id=target_doc_id,
-                    field=field,
-                    value_document_ids=[document.id],
-                ),
-            )
-        elif target_doc_field_instance.value is None:
-            target_doc_field_instance.value_document_ids = [document.id]
-            custom_field_instances_to_update.append(target_doc_field_instance)
-        elif document.id not in target_doc_field_instance.value:
-            target_doc_field_instance.value_document_ids.append(document.id)
-            custom_field_instances_to_update.append(target_doc_field_instance)
-
-    CustomFieldInstance.objects.bulk_create(custom_field_instances_to_create)
-    CustomFieldInstance.objects.bulk_update(
-        custom_field_instances_to_update,
-        ["value_document_ids"],
-    )
-    Document.objects.filter(id__in=target_doc_ids).update(modified=timezone.now())
-
-
-def remove_doclink(
-    document: Document,
-    field: CustomField,
-    target_doc_id: int,
-):
-    """
-    Removes a 'symmetrical' link to `document` from the target document's existing custom field instance
-    """
-    target_doc_field_instance = CustomFieldInstance.objects.filter(
-        document_id=target_doc_id,
-        field=field,
-    ).first()
-    if (
-        target_doc_field_instance is not None
-        and document.id in target_doc_field_instance.value
-    ):
-        target_doc_field_instance.value.remove(document.id)
-        target_doc_field_instance.save()
-    Document.objects.filter(id=target_doc_id).update(modified=timezone.now())
--- a/src/documents/tests/test_api_search.py
+++ b/src/documents/tests/test_api_search.py
@@ -15,8 +15,8 @@ from rest_framework.test import APITestCase
 from whoosh.writing import AsyncWriter

 from documents import index
-from documents.bulk_edit import set_permissions
 from documents.tests.utils import DirectoriesMixin
+from paperless.bulk_edit import set_permissions
 from paperless.models import Correspondent
 from paperless.models import CustomField
 from paperless.models import CustomFieldInstance
--- a/src/documents/tests/test_bulk_edit.py
+++ b/src/documents/tests/test_bulk_edit.py
@@ -9,8 +9,8 @@ from guardian.shortcuts import assign_perm
 from guardian.shortcuts import get_groups_with_perms
 from guardian.shortcuts import get_users_with_perms

-from documents import bulk_edit
 from documents.tests.utils import DirectoriesMixin
+from paperless import bulk_edit
 from paperless.models import Correspondent
 from paperless.models import CustomField
 from paperless.models import CustomFieldInstance