Merge branch 'dev' into feature-ai

2025-11-23 23:49:08 -06:00 · 2025-08-06 16:04:06 -04:00
parent b52412d776 6dca4daea5
commit 540539643c
20 changed files with 410 additions and 319 deletions
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -1,8 +1,8 @@
 from __future__ import annotations

 import logging
-import os
 import shutil
+from pathlib import Path
 from typing import TYPE_CHECKING

 import httpx
@@ -12,11 +12,13 @@ from celery.signals import before_task_publish
 from celery.signals import task_failure
 from celery.signals import task_postrun
 from celery.signals import task_prerun
+from celery.signals import worker_process_init
 from django.conf import settings
 from django.contrib.auth.models import Group
 from django.contrib.auth.models import User
 from django.db import DatabaseError
 from django.db import close_old_connections
+from django.db import connections
 from django.db import models
 from django.db.models import Q
 from django.dispatch import receiver
@@ -51,8 +53,6 @@ from documents.templating.workflows import parse_w_workflow_placeholders
 from paperless.config import AIConfig

 if TYPE_CHECKING:
-    from pathlib import Path
-
    from documents.classifier import DocumentClassifier
    from documents.data_models import ConsumableDocument
    from documents.data_models import DocumentMetadataOverrides
@@ -329,15 +329,16 @@ def cleanup_document_deletion(sender, instance, **kwargs):
            # Find a non-conflicting filename in case a document with the same
            # name was moved to trash earlier
            counter = 0
-            old_filename = os.path.split(instance.source_path)[1]
-            (old_filebase, old_fileext) = os.path.splitext(old_filename)
+            old_filename = Path(instance.source_path).name
+            old_filebase = Path(old_filename).stem
+            old_fileext = Path(old_filename).suffix

            while True:
                new_file_path = settings.EMPTY_TRASH_DIR / (
                    old_filebase + (f"_{counter:02}" if counter else "") + old_fileext
                )

-                if os.path.exists(new_file_path):
+                if new_file_path.exists():
                    counter += 1
                else:
                    break
@@ -361,26 +362,26 @@ def cleanup_document_deletion(sender, instance, **kwargs):
            files += (instance.source_path,)

        for filename in files:
-            if filename and os.path.isfile(filename):
+            if filename and filename.is_file():
                try:
-                    os.unlink(filename)
+                    filename.unlink()
                    logger.debug(f"Deleted file {filename}.")
                except OSError as e:
                    logger.warning(
                        f"While deleting document {instance!s}, the file "
                        f"{filename} could not be deleted: {e}",
                    )
-            elif filename and not os.path.isfile(filename):
+            elif filename and not filename.is_file():
                logger.warning(f"Expected {filename} to exist, but it did not")

        delete_empty_directories(
-            os.path.dirname(instance.source_path),
+            Path(instance.source_path).parent,
            root=settings.ORIGINALS_DIR,
        )

        if instance.has_archive_version:
            delete_empty_directories(
-                os.path.dirname(instance.archive_path),
+                Path(instance.archive_path).parent,
                root=settings.ARCHIVE_DIR,
            )

@@ -401,14 +402,14 @@ def update_filename_and_move_files(
    if isinstance(instance, CustomFieldInstance):
        instance = instance.document

-    def validate_move(instance, old_path, new_path):
-        if not os.path.isfile(old_path):
+    def validate_move(instance, old_path: Path, new_path: Path):
+        if not old_path.is_file():
            # Can't do anything if the old file does not exist anymore.
            msg = f"Document {instance!s}: File {old_path} doesn't exist."
            logger.fatal(msg)
            raise CannotMoveFilesException(msg)

-        if os.path.isfile(new_path):
+        if new_path.is_file():
            # Can't do anything if the new file already exists. Skip updating file.
            msg = f"Document {instance!s}: Cannot rename file since target path {new_path} already exists."
            logger.warning(msg)
@@ -436,16 +437,20 @@ def update_filename_and_move_files(
            old_filename = instance.filename
            old_source_path = instance.source_path

-            instance.filename = generate_unique_filename(instance)
+            # Need to convert to string to be able to save it to the db
+            instance.filename = str(generate_unique_filename(instance))
            move_original = old_filename != instance.filename

            old_archive_filename = instance.archive_filename
            old_archive_path = instance.archive_path

            if instance.has_archive_version:
-                instance.archive_filename = generate_unique_filename(
-                    instance,
-                    archive_filename=True,
+                # Need to convert to string to be able to save it to the db
+                instance.archive_filename = str(
+                    generate_unique_filename(
+                        instance,
+                        archive_filename=True,
+                    ),
                )

                move_archive = old_archive_filename != instance.archive_filename
@@ -487,11 +492,11 @@ def update_filename_and_move_files(

            # Try to move files to their original location.
            try:
-                if move_original and os.path.isfile(instance.source_path):
+                if move_original and instance.source_path.is_file():
                    logger.info("Restoring previous original path")
                    shutil.move(instance.source_path, old_source_path)

-                if move_archive and os.path.isfile(instance.archive_path):
+                if move_archive and instance.archive_path.is_file():
                    logger.info("Restoring previous archive path")
                    shutil.move(instance.archive_path, old_archive_path)

@@ -512,17 +517,15 @@ def update_filename_and_move_files(

        # finally, remove any empty sub folders. This will do nothing if
        # something has failed above.
-        if not os.path.isfile(old_source_path):
+        if not old_source_path.is_file():
            delete_empty_directories(
-                os.path.dirname(old_source_path),
+                Path(old_source_path).parent,
                root=settings.ORIGINALS_DIR,
            )

-        if instance.has_archive_version and not os.path.isfile(
-            old_archive_path,
-        ):
+        if instance.has_archive_version and not old_archive_path.is_file():
            delete_empty_directories(
-                os.path.dirname(old_archive_path),
+                Path(old_archive_path).parent,
                root=settings.ARCHIVE_DIR,
            )

@@ -1228,10 +1231,7 @@ def run_workflows(
                    )
            files = None
            if action.webhook.include_document:
-                with open(
-                    original_file,
-                    "rb",
-                ) as f:
+                with original_file.open("rb") as f:
                    files = {
                        "file": (
                            filename,
@@ -1452,6 +1452,21 @@ def task_failure_handler(
        logger.exception("Updating PaperlessTask failed")


+@worker_process_init.connect
+def close_connection_pool_on_worker_init(**kwargs):
+    """
+    Close the DB connection pool for each Celery child process after it starts.
+
+    This is necessary because the parent process parse the Django configuration,
+    initializes connection pools then forks.
+
+    Closing these pools after forking ensures child processes have a valid connection.
+    """
+    for conn in connections.all(initialized_only=True):
+        if conn.alias == "default" and hasattr(conn, "pool") and conn.pool:
+            conn.close_pool()
+
+
 def add_or_update_document_in_llm_index(sender, document, **kwargs):
    """
    Add or update a document in the LLM index when it is created or updated.