Chore: Standardize subprocess running and logging (#6275)

2025-06-28 15:54:41 -05:00 · 2024-04-04 13:11:43 -07:00 · 2024-04-04 13:11:43 -07:00 · 2c43b06910
commit 2c43b06910
parent 0f8b2e69c9
6 changed files with 94 additions and 76 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -5,8 +5,6 @@ import tempfile
 import uuid
 from enum import Enum
 from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
 from typing import TYPE_CHECKING
 from typing import Optional
@ -51,6 +49,7 @@ from documents.signals import document_consumption_finished
 from documents.signals import document_consumption_started
 from documents.utils import copy_basic_file_stats
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 class WorkflowTriggerPlugin(
@ -397,20 +396,15 @@ class Consumer(LoggingMixin):
        script_env["TASK_ID"] = self.task_id or ""
        try:
-            completed_proc = run(
+            run_subprocess(
-                args=[
+                [
                    settings.PRE_CONSUME_SCRIPT,
                    original_file_path,
                ],
-                env=script_env,
+                script_env,
-                capture_output=True,
+                self.log,
            )
            self._log_script_outputs(completed_proc)
            # Raises exception on non-zero output
            completed_proc.check_returncode()
        except Exception as e:
            self._fail(
                ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
@ -468,8 +462,8 @@ class Consumer(LoggingMixin):
        script_env["TASK_ID"] = self.task_id or ""
        try:
-            completed_proc = run(
+            run_subprocess(
-                args=[
+                [
                    settings.POST_CONSUME_SCRIPT,
                    str(document.pk),
                    document.get_public_filename(),
@ -480,15 +474,10 @@ class Consumer(LoggingMixin):
                    str(document.correspondent),
                    str(",".join(document.tags.all().values_list("name", flat=True))),
                ],
-                env=script_env,
+                script_env,
-                capture_output=True,
+                self.log,
            )
            self._log_script_outputs(completed_proc)
            # Raises exception on non-zero output
            completed_proc.check_returncode()
        except Exception as e:
            self._fail(
                ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
@ -929,41 +918,6 @@ class Consumer(LoggingMixin):
        except Exception:  # pragma: no cover
            pass
    def _log_script_outputs(self, completed_process: CompletedProcess):
        """
        Decodes a process stdout and stderr streams and logs them to the main log
        """
        # Log what the script exited as
        self.log.info(
            f"{completed_process.args[0]} exited {completed_process.returncode}",
        )
        # Decode the output (if any)
        if len(completed_process.stdout):
            stdout_str = (
                completed_process.stdout.decode("utf8", errors="ignore")
                .strip()
                .split(
                    "\n",
                )
            )
            self.log.info("Script stdout:")
            for line in stdout_str:
                self.log.info(line)
        if len(completed_process.stderr):
            stderr_str = (
                completed_process.stderr.decode("utf8", errors="ignore")
                .strip()
                .split(
                    "\n",
                )
            )
            self.log.warning("Script stderr:")
            for line in stderr_str:
                self.log.warning(line)
 def parse_doc_title_w_placeholders(
    title: str,
--- a/src/documents/converters.py
+++ b/src/documents/converters.py
@ -1,5 +1,4 @@
 from pathlib import Path
 from subprocess import run
 import img2pdf
 from django.conf import settings
@ -7,6 +6,7 @@ from PIL import Image
 from documents.utils import copy_basic_file_stats
 from documents.utils import maybe_override_pixel_limit
 from documents.utils import run_subprocess
 def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
@ -27,7 +27,7 @@ def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
        # Note the save into the temp folder, so as not to trigger a new
        # consume
        scratch_image = target_directory / tiff_path.name
-        run(
+        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@ -18,6 +18,7 @@ from django.utils import timezone
 from documents.loggers import LoggingMixin
 from documents.signals import document_consumer_declaration
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@ -164,8 +165,12 @@ def run_convert(
    logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
-    if not subprocess.Popen(args, env=environment).wait() == 0:
+    try:
-        raise ParseError(f"Convert failed at {args}")
+        run_subprocess(args, environment, logger)
    except subprocess.CalledProcessError as e:
        raise ParseError(f"Convert failed at {args}") from e
    except Exception as e:  # pragma: no cover
        raise ParseError("Unknown error running convert") from e
 def get_default_thumbnail() -> Path:
@ -188,9 +193,12 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
    # Ghostscript doesn't handle WebP outputs
    gs_out_path = os.path.join(temp_dir, "gs_out.png")
    cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
    try:
-        if not subprocess.Popen(cmd).wait() == 0:
+        try:
-            raise ParseError(f"Thumbnail (gs) failed at {cmd}")
+            run_subprocess(cmd, logger=logger)
        except subprocess.CalledProcessError as e:
            raise ParseError(f"Thumbnail (gs) failed at {cmd}") from e
        # then run convert on the output from gs to make WebP
        run_convert(
            density=300,
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@ -934,7 +934,7 @@ class PreConsumeTestCase(TestCase):
        return super().setUp()
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
    @override_settings(PRE_CONSUME_SCRIPT=None)
    def test_no_pre_consume_script(self, m):
        c = Consumer()
@ -942,7 +942,7 @@ class PreConsumeTestCase(TestCase):
        c.run_pre_consume_script()
        m.assert_not_called()
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
    @mock.patch("documents.consumer.Consumer._send_progress")
    @override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
    def test_pre_consume_script_not_found(self, m, m2):
@ -951,7 +951,7 @@ class PreConsumeTestCase(TestCase):
        c.working_copy = "path-to-file"
        self.assertRaises(ConsumerError, c.run_pre_consume_script)
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
    def test_pre_consume_script(self, m):
        with tempfile.NamedTemporaryFile() as script:
            with override_settings(PRE_CONSUME_SCRIPT=script.name):
@ -963,10 +963,10 @@ class PreConsumeTestCase(TestCase):
                m.assert_called_once()
-                args, kwargs = m.call_args
+                args, _ = m.call_args
-                command = kwargs["args"]
+                command = args[0]
-                environment = kwargs["env"]
+                environment = args[1]
                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], "path-to-file")
@ -1050,7 +1050,7 @@ class PostConsumeTestCase(TestCase):
        return super().setUp()
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
    @override_settings(POST_CONSUME_SCRIPT=None)
    def test_no_post_consume_script(self, m):
        doc = Document.objects.create(title="Test", mime_type="application/pdf")
@ -1075,7 +1075,7 @@ class PostConsumeTestCase(TestCase):
            doc,
        )
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
    def test_post_consume_script_simple(self, m):
        with tempfile.NamedTemporaryFile() as script:
            with override_settings(POST_CONSUME_SCRIPT=script.name):
@ -1085,7 +1085,7 @@ class PostConsumeTestCase(TestCase):
                m.assert_called_once()
-    @mock.patch("documents.consumer.run")
+    @mock.patch("documents.consumer.run_subprocess")
    def test_post_consume_script_with_correspondent(self, m):
        with tempfile.NamedTemporaryFile() as script:
            with override_settings(POST_CONSUME_SCRIPT=script.name):
@ -1106,10 +1106,10 @@ class PostConsumeTestCase(TestCase):
                m.assert_called_once()
-                _, kwargs = m.call_args
+                args, _ = m.call_args
-                command = kwargs["args"]
+                command = args[0]
-                environment = kwargs["env"]
+                environment = args[1]
                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], str(doc.pk))
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@ -1,6 +1,9 @@
 import logging
 import shutil
 from os import utime
 from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
 from typing import Optional
 from typing import Union
@ -56,3 +59,54 @@ def maybe_override_pixel_limit() -> None:
        if pixel_count == 0:
            pixel_count = None
        Image.MAX_IMAGE_PIXELS = pixel_count
 def run_subprocess(
    arguments: list[str],
    env: Optional[dict[str, str]] = None,
    logger: Optional[logging.Logger] = None,
    *,
    check_exit_code: bool = True,
    log_stdout: bool = True,
    log_stderr: bool = True,
 ) -> CompletedProcess:
    """
    Runs a subprocess and logs its output, checking return code if requested
    """
    proc_name = arguments[0]
    completed_proc = run(args=arguments, env=env, capture_output=True, check=False)
    if logger:
        logger.info(f"{proc_name} exited {completed_proc.returncode}")
    if log_stdout and logger and completed_proc.stdout:
        stdout_str = (
            completed_proc.stdout.decode("utf8", errors="ignore")
            .strip()
            .split(
                "\n",
            )
        )
        logger.info(f"{proc_name} stdout:")
        for line in stdout_str:
            logger.info(line)
    if log_stderr and logger and completed_proc.stderr:
        stderr_str = (
            completed_proc.stderr.decode("utf8", errors="ignore")
            .strip()
            .split(
                "\n",
            )
        )
        logger.info(f"{proc_name} stderr:")
        for line in stderr_str:
            logger.warning(line)
    # Last, if requested, after logging outputs
    if check_exit_code:
        completed_proc.check_returncode()
    return completed_proc
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -1,6 +1,5 @@
 import os
 import re
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
@ -13,6 +12,7 @@ from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
 from documents.utils import run_subprocess
 from paperless.config import OcrConfig
 from paperless.models import ArchiveFileChoices
 from paperless.models import CleanChoices
@ -103,7 +103,7 @@ class RasterisedDocumentParser(DocumentParser):
    def remove_alpha(self, image_path: str) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
-        subprocess.run(
+        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
@ -111,6 +111,7 @@ class RasterisedDocumentParser(DocumentParser):
                image_path,
                no_alpha_image,
            ],
            logger=self.log,
        )
        return no_alpha_image
@ -169,7 +170,7 @@ class RasterisedDocumentParser(DocumentParser):
                mode="w+",
                dir=self.tempdir,
            ) as tmp:
-                subprocess.run(
+                run_subprocess(
                    [
                        "pdftotext",
                        "-q",
@ -179,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
                        pdf_file,
                        tmp.name,
                    ],
                    logger=self.log,
                )
                text = self.read_file_handle_unicode_errors(Path(tmp.name))