diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 2a0e5dce2..c735ed4c8 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -5,8 +5,6 @@ import tempfile import uuid from enum import Enum from pathlib import Path -from subprocess import CompletedProcess -from subprocess import run from typing import TYPE_CHECKING from typing import Optional @@ -51,6 +49,7 @@ from documents.signals import document_consumption_finished from documents.signals import document_consumption_started from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats +from documents.utils import run_subprocess class WorkflowTriggerPlugin( @@ -397,20 +396,15 @@ class Consumer(LoggingMixin): script_env["TASK_ID"] = self.task_id or "" try: - completed_proc = run( - args=[ + run_subprocess( + [ settings.PRE_CONSUME_SCRIPT, original_file_path, ], - env=script_env, - capture_output=True, + script_env, + self.log, ) - self._log_script_outputs(completed_proc) - - # Raises exception on non-zero output - completed_proc.check_returncode() - except Exception as e: self._fail( ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR, @@ -468,8 +462,8 @@ class Consumer(LoggingMixin): script_env["TASK_ID"] = self.task_id or "" try: - completed_proc = run( - args=[ + run_subprocess( + [ settings.POST_CONSUME_SCRIPT, str(document.pk), document.get_public_filename(), @@ -480,15 +474,10 @@ class Consumer(LoggingMixin): str(document.correspondent), str(",".join(document.tags.all().values_list("name", flat=True))), ], - env=script_env, - capture_output=True, + script_env, + self.log, ) - self._log_script_outputs(completed_proc) - - # Raises exception on non-zero output - completed_proc.check_returncode() - except Exception as e: self._fail( ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR, @@ -929,41 +918,6 @@ class Consumer(LoggingMixin): except Exception: # pragma: no cover pass - def _log_script_outputs(self, completed_process: CompletedProcess): - """ - Decodes a process stdout and stderr streams and logs them to the main log - """ - # Log what the script exited as - self.log.info( - f"{completed_process.args[0]} exited {completed_process.returncode}", - ) - - # Decode the output (if any) - if len(completed_process.stdout): - stdout_str = ( - completed_process.stdout.decode("utf8", errors="ignore") - .strip() - .split( - "\n", - ) - ) - self.log.info("Script stdout:") - for line in stdout_str: - self.log.info(line) - - if len(completed_process.stderr): - stderr_str = ( - completed_process.stderr.decode("utf8", errors="ignore") - .strip() - .split( - "\n", - ) - ) - - self.log.warning("Script stderr:") - for line in stderr_str: - self.log.warning(line) - def parse_doc_title_w_placeholders( title: str, diff --git a/src/documents/converters.py b/src/documents/converters.py index 5c5ba1e07..fcd0a8857 100644 --- a/src/documents/converters.py +++ b/src/documents/converters.py @@ -1,5 +1,4 @@ from pathlib import Path -from subprocess import run import img2pdf from django.conf import settings @@ -7,6 +6,7 @@ from PIL import Image from documents.utils import copy_basic_file_stats from documents.utils import maybe_override_pixel_limit +from documents.utils import run_subprocess def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path: @@ -27,7 +27,7 @@ def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path: # Note the save into the temp folder, so as not to trigger a new # consume scratch_image = target_directory / tiff_path.name - run( + run_subprocess( [ settings.CONVERT_BINARY, "-alpha", diff --git a/src/documents/parsers.py b/src/documents/parsers.py index a2b4677e6..d781ddb9f 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -18,6 +18,7 @@ from django.utils import timezone from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats +from documents.utils import run_subprocess # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -164,8 +165,12 @@ def run_convert( logger.debug("Execute: " + " ".join(args), extra={"group": logging_group}) - if not subprocess.Popen(args, env=environment).wait() == 0: - raise ParseError(f"Convert failed at {args}") + try: + run_subprocess(args, environment, logger) + except subprocess.CalledProcessError as e: + raise ParseError(f"Convert failed at {args}") from e + except Exception as e: # pragma: no cover + raise ParseError("Unknown error running convert") from e def get_default_thumbnail() -> Path: @@ -188,9 +193,12 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) - # Ghostscript doesn't handle WebP outputs gs_out_path = os.path.join(temp_dir, "gs_out.png") cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path] + try: - if not subprocess.Popen(cmd).wait() == 0: - raise ParseError(f"Thumbnail (gs) failed at {cmd}") + try: + run_subprocess(cmd, logger=logger) + except subprocess.CalledProcessError as e: + raise ParseError(f"Thumbnail (gs) failed at {cmd}") from e # then run convert on the output from gs to make WebP run_convert( density=300, diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index f77265970..6c23df8aa 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -934,7 +934,7 @@ class PreConsumeTestCase(TestCase): return super().setUp() - @mock.patch("documents.consumer.run") + @mock.patch("documents.consumer.run_subprocess") @override_settings(PRE_CONSUME_SCRIPT=None) def test_no_pre_consume_script(self, m): c = Consumer() @@ -942,7 +942,7 @@ class PreConsumeTestCase(TestCase): c.run_pre_consume_script() m.assert_not_called() - @mock.patch("documents.consumer.run") + @mock.patch("documents.consumer.run_subprocess") @mock.patch("documents.consumer.Consumer._send_progress") @override_settings(PRE_CONSUME_SCRIPT="does-not-exist") def test_pre_consume_script_not_found(self, m, m2): @@ -951,7 +951,7 @@ class PreConsumeTestCase(TestCase): c.working_copy = "path-to-file" self.assertRaises(ConsumerError, c.run_pre_consume_script) - @mock.patch("documents.consumer.run") + @mock.patch("documents.consumer.run_subprocess") def test_pre_consume_script(self, m): with tempfile.NamedTemporaryFile() as script: with override_settings(PRE_CONSUME_SCRIPT=script.name): @@ -963,10 +963,10 @@ class PreConsumeTestCase(TestCase): m.assert_called_once() - args, kwargs = m.call_args + args, _ = m.call_args - command = kwargs["args"] - environment = kwargs["env"] + command = args[0] + environment = args[1] self.assertEqual(command[0], script.name) self.assertEqual(command[1], "path-to-file") @@ -1050,7 +1050,7 @@ class PostConsumeTestCase(TestCase): return super().setUp() - @mock.patch("documents.consumer.run") + @mock.patch("documents.consumer.run_subprocess") @override_settings(POST_CONSUME_SCRIPT=None) def test_no_post_consume_script(self, m): doc = Document.objects.create(title="Test", mime_type="application/pdf") @@ -1075,7 +1075,7 @@ class PostConsumeTestCase(TestCase): doc, ) - @mock.patch("documents.consumer.run") + @mock.patch("documents.consumer.run_subprocess") def test_post_consume_script_simple(self, m): with tempfile.NamedTemporaryFile() as script: with override_settings(POST_CONSUME_SCRIPT=script.name): @@ -1085,7 +1085,7 @@ class PostConsumeTestCase(TestCase): m.assert_called_once() - @mock.patch("documents.consumer.run") + @mock.patch("documents.consumer.run_subprocess") def test_post_consume_script_with_correspondent(self, m): with tempfile.NamedTemporaryFile() as script: with override_settings(POST_CONSUME_SCRIPT=script.name): @@ -1106,10 +1106,10 @@ class PostConsumeTestCase(TestCase): m.assert_called_once() - _, kwargs = m.call_args + args, _ = m.call_args - command = kwargs["args"] - environment = kwargs["env"] + command = args[0] + environment = args[1] self.assertEqual(command[0], script.name) self.assertEqual(command[1], str(doc.pk)) diff --git a/src/documents/utils.py b/src/documents/utils.py index 29f4de14d..0af1f54e3 100644 --- a/src/documents/utils.py +++ b/src/documents/utils.py @@ -1,6 +1,9 @@ +import logging import shutil from os import utime from pathlib import Path +from subprocess import CompletedProcess +from subprocess import run from typing import Optional from typing import Union @@ -56,3 +59,54 @@ def maybe_override_pixel_limit() -> None: if pixel_count == 0: pixel_count = None Image.MAX_IMAGE_PIXELS = pixel_count + + +def run_subprocess( + arguments: list[str], + env: Optional[dict[str, str]] = None, + logger: Optional[logging.Logger] = None, + *, + check_exit_code: bool = True, + log_stdout: bool = True, + log_stderr: bool = True, +) -> CompletedProcess: + """ + Runs a subprocess and logs its output, checking return code if requested + """ + + proc_name = arguments[0] + + completed_proc = run(args=arguments, env=env, capture_output=True, check=False) + + if logger: + logger.info(f"{proc_name} exited {completed_proc.returncode}") + + if log_stdout and logger and completed_proc.stdout: + stdout_str = ( + completed_proc.stdout.decode("utf8", errors="ignore") + .strip() + .split( + "\n", + ) + ) + logger.info(f"{proc_name} stdout:") + for line in stdout_str: + logger.info(line) + + if log_stderr and logger and completed_proc.stderr: + stderr_str = ( + completed_proc.stderr.decode("utf8", errors="ignore") + .strip() + .split( + "\n", + ) + ) + logger.info(f"{proc_name} stderr:") + for line in stderr_str: + logger.warning(line) + + # Last, if requested, after logging outputs + if check_exit_code: + completed_proc.check_returncode() + + return completed_proc diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index c483a3da4..44d421edc 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,6 +1,5 @@ import os import re -import subprocess import tempfile from pathlib import Path from typing import TYPE_CHECKING @@ -13,6 +12,7 @@ from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf from documents.utils import maybe_override_pixel_limit +from documents.utils import run_subprocess from paperless.config import OcrConfig from paperless.models import ArchiveFileChoices from paperless.models import CleanChoices @@ -103,7 +103,7 @@ class RasterisedDocumentParser(DocumentParser): def remove_alpha(self, image_path: str) -> Path: no_alpha_image = Path(self.tempdir) / "image-no-alpha" - subprocess.run( + run_subprocess( [ settings.CONVERT_BINARY, "-alpha", @@ -111,6 +111,7 @@ class RasterisedDocumentParser(DocumentParser): image_path, no_alpha_image, ], + logger=self.log, ) return no_alpha_image @@ -169,7 +170,7 @@ class RasterisedDocumentParser(DocumentParser): mode="w+", dir=self.tempdir, ) as tmp: - subprocess.run( + run_subprocess( [ "pdftotext", "-q", @@ -179,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser): pdf_file, tmp.name, ], + logger=self.log, ) text = self.read_file_handle_unicode_errors(Path(tmp.name))