Chore: Standardize subprocess running and logging (#6275)

This commit is contained in:
Trenton H 2024-04-04 13:11:43 -07:00 committed by GitHub
parent 0f8b2e69c9
commit 2c43b06910
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 94 additions and 76 deletions

View File

@ -5,8 +5,6 @@ import tempfile
import uuid import uuid
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from subprocess import CompletedProcess
from subprocess import run
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from typing import Optional from typing import Optional
@ -51,6 +49,7 @@ from documents.signals import document_consumption_finished
from documents.signals import document_consumption_started from documents.signals import document_consumption_started
from documents.utils import copy_basic_file_stats from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
class WorkflowTriggerPlugin( class WorkflowTriggerPlugin(
@ -397,20 +396,15 @@ class Consumer(LoggingMixin):
script_env["TASK_ID"] = self.task_id or "" script_env["TASK_ID"] = self.task_id or ""
try: try:
completed_proc = run( run_subprocess(
args=[ [
settings.PRE_CONSUME_SCRIPT, settings.PRE_CONSUME_SCRIPT,
original_file_path, original_file_path,
], ],
env=script_env, script_env,
capture_output=True, self.log,
) )
self._log_script_outputs(completed_proc)
# Raises exception on non-zero output
completed_proc.check_returncode()
except Exception as e: except Exception as e:
self._fail( self._fail(
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR, ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
@ -468,8 +462,8 @@ class Consumer(LoggingMixin):
script_env["TASK_ID"] = self.task_id or "" script_env["TASK_ID"] = self.task_id or ""
try: try:
completed_proc = run( run_subprocess(
args=[ [
settings.POST_CONSUME_SCRIPT, settings.POST_CONSUME_SCRIPT,
str(document.pk), str(document.pk),
document.get_public_filename(), document.get_public_filename(),
@ -480,15 +474,10 @@ class Consumer(LoggingMixin):
str(document.correspondent), str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True))), str(",".join(document.tags.all().values_list("name", flat=True))),
], ],
env=script_env, script_env,
capture_output=True, self.log,
) )
self._log_script_outputs(completed_proc)
# Raises exception on non-zero output
completed_proc.check_returncode()
except Exception as e: except Exception as e:
self._fail( self._fail(
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR, ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
@ -929,41 +918,6 @@ class Consumer(LoggingMixin):
except Exception: # pragma: no cover except Exception: # pragma: no cover
pass pass
def _log_script_outputs(self, completed_process: CompletedProcess):
"""
Decodes a process stdout and stderr streams and logs them to the main log
"""
# Log what the script exited as
self.log.info(
f"{completed_process.args[0]} exited {completed_process.returncode}",
)
# Decode the output (if any)
if len(completed_process.stdout):
stdout_str = (
completed_process.stdout.decode("utf8", errors="ignore")
.strip()
.split(
"\n",
)
)
self.log.info("Script stdout:")
for line in stdout_str:
self.log.info(line)
if len(completed_process.stderr):
stderr_str = (
completed_process.stderr.decode("utf8", errors="ignore")
.strip()
.split(
"\n",
)
)
self.log.warning("Script stderr:")
for line in stderr_str:
self.log.warning(line)
def parse_doc_title_w_placeholders( def parse_doc_title_w_placeholders(
title: str, title: str,

View File

@ -1,5 +1,4 @@
from pathlib import Path from pathlib import Path
from subprocess import run
import img2pdf import img2pdf
from django.conf import settings from django.conf import settings
@ -7,6 +6,7 @@ from PIL import Image
from documents.utils import copy_basic_file_stats from documents.utils import copy_basic_file_stats
from documents.utils import maybe_override_pixel_limit from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path: def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
@ -27,7 +27,7 @@ def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
# Note the save into the temp folder, so as not to trigger a new # Note the save into the temp folder, so as not to trigger a new
# consume # consume
scratch_image = target_directory / tiff_path.name scratch_image = target_directory / tiff_path.name
run( run_subprocess(
[ [
settings.CONVERT_BINARY, settings.CONVERT_BINARY,
"-alpha", "-alpha",

View File

@ -18,6 +18,7 @@ from django.utils import timezone
from documents.loggers import LoggingMixin from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
# This regular expression will try to find dates in the document at # This regular expression will try to find dates in the document at
# hand and will match the following formats: # hand and will match the following formats:
@ -164,8 +165,12 @@ def run_convert(
logger.debug("Execute: " + " ".join(args), extra={"group": logging_group}) logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})
if not subprocess.Popen(args, env=environment).wait() == 0: try:
raise ParseError(f"Convert failed at {args}") run_subprocess(args, environment, logger)
except subprocess.CalledProcessError as e:
raise ParseError(f"Convert failed at {args}") from e
except Exception as e: # pragma: no cover
raise ParseError("Unknown error running convert") from e
def get_default_thumbnail() -> Path: def get_default_thumbnail() -> Path:
@ -188,9 +193,12 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
# Ghostscript doesn't handle WebP outputs # Ghostscript doesn't handle WebP outputs
gs_out_path = os.path.join(temp_dir, "gs_out.png") gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path] cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]
try: try:
if not subprocess.Popen(cmd).wait() == 0: try:
raise ParseError(f"Thumbnail (gs) failed at {cmd}") run_subprocess(cmd, logger=logger)
except subprocess.CalledProcessError as e:
raise ParseError(f"Thumbnail (gs) failed at {cmd}") from e
# then run convert on the output from gs to make WebP # then run convert on the output from gs to make WebP
run_convert( run_convert(
density=300, density=300,

View File

@ -934,7 +934,7 @@ class PreConsumeTestCase(TestCase):
return super().setUp() return super().setUp()
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run_subprocess")
@override_settings(PRE_CONSUME_SCRIPT=None) @override_settings(PRE_CONSUME_SCRIPT=None)
def test_no_pre_consume_script(self, m): def test_no_pre_consume_script(self, m):
c = Consumer() c = Consumer()
@ -942,7 +942,7 @@ class PreConsumeTestCase(TestCase):
c.run_pre_consume_script() c.run_pre_consume_script()
m.assert_not_called() m.assert_not_called()
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run_subprocess")
@mock.patch("documents.consumer.Consumer._send_progress") @mock.patch("documents.consumer.Consumer._send_progress")
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist") @override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
def test_pre_consume_script_not_found(self, m, m2): def test_pre_consume_script_not_found(self, m, m2):
@ -951,7 +951,7 @@ class PreConsumeTestCase(TestCase):
c.working_copy = "path-to-file" c.working_copy = "path-to-file"
self.assertRaises(ConsumerError, c.run_pre_consume_script) self.assertRaises(ConsumerError, c.run_pre_consume_script)
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run_subprocess")
def test_pre_consume_script(self, m): def test_pre_consume_script(self, m):
with tempfile.NamedTemporaryFile() as script: with tempfile.NamedTemporaryFile() as script:
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
@ -963,10 +963,10 @@ class PreConsumeTestCase(TestCase):
m.assert_called_once() m.assert_called_once()
args, kwargs = m.call_args args, _ = m.call_args
command = kwargs["args"] command = args[0]
environment = kwargs["env"] environment = args[1]
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], "path-to-file") self.assertEqual(command[1], "path-to-file")
@ -1050,7 +1050,7 @@ class PostConsumeTestCase(TestCase):
return super().setUp() return super().setUp()
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run_subprocess")
@override_settings(POST_CONSUME_SCRIPT=None) @override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m): def test_no_post_consume_script(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf") doc = Document.objects.create(title="Test", mime_type="application/pdf")
@ -1075,7 +1075,7 @@ class PostConsumeTestCase(TestCase):
doc, doc,
) )
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run_subprocess")
def test_post_consume_script_simple(self, m): def test_post_consume_script_simple(self, m):
with tempfile.NamedTemporaryFile() as script: with tempfile.NamedTemporaryFile() as script:
with override_settings(POST_CONSUME_SCRIPT=script.name): with override_settings(POST_CONSUME_SCRIPT=script.name):
@ -1085,7 +1085,7 @@ class PostConsumeTestCase(TestCase):
m.assert_called_once() m.assert_called_once()
@mock.patch("documents.consumer.run") @mock.patch("documents.consumer.run_subprocess")
def test_post_consume_script_with_correspondent(self, m): def test_post_consume_script_with_correspondent(self, m):
with tempfile.NamedTemporaryFile() as script: with tempfile.NamedTemporaryFile() as script:
with override_settings(POST_CONSUME_SCRIPT=script.name): with override_settings(POST_CONSUME_SCRIPT=script.name):
@ -1106,10 +1106,10 @@ class PostConsumeTestCase(TestCase):
m.assert_called_once() m.assert_called_once()
_, kwargs = m.call_args args, _ = m.call_args
command = kwargs["args"] command = args[0]
environment = kwargs["env"] environment = args[1]
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk)) self.assertEqual(command[1], str(doc.pk))

View File

@ -1,6 +1,9 @@
import logging
import shutil import shutil
from os import utime from os import utime
from pathlib import Path from pathlib import Path
from subprocess import CompletedProcess
from subprocess import run
from typing import Optional from typing import Optional
from typing import Union from typing import Union
@ -56,3 +59,54 @@ def maybe_override_pixel_limit() -> None:
if pixel_count == 0: if pixel_count == 0:
pixel_count = None pixel_count = None
Image.MAX_IMAGE_PIXELS = pixel_count Image.MAX_IMAGE_PIXELS = pixel_count
def run_subprocess(
arguments: list[str],
env: Optional[dict[str, str]] = None,
logger: Optional[logging.Logger] = None,
*,
check_exit_code: bool = True,
log_stdout: bool = True,
log_stderr: bool = True,
) -> CompletedProcess:
"""
Runs a subprocess and logs its output, checking return code if requested
"""
proc_name = arguments[0]
completed_proc = run(args=arguments, env=env, capture_output=True, check=False)
if logger:
logger.info(f"{proc_name} exited {completed_proc.returncode}")
if log_stdout and logger and completed_proc.stdout:
stdout_str = (
completed_proc.stdout.decode("utf8", errors="ignore")
.strip()
.split(
"\n",
)
)
logger.info(f"{proc_name} stdout:")
for line in stdout_str:
logger.info(line)
if log_stderr and logger and completed_proc.stderr:
stderr_str = (
completed_proc.stderr.decode("utf8", errors="ignore")
.strip()
.split(
"\n",
)
)
logger.info(f"{proc_name} stderr:")
for line in stderr_str:
logger.warning(line)
# Last, if requested, after logging outputs
if check_exit_code:
completed_proc.check_returncode()
return completed_proc

View File

@ -1,6 +1,5 @@
import os import os
import re import re
import subprocess
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -13,6 +12,7 @@ from documents.parsers import DocumentParser
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf from documents.parsers import make_thumbnail_from_pdf
from documents.utils import maybe_override_pixel_limit from documents.utils import maybe_override_pixel_limit
from documents.utils import run_subprocess
from paperless.config import OcrConfig from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices from paperless.models import CleanChoices
@ -103,7 +103,7 @@ class RasterisedDocumentParser(DocumentParser):
def remove_alpha(self, image_path: str) -> Path: def remove_alpha(self, image_path: str) -> Path:
no_alpha_image = Path(self.tempdir) / "image-no-alpha" no_alpha_image = Path(self.tempdir) / "image-no-alpha"
subprocess.run( run_subprocess(
[ [
settings.CONVERT_BINARY, settings.CONVERT_BINARY,
"-alpha", "-alpha",
@ -111,6 +111,7 @@ class RasterisedDocumentParser(DocumentParser):
image_path, image_path,
no_alpha_image, no_alpha_image,
], ],
logger=self.log,
) )
return no_alpha_image return no_alpha_image
@ -169,7 +170,7 @@ class RasterisedDocumentParser(DocumentParser):
mode="w+", mode="w+",
dir=self.tempdir, dir=self.tempdir,
) as tmp: ) as tmp:
subprocess.run( run_subprocess(
[ [
"pdftotext", "pdftotext",
"-q", "-q",
@ -179,6 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
pdf_file, pdf_file,
tmp.name, tmp.name,
], ],
logger=self.log,
) )
text = self.read_file_handle_unicode_errors(Path(tmp.name)) text = self.read_file_handle_unicode_errors(Path(tmp.name))