From d33416288c46bbbb0302832006601669f79e9ba4 Mon Sep 17 00:00:00 2001 From: Shayne Fletcher Date: Fri, 14 Nov 2025 06:53:02 -0800 Subject: [PATCH] : bootstrap: restore default SIGTERM disposition (#1885) Summary: `fbinit()` causes the installation of a glog signal handler that prints a stack trace handling `SIGTERM`. calls to `pm.stop()` result in these traces being written to stderr. this diff restores the default signal disposition after the call to `fbinit()` in `bootstrap_main` and the behavior is extinguished. Differential Revision: D87037324 --- monarch_hyperactor/src/bootstrap.rs | 7 ++ python/tests/test_actor_logging.py | 133 ++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 python/tests/test_actor_logging.py diff --git a/monarch_hyperactor/src/bootstrap.rs b/monarch_hyperactor/src/bootstrap.rs index 0188d6b94..a39a21183 100644 --- a/monarch_hyperactor/src/bootstrap.rs +++ b/monarch_hyperactor/src/bootstrap.rs @@ -36,6 +36,13 @@ pub fn bootstrap_main(py: Python) -> PyResult> { fbinit::perform_init(); }; + // SAFETY: Does not derefrence pointers or rely on undefined + // memory. No other threads are likely to be modifying it + // concurrently. We do this to avoid glog's SIGTERM backtraces. + unsafe { + libc::signal(libc::SIGTERM, libc::SIG_DFL); + } + hyperactor::tracing::debug!("entering async bootstrap"); crate::runtime::future_into_py::<_, ()>(py, async move { // SAFETY: diff --git a/python/tests/test_actor_logging.py b/python/tests/test_actor_logging.py new file mode 100644 index 000000000..edef0e431 --- /dev/null +++ b/python/tests/test_actor_logging.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import asyncio +import logging +import os +import re +import sys +import tempfile + +import pytest +from monarch._src.actor.host_mesh import this_host +from monarch.actor import Actor, endpoint + + +class Logger(Actor): + def __init__( + self, stdout_path: str | None = None, stderr_path: str | None = None + ) -> None: + self._logger: logging.Logger = logging.getLogger() + + # If file paths are provided, remove existing handlers to log + # only to files. + if stdout_path or stderr_path: + self._logger.handlers.clear() + + stdout_handler = ( + logging.FileHandler(stdout_path, mode="a") + if stdout_path + else logging.StreamHandler(sys.stdout) + ) + stdout_handler.setLevel(logging.INFO) + stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR) + + stderr_handler = ( + logging.FileHandler(stderr_path, mode="a") + if stderr_path + else logging.StreamHandler(sys.stderr) + ) + stderr_handler.setLevel(logging.ERROR) + + self._logger.addHandler(stdout_handler) + self._logger.addHandler(stderr_handler) + + self._stdout_handler = stdout_handler + self._stderr_handler = stderr_handler + + @endpoint + async def log_warn(self, content: str) -> None: + self._logger.warning(f"{content}") + self._stdout_handler.flush() + self._stderr_handler.flush() + + @endpoint + async def log_info(self, content: str) -> None: + self._logger.info(f"{content}") + self._stdout_handler.flush() + self._stderr_handler.flush() + + @endpoint + async def log_error(self, content: str) -> None: + self._logger.error(f"{content}") + self._stdout_handler.flush() + self._stderr_handler.flush() + + +@pytest.mark.timeout(60) +async def test_actor_logging_smoke() -> None: + # Create temporary files to capture output. + with tempfile.NamedTemporaryFile( + mode="w+", delete=False, suffix="_stdout.log" + ) as stdout_file, tempfile.NamedTemporaryFile( + mode="w+", delete=False, suffix="_stderr.log" + ) as stderr_file: + stdout_path = stdout_file.name + stderr_path = stderr_file.name + + try: + pm = this_host().spawn_procs(per_host={"gpus": 2}) + await pm.logging_option(level=logging.INFO) + + # Log to the terminal. + am_1 = pm.spawn("logger_1", Logger) + await am_1.log_warn.call("hello 1") + await am_1.log_info.call("hello 2") + await am_1.log_error.call("hello 3") + + # Log to files. + am_2 = pm.spawn("logger_2", Logger, stdout_path, stderr_path) + await am_2.log_warn.call("hello 1") + await am_2.log_info.call("hello 2") + await am_2.log_error.call("hello 3") + + # Wait for output to be written. + await asyncio.sleep(1) + + # Read the captured output. + with open(stdout_path, "r") as f: + stdout_content = f.read() + with open(stderr_path, "r") as f: + stderr_content = f.read() + + # Assertions on the captured output. + assert re.search( + r"hello 1", stdout_content + ), f"Expected 'hello 1' in stdout: {stdout_content}" + assert re.search( + r"hello 2", stdout_content + ), f"Expected 'hello 2' in stdout: {stdout_content}" + assert re.search( + r"hello 3", stderr_content + ), f"Expected 'hello 3' in stderr: {stderr_content}" + assert re.search( + r"\[actor=.*Logger.*\]", stdout_content + ), f"Expected actor prefix in stdout: {stdout_content}" + + await pm.stop() + + finally: + # Clean up temp files. + try: + os.unlink(stdout_path) + except OSError: + pass + try: + os.unlink(stderr_path) + except OSError: + pass