: set mesh logging defaults for notebooks (#1893)

shayne-fletcher · facebook-github-bot · commit bb63757758c5 · 2025-11-18T14:01:55.000-08:00
Summary:

D86994420 changed defaults:
- `HYPERACTOR_MESH_ENABLE_LOG_FORWARDING=false`
- `HYPERACTOR_MESH_ENABLE_FILE_CAPTURE=false`
- `HYPERACTOR_MESH_TAIL_LOG_LINES=0`

these defaults do not play well with interactive notebooks so this diff selectively overrides them when the execution environment is interactive (ipython, jupyter, bento).

incidentally this diff adds a new test module `test_actor_logging.py`. today it has a new smoke test, in time i mean to break out the logging tests from `test_python_actor.py`, fix those that need it, and move them here.

Reviewed By: mariusae

Differential Revision: D87098535
diff --git a/hyperactor/src/config/global.rs b/hyperactor/src/config/global.rs
@@ -413,6 +413,16 @@ pub fn get<T: AttrValue + Copy>(key: Key<T>) -> T {
     *key.default().expect("key must have a default")
 }
 
+/// Return the override value for `key` if it is explicitly present in
+/// `overrides`, otherwise fall back to the global value for that key.
+pub fn override_or_global<T: AttrValue + Copy>(overrides: &Attrs, key: Key<T>) -> T {
+    if overrides.contains_key(key) {
+        *overrides.get(key).unwrap()
+    } else {
+        get(key)
+    }
+}
+
 /// Get a key by cloning the value.
 ///
 /// Resolution order: TestOverride -> Runtime -> Env -> File ->
diff --git a/hyperactor_mesh/src/alloc/process.rs b/hyperactor_mesh/src/alloc/process.rs
@@ -49,7 +49,6 @@ use super::ProcStopReason;
 use crate::assign::Ranks;
 use crate::bootstrap;
 use crate::bootstrap::Allocator2Process;
-use crate::bootstrap::MESH_ENABLE_FILE_CAPTURE;
 use crate::bootstrap::MESH_ENABLE_LOG_FORWARDING;
 use crate::bootstrap::MESH_TAIL_LOG_LINES;
 use crate::bootstrap::Process2Allocator;
@@ -447,43 +446,30 @@ impl ProcessAlloc {
         }
         let mut cmd = self.cmd.lock().await;
 
-        // Read config (defaults are in 'bootstrap.rs').
+        // In the case `MESH_ENABLE_LOG_FORWARDING` is set it's
+        // probable the client execution context is a notebook. In
+        // that case, for output from this process's children to
+        // reach the client, we **must** use pipes and copy output
+        // from child to parent (**`Stdio::inherit`** does not work!).
+        // So, this variable is being used as a proxy for "use pipes"
+        // here.
         let enable_forwarding = hyperactor::config::global::get(MESH_ENABLE_LOG_FORWARDING);
-        let enable_file_capture = hyperactor::config::global::get(MESH_ENABLE_FILE_CAPTURE);
         let tail_size = hyperactor::config::global::get(MESH_TAIL_LOG_LINES);
-
-        // We don't support FileAppender in this v0 allocator path; warn if asked.
-        if enable_file_capture {
-            tracing::info!(
-                "MESH_ENABLE_FILE_CAPTURE=true, but ProcessAllocator (v0) has no FileAppender; \
-                 files will NOT be written in this path"
-            );
-        }
-
-        let need_stdio = enable_forwarding || tail_size > 0;
-
-        if need_stdio {
+        if enable_forwarding || tail_size > 0 {
             cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
         } else {
             cmd.stdout(Stdio::inherit()).stderr(Stdio::inherit());
             tracing::info!(
-                enable_forwarding,
-                enable_file_capture,
-                tail_size,
                 "child stdio NOT captured (forwarding/file_capture/tail all disabled); \
                  inheriting parent console"
             );
         }
-
-        // Only allocate & export a log channel when forwarding is
-        // enabled.
-        let log_channel: Option<ChannelAddr> = if enable_forwarding {
-            let addr = ChannelAddr::any(ChannelTransport::Unix);
-            cmd.env(bootstrap::BOOTSTRAP_LOG_CHANNEL, addr.to_string());
-            Some(addr)
-        } else {
-            None
-        };
+        // Regardless of the value of `MESH_ENABLE_LOG_FORWARDING`
+        // (c.f. `enable_forwarding`), we do not do log forwarding on
+        // these procs. This is because, now that we are on the v1
+        // path, the only procs we spawn via this code path are those
+        // to support `HostMeshAgent`s.
+        let log_channel: Option<ChannelAddr> = None;
 
         let index = self.created.len();
         self.created.push(ShortUuid::generate());
diff --git a/hyperactor_mesh/src/bootstrap.rs b/hyperactor_mesh/src/bootstrap.rs
@@ -45,6 +45,7 @@ use hyperactor::clock::RealClock;
 use hyperactor::config::CONFIG;
 use hyperactor::config::ConfigAttr;
 use hyperactor::config::global as config;
+use hyperactor::config::global::override_or_global;
 use hyperactor::context;
 use hyperactor::declare_attrs;
 use hyperactor::host::Host;
@@ -1848,6 +1849,13 @@ impl ProcManager for BootstrapProcManager {
         let (callback_addr, mut callback_rx) =
             channel::serve(ChannelAddr::any(ChannelTransport::Unix))?;
 
+        // Decide whether we need to capture stdio.
+        let overrides = &config.client_config_override;
+        let enable_forwarding = override_or_global(overrides, MESH_ENABLE_LOG_FORWARDING);
+        let enable_file_capture = override_or_global(overrides, MESH_ENABLE_FILE_CAPTURE);
+        let tail_size = override_or_global(overrides, MESH_TAIL_LOG_LINES);
+        let need_stdio = enable_forwarding || enable_file_capture || tail_size > 0;
+
         let mode = Bootstrap::Proc {
             proc_id: proc_id.clone(),
             backend_addr,
@@ -1862,12 +1870,6 @@ impl ProcManager for BootstrapProcManager {
                 .map_err(|e| HostError::ProcessConfigurationFailure(proc_id.clone(), e.into()))?,
         );
 
-        // Decide whether we need to capture stdio.
-        let enable_forwarding = hyperactor::config::global::get(MESH_ENABLE_LOG_FORWARDING);
-        let enable_file_capture = hyperactor::config::global::get(MESH_ENABLE_FILE_CAPTURE);
-        let tail_size = hyperactor::config::global::get(MESH_TAIL_LOG_LINES);
-        let need_stdio = enable_forwarding || enable_file_capture || tail_size > 0;
-
         if need_stdio {
             cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
         } else {
diff --git a/python/monarch/_src/actor/__init__.py b/python/monarch/_src/actor/__init__.py
@@ -9,3 +9,26 @@
 """
 Monarch Actor API
 """
+
+from monarch._rust_bindings.monarch_hyperactor.config import configure
+
+# Detect if we're running in IPython/Jupyter
+_in_ipython = False
+try:
+    # pyre-ignore[21]
+    from IPython import get_ipython
+
+    _in_ipython = get_ipython() is not None
+except ImportError:
+    pass
+
+# Set notebook-friendly defaults for stdio piping when spawning procs.
+# These config is read by:
+# 1. Rust BootstrapProcManager::spawn() to decide whether to pipe
+#    child stdio
+# 2. Rust LoggingMeshClient::spawn() to decide whether to spawn
+#   LogForwardActors
+# Only apply these defaults overrides in notebook/IPython environments
+# where stdout **needs** to be captured.
+if _in_ipython:
+    configure(enable_log_forwarding=True)
diff --git a/python/tests/test_actor_logging.py b/python/tests/test_actor_logging.py
@@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import asyncio
+import logging
+import os
+import re
+import sys
+import tempfile
+
+import pytest
+from monarch._src.actor.host_mesh import this_host
+from monarch.actor import Actor, endpoint
+
+
+class Logger(Actor):
+    def __init__(
+        self, stdout_path: str | None = None, stderr_path: str | None = None
+    ) -> None:
+        self._logger: logging.Logger = logging.getLogger()
+
+        # If file paths are provided, remove existing handlers to log
+        # only to files.
+        if stdout_path or stderr_path:
+            self._logger.handlers.clear()
+
+        stdout_handler = (
+            logging.FileHandler(stdout_path, mode="a")
+            if stdout_path
+            else logging.StreamHandler(sys.stdout)
+        )
+        stdout_handler.setLevel(logging.INFO)
+        stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
+
+        stderr_handler = (
+            logging.FileHandler(stderr_path, mode="a")
+            if stderr_path
+            else logging.StreamHandler(sys.stderr)
+        )
+        stderr_handler.setLevel(logging.ERROR)
+
+        self._logger.addHandler(stdout_handler)
+        self._logger.addHandler(stderr_handler)
+
+        self._stdout_handler = stdout_handler
+        self._stderr_handler = stderr_handler
+
+    @endpoint
+    async def log_warn(self, content: str) -> None:
+        self._logger.warning(f"{content}")
+        self._stdout_handler.flush()
+        self._stderr_handler.flush()
+
+    @endpoint
+    async def log_info(self, content: str) -> None:
+        self._logger.info(f"{content}")
+        self._stdout_handler.flush()
+        self._stderr_handler.flush()
+
+    @endpoint
+    async def log_error(self, content: str) -> None:
+        self._logger.error(f"{content}")
+        self._stdout_handler.flush()
+        self._stderr_handler.flush()
+
+
+@pytest.mark.timeout(60)
+async def test_actor_logging_smoke() -> None:
+    # Create temporary files to capture output.
+    with tempfile.NamedTemporaryFile(
+        mode="w+", delete=False, suffix="_stdout.log"
+    ) as stdout_file, tempfile.NamedTemporaryFile(
+        mode="w+", delete=False, suffix="_stderr.log"
+    ) as stderr_file:
+        stdout_path = stdout_file.name
+        stderr_path = stderr_file.name
+
+    try:
+        pm = this_host().spawn_procs(per_host={"gpus": 2})
+        await pm.logging_option(level=logging.INFO)
+
+        # Log to the terminal.
+        am_1 = pm.spawn("logger_1", Logger)
+        await am_1.log_warn.call("hello 1")
+        await am_1.log_info.call("hello 2")
+        await am_1.log_error.call("hello 3")
+
+        # Log to files.
+        am_2 = pm.spawn("logger_2", Logger, stdout_path, stderr_path)
+        await am_2.log_warn.call("hello 1")
+        await am_2.log_info.call("hello 2")
+        await am_2.log_error.call("hello 3")
+
+        # Wait for output to be written.
+        await asyncio.sleep(1)
+
+        # Read the captured output.
+        with open(stdout_path, "r") as f:
+            stdout_content = f.read()
+        with open(stderr_path, "r") as f:
+            stderr_content = f.read()
+
+        # Assertions on the captured output.
+        assert re.search(
+            r"hello 1", stdout_content
+        ), f"Expected 'hello 1' in stdout: {stdout_content}"
+        assert re.search(
+            r"hello 2", stdout_content
+        ), f"Expected 'hello 2' in stdout: {stdout_content}"
+        assert re.search(
+            r"hello 3", stderr_content
+        ), f"Expected 'hello 3' in stderr: {stderr_content}"
+        assert re.search(
+            r"\[actor=.*Logger.*\]", stdout_content
+        ), f"Expected actor prefix in stdout: {stdout_content}"
+
+        await pm.stop()
+
+    finally:
+        # Clean up temp files.
+        try:
+            os.unlink(stdout_path)
+        except OSError:
+            pass
+        try:
+            os.unlink(stderr_path)
+        except OSError:
+            pass