Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file removed examples/slurm/__init__.py
Empty file.
92 changes: 0 additions & 92 deletions examples/slurm/utils.py

This file was deleted.

165 changes: 47 additions & 118 deletions examples/slurm_allreduce.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,126 +5,21 @@
"execution_count": null,
"id": "c443b989-5a71-455f-9a59-9963338634ec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:8: FutureWarning: Setting `workspace='/home/ubuntu/ahmads/monarch/examples'` is deprecated.\n",
"torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `squeue` (squeue: error: Invalid job id: monarch-ubuntu\n",
"), trying `sacct`\n",
"torchx.schedulers.slurm_scheduler 2025-08-29 21:02:20 INFO unable to get job info for `monarch-ubuntu` with `sacct` (sacct: fatal: Bad job/step specified: monarch-ubuntu\n",
")\n",
"monarch.tools.commands 2025-08-29 21:02:20 INFO no existing RUNNING server `slurm:///monarch-ubuntu` creating new one...\n",
"torchx.runner.api 2025-08-29 21:02:20 INFO Tracker configurations: {}\n",
"torchx.runner.api 2025-08-29 21:02:20 INFO Checking for changes in workspace `/home/ubuntu/.monarch/out/tmpkk97qppi/workspace`...\n",
"torchx.runner.api 2025-08-29 21:02:20 INFO To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n",
"torchx.runner.api 2025-08-29 21:02:20 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.\n",
"monarch.tools.commands 2025-08-29 21:02:20 INFO created new `slurm:///418` waiting for it to be ready...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n",
"Ahmad: {'requeue': None, 'ntasks-per-node': '1', 'cpus-per-task': '48', 'mem': '186777', 'gpus-per-task': '4', 'ntasks': '1'}\n",
"Waiting for slurm:///418 to be RUNNING (current: PENDING); will check again in 5.0 seconds. Total wait time: 0:00:20.105986\r"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"slurm.utils 2025-08-29 21:02:45 INFO \n",
"===== Server Info =====\n",
"{\n",
" \"name\": \"418\",\n",
" \"server_handle\": \"slurm:///418\",\n",
" \"state\": \"RUNNING\",\n",
" \"meshes\": {\n",
" \"mesh0\": {\n",
" \"host_type\": \"__UNSET__\",\n",
" \"hosts\": 2,\n",
" \"gpus\": -1,\n",
" \"hostnames\": [\n",
" \"gpu-queue-st-gpu-compute-1\",\n",
" \"gpu-queue-st-gpu-compute-2\"\n",
" ]\n",
" }\n",
" }\n",
"}\n",
"__main__ 2025-08-29 21:02:45 INFO computing world size...\n",
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n",
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n",
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n",
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints\n",
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO found a single proc mesh `mesh0` in slurm:///418, will allocate on it\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-1:26600` (error: [Errno -5] No address associated with hostname)\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.236:26600` for `gpu-queue-st-gpu-compute-1:26600`\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO no AF_INET6 address that can bind TCP sockets for `gpu-queue-st-gpu-compute-2:26600` (error: [Errno -5] No address associated with hostname)\n",
"monarch.tools.network 2025-08-29 21:02:45 INFO resolved AF_INET address `10.0.2.132:26600` for `gpu-queue-st-gpu-compute-2:26600`\n",
"monarch._src.actor.allocator 2025-08-29 21:02:45 INFO initializing alloc on remote allocator addresses: ['tcp!10.0.2.236:26600', 'tcp!10.0.2.132:26600']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"New job `slurm:///418` is ready to serve.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"__main__ 2025-08-29 21:02:53 INFO computed world_sizes:\n",
" ----------------------------------------\n",
" {\n",
" \"rank_0\": 8,\n",
" \"rank_1\": 8,\n",
" \"rank_2\": 8,\n",
" \"rank_3\": 8,\n",
" \"rank_4\": 8,\n",
" \"rank_5\": 8,\n",
" \"rank_6\": 8,\n",
" \"rank_7\": 8\n",
"}\n",
" ----------------------------------------\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[36m>>> Aggregated Logs (2025-08-29 21:02:51) >>>\u001b[0m\n",
"\u001b[33m[8 similar log lines]\u001b[0m Initializing process group `nccl`:\n",
"\u001b[33m[8 similar log lines]\u001b[0m MASTER_ADDR = gpu-queue-st-gpu-compute-1\n",
"\u001b[33m[8 similar log lines]\u001b[0m MASTER_PORT = 29500\n",
"\u001b[33m[8 similar log lines]\u001b[0m RANK = 5\n",
"\u001b[33m[8 similar log lines]\u001b[0m WORLD_SIZE = 8\n",
"\u001b[36m<<< Aggregated Logs (2025-08-29 21:02:54) <<<\u001b[0m\n",
"\n"
]
}
],
"outputs": [],
"source": [
"# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.\n",
"\n",
"# @noautodeps\n",
"# pyre-ignore-all-errors\n",
"import json\n",
"import logging\n",
"import socket\n",
"import sys\n",
"\n",
"import cloudpickle\n",
"from monarch.tools import commands\n",
"from example_actors.compute_world_size_actor import ComputeWorldSizeActor\n",
"from slurm.utils import get_appdef, get_server_info, create_proc_mesh\n",
"from monarch.actor import Actor, endpoint\n",
"from monarch.job import SlurmJob\n",
"\n",
"\n",
"logging.basicConfig(\n",
Expand All @@ -138,21 +33,45 @@
"logger: logging.Logger = logging.getLogger(__name__)\n",
"\n",
"\n",
"class _HostnameActor(Actor):\n",
" \"\"\"Helper actor to get hostname from rank 0\"\"\"\n",
" @endpoint\n",
" def get_hostname(self) -> str:\n",
" return socket.gethostname()\n",
"\n",
"\n",
"async def main():\n",
" num_hosts = 2\n",
" appdef = await get_appdef(num_hosts)\n",
" server_info = await get_server_info(appdef)\n",
" num_nodes = 2\n",
" gpus_per_node = 8\n",
" mesh_name = \"mesh0\"\n",
" master_port = 29500\n",
" \n",
" # Create SLURM job\n",
" slurm_job = SlurmJob(\n",
" meshes={mesh_name: num_nodes},\n",
" job_name=\"monarch_example\",\n",
" gpus_per_node=gpus_per_node,\n",
" time_limit=\"06:00:00\",\n",
" )\n",
"\n",
" try:\n",
" proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)\n",
" actor = await proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n",
" # Get job state and create process mesh\n",
" job_state = slurm_job.state()\n",
" proc_mesh = job_state.mesh0.spawn_procs({\"gpus\": gpus_per_node})\n",
" \n",
" # Get master_addr from rank 0\n",
" hostname_actor = proc_mesh.spawn(\"hostname_actor\", _HostnameActor)\n",
" hostname_values = await hostname_actor.flatten(\"rank\").slice(rank=0).get_hostname.call()\n",
" master_addr = hostname_values.item()\n",
" \n",
" # Spawn actor\n",
" actor = proc_mesh.spawn(\"compute_world_size_actor\", ComputeWorldSizeActor)\n",
"\n",
" logger.info(\"computing world size...\")\n",
" # this is redundant but is here for example sake\n",
" mesh_name = server_info.get_mesh_spec(\"mesh0\").name\n",
" values = await actor.compute_world_size.call(\n",
" master_addr=server_info.host0(mesh_name),\n",
" master_port=29500,\n",
" master_addr=master_addr,\n",
" master_port=master_port,\n",
" )\n",
"\n",
" values_by_rank = {f\"rank_{p.rank}\": v for p, v in list(values.flatten(\"rank\"))}\n",
Expand All @@ -164,14 +83,24 @@
" {'-'*40}\"\"\"\n",
" )\n",
" finally:\n",
" commands.kill(f\"slurm:///{server_info.name}\")\n",
" # Cancel the SLURM job, releasing all reserved nodes back to the cluster\n",
" slurm_job.kill()\n",
" logger.info(\"Job terminated successfully\")\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" cloudpickle.register_pickle_by_value(sys.modules[ComputeWorldSizeActor.__module__])\n",
"\n",
" await main()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71ca61d9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading
Loading