From 069c8f301b492ad0d6536f50fef7697acb8a442d Mon Sep 17 00:00:00 2001 From: Wei Du Date: Mon, 20 Apr 2026 12:44:01 -0700 Subject: [PATCH 1/2] Support path-based Ray templates Signed-off-by: Wei Du --- .../run/torchx_backend/schedulers/slurm.py | 6 ++++++ .../torchx_backend/schedulers/test_slurm.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index 9af9483a..d1919751 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -22,6 +22,7 @@ import json import logging import os +import os import threading import time from dataclasses import asdict @@ -139,12 +140,17 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[Any]: # t command = [app.roles[0].entrypoint] + app.roles[0].args # Use Ray template from executor configuration ray_template_name = executor.ray_template + ray_template_dir = None + if os.path.isabs(ray_template_name) or os.path.dirname(ray_template_name): + ray_template_name = os.path.basename(ray_template_name) + ray_template_dir = os.path.dirname(os.path.abspath(executor.ray_template)) req = SlurmRayRequest( name=app.roles[0].name, launch_cmd=["sbatch", "--requeue", "--parsable"], command=" ".join(command), cluster_dir=os.path.join(executor.tunnel.job_dir, Path(job_dir).name, "ray"), template_name=ray_template_name, + template_dir=ray_template_dir, executor=executor, workdir=f"/{RUNDIR_NAME}/code", nemo_run_dir=os.path.join(executor.tunnel.job_dir, Path(job_dir).name), diff --git a/test/run/torchx_backend/schedulers/test_slurm.py b/test/run/torchx_backend/schedulers/test_slurm.py index 857d7a66..c06f0deb 100644 --- a/test/run/torchx_backend/schedulers/test_slurm.py +++ b/test/run/torchx_backend/schedulers/test_slurm.py @@ -502,6 +502,9 @@ def test_ray_template_executor(slurm_scheduler, slurm_executor, temp_dir): roles=[Role(name="test_role", image="", entrypoint="python", args=["script.py"])], metadata={USE_WITH_RAY_CLUSTER_KEY: True}, ) + custom_template_path = os.path.join(temp_dir, "custom_ray.sub.j2") + with open(custom_template_path, "w", encoding="utf-8") as f: + f.write("#!/bin/bash\n# Custom template") with ( mock.patch.object(SlurmTunnelScheduler, "_initialize_tunnel"), @@ -533,6 +536,21 @@ def test_ray_template_executor(slurm_scheduler, slurm_executor, temp_dir): assert isinstance(dryrun_info.request, SlurmRayRequest) assert dryrun_info.request.template_name == "ray_enroot.sub.j2" + path_executor = SlurmExecutor( + account="test_account", + job_dir=temp_dir, + nodes=1, + ntasks_per_node=1, + tunnel=LocalTunnel(job_dir=temp_dir), + ray_template=custom_template_path, + ) + with mock.patch("nemo_run.core.execution.utils.fill_template") as mock_fill: + mock_fill.return_value = "#!/bin/bash\n# Mock script" + dryrun_info = slurm_scheduler._submit_dryrun(app_def, path_executor) + assert isinstance(dryrun_info.request, SlurmRayRequest) + assert dryrun_info.request.template_name == "custom_ray.sub.j2" + assert dryrun_info.request.template_dir == temp_dir + def test_heterogeneous_ray_cluster_run_as_group(slurm_scheduler, temp_dir): """Test that run_as_group is automatically set for heterogeneous Ray clusters.""" From c64437d3344e5aa290d56b6f194f1c62c0bca5fc Mon Sep 17 00:00:00 2001 From: Wei Du Date: Mon, 20 Apr 2026 12:51:23 -0700 Subject: [PATCH 2/2] fix ruff check Signed-off-by: Wei Du --- nemo_run/run/torchx_backend/schedulers/slurm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_run/run/torchx_backend/schedulers/slurm.py b/nemo_run/run/torchx_backend/schedulers/slurm.py index d1919751..2926f353 100644 --- a/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -22,7 +22,6 @@ import json import logging import os -import os import threading import time from dataclasses import asdict