diff --git a/tools/launcher/examples/Nemotron-h/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/specdec_bench_mtp_vllm.yaml b/tools/launcher/examples/Nemotron-h/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/specdec_bench_mtp_vllm.yaml new file mode 100644 index 00000000000..1107c074605 --- /dev/null +++ b/tools/launcher/examples/Nemotron-h/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/specdec_bench_mtp_vllm.yaml @@ -0,0 +1,73 @@ +# SPEED-bench MTP speculative-decoding run for NVIDIA-Nemotron-3-Super-120B-A12B-BF16 via vLLM. +# +# Nemotron-3-Super-120B-A12B is 120B total params (MoE; 12B active per +# token). BF16 weights = 240 GB total, so tp_size=4 minimum on 80 GB +# H100/A100. +# +# Slurm run on cw_dfw — cells override per-cell knobs via +# pipeline.task_N.args+=[...]: +# +# uv run slurm.py \ +# --yaml modules/Model-Optimizer/tools/launcher/examples/Nemotron-h/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/specdec_bench_mtp_vllm.yaml \ +# --yes detach=true \ +# pipeline.task_0.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace//qualitative","--draft_length 3"] \ +# pipeline.task_1.args+=["--temperature 0","--max_seq_len 65536","--save_dir /scratchspace//throughput_32k","--num_requests 80","--draft_length 3"] + +job_name: NVIDIA-Nemotron-3-Super-120B-A12B-BF16_specdec_bench_mtp_vllm + +pipeline: + global_vars: + hf_model: /hf-local/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 + + # task_0: SPEED qualitative split + task_0: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 4 + - --ep_size 1 + - --concurrency 32 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/qualitative + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:v0.22.1 + + # task_1: SPEED throughput_32k split + task_1: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 4 + - --ep_size 1 + - --concurrency 8 + - --num_requests 80 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/throughput_32k + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 4 + container: vllm/vllm-openai:v0.22.1