Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1577,6 +1577,60 @@ kimik2.5-fp4-mi355x-vllm-disagg:
additional-settings:
- "DECODE_NODES=2"

kimik2.5-int4-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:v0.21.0
model: moonshotai/Kimi-K2.5-INT4
model-prefix: kimik2.5
runner: mi355x-disagg
precision: int4
framework: vllm-disagg
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

minimaxm2.5-fp8-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
model: MiniMaxAI/MiniMax-M2.5
Expand Down
6 changes: 6 additions & 0 deletions benchmarks/multi_node/amd_utils/models_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ Kimi-K2.5-MXFP4:
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
hf_dir: "models--amd--Kimi-K2.5-MXFP4"

Kimi-K2.5-INT4:
prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --gpu-memory-utilization 0.9 --mm-encoder-tp-mode data --trust-remote-code"
decode_flags: "--tensor-parallel-size 8 --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --gpu-memory-utilization 0.9 --mm-encoder-tp-mode data --trust-remote-code"
env: "VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ROCM_USE_AITER_RMSNORM=0 VLLM_USE_V2_MODEL_RUNNER=1 VLLM_ENGINE_READY_TIMEOUT_S=3600"
hf_dir: "models--moonshotai--Kimi-K2.5"

MiniMax-M2.5:
# AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
# Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
Expand Down
78 changes: 78 additions & 0 deletions benchmarks/multi_node/kimik2.5_int4_mi355x_vllm-disagg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
CONC_LIST \
ISL \
OSL \
IMAGE \
SPEC_DECODING \
MODEL_PATH \
PREFILL_NUM_WORKERS \
PREFILL_TP \
PREFILL_EP \
PREFILL_DP_ATTN \
DECODE_NUM_WORKERS \
DECODE_TP \
DECODE_EP \
DECODE_DP_ATTN \
PREFILL_NODES \
DECODE_NODES \
RANDOM_RANGE_RATIO \
FRAMEWORK

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

set -x

cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1

export TIME_LIMIT="08:00:00"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
export PREFILL_ENABLE_EP=false
else
export PREFILL_ENABLE_EP=true
fi

if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
export PREFILL_ENABLE_DP=true
else
export PREFILL_ENABLE_DP=false
fi

if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
export DECODE_ENABLE_EP=false
else
export DECODE_ENABLE_EP=true
fi

if [[ "$DECODE_DP_ATTN" == "true" ]]; then
export DECODE_ENABLE_DP=true
else
export DECODE_ENABLE_DP=false
fi

JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
$PREFILL_NUM_WORKERS \
$DECODE_NODES \
$DECODE_NUM_WORKERS \
$ISL $OSL "${CONC_LIST// /x}" inf \
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
${PREFILL_TP} ${DECODE_TP} \
${RANDOM_RANGE_RATIO} \
"${NODELIST:-}")

if [[ $? -ne 0 ]]; then
echo "Failed to submit job" >&2
exit 1
fi

echo "$JOB_ID"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New launcher script is verbatim duplicate of existing scripts

Low Severity

kimik2.5_int4_mi355x_vllm-disagg.sh is a verbatim copy of minimaxm2.5_fp8_mi355x_vllm-disagg.sh and nearly identical to kimik2.5_fp4_mi355x_vllm-disagg.sh (differing only by two comment lines). The sglang-disagg launcher scripts (glm5_fp8, qwen3.5_fp8, dsr1_fp8) also share the same logic. There are now 6+ near-identical disagg launchers. Since none contain model-specific logic (all configuration comes from environment variables and models_vllm.yaml), a single shared script could replace all of them, reducing maintenance burden and risk of inconsistent fixes across copies.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 47cb1c7. Configure here.

9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3200,3 +3200,12 @@
- "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523, 1P1D TP8/EP1, dp-attn false, conc [8..512]"
- "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579

- config-keys:
- kimik2.5-int4-mi355x-vllm-disagg
description:
- "Add Kimi-K2.5 INT4 MI355X vLLM disaggregated prefill-decode benchmark"
- "Image: vllm/vllm-openai-rocm:v0.21.0"
- "1P+2D TP8 topology for 1k1k and 8k1k (conc 8-512), matching the Kimi-K2.5 MXFP4 vLLM-disagg workflow"
- "Add models_vllm.yaml server flags and multinode launch script kimik2.5_int4_mi355x_vllm-disagg.sh"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1581
Loading