From 88e318cb29e71d45868cdc71a55f3d5e4a6814a8 Mon Sep 17 00:00:00 2001 From: haic0 Date: Thu, 28 May 2026 11:22:30 -0400 Subject: [PATCH 1/2] [AMD] Add Kimi K2.5 INT4 MI355X vLLM disagg Co-authored-by: Cursor --- .github/configs/amd-master.yaml | 54 +++++++++++++ .../multi_node/amd_utils/models_vllm.yaml | 6 ++ .../kimik2.5_int4_mi355x_vllm-disagg.sh | 78 +++++++++++++++++++ 3 files changed, 138 insertions(+) create mode 100755 benchmarks/multi_node/kimik2.5_int4_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3544aad49..043d2d8e4 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1577,6 +1577,60 @@ kimik2.5-fp4-mi355x-vllm-disagg: additional-settings: - "DECODE_NODES=2" +kimik2.5-int4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.21.0 + model: moonshotai/Kimi-K2.5-INT4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: int4 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + minimaxm2.5-fp8-mi355x-vllm-disagg: image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 model: MiniMaxAI/MiniMax-M2.5 diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index b051de8d9..5658b11c4 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -30,6 +30,12 @@ Kimi-K2.5-MXFP4: env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--amd--Kimi-K2.5-MXFP4" +Kimi-K2.5-INT4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --gpu-memory-utilization 0.9 --mm-encoder-tp-mode data --trust-remote-code" + decode_flags: "--tensor-parallel-size 8 --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --gpu-memory-utilization 0.9 --mm-encoder-tp-mode data --trust-remote-code" + env: "VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ROCM_USE_AITER_RMSNORM=0 VLLM_USE_V2_MODEL_RUNNER=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--moonshotai--Kimi-K2.5" + MiniMax-M2.5: # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup. # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE. diff --git a/benchmarks/multi_node/kimik2.5_int4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_int4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_int4_mi355x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" From 0aceaf2622ce8fb0df7da6503a345ffb93889c9a Mon Sep 17 00:00:00 2001 From: haic0 Date: Thu, 28 May 2026 11:29:49 -0400 Subject: [PATCH 2/2] [AMD] Trigger Kimi K2.5 INT4 MI355X sweep Co-authored-by: Cursor --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 935cded22..9a8a69600 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3200,3 +3200,12 @@ - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523, 1P1D TP8/EP1, dp-attn false, conc [8..512]" - "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579 + +- config-keys: + - kimik2.5-int4-mi355x-vllm-disagg + description: + - "Add Kimi-K2.5 INT4 MI355X vLLM disaggregated prefill-decode benchmark" + - "Image: vllm/vllm-openai-rocm:v0.21.0" + - "1P+2D TP8 topology for 1k1k and 8k1k (conc 8-512), matching the Kimi-K2.5 MXFP4 vLLM-disagg workflow" + - "Add models_vllm.yaml server flags and multinode launch script kimik2.5_int4_mi355x_vllm-disagg.sh" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1581