Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,26 +67,23 @@ dsr1-fp4-mi355x-atom:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }

dsr1-fp4-mi355x-atom-mtp:
image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
model: amd/DeepSeek-R1-0528-MXFP4
image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
model: amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4
model-prefix: dsr1
runner: mi355x
precision: fp4
# WIP framework (no customers yet)
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 1024, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
#- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 1024, spec-decoding: mtp }

dsr1-fp8-mi300x-sglang:
image: lmsysorg/sglang:v0.5.12-rocm700-mi30x
Expand Down
29 changes: 14 additions & 15 deletions benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,22 @@ PORT=${PORT:-8888}

export OMP_NUM_THREADS=1

# Calculate max-model-len based on ISL and OSL
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
CALCULATED_MAX_MODEL_LEN=""
else
CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing max-model-len for 8192

High Severity

The script always leaves CALCULATED_MAX_MODEL_LEN empty for normal throughput runs, but this commit also turns on the isl=8192 / osl=1024 scenario in amd-master.yaml. The previous ISL/OSL branch passed --max-model-len 10240 for that case; sibling ATOM scripts still do. Without a raised limit, the ATOM server may reject or truncate 8192-token prompts.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 14da3ee. Configure here.

fi

CALCULATED_MAX_MODEL_LEN=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi
PARALLEL_ARGS=(-tp "$TP") #TP
if [ "$DP_ATTENTION" = "true" ]; then
if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
else #DP+TP
PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
fi
fi

SPEC_ARGS=(--method mtp --num-speculative-tokens 3 )

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
Expand All @@ -52,9 +51,9 @@ export AMDGCN_USE_BUFFER_OPS=1
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
--method mtp \
"${PARALLEL_ARGS[@]}" \
"${SPEC_ARGS[@]}" \
--kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3330,6 +3330,16 @@
- "Update vLLM ROCm image from nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458 to v0.22.0"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1624

- config-keys:
- dsr1-fp4-mi355x-atom-mtp
description:
- "Update ATOM image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3"
- "Update model from amd/DeepSeek-R1-0528-MXFP4 to amd/DeepSeek-R1-0528-MXFP4-MTP-MoEFP4"
- "Switch to TP=4 only (drop TP=8 search-space); enable isl=8192 TP=4 sweep"
- "isl=1024/osl=1024 TP=4: +5.7% to +16.6% improvement at conc 4-64 vs prior InferenceX numbers"
- "isl=1024/osl=1024 TP=4: -0.6% to -1.9% at high concurrency (conc 128-256)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1629

- config-keys:
- kimik2.5-fp4-mi355x-vllm
description:
Expand Down