SemiAnalysisAI · billishyahao · May 29, 2026 · May 29, 2026 · May 30, 2026 · May 30, 2026
@@ -1861,8 +1861,8 @@ dsr1-fp4-mi355x-sglang-disagg:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=0"
 
-dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
+dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1970,7 +1970,19 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           additional-settings:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
+
 
+dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
     - isl: 8192
       osl: 1024
       search-space:
@@ -2015,7 +2027,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
       # 1P2D TP8
       - spec-decoding: "mtp"
-        conc-list: [ 64, 128, 256 ]
+        conc-list: [ 32, 64 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2030,11 +2042,11 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
+          - "DECODE_MTP_SIZE=3"
 
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 128, 512 ]
+        conc-list: [ 512 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2049,11 +2061,11 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           dp-attn: true
           additional-settings:
           - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
+          - "DECODE_MTP_SIZE=3"
 
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 64, 256 ]
+        conc-list: [ 256 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2068,11 +2080,50 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           dp-attn: true
           additional-settings:
           - "DECODE_NODES=1"
-          - "DECODE_MTP_SIZE=1"
+          - "DECODE_MTP_SIZE=3"
+
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 128 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 64 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
 
       # 2*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 1024, 2048, 4096 ]
+        conc-list: [ 2048, 4096 ]
         prefill:
           num-worker: 2
           tp: 8
@@ -2088,7 +2139,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           additional-settings:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
-      
+
 
 # DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
 # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
@@ -124,9 +124,11 @@ else
     # =========================================================================
 
     export SGLANG_USE_AITER=1
+    export AITER_LOG_LEVEL=ERROR
 
     export SGLANG_MORI_DISPATCH_DTYPE=auto
-    export SGLANG_MORI_FP8_COMB=true
+    export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
+    export MORI_COMBINE_DTYPE_DECODE=fp8
     export SGLANG_MORI_QP_PER_TRANSFER=4
     export SGLANG_MORI_NUM_WORKERS=4
     export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3201,6 +3201,13 @@
     - "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579
 
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
+  description:
+    - "Bump the image to May 26"
+    - "Add conc 128/256 new sweep point"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1584
+
 - config-keys:
     - glm5-fp8-gb300-dynamo-sglang
   description: