pass in disable_ue8m0_cast

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit dc7a48ef4e19 · 2025-11-09T00:40:01.000-05:00
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -352,12 +352,22 @@ def apply(
             expected_m,
         )
 
+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_target_arch()
         a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
             workspace1,
             expert_num_tokens,
-            quant_scale_fmt=DeepGemmQuantScaleFMT.from_target_arch(),
+            quant_scale_fmt=quant_scale_fmt,
         )
 
+        # If we have committed to the UE8M0 format. This flag must be set so
+        # DeepGEMM does the same to the weights if they are not in UE8M0
+        # format.
+        enable_dg_ue8m0_cast = quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
         fp8_m_grouped_gemm_nt_masked(
-            (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
+            disable_ue8m0_cast=not enable_dg_ue8m0_cast,
         )
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
@@ -195,9 +195,12 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     _lazy_init()
     if _grouped_masked_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_masked_impl(
-        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs
-    )
+    if "disable_ue8m0_cast" in kwargs:
+        disable_ue8m0_cast = kwargs["disable_ue8m0_cast"]
+        del kwargs["disable_ue8m0_cast"]
+    else:
+        disable_ue8m0_cast = not is_deep_gemm_e8m0_used()
+    return _grouped_masked_impl(*args, disable_ue8m0_cast=disable_ue8m0_cast, **kwargs)
 
 
 def fp8_mqa_logits(