diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 6b4a0b8cf073..3d0c5636d6c0 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -28,13 +28,18 @@ def __init__( super().__init__(**kwargs) self._shared_experts = shared_experts - # Disable shared expert overlap if we are not using - # flashinfer + DP since there is nothing to be gained in this case. - # Disabling the overlap optimization also prevents the shared experts - # from being hidden from torch.compile. + # Disable shared expert overlap if we are using eplb, because of + # correctness issues, or if using flashinfer with DP, since there + # is nothing to be gained in this case. Disabling the overlap + # optimization also prevents the shared experts from being hidden + # from torch.compile. self.use_overlapped = ( use_overlapped - and not (self.use_flashinfer_cutlass_kernels and self.dp_size > 1) + and not ( + # TODO(wentao): find the root cause and remove this condition + self.enable_eplb + or (self.use_flashinfer_cutlass_kernels and self.dp_size > 1) + ) and self._shared_experts is not None )