ModelTC · blueswhen · May 8, 2026 · May 11, 2026 · May 12, 2026
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,10 +1,11 @@
-ARG CUDA_VERSION=12.8.0
+ARG CUDA_VERSION=12.9.0
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
 
 ARG PYTHON_VERSION=3.10
 ARG MAMBA_VERSION=24.7.1-0
 ARG VLLM_VERSION=0.16.0
 ARG FLASH_MLA_REF=47c35a7
+ARG DEEPGEMM_REF=891d57b4db1071624b5c8fa0d1e51cb317fa709f
 ARG TARGETPLATFORM
 ARG ENABLE_DEEPEP=1
 ARG ENABLE_NIXL=1
@@ -78,27 +79,20 @@ RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
 RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
       set -e; \
       ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
-      NVSHMEM_VERSION=3.3.9; \
-      CUDA_ARCHS=90; \
-      wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
-      && tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
-      && cd nvshmem \
-      && rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
-      && NVSHMEM_SHMEM_SUPPORT=0 \
-         NVSHMEM_UCX_SUPPORT=0 \
-         NVSHMEM_USE_NCCL=0 \
-         NVSHMEM_MPI_SUPPORT=0 \
-         NVSHMEM_IBGDA_SUPPORT=1 \
-         NVSHMEM_PMIX_SUPPORT=0 \
-         NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-         NVSHMEM_USE_GDRCOPY=1 \
-         cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
-      && cmake --build build --target install -j64; \
-      DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
-      cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
-      cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
+      python -m pip install --upgrade --no-deps \
+        "nvidia-nccl-cu12==2.30.4" \
+        "nvidia-nvshmem-cu12==3.5.21"; \
+      cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout b306af06afd412c88e51e71802951606e40b7358; \
+      ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so.3 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so; \
+      ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so.2 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so; \
+      pip install --no-build-isolation .; \
     fi
 
+RUN cd /root && git clone https://github.com/deepseek-ai/DeepGEMM.git && \
+    cd DeepGEMM && git checkout ${DEEPGEMM_REF} && \
+    git submodule update --init --recursive && \
+    pip install --no-build-isolation .
+
 RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
       apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
       DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \

diff --git a/docker/scripts/build.sh b/docker/scripts/build.sh
@@ -18,7 +18,8 @@ set -euo pipefail
 #   --no-nixl                 Disable NIXL (default: enabled)
 #   --no-cache                Disable cache (default: enabled)
 #   --lite                    Disable DEEPEP, NIXL and cache in one shot
-#   --cuda-version <ver>      CUDA version (default: 12.8.0)
+#   --cuda-version <ver>      CUDA version (default: 12.9.0)
+#   --deepgemm-ref <ref>      DeepGEMM git ref (default: 891d57b4db1071624b5c8fa0d1e51cb317fa709f)
 #   --image-prefix <name>     Image prefix (default: lightllm)
 #   --image-tag <tag>         Image tag (default: generated from enabled features)
 #   -h / --help               Show help
@@ -27,7 +28,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 cd "${ROOT_DIR}"
 
 IMAGE_PREFIX="${IMAGE_PREFIX:-lightllm}"
-CUDA_VERSION="${CUDA_VERSION:-12.8.0}"
+CUDA_VERSION="${CUDA_VERSION:-12.9.0}"
+DEEPGEMM_REF="${DEEPGEMM_REF:-891d57b4db1071624b5c8fa0d1e51cb317fa709f}"
 IMAGE_TAG="${IMAGE_TAG:-}"
 
 ENABLE_DEEPEP="${ENABLE_DEEPEP:-1}"
@@ -52,6 +54,10 @@ while [[ $# -gt 0 ]]; do
       CUDA_VERSION="${2:-}"
       shift
       ;;
+    --deepgemm-ref)
+      DEEPGEMM_REF="${2:-}"
+      shift
+      ;;
     --image-prefix)
       IMAGE_PREFIX="${2:-}"
       shift
@@ -97,9 +103,9 @@ fi
 
 DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile \
   --build-arg CUDA_VERSION="${CUDA_VERSION}" \
+  --build-arg DEEPGEMM_REF="${DEEPGEMM_REF}" \
   --build-arg ENABLE_DEEPEP="${ENABLE_DEEPEP}" \
   --build-arg ENABLE_NIXL="${ENABLE_NIXL}" \
   --build-arg ENABLE_CACHE="${ENABLE_CACHE}" \
   --progress=plain \
   -t "${IMAGE_PREFIX}:${IMAGE_TAG}" . 
-
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -11,6 +11,7 @@
 from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.impl import select_fuse_moe_impl
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num, get_env_start_args
+from lightllm.utils.device_utils import is_sm100_gpu
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
 from lightllm.utils.log_utils import init_logger
 
@@ -48,6 +49,7 @@ def __init__(
         self.quant_method = quant_method
         assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
         self.enable_ep_moe = get_env_start_args().enable_ep_moe
+        self.quant_method = self._maybe_upgrade_quant_method_for_ep_moe(self.quant_method)
         self.n_routed_experts = n_routed_experts
         self.num_fused_shared_experts = num_fused_shared_experts
         self._init_config(network_config)
@@ -66,6 +68,28 @@ def __init__(
         self.lock = threading.Lock()
         self._create_weight()
 
+    def _maybe_upgrade_quant_method_for_ep_moe(self, quant_method: QuantizationMethod) -> QuantizationMethod:
+        if not self.enable_ep_moe:
+            return quant_method
+
+        target_method = "deepgemm-fp8fp4-b32" if is_sm100_gpu() else "deepgemm-fp8w8a8-b128"
+        if quant_method.method_name == "none":
+            from lightllm.common.quantization.registry import QUANTMETHODS
+
+            logger.info(
+                f"enable_ep_moe requires DeepGEMM MoE expert weights; "
+                f"auto-upgrading fused_moe quantization from `none` to `{target_method}`."
+            )
+            quant_method = QUANTMETHODS.get(target_method)
+
+        if quant_method.method_name != target_method:
+            raise ValueError(
+                f"enable_ep_moe currently requires `{target_method}` for fused_moe on this GPU, "
+                f"but got `{quant_method.method_name}`."
+            )
+
+        return quant_method
+
     def _init_config(self, network_config: Dict[str, Any]):
         self.n_group = network_config.get("n_group", 0)
         self.use_grouped_topk = self.n_group > 0
@@ -147,6 +171,9 @@ def experts(
             is_prefill=is_prefill,
         )
 
+    def use_sm100_mega_moe(self) -> bool:
+        return bool(getattr(self.fuse_moe_impl, "_use_sm100_fp4_moe", lambda: False)())
+
     def low_latency_dispatch(
         self,
         hidden_states: torch.Tensor,