Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 14 additions & 20 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
ARG CUDA_VERSION=12.8.0
ARG CUDA_VERSION=12.9.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04

ARG PYTHON_VERSION=3.10
ARG MAMBA_VERSION=24.7.1-0
ARG VLLM_VERSION=0.16.0
ARG FLASH_MLA_REF=47c35a7
ARG DEEPGEMM_REF=891d57b4db1071624b5c8fa0d1e51cb317fa709f
ARG TARGETPLATFORM
ARG ENABLE_DEEPEP=1
ARG ENABLE_NIXL=1
Expand Down Expand Up @@ -78,27 +79,20 @@ RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
set -e; \
ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
NVSHMEM_VERSION=3.3.9; \
CUDA_ARCHS=90; \
wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
&& cd nvshmem \
&& rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
&& NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
&& cmake --build build --target install -j64; \
DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
python -m pip install --upgrade --no-deps \
"nvidia-nccl-cu12==2.30.4" \
"nvidia-nvshmem-cu12==3.5.21"; \
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout b306af06afd412c88e51e71802951606e40b7358; \
ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so.3 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so; \
ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so.2 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so; \
pip install --no-build-isolation .; \
fi

RUN cd /root && git clone https://github.com/deepseek-ai/DeepGEMM.git && \
cd DeepGEMM && git checkout ${DEEPGEMM_REF} && \
git submodule update --init --recursive && \
pip install --no-build-isolation .

RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
Expand Down
12 changes: 9 additions & 3 deletions docker/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ set -euo pipefail
# --no-nixl Disable NIXL (default: enabled)
# --no-cache Disable cache (default: enabled)
# --lite Disable DEEPEP, NIXL and cache in one shot
# --cuda-version <ver> CUDA version (default: 12.8.0)
# --cuda-version <ver> CUDA version (default: 12.9.0)
# --deepgemm-ref <ref> DeepGEMM git ref (default: 891d57b4db1071624b5c8fa0d1e51cb317fa709f)
# --image-prefix <name> Image prefix (default: lightllm)
# --image-tag <tag> Image tag (default: generated from enabled features)
# -h / --help Show help
Expand All @@ -27,7 +28,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "${ROOT_DIR}"

IMAGE_PREFIX="${IMAGE_PREFIX:-lightllm}"
CUDA_VERSION="${CUDA_VERSION:-12.8.0}"
CUDA_VERSION="${CUDA_VERSION:-12.9.0}"
DEEPGEMM_REF="${DEEPGEMM_REF:-891d57b4db1071624b5c8fa0d1e51cb317fa709f}"
IMAGE_TAG="${IMAGE_TAG:-}"

ENABLE_DEEPEP="${ENABLE_DEEPEP:-1}"
Expand All @@ -52,6 +54,10 @@ while [[ $# -gt 0 ]]; do
CUDA_VERSION="${2:-}"
shift
;;
--deepgemm-ref)
DEEPGEMM_REF="${2:-}"
shift
;;
--image-prefix)
IMAGE_PREFIX="${2:-}"
shift
Expand Down Expand Up @@ -97,9 +103,9 @@ fi

DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile \
--build-arg CUDA_VERSION="${CUDA_VERSION}" \
--build-arg DEEPGEMM_REF="${DEEPGEMM_REF}" \
--build-arg ENABLE_DEEPEP="${ENABLE_DEEPEP}" \
--build-arg ENABLE_NIXL="${ENABLE_NIXL}" \
--build-arg ENABLE_CACHE="${ENABLE_CACHE}" \
--progress=plain \
-t "${IMAGE_PREFIX}:${IMAGE_TAG}" .

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.impl import select_fuse_moe_impl
from lightllm.common.quantization.quantize_method import QuantizationMethod
from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num, get_env_start_args
from lightllm.utils.device_utils import is_sm100_gpu
from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
from lightllm.utils.log_utils import init_logger

Expand Down Expand Up @@ -48,6 +49,7 @@ def __init__(
self.quant_method = quant_method
assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
self.enable_ep_moe = get_env_start_args().enable_ep_moe
self.quant_method = self._maybe_upgrade_quant_method_for_ep_moe(self.quant_method)
self.n_routed_experts = n_routed_experts
self.num_fused_shared_experts = num_fused_shared_experts
self._init_config(network_config)
Expand All @@ -66,6 +68,28 @@ def __init__(
self.lock = threading.Lock()
self._create_weight()

def _maybe_upgrade_quant_method_for_ep_moe(self, quant_method: QuantizationMethod) -> QuantizationMethod:
if not self.enable_ep_moe:
return quant_method

target_method = "deepgemm-fp8fp4-b32" if is_sm100_gpu() else "deepgemm-fp8w8a8-b128"
if quant_method.method_name == "none":
from lightllm.common.quantization.registry import QUANTMETHODS

logger.info(
f"enable_ep_moe requires DeepGEMM MoE expert weights; "
f"auto-upgrading fused_moe quantization from `none` to `{target_method}`."
)
quant_method = QUANTMETHODS.get(target_method)

if quant_method.method_name != target_method:
raise ValueError(
f"enable_ep_moe currently requires `{target_method}` for fused_moe on this GPU, "
f"but got `{quant_method.method_name}`."
)

return quant_method

def _init_config(self, network_config: Dict[str, Any]):
self.n_group = network_config.get("n_group", 0)
self.use_grouped_topk = self.n_group > 0
Expand Down Expand Up @@ -147,6 +171,9 @@ def experts(
is_prefill=is_prefill,
)

def use_sm100_mega_moe(self) -> bool:
return bool(getattr(self.fuse_moe_impl, "_use_sm100_fp4_moe", lambda: False)())

def low_latency_dispatch(
self,
hidden_states: torch.Tensor,
Expand Down
Loading
Loading