From 261b58daced5f677259c77a9a968a4a1978ba458 Mon Sep 17 00:00:00 2001 From: Pavel Belevich Date: Wed, 19 Nov 2025 21:27:31 +0000 Subject: [PATCH] expert parallelism benchmarks --- .../deepep-benchmark/.gitignore | 1 + .../deepep-benchmark/README.md | 147 +++ .../deepep-benchmark/deepep.Dockerfile | 191 ++++ .../pplx-garden-benchmark/.gitignore | 5 + .../pplx-garden-benchmark/README.md | 58 ++ .../pplx-garden.Dockerfile | 9 + .../pplx-garden-benchmark/pplx-garden.sbatch | 64 ++ .../pplx-garden_uccl-ep.Dockerfile | 13 + .../uccl-pplx-garden.sbatch | 68 ++ .../uccl_bench_all_to_all.py | 838 ++++++++++++++++++ .../pplx-kernels-benchmark/.gitignore | 4 + .../pplx-kernels-benchmark/README.md | 60 ++ .../pplx-kernels-benchmark/coredump/.gitkeep | 0 .../pplx-kernels-benchmark/data/.gitkeep | 0 .../launch_bench_all_to_all.py | 62 ++ .../pplx-kernels.Dockerfile | 197 ++++ .../pplx-kernels.sbatch | 71 ++ .../uccl-ep-benchmark/.gitignore | 3 + .../uccl-ep-benchmark/LICENSE | 21 + .../uccl-ep-benchmark/README.md | 33 + .../uccl-ep-benchmark/test_internode.sbatch | 52 ++ .../uccl-ep-benchmark/test_intranode.sbatch | 51 ++ .../uccl-ep-benchmark/test_low_latency.sbatch | 51 ++ .../uccl-ep-benchmark/uccl-ep.Dockerfile | 136 +++ 24 files changed, 2135 insertions(+) create mode 100644 3.test_cases/expert-parallelism/deepep-benchmark/.gitignore create mode 100644 3.test_cases/expert-parallelism/deepep-benchmark/README.md create mode 100644 3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/coredump/.gitkeep create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/data/.gitkeep create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore b/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore new file mode 100644 index 000000000..270f8d417 --- /dev/null +++ b/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore @@ -0,0 +1 @@ +*.sqsh diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/README.md b/3.test_cases/expert-parallelism/deepep-benchmark/README.md new file mode 100644 index 000000000..eef71beec --- /dev/null +++ b/3.test_cases/expert-parallelism/deepep-benchmark/README.md @@ -0,0 +1,147 @@ +# DeepEP Benchmark +https://github.com/deepseek-ai/DeepEP + +Updated to [NVSHMEM 3.4.5-0](https://github.com/NVIDIA/nvshmem/commit/df2814155acfba6227534dd81a8bf338da9e55f2) and DeepEP [Sep 25, 2025](https://github.com/deepseek-ai/DeepEP/tree/e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39) + +## Git clone NVSHMEM + +3.4.5-0: +``` +git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 && cd .. +``` + +devel brach: +``` +git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout devel && cd .. +``` + +## Building DeepEP Docker image + +```bash +GDRCOPY_VERSION=v2.5.1 +EFA_INSTALLER_VERSION=1.43.2 +NCCL_VERSION=v2.27.7-1 +NCCL_TESTS_VERSION=v2.16.9 +NVSHMEM_VERSION=3.4.5-0 +TAG="efa${EFA_INSTALLER_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}" +DEEPEP_CONTAINER_IMAGE_NAME_TAG="deepep:${TAG}" +``` + +```bash +docker build --progress=plain -f ./deepep.Dockerfile \ + --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \ + --build-arg="NCCL_VERSION=${NCCL_VERSION}" \ + --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \ + --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \ + -t ${DEEPEP_CONTAINER_IMAGE_NAME_TAG} \ + . +``` + +```bash +enroot import -o ./deepep.sqsh dockerd://${DEEPEP_CONTAINER_IMAGE_NAME_TAG} +``` + +## Running DeepEP Benchmark + +### Intranode + +```bash +srun --mpi=pmix --cpu-bind=none --container-image ./deepep.sqsh python /DeepEP/tests/test_intranode.py +``` + +## P5en results +DeepEP commit [e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39](https://github.com/deepseek-ai/DeepEP/tree/e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39) +``` +[config] num_tokens=4096, hidden=7168, num_topk=8 +[layout] Kernel performance: 0.041 ms + +[testing] Running with BF16, without top-k (async=False, previous=False) ... passed +[testing] Running with BF16, with top-k (async=False, previous=False) ... passed +[testing] Running with BF16, without top-k (async=False, previous=False) ... passed +[testing] Running with BF16, with top-k (async=False, previous=False) ... passed +[testing] Running with FP8, without top-k (async=False, previous=False) ... passed +[testing] Running with FP8, with top-k (async=False, previous=False) ... passed +[testing] Running with BF16, without top-k (async=True, previous=False) ... passed +[testing] Running with BF16, with top-k (async=True, previous=False) ... passed +[testing] Running with BF16, without top-k (async=True, previous=False) ... passed +[testing] Running with BF16, with top-k (async=True, previous=False) ... passed +[testing] Running with FP8, without top-k (async=True, previous=False) ... passed +[testing] Running with FP8, with top-k (async=True, previous=False) ... passed +[testing] Running with BF16, without top-k (async=False, previous=True) ... passed +[testing] Running with BF16, with top-k (async=False, previous=True) ... passed +[testing] Running with BF16, without top-k (async=False, previous=True) ... passed +[testing] Running with BF16, with top-k (async=False, previous=True) ... passed +[testing] Running with FP8, without top-k (async=False, previous=True) ... passed +[testing] Running with FP8, with top-k (async=False, previous=True) ... passed +[testing] Running with BF16, without top-k (async=True, previous=True) ... passed +[testing] Running with BF16, with top-k (async=True, previous=True) ... passed +[testing] Running with BF16, without top-k (async=True, previous=True) ... passed +[testing] Running with BF16, with top-k (async=True, previous=True) ... passed +[testing] Running with FP8, without top-k (async=True, previous=True) ... passed +[testing] Running with FP8, with top-k (async=True, previous=True) ... passed + +[tuning] SMs 24, NVL chunk 4: 294.24 GB/s (NVL), 544.47 us +[tuning] SMs 24, NVL chunk 6: 320.68 GB/s (NVL), 499.58 us +[tuning] SMs 24, NVL chunk 8: 317.79 GB/s (NVL), 504.13 us +[tuning] SMs 24, NVL chunk 10: 316.46 GB/s (NVL), 506.25 us +[tuning] SMs 24, NVL chunk 12: 308.37 GB/s (NVL), 519.53 us +[tuning] SMs 24, NVL chunk 14: 298.15 GB/s (NVL), 537.34 us +[tuning] SMs 24, NVL chunk 16: 292.44 GB/s (NVL), 547.83 us +[tuning] SMs 24, NVL chunk 18: 297.46 GB/s (NVL), 538.58 us +[tuning] SMs 24, NVL chunk 20: 293.29 GB/s (NVL), 546.24 us +[tuning] SMs 24, NVL chunk 22: 287.31 GB/s (NVL), 557.62 us +[tuning] SMs 24, NVL chunk 24: 287.20 GB/s (NVL), 557.83 us +[tuning] SMs 24, NVL chunk 26: 286.76 GB/s (NVL), 558.67 us +[tuning] SMs 24, NVL chunk 28: 287.96 GB/s (NVL), 556.35 us +[tuning] SMs 24, NVL chunk 30: 282.88 GB/s (NVL), 566.33 us +[tuning] SMs 24, NVL chunk 32: 281.40 GB/s (NVL), 569.32 us +[tuning] SMs 24, NVL chunk default: 319.82 GB/s (NVL), 500.93 us +[tuning] Best dispatch (FP8): SMs 24, NVL chunk 6, 320.68 GB/s (NVL), t: 499.58 us + +[tuning] SMs 24, NVL chunk 4: 331.77 GB/s (NVL), 936.50 us +[tuning] SMs 24, NVL chunk 6: 304.74 GB/s (NVL), 1019.58 us +[tuning] SMs 24, NVL chunk 8: 305.57 GB/s (NVL), 1016.81 us +[tuning] SMs 24, NVL chunk 10: 305.73 GB/s (NVL), 1016.26 us +[tuning] SMs 24, NVL chunk 12: 303.80 GB/s (NVL), 1022.74 us +[tuning] SMs 24, NVL chunk 14: 300.82 GB/s (NVL), 1032.85 us +[tuning] SMs 24, NVL chunk 16: 300.27 GB/s (NVL), 1034.75 us +[tuning] SMs 24, NVL chunk 18: 301.12 GB/s (NVL), 1031.83 us +[tuning] SMs 24, NVL chunk 20: 298.67 GB/s (NVL), 1040.29 us +[tuning] SMs 24, NVL chunk 22: 296.76 GB/s (NVL), 1046.98 us +[tuning] SMs 24, NVL chunk 24: 296.46 GB/s (NVL), 1048.05 us +[tuning] SMs 24, NVL chunk 26: 294.70 GB/s (NVL), 1054.29 us +[tuning] SMs 24, NVL chunk 28: 293.73 GB/s (NVL), 1057.80 us +[tuning] SMs 24, NVL chunk 30: 292.28 GB/s (NVL), 1063.03 us +[tuning] SMs 24, NVL chunk 32: 292.16 GB/s (NVL), 1063.47 us +[tuning] SMs 24, NVL chunk default: 305.72 GB/s (NVL), 1016.31 us +[tuning] Best dispatch (BF16): SMs 24, NVL chunk 4, 331.77 GB/s (NVL), t: 936.50 us + +[tuning] SMs 24, NVL chunk 1: 159.88 GB/s (NVL), 1943.39 us +[tuning] SMs 24, NVL chunk 2: 277.52 GB/s (NVL), 1119.56 us +[tuning] SMs 24, NVL chunk 3: 316.19 GB/s (NVL), 982.64 us +[tuning] SMs 24, NVL chunk 4: 321.89 GB/s (NVL), 965.24 us +[tuning] SMs 24, NVL chunk 5: 311.73 GB/s (NVL), 996.72 us +[tuning] SMs 24, NVL chunk 6: 294.88 GB/s (NVL), 1053.67 us +[tuning] SMs 24, NVL chunk 7: 304.14 GB/s (NVL), 1021.57 us +[tuning] SMs 24, NVL chunk 8: 288.61 GB/s (NVL), 1076.55 us +[tuning] SMs 24, NVL chunk 9: 284.72 GB/s (NVL), 1091.26 us +[tuning] SMs 24, NVL chunk 10: 289.42 GB/s (NVL), 1073.55 us +[tuning] SMs 24, NVL chunk 11: 284.57 GB/s (NVL), 1091.85 us +[tuning] SMs 24, NVL chunk 12: 284.85 GB/s (NVL), 1090.75 us +[tuning] SMs 24, NVL chunk 13: 288.21 GB/s (NVL), 1078.05 us +[tuning] SMs 24, NVL chunk 14: 285.78 GB/s (NVL), 1087.20 us +[tuning] SMs 24, NVL chunk 15: 283.55 GB/s (NVL), 1095.76 us +[tuning] SMs 24, NVL chunk 16: 283.94 GB/s (NVL), 1094.27 us +[tuning] SMs 24, NVL chunk default: 319.88 GB/s (NVL), 971.32 us +[tuning] Best combine: SMs 24, NVL chunk 4: 321.89 GB/s (NVL), t: 965.24 us +``` + +### Internode + +```bash +srun \ + -l --mpi=pmix --cpu-bind=none \ + --container-image ./deepep.sqsh \ + -N 2 \ + bash -c 'MASTER_ADDR=${SLURM_NODELIST%%,*} WORLD_SIZE=$SLURM_NNODES RANK=$SLURM_PROCID NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa python3 -u -X faulthandler /DeepEP/tests/test_internode.py' +``` diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile b/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile new file mode 100644 index 000000000..6bc3277f1 --- /dev/null +++ b/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile @@ -0,0 +1,191 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +ARG CUDA_VERSION=12.8.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + +################################ NCCL ######################################## + +ARG GDRCOPY_VERSION=v2.5.1 +ARG EFA_INSTALLER_VERSION=1.43.2 +ARG AWS_OFI_NCCL_VERSION=v1.16.3 +ARG NCCL_VERSION=v2.27.7-1 +ARG NCCL_TESTS_VERSION=v2.16.9 + +RUN apt-get update -y && apt-get upgrade -y +RUN apt-get remove -y --allow-change-held-packages \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + libmlx5-1 \ + libnccl2 \ + libnccl-dev + +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +ENV OPAL_PREFIX= + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + apt-utils \ + autoconf \ + automake \ + build-essential \ + check \ + cmake \ + curl \ + debhelper \ + devscripts \ + git \ + gcc \ + gdb \ + kmod \ + libsubunit-dev \ + libtool \ + openssh-client \ + openssh-server \ + pkg-config \ + python3-distutils \ + vim \ + python3.10-dev \ + python3.10-venv +RUN apt-get purge -y cuda-compat-* + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH + +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ + && python3 /tmp/get-pip.py \ + && pip3 install awscli pynvml + +################################################# +## Install NVIDIA GDRCopy +## +## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure +## that the cuda-compat-xx-x package is the latest. +RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ + && cd /tmp/gdrcopy \ + && make prefix=/opt/gdrcopy install + +ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH +ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH +ENV CPATH=/opt/gdrcopy/include:$CPATH +ENV PATH=/opt/gdrcopy/bin:$PATH + +################################################# +## Install EFA installer +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +################################################### +## Install NCCL +RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ + && cd /opt/nccl \ + && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +################################################### +## Install NCCL-tests +RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && make -j $(nproc) \ + MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda \ + NCCL_HOME=/opt/nccl/build \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +RUN rm -rf /var/lib/apt/lists/* + +## Set Open MPI variables to exclude network interface and conduit. +ENV OMPI_MCA_pml=^ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ + OPAL_PREFIX=/opt/amazon/openmpi \ + NCCL_SOCKET_IFNAME=^docker,lo,veth + +## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 +ENV PMIX_MCA_gds=hash + +## Set LD_PRELOAD for NCCL library +ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so + +################################ NVSHMEM ######################################## + +ENV NVSHMEM_DIR=/opt/nvshmem +ENV NVSHMEM_HOME=/opt/nvshmem + +# 3.2.5-1: wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz +# 3.3.9: wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz +# 3.4.5-0: git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 +COPY ./nvshmem /nvshmem_src + +RUN cd /nvshmem_src \ + && mkdir -p build \ + && cd build \ + && cmake \ + -DNVSHMEM_PREFIX=/opt/nvshmem \ + -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \ + \ + -DCUDA_HOME=/usr/local/cuda \ + -DCMAKE_CUDA_ARCHITECTURES="90a;100" \ + \ + -DNVSHMEM_USE_GDRCOPY=1 \ + -DGDRCOPY_HOME=/opt/gdrcopy \ + \ + -DNVSHMEM_USE_NCCL=1 \ + -DNCCL_HOME=/opt/nccl/build \ + -DNCCL_INCLUDE=/opt/nccl/build/include \ + \ + -DNVSHMEM_LIBFABRIC_SUPPORT=1 \ + -DLIBFABRIC_HOME=/opt/amazon/efa \ + \ + -DNVSHMEM_MPI_SUPPORT=1 \ + -DMPI_HOME=/opt/amazon/openmpi \ + \ + -DNVSHMEM_PMIX_SUPPORT=1 \ + -DPMIX_HOME=/opt/amazon/pmix \ + -DNVSHMEM_DEFAULT_PMIX=1 \ + \ + -DNVSHMEM_BUILD_TESTS=1 \ + -DNVSHMEM_BUILD_EXAMPLES=1 \ + -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \ + -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \ + \ + -DNVSHMEM_IBRC_SUPPORT=1 \ + -DNVSHMEM_IBGDA_SUPPORT=1 \ + \ + -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + .. \ + && make -j$(nproc) \ + && make install + +ENV PATH=/opt/nvshmem/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH +# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa + +################################ PyTorch ######################################## + +RUN pip install torch --index-url https://download.pytorch.org/whl/cu128 +RUN pip install ninja numpy cmake pytest + +################################ DeepEP ######################################## + +ARG DEEPEP_COMMIT=e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39 + +RUN git clone https://github.com/deepseek-ai/DeepEP.git /DeepEP \ + && cd /DeepEP \ + && git checkout ${DEEPEP_COMMIT} \ + && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" pip install . + +RUN mkdir -p /tmp/coredump diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore b/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore new file mode 100644 index 000000000..d0e8737cc --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore @@ -0,0 +1,5 @@ +*.sqsh +*.out +*.err +pplx-garden +uccl \ No newline at end of file diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md b/3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md new file mode 100644 index 000000000..a325bd252 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md @@ -0,0 +1,58 @@ +# PPLX Garden Benchmark + +```bash +git clone https://github.com/perplexityai/pplx-garden.git +``` + +```bash +docker build -t pplx-garden-dev - < ./pplx-garden/docker/dev.Dockerfile && docker build -t pplx-garden -f pplx-garden.Dockerfile . +``` + +```bash +enroot import -o pplx-garden.sqsh dockerd://pplx-garden +``` + +```bash +sbatch pplx-garden.sbatch +``` + +# All-to-All Performance Results + +Decode (128 tokens) Dispatch and Combine Median Latency(μs): + +| | My UCCL EP | My pplx-EFA | pplx-EFA | pplx-CX7 | DeepEP-CX7 | x | My UCCL EP | My pplx-EFA | pplx-EFA | pplx-CX7 | DeepEP-CX7 | +|-------|-----------:|------------:|---------:|---------:|-----------:|---|-----------:|------------:|---------:|---------:|-----------:| +| EP128 | | 424.5 | | | | x | | 588.5 | | | | +| EP64 | crash | 268.6 | 266.7 | 187.5 | 177.9 | x | crash | 393.6 | 391.2 | 309.1 | 325.0 | +| EP32 | 358.0 | 230.9 | 229.1 | 153.9 | 159.1 | x | 689.3 | 336.9 | 335.0 | 266.3 | 285.0 | +| EP16 | 301.1 | 218.0 | 214.8 | 110.2 | 123.9 | x | 834.5 | 244.6 | 241.5 | 185.5 | 203.0 | +| EP8 | 66.1 | 50.6 | 49.7 | 50.5 | 42.6 | x | 86.6 | 64.1 | 64.2 | 65.3 | 72.0 | + + +Prefill (4096 tokens) Dispatch and Combine Median Latency(μs): + +| x | My pplx-EFA | pplx-EFA | pplx-CX7 | DeepEP-CX7 | x | My pplx-EFA | pplx-EFA | pplx-CX7 | DeepEP-CX7 | +|-------|------------:|----------:|----------:|-----------:|---|------------:|----------:|----------:|-----------:| +| EP128 | 5883.4 | | | | x | 10785.1 | | | | +| EP64 | 5395.6 | 5334.3 | 4665.2 | 5071.6 | x | 9854.9 | 9779.3 | 8771.1 | 5922.7 | +| EP32 | 4605.2 | 4619.0 | 4011.8 | 3680.2 | x | 8286.3 | 8271.5 | 7526.8 | 3565.4 | +| EP16 | 3181.4 | 3196.7 | 2734.8 | 2481.9 | x | 5373.6 | 5379.1 | 1062.2 | 1863.9 | +| EP8 | 1076.0 | 1052.4 | 5071.1 | 1810.3 | x | 1354.0 | 1396.7 | 1405.1 | 962.9 | + +# UCCL-EP Benchmark + +```bash +git clone https://github.com/uccl-project/uccl.git +``` + +```bash +docker build -t pplx-garden_uccl-ep -f pplx-garden_uccl-ep.Dockerfile . +``` + +```bash +enroot import -o pplx-garden_uccl-ep.sqsh dockerd://pplx-garden_uccl-ep +``` + +```bash +sbatch uccl-pplx-garden.sbatch +``` diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile new file mode 100644 index 000000000..77885a12c --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile @@ -0,0 +1,9 @@ +FROM pplx-garden-dev:latest + +COPY pplx-garden /app +RUN cd /app \ + && export TORCH_CMAKE_PREFIX_PATH=$(python3 -c "import torch; print(torch.utils.cmake_prefix_path)") \ + && python3 -m build --wheel \ + && python3 -m pip install /app/dist/*.whl + +WORKDIR /app diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch new file mode 100644 index 000000000..1b5e57c33 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch @@ -0,0 +1,64 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=pplx-garden +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node 1 +##SBATCH --output %x_%j.out +##SBATCH --error %x_%j.err +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 + +set -ex; + +### Disable hyperthreading by setting the tasks per core to 1 +##SBATCH --ntasks-per-core=1 + +########################### +###### User Variables ##### +########################### + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +export FI_EFA_FORK_SAFE=1 + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL + +### Increase the send queue depth and can turn NCCL communications into non-blocking. +### https://www.usenix.org/system/files/atc23-choi.pdf +export NCCL_BUFFSIZE=8388608 +### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications +### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html +export NCCL_P2P_NET_CHUNKSIZE=1048576 + +### Improve performance for AllReduce by selecting specific protocol and algorithm for specific +### message size and number of ranks. +### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS. +export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so + +#Get Hostname and Instance IDs +mpirun -N 1 bash -c 'echo $(hostname): $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")' + +export MASTER_PORT=29500 +export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) +export NUM_NODES=${SLURM_NNODES} +export NUM_GPUS=8 + +srun -l \ + --container-image ./pplx-garden.sqsh \ + --mpi=pmix --cpu-bind=none \ + bash -c 'python3 -X faulthandler -m benchmarks.bench_all_to_all \ + --world-size $((NUM_NODES * NUM_GPUS)) \ + --nets-per-gpu 2 \ + --init-method=tcp://${MASTER_ADDR}:${MASTER_PORT} \ + --node-rank=${SLURM_NODEID} \ + --nvlink=8 \ + --max-num-tokens 128' diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile new file mode 100644 index 000000000..6b0003eae --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile @@ -0,0 +1,13 @@ +FROM pplx-garden:latest + +RUN apt-get update && apt-get install -y \ + sudo \ + libnuma-dev + +COPY uccl /opt/uccl +RUN cd /opt/uccl \ + && cd ep \ + && ./install_deps.sh \ + && make -j install + +ENV PYTHONPATH=/opt/uccl/ep/bench:$PYTHONPATH diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch new file mode 100644 index 000000000..dfc488365 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch @@ -0,0 +1,68 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=pplx-garden +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node 1 +##SBATCH --output %x_%j.out +##SBATCH --error %x_%j.err +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 + +set -ex; + +### Disable hyperthreading by setting the tasks per core to 1 +##SBATCH --ntasks-per-core=1 + +########################### +###### User Variables ##### +########################### + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +export FI_EFA_FORK_SAFE=1 + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL + +### Increase the send queue depth and can turn NCCL communications into non-blocking. +### https://www.usenix.org/system/files/atc23-choi.pdf +export NCCL_BUFFSIZE=8388608 +### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications +### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html +export NCCL_P2P_NET_CHUNKSIZE=1048576 + +### Improve performance for AllReduce by selecting specific protocol and algorithm for specific +### message size and number of ranks. +### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS. +export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so + +#Get Hostname and Instance IDs +mpirun -N 1 bash -c 'echo $(hostname): $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")' + +export MASTER_PORT=29500 +export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) +export NUM_NODES=${SLURM_NNODES} +export NUM_GPUS=8 + +export PYTHONPATH=/opt/uccl/ep/bench:$PYTHONPATH + +srun -l \ + --container-image ./pplx-garden_uccl-ep.sqsh \ + --container-mounts ./uccl_bench_all_to_all.py:/app/benchmarks/uccl_bench_all_to_all.py \ + --mpi=pmix --cpu-bind=none \ + bash -c 'python3 -X faulthandler -m benchmarks.uccl_bench_all_to_all \ + --world-size $((NUM_NODES * NUM_GPUS)) \ + --nets-per-gpu 2 \ + --init-method=tcp://${MASTER_ADDR}:${MASTER_PORT} \ + --node-rank=${SLURM_NODEID} \ + --nvlink=8 \ + --max-num-tokens 128 \ + --use-uccl' diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py new file mode 100644 index 000000000..352c38083 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py @@ -0,0 +1,838 @@ +# ruff: noqa: B023 + +import argparse +import json +import os +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Optional + +import torch +import torch.distributed as dist + +from pplx_garden.distributed import ParallelGroup, ParallelLaunch +from pplx_garden.kernels.p2p_all_to_all import P2PAllToAll +from pplx_garden.utils import logging_utils +from pplx_garden.utils.math import Statistics, round_up +from pplx_garden.utils.torch import profile_range, str_to_dtype +from tests.p2p_all_to_all.data import RankTestData + +# Add UCCL imports +try: + from uccl.ep import Config + from buffer import Buffer + from utils import inplace_unique + UCCL_AVAILABLE = True +except ImportError: + UCCL_AVAILABLE = False + Config = None + Buffer = None + +logger = logging_utils.get_logger("bench_all_to_all") + + +def rand_topk_idx( + num_tokens: int, + num_experts: int, + num_topk: int, + generator: torch.Generator, + device: torch.device, +) -> torch.Tensor: + scores = torch.randn( + (num_tokens, num_experts), + dtype=torch.float32, + device=device, + generator=generator, + ) + scores = scores.abs() + 1 + topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1] + return topk_idx.to(torch.uint32) + + +def act(x: torch.Tensor, x_scale: Optional[torch.Tensor]) -> torch.Tensor: + if x_scale is None: + return x * 2 + + b, h = x.shape + _, hs = x_scale.shape + x_reshaped = x.view(b, hs, h // hs) + result = x_reshaped.to(torch.float32) + result *= x_scale.view(b, hs, 1) + result *= 2 + return result.view(b, h) + + +def make_rng(device: torch.device, rank: int) -> torch.Generator: + generator = torch.Generator(device=device) + generator.manual_seed(rank + 123) + return generator + + +@dataclass(slots=True) +class AllToAllConfig: + nets_per_gpu: int + max_num_tokens: int + max_private_tokens: int + num_experts: int + hidden_dim: int + hidden_dim_scale: Optional[int] + num_experts_per_token: int + in_dtype: torch.dtype + out_dtype: torch.dtype + scale_dtype: Optional[torch.dtype] + nvlink: Optional[int] + + @property + def dispatch_bytes(self) -> int: + dispatch_token_dim = self.hidden_dim * self.in_dtype.itemsize + if self.hidden_dim_scale is not None: + assert self.scale_dtype is not None + dispatch_token_dim += self.hidden_dim_scale * self.scale_dtype.itemsize + return self.max_num_tokens * self.num_experts_per_token * dispatch_token_dim + + @property + def combine_bytes(self) -> int: + combine_token_dim = self.hidden_dim * self.out_dtype.itemsize + return self.max_num_tokens * self.num_experts_per_token * combine_token_dim + + +class AllToAllResource: + def __init__( + self, + device: torch.device, + dp_group: ParallelGroup, + global_group: ParallelGroup, + cfg: AllToAllConfig, + ) -> None: + self.device = device + self.dp_group = dp_group + self.global_group = global_group + self.cfg = cfg + + self.expert_padding = expert_padding = 1 + self.dp_rank = global_group.rank // dp_group.size + self.num_dp_groups = num_dp_groups = global_group.size // dp_group.size + self.num_local_experts = num_local_experts = ( + cfg.num_experts // global_group.size + ) + self.num_tokens = num_tokens = num_dp_groups * cfg.max_num_tokens + max_recv_tokens = round_up( + max( + min( + num_tokens * cfg.num_experts_per_token + + num_local_experts * (expert_padding - 1), + num_tokens * num_local_experts, + ), + num_local_experts * expert_padding, + ), + expert_padding, + ) + + node_group: Optional[ParallelGroup] = None + if cfg.nvlink is not None: + assert 0 < cfg.nvlink <= min(8, global_group.size, 8) + node_group = global_group.slice_by_count(global_group.size // cfg.nvlink) + + # Instantiate the all-to-all kernel. + self.all_to_all = P2PAllToAll( + max_num_tokens=cfg.max_num_tokens, + num_experts=cfg.num_experts, + expert_padding=expert_padding, + hidden_dim=cfg.hidden_dim, + hidden_dim_scale=cfg.hidden_dim_scale, + max_private_tokens=cfg.max_private_tokens, + in_dtype=cfg.in_dtype, + out_dtype=cfg.out_dtype, + scale_dtype=cfg.scale_dtype, + num_experts_per_token=cfg.num_experts_per_token, + nets_per_gpu=cfg.nets_per_gpu, + device=device, + dp_group=dp_group, + node_group=node_group, + global_group=global_group, + ) + + # Allocate buffers. + self.expert_num_tokens = torch.empty( + (num_local_experts,), + dtype=torch.int32, + device=device, + ) + self.out_expert_x = torch.empty( + (max_recv_tokens, cfg.hidden_dim), + dtype=cfg.in_dtype, + device=device, + ) + self.out_tokens = torch.empty( + (cfg.max_num_tokens, cfg.hidden_dim), + dtype=cfg.out_dtype, + device=device, + ) + self.expert_y = torch.empty( + (max_recv_tokens, cfg.hidden_dim), + dtype=cfg.out_dtype, + device=device, + ) + self.out_expert_x_scale: torch.Tensor | None = None + if cfg.hidden_dim_scale is not None or cfg.scale_dtype is not None: + assert cfg.scale_dtype is not None + assert cfg.hidden_dim_scale is not None + self.out_expert_x_scale = torch.empty( + (max_recv_tokens, cfg.hidden_dim_scale), + dtype=cfg.scale_dtype, + device=device, + ) + + def create_rank_data(self, dp_rank: int) -> RankTestData: + return RankTestData.create( + num_experts=self.cfg.num_experts, + num_experts_per_token=self.cfg.num_experts_per_token, + max_num_tokens=self.cfg.max_num_tokens, + hidden_dim=self.cfg.hidden_dim, + hidden_dim_scale=self.cfg.hidden_dim_scale, + in_dtype=self.cfg.in_dtype, + scale_dtype=self.cfg.scale_dtype, + generator=make_rng(self.device, dp_rank), + device=self.device, + ) + + +class UCCLAllToAllResource: + """Resource wrapper for UCCL-based all-to-all communication.""" + + def __init__( + self, + device: torch.device, + global_group: ParallelGroup, + cfg: AllToAllConfig, + ) -> None: + if not UCCL_AVAILABLE: + raise RuntimeError("UCCL is not available. Please install uccl.ep.") + + self.device = device + self.global_group = global_group + self.cfg = cfg + self.rank = global_group.rank + self.num_ranks = global_group.size + + # Convert ParallelGroup to torch.distributed ProcessGroup + self.dist_group = dist.new_group(list(range(self.num_ranks))) + + self.num_local_experts = cfg.num_experts // self.num_ranks + + # Initialize UCCL buffer + num_sms = 24 + num_nvl_bytes = int(2e9) + num_rdma_bytes = int(1e9) + num_qps_per_rank = max(num_sms, self.num_local_experts) + + num_nodes = self.num_ranks // int(os.environ.get("LOCAL_WORLD_SIZE", 8)) + is_intranode = num_nodes == 1 + + self.buffer = Buffer( + self.dist_group, + num_nvl_bytes, + num_rdma_bytes, + low_latency_mode=False, + num_qps_per_rank=num_qps_per_rank, + explicitly_destroy=True, + is_intranode=is_intranode, + ) + + # Default configs + rdma_buffer_size = 512 + nvl_buffer_size = 720 if self.num_ranks in (144, 160) else 512 + if self.num_ranks == 24: + nvl_buffer_size = 540 + self.dispatch_config = Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size) + self.combine_config = Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size) + + def create_rank_data_uccl(self, rank: int): + """Create test data in UCCL format.""" + num_tokens = self.cfg.max_num_tokens + hidden = self.cfg.hidden_dim + num_experts = self.cfg.num_experts + num_topk = self.cfg.num_experts_per_token + + # Generate input tensor + x = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device=self.device) + + # Generate topk indices + scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device=self.device).abs() + 1 + topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1] + topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device=self.device) + + # Calculate rank indices + rank_idx = topk_idx // (num_experts // self.num_ranks) + rank_idx.masked_fill_(topk_idx == -1, -1) + inplace_unique(rank_idx, self.num_ranks) + + return x, topk_idx, topk_weights, rank_idx + + def destroy(self): + """Clean up UCCL resources.""" + self.buffer.destroy() + + +def correctness_check(r: AllToAllResource) -> None: + expected_num_tokens_list = [ + RankTestData.rand_indices_and_count( + num_experts=r.cfg.num_experts, + num_experts_per_token=r.cfg.num_experts_per_token, + max_num_tokens=r.cfg.max_num_tokens, + generator=make_rng(r.device, dp_rank), + device=r.device, + )[1] + for dp_rank in range(r.num_dp_groups) + ] + expected_num_tokens = torch.sum( + torch.stack(expected_num_tokens_list, dim=0), + dim=0, + dtype=torch.int32, + ).to("cpu") + + local_rank = r.create_rank_data(r.dp_rank) + ref_out_tokens = act(local_rank.dp_x, local_rank.dp_x_scale).to(r.cfg.out_dtype) + + # Test run. + r.all_to_all.dispatch( + out_expert_num_tokens=r.expert_num_tokens, + out_expert_x=r.out_expert_x, + out_expert_x_scale=r.out_expert_x_scale, + dp_x=local_rank.dp_x, + dp_x_scale=local_rank.dp_x_scale, + indices=local_rank.indices, + weights=local_rank.weights, + ) + expert_y = act(r.out_expert_x, r.out_expert_x_scale).to(r.cfg.out_dtype) + r.all_to_all.combine( + out_tokens=r.out_tokens, + indices=local_rank.indices, + weights=local_rank.weights, + expert_y=expert_y, + bound_m=local_rank.bound_m, + ) + torch.cuda.synchronize() + + # Verify the token counts. + first_expert = r.global_group.rank * r.num_local_experts + last_expert = min(first_expert + r.num_local_experts, r.cfg.num_experts) + expected_local_tokens = expected_num_tokens[first_expert:last_expert] + torch.testing.assert_close(expected_local_tokens, r.expert_num_tokens.to("cpu")) + + # Verify the tokens. + def hash_token(x: torch.Tensor) -> str: + return ",".join(f"{v:.2f}" for v in x.tolist()) + + tokens_on_rank = set() + index = 0 + for n in expected_local_tokens.tolist(): + for token in r.out_expert_x[index : index + n]: + tokens_on_rank.add(hash_token(token)) + + index = round_up(index + n, r.expert_padding) + + # Verify the tokens on the rank. + num_missing = 0 + for i, (token, routes) in enumerate( + zip(list(local_rank.dp_x), local_rank.indices.tolist()) + ): + if not any(first_expert <= route < last_expert for route in routes): + continue + key = hash_token(token) + if key not in tokens_on_rank: + num_missing += 1 + logger.error( + "Token %i: %s not found in output on rank %i (routed to %s)", + i, + key, + r.dp_rank, + ", ".join(str(route) for route in routes), + ) + assert num_missing == 0, f"Missing {num_missing} tokens on rank {r.dp_rank}" + + # Verify the combine output. + torch.testing.assert_close(r.out_tokens, ref_out_tokens) + + +def benchmark( + r: AllToAllResource, num_warmup: int, num_repeats: int, output: Path +) -> None: + local_rank = r.create_rank_data(r.dp_rank) + rng = make_rng(r.device, r.dp_rank) + out_dummy = torch.empty((1,), dtype=torch.float32, device=r.device) + gemm = torch.empty( + (2048, 2048) if r.cfg.max_num_tokens <= 128 else (8192, 8192), + dtype=torch.float32, + device=r.device, + ) + + def wait() -> None: + # Wait to simulate the delay of other layers. + torch.distributed.all_reduce(out_dummy) + _ = gemm @ gemm + torch.distributed.all_reduce(out_dummy) + + def dispatch(do_send: bool, do_recv: bool) -> None: + r.all_to_all.dispatch( + out_expert_num_tokens=r.expert_num_tokens, + out_expert_x=r.out_expert_x, + out_expert_x_scale=r.out_expert_x_scale, + dp_x=local_rank.dp_x, + dp_x_scale=local_rank.dp_x_scale, + indices=local_rank.indices, + weights=local_rank.weights, + bound_m=local_rank.bound_m, + do_send=do_send, + do_recv=do_recv, + ) + + def combine(do_send: bool, do_recv: bool) -> None: + r.all_to_all.combine( + out_tokens=r.out_tokens, + indices=local_rank.indices, + weights=local_rank.weights, + expert_y=r.expert_y, + bound_m=local_rank.bound_m, + do_send=do_send, + do_recv=do_recv, + ) + + # Create and initialize events for timing. + events = [] + for _ in range(num_warmup + num_repeats): + dispatch_start = torch.cuda.Event(enable_timing=True) + dispatch_end = torch.cuda.Event(enable_timing=True) + combine_start = torch.cuda.Event(enable_timing=True) + combine_end = torch.cuda.Event(enable_timing=True) + dispatch_send_start = torch.cuda.Event(enable_timing=True) + dispatch_send_end = torch.cuda.Event(enable_timing=True) + dispatch_recv_start = torch.cuda.Event(enable_timing=True) + dispatch_recv_end = torch.cuda.Event(enable_timing=True) + combine_send_start = torch.cuda.Event(enable_timing=True) + combine_send_end = torch.cuda.Event(enable_timing=True) + combine_recv_start = torch.cuda.Event(enable_timing=True) + combine_recv_end = torch.cuda.Event(enable_timing=True) + dispatch_start.record() + dispatch_end.record() + combine_start.record() + combine_end.record() + dispatch_send_start.record() + dispatch_send_end.record() + dispatch_recv_start.record() + dispatch_recv_end.record() + combine_send_start.record() + combine_send_end.record() + combine_recv_start.record() + combine_recv_end.record() + events.append( + ( + dispatch_start, + dispatch_end, + combine_start, + combine_end, + dispatch_send_start, + dispatch_send_end, + dispatch_recv_start, + dispatch_recv_end, + combine_send_start, + combine_send_end, + combine_recv_start, + combine_recv_end, + ) + ) + + # Benchmark loop + last_report_time = time.time() + for i in range(num_warmup + num_repeats): + if i + 1 == num_warmup: + # Start profiling one iteration before the bench starts. + torch.cuda.profiler.start() + now = time.time() + if now - last_report_time > 1 or i + 1 == num_warmup + num_repeats: + logger.info("Iteration %i/%i", i + 1, num_warmup + num_repeats) + last_report_time = now + + ( + dispatch_start, + dispatch_end, + combine_start, + combine_end, + dispatch_send_start, + dispatch_send_end, + dispatch_recv_start, + dispatch_recv_end, + combine_send_start, + combine_send_end, + combine_recv_start, + combine_recv_end, + ) = events[i] + + # Update indices + local_rank.indices = rand_topk_idx( + r.cfg.max_num_tokens, + r.cfg.num_experts, + r.cfg.num_experts_per_token, + rng, + r.device, + ) + + # Send-Recv back to back + with profile_range("back-to-back"): + wait() + + dispatch_start.record() + dispatch(do_send=True, do_recv=True) + dispatch_end.record() + + wait() + + combine_start.record() + combine(do_send=True, do_recv=True) + combine_end.record() + + # Insert long kernel in between send and recv + with profile_range("overlap"): + wait() + + dispatch_send_start.record() + dispatch(do_send=True, do_recv=False) + dispatch_send_end.record() + + wait() # Fake overlap work + + dispatch_recv_start.record() + dispatch(do_send=False, do_recv=True) + dispatch_recv_end.record() + + wait() + + combine_send_start.record() + combine(do_send=True, do_recv=False) + combine_send_end.record() + + wait() # Fake overlap work + + combine_recv_start.record() + combine(do_send=False, do_recv=True) + combine_recv_end.record() + + torch.cuda.synchronize() + torch.cuda.profiler.stop() + + dispatch_times: list[float] = [] + dispatch_send_times: list[float] = [] + dispatch_recv_times: list[float] = [] + combine_times: list[float] = [] + combine_send_times: list[float] = [] + combine_recv_times: list[float] = [] + for ( + dispatch_st, + dispatch_en, + combine_st, + combine_en, + dispatch_send_st, + dispatch_send_en, + dispatch_recv_st, + dispatch_recv_en, + combine_send_st, + combine_send_en, + combine_recv_st, + combine_recv_en, + ) in events[num_warmup:]: + dispatch_times.append(dispatch_st.elapsed_time(dispatch_en) * 1000) + combine_times.append(combine_st.elapsed_time(combine_en) * 1000) + dispatch_send_times.append( + dispatch_send_st.elapsed_time(dispatch_send_en) * 1000 + ) + dispatch_recv_times.append( + dispatch_recv_st.elapsed_time(dispatch_recv_en) * 1000 + ) + combine_send_times.append(combine_send_st.elapsed_time(combine_send_en) * 1000) + combine_recv_times.append(combine_recv_st.elapsed_time(combine_recv_en) * 1000) + + # All-gather results from all ranks + dispatch_times = sum(r.global_group.all_gather_object(dispatch_times), []) + combine_times = sum(r.global_group.all_gather_object(combine_times), []) + dispatch_send_times = sum(r.global_group.all_gather_object(dispatch_send_times), []) + dispatch_recv_times = sum(r.global_group.all_gather_object(dispatch_recv_times), []) + combine_send_times = sum(r.global_group.all_gather_object(combine_send_times), []) + combine_recv_times = sum(r.global_group.all_gather_object(combine_recv_times), []) + + # Report the results. + if r.global_group.rank == 0: + stat_dispatch = Statistics.create(dispatch_times) + stat_dispatch_send = Statistics.create(dispatch_send_times) + stat_dispatch_recv = Statistics.create(dispatch_recv_times) + stat_combine = Statistics.create(combine_times) + stat_combine_send = Statistics.create(combine_send_times) + stat_combine_recv = Statistics.create(combine_recv_times) + + dispatch_bandwidth = r.cfg.dispatch_bytes / stat_dispatch.p50 * 1e-3 + combine_bandwidth = r.cfg.combine_bytes / stat_combine.p50 * 1e-3 + + logger.info( + "Dispatch both time: %s, %.1f GB/s", + stat_dispatch, + dispatch_bandwidth, + ) + logger.info("Dispatch send time: %s", stat_dispatch_send) + logger.info("Dispatch recv time: %s", stat_dispatch_recv) + + logger.info( + "Combine both time: %s, %.1f GB/s", + stat_combine, + combine_bandwidth, + ) + logger.info("Combine send time: %s", stat_combine_send) + logger.info("Combine recv time: %s", stat_combine_recv) + + data = { + "dispatch": { + "both": asdict(stat_dispatch), + "send": asdict(stat_dispatch_send), + "recv": asdict(stat_dispatch_recv), + }, + "combine": { + "both": asdict(stat_combine), + "send": asdict(stat_combine_send), + "recv": asdict(stat_combine_recv), + }, + } + + with output.open("w") as f: + f.write(json.dumps(data)) + + +def benchmark_uccl( + r: UCCLAllToAllResource, num_warmup: int, num_repeats: int, output: Path +) -> None: + """Benchmark UCCL dispatch and combine latency.""" + + # Create test data + x, topk_idx, topk_weights, rank_idx = r.create_rank_data_uccl(r.rank) + + # Get dispatch layout + ( + num_tokens_per_rank, + num_tokens_per_rdma_rank, + num_tokens_per_expert, + is_token_in_rank, + _, + ) = r.buffer.get_dispatch_layout(topk_idx, r.cfg.num_experts) + + # Initial dispatch to get handle + dispatch_args = { + "x": x, + "num_tokens_per_rank": num_tokens_per_rank, + "num_tokens_per_rdma_rank": num_tokens_per_rdma_rank, + "is_token_in_rank": is_token_in_rank, + "num_tokens_per_expert": num_tokens_per_expert, + "topk_idx": topk_idx, + "topk_weights": topk_weights, + "config": r.dispatch_config, + } + recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, _ = r.buffer.dispatch(**dispatch_args) + + if r.rank == 0: + logger.info("Starting UCCL latency benchmark...") + logger.info("Num warmup: %d, Num repeats: %d", num_warmup, num_repeats) + + # Warmup + for _ in range(num_warmup): + r.buffer.dispatch(x=x, handle=handle, config=r.dispatch_config) + r.buffer.combine(x=recv_x, handle=handle, topk_weights=recv_topk_weights, config=r.combine_config) + torch.cuda.synchronize() + + # Benchmark dispatch + dispatch_events = [] + for _ in range(num_repeats): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + r.buffer.dispatch(x=x, handle=handle, config=r.dispatch_config) + end_event.record() + dispatch_events.append((start_event, end_event)) + + # Benchmark combine + combine_events = [] + for _ in range(num_repeats): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + r.buffer.combine(x=recv_x, handle=handle, topk_weights=recv_topk_weights, config=r.combine_config) + end_event.record() + combine_events.append((start_event, end_event)) + + torch.cuda.synchronize() + + # Extract timing results (convert from ms to us) + dispatch_times = [start.elapsed_time(end) * 1000 for start, end in dispatch_events] + combine_times = [start.elapsed_time(end) * 1000 for start, end in combine_events] + + # Gather results from all ranks + dispatch_times = sum(r.global_group.all_gather_object(dispatch_times), []) + combine_times = sum(r.global_group.all_gather_object(combine_times), []) + + # Report results + if r.rank == 0: + stat_dispatch = Statistics.create(dispatch_times) + stat_combine = Statistics.create(combine_times) + + dispatch_bandwidth = r.cfg.dispatch_bytes / stat_dispatch.p50 * 1e-3 + combine_bandwidth = r.cfg.combine_bytes / stat_combine.p50 * 1e-3 + + logger.info( + "UCCL Dispatch time: %s, %.1f GB/s", + stat_dispatch, + dispatch_bandwidth, + ) + logger.info( + "UCCL Combine time: %s, %.1f GB/s", + stat_combine, + combine_bandwidth, + ) + + data = { + "uccl_dispatch": asdict(stat_dispatch), + "uccl_combine": asdict(stat_combine), + } + + with output.open("w") as f: + f.write(json.dumps(data)) + + +def _worker( + device: torch.device, + dp_group: Optional[ParallelGroup], + global_group: Optional[ParallelGroup], + config: AllToAllConfig, + num_warmup: int, + num_repeats: int, + output: Path, + check: bool, + use_uccl: bool = False, +) -> None: + """Benchmark worker process.""" + + assert dp_group is not None + assert global_group is not None + + os.environ["LOCAL_RANK"] = str(global_group.local_rank) + os.environ["LOCAL_WORLD_SIZE"] = os.environ.get("NUM_GPUS", "8") + os.environ["RANK"] = str(global_group.rank) + os.environ["WORLD_SIZE"] = str(global_group.size) + + if global_group.rank == 0: + logging_utils.setup(level="INFO") + + if use_uccl: + if not UCCL_AVAILABLE: + logger.error("UCCL requested but not available!") + return + + r_uccl = UCCLAllToAllResource(device, global_group, config) + try: + benchmark_uccl(r_uccl, num_warmup, num_repeats, output) + finally: + global_group.barrier() + r_uccl.destroy() + else: + r = AllToAllResource(device, dp_group, global_group, config) + try: + # Correctness check + if check: + logger.info("Checking correctness...") + ex: Exception | None = None + try: + correctness_check(r) + ok = True + except Exception as e: # noqa: BLE001 + ex = e + ok = False + global_ok = r.global_group.all_gather_object(ok) + bad_rank = next((i for i, ok in enumerate(global_ok) if not ok), None) + if bad_rank is not None: + raise RuntimeError( + f"Correctness check failed on rank {bad_rank}" + ) from ex + logger.info("Correctness check passed") + else: + logger.info("Skipping correctness check") + + # Benchmark. + benchmark(r, num_warmup, num_repeats, output) + finally: + global_group.barrier() + r.all_to_all.destroy() + + +def main() -> None: + """Benchmark entry point.""" + + parser = argparse.ArgumentParser(description="All-to-All Benchmark") + parser.add_argument("--world-size", type=int, required=True) + parser.add_argument( + "--init-method", + type=str, + default=None, + ) + parser.add_argument("--dp-size", type=int, default=1) # TODO: rename to tp_size + parser.add_argument("--node-rank", type=int, default=0) + parser.add_argument("--num-warmup", type=int, default=10000) + parser.add_argument("--num-repeats", type=int, default=10000) + parser.add_argument("--nets-per-gpu", type=int, default=2) + parser.add_argument("--max-num-tokens", type=int, default=128) + parser.add_argument("--max-private-tokens", type=int, default=256) + parser.add_argument("--num-experts", type=int, default=256) + parser.add_argument("--hidden-dim", type=int, default=7168) + parser.add_argument("--hidden-dim-scale", type=int, default=56) + parser.add_argument("--num-experts-per-token", type=int, default=8) + parser.add_argument("--in-dtype", type=str_to_dtype, default=torch.float8_e4m3fn) + parser.add_argument("--out-dtype", type=str_to_dtype, default=torch.bfloat16) + parser.add_argument("--scale-dtype", type=str_to_dtype, default=torch.float32) + parser.add_argument("--nvlink", type=int, default=None) + parser.add_argument("--output", type=Path, default=Path("/dev/stdout")) + parser.add_argument( + "--check", type=bool, default=True, action=argparse.BooleanOptionalAction + ) + parser.add_argument( + "--use-uccl", + action="store_true", + help="Use UCCL backend for benchmarking instead of pplx_garden" + ) + args = parser.parse_args() + + config = AllToAllConfig( + nets_per_gpu=args.nets_per_gpu, + max_num_tokens=args.max_num_tokens, + max_private_tokens=args.max_private_tokens, + num_experts=args.num_experts, + hidden_dim=args.hidden_dim, + hidden_dim_scale=args.hidden_dim_scale, + num_experts_per_token=args.num_experts_per_token, + in_dtype=args.in_dtype, + out_dtype=args.out_dtype, + scale_dtype=args.scale_dtype, + nvlink=args.nvlink, + ) + + ParallelLaunch( + world_size=args.world_size, + init_method=args.init_method, + dp_size=args.dp_size, + node_rank=args.node_rank, + ).run( + _worker, + config, + args.num_warmup, + args.num_repeats, + args.output, + args.check, + args.use_uccl, + ) + + +if __name__ == "__main__": + main() diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore new file mode 100644 index 000000000..29bd446f5 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore @@ -0,0 +1,4 @@ +*.sqsh +nvshmem* +*.out +*.err diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md new file mode 100644 index 000000000..b6055062e --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md @@ -0,0 +1,60 @@ +# Perplexity Kernels Benchmark +https://github.com/ppl-ai/pplx-kernels + +Updated to [NVSHMEM 3.4.5-0](https://github.com/NVIDIA/nvshmem/commit/df2814155acfba6227534dd81a8bf338da9e55f2) and PPLX-KERNELS [Aug 6, 2025](https://github.com/ppl-ai/pplx-kernels/commit/12cecfda252e4e646417ac263d96e994d476ee5d) + +## Git clone NVSHMEM + +3.4.5-0: +``` +git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 && cd .. +``` + +devel brach: +``` +git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout devel && cd .. +``` + +## Building Perplexity Kernels Docker image + +```bash +GDRCOPY_VERSION=v2.5.1 +EFA_INSTALLER_VERSION=1.43.2 +NCCL_VERSION=v2.27.7-1 +NCCL_TESTS_VERSION=v2.16.9 +NVSHMEM_VERSION=3.4.5-0 +TAG="efa${EFA_INSTALLER_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}" +PPLX_CONTAINER_IMAGE_NAME_TAG="pplx-kernels:${TAG}" +``` + +```bash +docker build --progress=plain -f ./pplx-kernels.Dockerfile \ + --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \ + --build-arg="NCCL_VERSION=${NCCL_VERSION}" \ + --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \ + --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \ + -t ${PPLX_CONTAINER_IMAGE_NAME_TAG} \ + . +``` + +```bash +enroot import -o ./pplx-kernels.sqsh dockerd://${PPLX_CONTAINER_IMAGE_NAME_TAG} +``` + +## Running Perplexity Kernels Benchmark + +```bash +sbatch pplx-kernels.sbatch +``` + +## Check the logs + +```bash +tail -f -n +0 slurm-XXX.out +``` + +## Core dump + +1. run `ulimit -c unlimited` and check that `srun -N bash -c "ulimit -c"` should print `unlimited` times +2. run `srun -N sudo bash -c "mkdir -p /tmp/coredump && echo '/tmp/coredump/core.%e.%p' > /proc/sys/kernel/core_pattern"` +3. run `sbatch pplx-kernels.sbatch` diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/coredump/.gitkeep b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/coredump/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/data/.gitkeep b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/data/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py new file mode 100644 index 000000000..11aa08ce4 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py @@ -0,0 +1,62 @@ +import argparse +from typing import Callable, Concatenate, ParamSpec +import torch +import os +from tests.bench_all_to_all import _worker_bench_all_to_all +from tests.distributed_utils import ProcessGroupInfo + +P = ParamSpec("P") + +def parallel_launch_from_torchrun( + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + rank = int(os.environ.get("RANK", 0)) + world_size = int(os.environ.get("WORLD_SIZE", 1)) + world_local_size = torch.cuda.device_count() + local_rank = rank % world_local_size + node_rank = rank // world_local_size + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl", device_id=device) + try: + worker( + ProcessGroupInfo( + world_size=world_size, + world_local_size=world_local_size, + rank=rank, + node_rank=node_rank, + local_rank=local_rank, + device=device, + ), + *args, + **kwargs, + ) + finally: + torch.distributed.destroy_process_group() + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--dp-size", type=int, default=1) + parser.add_argument( + "--in-dtype", + choices=["bfloat16", "float16", "float8_e4m3fn"], + default="float8_e4m3fn", + ) + parser.add_argument( + "--out-dtype", + choices=["bfloat16", "float16"], + default="bfloat16", + ) + args = parser.parse_args() + dp_size = int(args.dp_size) + in_dtype = str(args.in_dtype) + out_dtype = str(args.out_dtype) + + parallel_launch_from_torchrun(_worker_bench_all_to_all, dp_size, in_dtype, out_dtype) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile new file mode 100644 index 000000000..2c194691f --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile @@ -0,0 +1,197 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +ARG CUDA_VERSION=12.8.1 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + +################################ NCCL ######################################## + +ARG GDRCOPY_VERSION=v2.5.1 +ARG EFA_INSTALLER_VERSION=1.43.2 +ARG AWS_OFI_NCCL_VERSION=v1.16.3 +ARG NCCL_VERSION=v2.27.7-1 +ARG NCCL_TESTS_VERSION=v2.16.9 + +RUN apt-get update -y && apt-get upgrade -y +RUN apt-get remove -y --allow-change-held-packages \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + libmlx5-1 \ + libnccl2 \ + libnccl-dev + +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +ENV OPAL_PREFIX= + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + apt-utils \ + autoconf \ + automake \ + build-essential \ + check \ + cmake \ + curl \ + debhelper \ + devscripts \ + git \ + gcc \ + gdb \ + kmod \ + libsubunit-dev \ + libtool \ + openssh-client \ + openssh-server \ + pkg-config \ + python3-distutils \ + vim \ + python3.10-dev \ + python3.10-venv +RUN apt-get purge -y cuda-compat-* + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH + +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ + && python3 /tmp/get-pip.py \ + && pip3 install awscli pynvml + +################################################# +## Install NVIDIA GDRCopy +## +## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure +## that the cuda-compat-xx-x package is the latest. +RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ + && cd /tmp/gdrcopy \ + && make prefix=/opt/gdrcopy install + +ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH +ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH +ENV CPATH=/opt/gdrcopy/include:$CPATH +ENV PATH=/opt/gdrcopy/bin:$PATH + +################################################# +## Install EFA installer +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +################################################### +## Install NCCL +RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ + && cd /opt/nccl \ + && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +################################################### +## Install NCCL-tests +RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && make -j $(nproc) \ + MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda \ + NCCL_HOME=/opt/nccl/build \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +RUN rm -rf /var/lib/apt/lists/* + +## Set Open MPI variables to exclude network interface and conduit. +ENV OMPI_MCA_pml=^ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ + OPAL_PREFIX=/opt/amazon/openmpi \ + NCCL_SOCKET_IFNAME=^docker,lo,veth + +## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 +ENV PMIX_MCA_gds=hash + +## Set LD_PRELOAD for NCCL library +ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so + +################################ NVSHMEM ######################################## + +ENV NVSHMEM_DIR=/opt/nvshmem +ENV NVSHMEM_HOME=/opt/nvshmem + +# 3.2.5-1: wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz +# 3.3.9: wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz +# 3.4.5-0: git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 +COPY ./nvshmem /nvshmem_src + +RUN cd /nvshmem_src \ + && mkdir -p build \ + && cd build \ + && cmake \ + -DNVSHMEM_PREFIX=/opt/nvshmem \ + -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \ + \ + -DCUDA_HOME=/usr/local/cuda \ + -DCMAKE_CUDA_ARCHITECTURES="90a;100" \ + \ + -DNVSHMEM_USE_GDRCOPY=1 \ + -DGDRCOPY_HOME=/opt/gdrcopy \ + \ + -DNVSHMEM_USE_NCCL=1 \ + -DNCCL_HOME=/opt/nccl/build \ + -DNCCL_INCLUDE=/opt/nccl/build/include \ + \ + -DNVSHMEM_LIBFABRIC_SUPPORT=1 \ + -DLIBFABRIC_HOME=/opt/amazon/efa \ + \ + -DNVSHMEM_MPI_SUPPORT=1 \ + -DMPI_HOME=/opt/amazon/openmpi \ + \ + -DNVSHMEM_PMIX_SUPPORT=1 \ + -DPMIX_HOME=/opt/amazon/pmix \ + -DNVSHMEM_DEFAULT_PMIX=1 \ + \ + -DNVSHMEM_BUILD_TESTS=1 \ + -DNVSHMEM_BUILD_EXAMPLES=1 \ + -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \ + -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \ + \ + -DNVSHMEM_IBRC_SUPPORT=1 \ + -DNVSHMEM_IBGDA_SUPPORT=1 \ + \ + -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + .. \ + && make -j$(nproc) \ + && make install + +ENV PATH=/opt/nvshmem/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH +# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa + +################################ PyTorch ######################################## + +RUN pip install torch --index-url https://download.pytorch.org/whl/cu128 +RUN pip install ninja numpy cmake pytest + +################################ PPLX-KERNELS ######################################## + +RUN git clone https://github.com/ppl-ai/pplx-kernels.git /pplx-kernels \ + && cd /pplx-kernels \ + && git checkout 12cecfda252e4e646417ac263d96e994d476ee5d +# COPY pplx-kernels /pplx-kernels + +RUN cd /pplx-kernels \ + && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" python3 setup.py bdist_wheel \ + && pip install dist/*.whl + +ENV PYTHONPATH=/pplx-kernels + +COPY launch_bench_all_to_all.py /launch_bench_all_to_all.py + +RUN mkdir -p /tmp/coredump diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch new file mode 100644 index 000000000..85f0d0bc9 --- /dev/null +++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch @@ -0,0 +1,71 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=pplx-ker +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node 8 +##SBATCH --output %x_%j.out +##SBATCH --error %x_%j.err +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 + +set -ex; + +### Disable hyperthreading by setting the tasks per core to 1 +##SBATCH --ntasks-per-core=1 + +########################### +###### User Variables ##### +########################### + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +export FI_EFA_FORK_SAFE=1 + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL + +### Increase the send queue depth and can turn NCCL communications into non-blocking. +### https://www.usenix.org/system/files/atc23-choi.pdf +export NCCL_BUFFSIZE=8388608 +### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications +### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html +export NCCL_P2P_NET_CHUNKSIZE=1048576 + +### Improve performance for AllReduce by selecting specific protocol and algorithm for specific +### message size and number of ranks. +### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS. +export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so + +export NVSHMEM_DIR=/opt/nvshmem + +export LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH + +export NVSHMEM_BOOTSTRAP_PMI=PMIX + +# export NVSHMEM_DEBUG=TRACE +# export NVSHMEM_DEBUG_SUBSYS=ALL + +export NVSHMEM_REMOTE_TRANSPORT=libfabric +export NVSHMEM_LIBFABRIC_PROVIDER=efa + +#Get Hostname and Instance IDs +mpirun -N 1 bash -c 'echo $(hostname): $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")' + +export MASTER_PORT=29500 +export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) +export WORLD_SIZE=${SLURM_NPROCS} + +export PYTHONFAULTHANDLER=1 + +srun --container-image ./pplx-kernels.sqsh \ + --mpi=pmix --cpu-bind=none \ + --container-mounts=$(pwd)/coredump:/tmp/coredump,$(pwd)/data:/pplx-kernels/data \ + bash -c "RANK=\${SLURM_PROCID} python3 -X faulthandler launch_bench_all_to_all.py" \ No newline at end of file diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore b/3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore new file mode 100644 index 000000000..e39592d35 --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore @@ -0,0 +1,3 @@ +*.sqsh +*.out +*.err diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE b/3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE new file mode 100644 index 000000000..f4dfd3ce7 --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Pavel Belevich + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md b/3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md new file mode 100644 index 000000000..8a99c45e7 --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md @@ -0,0 +1,33 @@ +# UCCL-EP Benchmark + +https://uccl-project.github.io/posts/uccl-ep/ + +```bash +docker build -t uccl-ep -f uccl-ep.Dockerfile . +``` + +```bash +enroot import -o ./uccl-ep.sqsh dockerd://uccl-ep +``` + +test_internode.sbatch +```bash +sbatch test_internode.sbatch +``` + +test_intranode.sbatch +```bash +sbatch test_intranode.sbatch +``` + +| Type | Dispatch #EP | Bottleneck bandwidth | Combine #EP | Bottleneck bandwidth | +|:---------:|:-------------:|:--------------------:|:------------:|:--------------------:| +| Intranode | 8 | 318-321 GB/s (NVLink)| 8 | 320-323 GB/s (NVLink)| +| Internode | 16 | 48-54 GB/s (RDMA) | 16 | 15-19 GB/s (RDMA) | +| Internode | 24 | 52-55 GB/s (RDMA) | 24 | 23-29 GB/s (RDMA) | +| Internode | 32 | 53-55 GB/s (RDMA) | 32 | 40-42 GB/s (RDMA) | + +test_low_latency.sbatch +```bash +sbatch test_low_latency.sbatch +``` diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch new file mode 100644 index 000000000..716c3b60a --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=internode +#SBATCH --nodes=6 +#SBATCH --ntasks-per-node 1 +##SBATCH --output %x_%j.out +##SBATCH --error %x_%j.err +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 + +set -ex; + +########################### +###### User Variables ##### +########################### + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +export FI_EFA_FORK_SAFE=1 + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL + +export OMP_NUM_THREADS=8 +export PYTHONFAULTHANDLER=1 + +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=12355 +SLURM_GPUS_PER_NODE=8 + +srun -l \ + --container-image ./uccl-ep.sqsh \ + torchrun \ + --nnodes=$SLURM_JOB_NUM_NODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$(hostname) \ + bench/test_internode.py \ + --num-tokens=4096 \ + --hidden=7168 \ + --num-topk=8 \ + --num-experts=288 \ + --test-ll-compatibility diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch new file mode 100644 index 000000000..62b2c1e9f --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=intranode +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node 1 +##SBATCH --output %x_%j.out +##SBATCH --error %x_%j.err +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 + +set -ex; + +########################### +###### User Variables ##### +########################### + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +export FI_EFA_FORK_SAFE=1 + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL + +export OMP_NUM_THREADS=8 +export PYTHONFAULTHANDLER=1 + +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=12355 +SLURM_GPUS_PER_NODE=8 + +srun -l \ + --container-image ./uccl-ep.sqsh \ + torchrun \ + --nnodes=$SLURM_JOB_NUM_NODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$(hostname) \ + bench/test_intranode.py \ + --num-tokens 4096 \ + --hidden 7168 \ + --num-topk 8 \ + --num-experts 256 diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch new file mode 100644 index 000000000..e7690b9e9 --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=lowlatency +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node 1 +##SBATCH --output %x_%j.out +##SBATCH --error %x_%j.err +#SBATCH --exclusive +#SBATCH --wait-all-nodes=1 + +set -ex; + +########################### +###### User Variables ##### +########################### + +## Set libfabric flags to use EFA +export FI_PROVIDER=efa +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d +export FI_EFA_FORK_SAFE=1 + +## Set this flag for debugging EFA +# export FI_LOG_LEVEL=warn + +## NCCL Environment variables +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=ALL + +export OMP_NUM_THREADS=8 +export PYTHONFAULTHANDLER=1 + +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=12355 +SLURM_GPUS_PER_NODE=8 + +srun -l \ + --container-image ./uccl-ep.sqsh \ + torchrun \ + --nnodes=$SLURM_JOB_NUM_NODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$(hostname) \ + bench/test_low_latency.py \ + --num-tokens=128 \ + --hidden=7168 \ + --num-topk=8 \ + --num-experts=288 diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile b/3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile new file mode 100644 index 000000000..573a646eb --- /dev/null +++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile @@ -0,0 +1,136 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +ARG CUDA_VERSION=12.8.1 +FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + +ARG GDRCOPY_VERSION=v2.5.1 +ARG EFA_INSTALLER_VERSION=1.43.2 +ARG AWS_OFI_NCCL_VERSION=v1.16.3 +ARG NCCL_VERSION=v2.27.7-1 +ARG NCCL_TESTS_VERSION=v2.16.9 + +RUN apt-get update -y && apt-get upgrade -y +RUN apt-get remove -y --allow-change-held-packages \ + ibverbs-utils \ + libibverbs-dev \ + libibverbs1 \ + libmlx5-1 \ + libnccl2 \ + libnccl-dev + +RUN rm -rf /opt/hpcx \ + && rm -rf /usr/local/mpi \ + && rm -f /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +ENV OPAL_PREFIX= + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + apt-utils \ + autoconf \ + automake \ + build-essential \ + check \ + cmake \ + curl \ + debhelper \ + devscripts \ + git \ + gcc \ + gdb \ + kmod \ + libsubunit-dev \ + libtool \ + openssh-client \ + openssh-server \ + pkg-config \ + python3-distutils \ + vim \ + python3.10-dev \ + python3.10-venv +RUN apt-get purge -y cuda-compat-* + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Set paths for both aarch64 and x86_64 +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH + +RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ + && python3 /tmp/get-pip.py \ + && pip3 install awscli pynvml + +################################################# +## Install NVIDIA GDRCopy +## +## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure +## that the cuda-compat-xx-x package is the latest. +RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ + && cd /tmp/gdrcopy \ + && make prefix=/opt/gdrcopy install + +ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH +ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH +ENV CPATH=/opt/gdrcopy/include:$CPATH +ENV PATH=/opt/gdrcopy/bin:$PATH + +################################################# +## Install EFA installer +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +################################################### +## Install NCCL +RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ + && cd /opt/nccl \ + && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +################################################### +## Install NCCL-tests +RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && make -j $(nproc) \ + MPI=1 \ + MPI_HOME=/opt/amazon/openmpi/ \ + CUDA_HOME=/usr/local/cuda \ + NCCL_HOME=/opt/nccl/build \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" + +RUN rm -rf /var/lib/apt/lists/* + +## Set Open MPI variables to exclude network interface and conduit. +ENV OMPI_MCA_pml=^ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ + OPAL_PREFIX=/opt/amazon/openmpi \ + NCCL_SOCKET_IFNAME=^docker,lo,veth + +## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 +ENV PMIX_MCA_gds=hash + +## Set LD_PRELOAD for NCCL library +ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so + +################################################### +## Install uccl-ep + +RUN apt-get update && apt-get install -y \ + sudo \ + libnuma-dev + +RUN git clone https://github.com/uccl-project/uccl.git /opt/uccl \ + && cd /opt/uccl \ + # && git checkout \ + && cd ep \ + && ./install_deps.sh \ + && make -j install + +WORKDIR /opt/uccl/ep