From 261b58daced5f677259c77a9a968a4a1978ba458 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <belevichp@gmail.com>
Date: Wed, 19 Nov 2025 21:27:31 +0000
Subject: [PATCH] expert parallelism benchmarks

---
 .../deepep-benchmark/.gitignore               |   1 +
 .../deepep-benchmark/README.md                | 147 +++
 .../deepep-benchmark/deepep.Dockerfile        | 191 ++++
 .../pplx-garden-benchmark/.gitignore          |   5 +
 .../pplx-garden-benchmark/README.md           |  58 ++
 .../pplx-garden.Dockerfile                    |   9 +
 .../pplx-garden-benchmark/pplx-garden.sbatch  |  64 ++
 .../pplx-garden_uccl-ep.Dockerfile            |  13 +
 .../uccl-pplx-garden.sbatch                   |  68 ++
 .../uccl_bench_all_to_all.py                  | 838 ++++++++++++++++++
 .../pplx-kernels-benchmark/.gitignore         |   4 +
 .../pplx-kernels-benchmark/README.md          |  60 ++
 .../pplx-kernels-benchmark/coredump/.gitkeep  |   0
 .../pplx-kernels-benchmark/data/.gitkeep      |   0
 .../launch_bench_all_to_all.py                |  62 ++
 .../pplx-kernels.Dockerfile                   | 197 ++++
 .../pplx-kernels.sbatch                       |  71 ++
 .../uccl-ep-benchmark/.gitignore              |   3 +
 .../uccl-ep-benchmark/LICENSE                 |  21 +
 .../uccl-ep-benchmark/README.md               |  33 +
 .../uccl-ep-benchmark/test_internode.sbatch   |  52 ++
 .../uccl-ep-benchmark/test_intranode.sbatch   |  51 ++
 .../uccl-ep-benchmark/test_low_latency.sbatch |  51 ++
 .../uccl-ep-benchmark/uccl-ep.Dockerfile      | 136 +++
 24 files changed, 2135 insertions(+)
 create mode 100644 3.test_cases/expert-parallelism/deepep-benchmark/.gitignore
 create mode 100644 3.test_cases/expert-parallelism/deepep-benchmark/README.md
 create mode 100644 3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch
 create mode 100644 3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/coredump/.gitkeep
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/data/.gitkeep
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile
 create mode 100644 3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch
 create mode 100644 3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile

diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore b/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore
new file mode 100644
index 000000000..270f8d417
--- /dev/null
+++ b/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore
@@ -0,0 +1 @@
+*.sqsh
diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/README.md b/3.test_cases/expert-parallelism/deepep-benchmark/README.md
new file mode 100644
index 000000000..eef71beec
--- /dev/null
+++ b/3.test_cases/expert-parallelism/deepep-benchmark/README.md
@@ -0,0 +1,147 @@
+# DeepEP Benchmark
+https://github.com/deepseek-ai/DeepEP
+
+Updated to [NVSHMEM 3.4.5-0](https://github.com/NVIDIA/nvshmem/commit/df2814155acfba6227534dd81a8bf338da9e55f2) and DeepEP [Sep 25, 2025](https://github.com/deepseek-ai/DeepEP/tree/e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39)
+
+## Git clone NVSHMEM
+
+3.4.5-0:
+```
+git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 && cd ..
+```
+
+devel brach:
+```
+git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout devel && cd ..
+```
+
+## Building DeepEP Docker image
+
+```bash
+GDRCOPY_VERSION=v2.5.1
+EFA_INSTALLER_VERSION=1.43.2
+NCCL_VERSION=v2.27.7-1
+NCCL_TESTS_VERSION=v2.16.9
+NVSHMEM_VERSION=3.4.5-0
+TAG="efa${EFA_INSTALLER_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}"
+DEEPEP_CONTAINER_IMAGE_NAME_TAG="deepep:${TAG}"
+```
+
+```bash
+docker build --progress=plain -f ./deepep.Dockerfile \
+       --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \
+       --build-arg="NCCL_VERSION=${NCCL_VERSION}" \
+       --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \
+       --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \
+       -t ${DEEPEP_CONTAINER_IMAGE_NAME_TAG} \
+       .
+```
+
+```bash
+enroot import -o ./deepep.sqsh dockerd://${DEEPEP_CONTAINER_IMAGE_NAME_TAG}
+```
+
+## Running DeepEP Benchmark
+
+### Intranode
+
+```bash
+srun --mpi=pmix --cpu-bind=none --container-image ./deepep.sqsh python /DeepEP/tests/test_intranode.py
+```
+
+## P5en results
+DeepEP commit [e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39](https://github.com/deepseek-ai/DeepEP/tree/e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39)
+```
+[config] num_tokens=4096, hidden=7168, num_topk=8
+[layout] Kernel performance: 0.041 ms
+
+[testing] Running with BF16, without top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=False) ... passed
+[testing] Running with FP8, without top-k (async=False, previous=False) ... passed
+[testing] Running with FP8, with top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=False) ... passed
+[testing] Running with FP8, without top-k (async=True, previous=False) ... passed
+[testing] Running with FP8, with top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, without top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=True) ... passed
+[testing] Running with FP8, without top-k (async=False, previous=True) ... passed
+[testing] Running with FP8, with top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=True) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=True) ... passed
+[testing] Running with FP8, without top-k (async=True, previous=True) ... passed
+[testing] Running with FP8, with top-k (async=True, previous=True) ... passed
+
+[tuning] SMs 24, NVL chunk 4: 294.24 GB/s (NVL), 544.47 us
+[tuning] SMs 24, NVL chunk 6: 320.68 GB/s (NVL), 499.58 us
+[tuning] SMs 24, NVL chunk 8: 317.79 GB/s (NVL), 504.13 us
+[tuning] SMs 24, NVL chunk 10: 316.46 GB/s (NVL), 506.25 us
+[tuning] SMs 24, NVL chunk 12: 308.37 GB/s (NVL), 519.53 us
+[tuning] SMs 24, NVL chunk 14: 298.15 GB/s (NVL), 537.34 us
+[tuning] SMs 24, NVL chunk 16: 292.44 GB/s (NVL), 547.83 us
+[tuning] SMs 24, NVL chunk 18: 297.46 GB/s (NVL), 538.58 us
+[tuning] SMs 24, NVL chunk 20: 293.29 GB/s (NVL), 546.24 us
+[tuning] SMs 24, NVL chunk 22: 287.31 GB/s (NVL), 557.62 us
+[tuning] SMs 24, NVL chunk 24: 287.20 GB/s (NVL), 557.83 us
+[tuning] SMs 24, NVL chunk 26: 286.76 GB/s (NVL), 558.67 us
+[tuning] SMs 24, NVL chunk 28: 287.96 GB/s (NVL), 556.35 us
+[tuning] SMs 24, NVL chunk 30: 282.88 GB/s (NVL), 566.33 us
+[tuning] SMs 24, NVL chunk 32: 281.40 GB/s (NVL), 569.32 us
+[tuning] SMs 24, NVL chunk default: 319.82 GB/s (NVL), 500.93 us
+[tuning] Best dispatch (FP8): SMs 24, NVL chunk 6, 320.68 GB/s (NVL), t: 499.58 us
+
+[tuning] SMs 24, NVL chunk 4: 331.77 GB/s (NVL), 936.50 us
+[tuning] SMs 24, NVL chunk 6: 304.74 GB/s (NVL), 1019.58 us
+[tuning] SMs 24, NVL chunk 8: 305.57 GB/s (NVL), 1016.81 us
+[tuning] SMs 24, NVL chunk 10: 305.73 GB/s (NVL), 1016.26 us
+[tuning] SMs 24, NVL chunk 12: 303.80 GB/s (NVL), 1022.74 us
+[tuning] SMs 24, NVL chunk 14: 300.82 GB/s (NVL), 1032.85 us
+[tuning] SMs 24, NVL chunk 16: 300.27 GB/s (NVL), 1034.75 us
+[tuning] SMs 24, NVL chunk 18: 301.12 GB/s (NVL), 1031.83 us
+[tuning] SMs 24, NVL chunk 20: 298.67 GB/s (NVL), 1040.29 us
+[tuning] SMs 24, NVL chunk 22: 296.76 GB/s (NVL), 1046.98 us
+[tuning] SMs 24, NVL chunk 24: 296.46 GB/s (NVL), 1048.05 us
+[tuning] SMs 24, NVL chunk 26: 294.70 GB/s (NVL), 1054.29 us
+[tuning] SMs 24, NVL chunk 28: 293.73 GB/s (NVL), 1057.80 us
+[tuning] SMs 24, NVL chunk 30: 292.28 GB/s (NVL), 1063.03 us
+[tuning] SMs 24, NVL chunk 32: 292.16 GB/s (NVL), 1063.47 us
+[tuning] SMs 24, NVL chunk default: 305.72 GB/s (NVL), 1016.31 us
+[tuning] Best dispatch (BF16): SMs 24, NVL chunk 4, 331.77 GB/s (NVL), t: 936.50 us
+
+[tuning] SMs 24, NVL chunk 1: 159.88 GB/s (NVL), 1943.39 us
+[tuning] SMs 24, NVL chunk 2: 277.52 GB/s (NVL), 1119.56 us
+[tuning] SMs 24, NVL chunk 3: 316.19 GB/s (NVL), 982.64 us
+[tuning] SMs 24, NVL chunk 4: 321.89 GB/s (NVL), 965.24 us
+[tuning] SMs 24, NVL chunk 5: 311.73 GB/s (NVL), 996.72 us
+[tuning] SMs 24, NVL chunk 6: 294.88 GB/s (NVL), 1053.67 us
+[tuning] SMs 24, NVL chunk 7: 304.14 GB/s (NVL), 1021.57 us
+[tuning] SMs 24, NVL chunk 8: 288.61 GB/s (NVL), 1076.55 us
+[tuning] SMs 24, NVL chunk 9: 284.72 GB/s (NVL), 1091.26 us
+[tuning] SMs 24, NVL chunk 10: 289.42 GB/s (NVL), 1073.55 us
+[tuning] SMs 24, NVL chunk 11: 284.57 GB/s (NVL), 1091.85 us
+[tuning] SMs 24, NVL chunk 12: 284.85 GB/s (NVL), 1090.75 us
+[tuning] SMs 24, NVL chunk 13: 288.21 GB/s (NVL), 1078.05 us
+[tuning] SMs 24, NVL chunk 14: 285.78 GB/s (NVL), 1087.20 us
+[tuning] SMs 24, NVL chunk 15: 283.55 GB/s (NVL), 1095.76 us
+[tuning] SMs 24, NVL chunk 16: 283.94 GB/s (NVL), 1094.27 us
+[tuning] SMs 24, NVL chunk default: 319.88 GB/s (NVL), 971.32 us
+[tuning] Best combine: SMs 24, NVL chunk 4: 321.89 GB/s (NVL), t: 965.24 us
+```
+
+### Internode
+
+```bash
+srun \
+  -l --mpi=pmix --cpu-bind=none \
+  --container-image ./deepep.sqsh \
+  -N 2 \
+  bash -c 'MASTER_ADDR=${SLURM_NODELIST%%,*} WORLD_SIZE=$SLURM_NNODES RANK=$SLURM_PROCID NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa python3 -u -X faulthandler /DeepEP/tests/test_internode.py'
+```
diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile b/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile
new file mode 100644
index 000000000..6bc3277f1
--- /dev/null
+++ b/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile
@@ -0,0 +1,191 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+################################ NCCL ########################################
+
+ARG GDRCOPY_VERSION=v2.5.1
+ARG EFA_INSTALLER_VERSION=1.43.2
+ARG AWS_OFI_NCCL_VERSION=v1.16.3
+ARG NCCL_VERSION=v2.27.7-1
+ARG NCCL_TESTS_VERSION=v2.16.9
+
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+ENV OPAL_PREFIX=
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
+    git \
+    gcc \
+    gdb \
+    kmod \
+    libsubunit-dev \
+    libtool \
+    openssh-client \
+    openssh-server \
+    pkg-config \
+    python3-distutils \
+    vim \
+    python3.10-dev \
+    python3.10-venv
+RUN apt-get purge -y cuda-compat-*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
+ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
+    && python3 /tmp/get-pip.py \
+    && pip3 install awscli pynvml
+
+#################################################
+## Install NVIDIA GDRCopy
+##
+## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
+## that the cuda-compat-xx-x package is the latest.
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
+ENV CPATH=/opt/gdrcopy/include:$CPATH
+ENV PATH=/opt/gdrcopy/bin:$PATH
+
+#################################################
+## Install EFA installer
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+###################################################
+## Install NCCL
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
+    && cd /opt/nccl \
+    && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+###################################################
+## Install NCCL-tests
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) \
+    MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi/ \
+    CUDA_HOME=/usr/local/cuda \
+    NCCL_HOME=/opt/nccl/build \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
+ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so
+
+################################ NVSHMEM ########################################
+
+ENV NVSHMEM_DIR=/opt/nvshmem
+ENV NVSHMEM_HOME=/opt/nvshmem
+
+# 3.2.5-1: wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz
+# 3.3.9:   wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+# 3.4.5-0: git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2
+COPY ./nvshmem /nvshmem_src
+
+RUN cd /nvshmem_src \
+    && mkdir -p build \
+    && cd build \ 
+    && cmake \
+    -DNVSHMEM_PREFIX=/opt/nvshmem \
+    -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \
+    \
+    -DCUDA_HOME=/usr/local/cuda \
+    -DCMAKE_CUDA_ARCHITECTURES="90a;100" \
+    \
+    -DNVSHMEM_USE_GDRCOPY=1 \
+    -DGDRCOPY_HOME=/opt/gdrcopy \
+    \
+    -DNVSHMEM_USE_NCCL=1 \
+    -DNCCL_HOME=/opt/nccl/build \
+    -DNCCL_INCLUDE=/opt/nccl/build/include \
+    \
+    -DNVSHMEM_LIBFABRIC_SUPPORT=1 \
+    -DLIBFABRIC_HOME=/opt/amazon/efa \
+    \
+    -DNVSHMEM_MPI_SUPPORT=1 \
+    -DMPI_HOME=/opt/amazon/openmpi \
+    \
+    -DNVSHMEM_PMIX_SUPPORT=1 \
+    -DPMIX_HOME=/opt/amazon/pmix \
+    -DNVSHMEM_DEFAULT_PMIX=1 \
+    \
+    -DNVSHMEM_BUILD_TESTS=1 \
+    -DNVSHMEM_BUILD_EXAMPLES=1 \
+    -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \
+    -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \
+    \
+    -DNVSHMEM_IBRC_SUPPORT=1 \
+    -DNVSHMEM_IBGDA_SUPPORT=1 \
+    \
+    -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    .. \
+    && make -j$(nproc) \
+    && make install
+
+ENV PATH=/opt/nvshmem/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH
+# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa
+
+################################ PyTorch ########################################
+
+RUN pip install torch --index-url https://download.pytorch.org/whl/cu128
+RUN pip install ninja numpy cmake pytest
+
+################################ DeepEP ########################################
+
+ARG DEEPEP_COMMIT=e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39
+
+RUN git clone https://github.com/deepseek-ai/DeepEP.git /DeepEP \
+    && cd /DeepEP \
+    && git checkout ${DEEPEP_COMMIT} \
+    && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" pip install .
+
+RUN mkdir -p /tmp/coredump
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore b/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore
new file mode 100644
index 000000000..d0e8737cc
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore
@@ -0,0 +1,5 @@
+*.sqsh
+*.out
+*.err
+pplx-garden
+uccl
\ No newline at end of file
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md b/3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md
new file mode 100644
index 000000000..a325bd252
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/README.md
@@ -0,0 +1,58 @@
+# PPLX Garden Benchmark
+
+```bash
+git clone https://github.com/perplexityai/pplx-garden.git
+```
+
+```bash
+docker build -t pplx-garden-dev - < ./pplx-garden/docker/dev.Dockerfile && docker build -t pplx-garden -f pplx-garden.Dockerfile .
+```
+
+```bash
+enroot import -o pplx-garden.sqsh dockerd://pplx-garden
+```
+
+```bash
+sbatch pplx-garden.sbatch
+```
+
+# All-to-All Performance Results
+
+Decode (128 tokens) Dispatch and Combine Median Latency(μs):
+
+|       | My UCCL EP | My pplx-EFA | pplx-EFA | pplx-CX7 | DeepEP-CX7 | x | My UCCL EP | My pplx-EFA | pplx-EFA | pplx-CX7 | DeepEP-CX7 |
+|-------|-----------:|------------:|---------:|---------:|-----------:|---|-----------:|------------:|---------:|---------:|-----------:|
+| EP128 |            |    424.5    |          |          |            | x |            |    588.5    |          |          |            |
+| EP64  |   crash    |    268.6    | 266.7    | 187.5    |   177.9    | x |   crash    |    393.6    | 391.2    | 309.1    |   325.0    |
+| EP32  |   358.0    |    230.9    | 229.1    | 153.9    |   159.1    | x |   689.3    |    336.9    | 335.0    | 266.3    |   285.0    |
+| EP16  |   301.1    |    218.0    | 214.8    | 110.2    |   123.9    | x |   834.5    |    244.6    | 241.5    | 185.5    |   203.0    |
+| EP8   |    66.1    |     50.6    |  49.7    |  50.5    |    42.6    | x |    86.6    |     64.1    |  64.2    |  65.3    |    72.0    |
+
+
+Prefill (4096 tokens) Dispatch and Combine Median Latency(μs):
+
+| x     | My pplx-EFA |  pplx-EFA |  pplx-CX7 | DeepEP-CX7 | x | My pplx-EFA |  pplx-EFA |  pplx-CX7 | DeepEP-CX7 |
+|-------|------------:|----------:|----------:|-----------:|---|------------:|----------:|----------:|-----------:|
+| EP128 |   5883.4    |           |           |            | x |  10785.1    |           |           |            |
+| EP64  |   5395.6    | 5334.3    | 4665.2    |  5071.6    | x |   9854.9    | 9779.3    | 8771.1    |  5922.7    |
+| EP32  |   4605.2    | 4619.0    | 4011.8    |  3680.2    | x |   8286.3    | 8271.5    | 7526.8    |  3565.4    |
+| EP16  |   3181.4    | 3196.7    | 2734.8    |  2481.9    | x |   5373.6    | 5379.1    | 1062.2    |  1863.9    |
+| EP8   |   1076.0    | 1052.4    | 5071.1    |  1810.3    | x |   1354.0    | 1396.7    | 1405.1    |   962.9    |
+
+# UCCL-EP Benchmark
+
+```bash
+git clone https://github.com/uccl-project/uccl.git
+```
+
+```bash
+docker build -t pplx-garden_uccl-ep -f pplx-garden_uccl-ep.Dockerfile .
+```
+
+```bash
+enroot import -o pplx-garden_uccl-ep.sqsh dockerd://pplx-garden_uccl-ep
+```
+
+```bash
+sbatch uccl-pplx-garden.sbatch
+```
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile
new file mode 100644
index 000000000..77885a12c
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.Dockerfile
@@ -0,0 +1,9 @@
+FROM pplx-garden-dev:latest
+
+COPY pplx-garden /app
+RUN cd /app \
+    && export TORCH_CMAKE_PREFIX_PATH=$(python3 -c "import torch; print(torch.utils.cmake_prefix_path)") \
+    && python3 -m build --wheel \
+    && python3 -m pip install /app/dist/*.whl
+
+WORKDIR /app
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch
new file mode 100644
index 000000000..1b5e57c33
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden.sbatch
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=pplx-garden
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node 1
+##SBATCH --output %x_%j.out
+##SBATCH --error %x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+### Disable hyperthreading by setting the tasks per core to 1
+##SBATCH --ntasks-per-core=1
+
+###########################
+###### User Variables #####
+###########################
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+export FI_EFA_FORK_SAFE=1
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+
+### Increase the send queue depth and can turn NCCL communications into non-blocking.
+### https://www.usenix.org/system/files/atc23-choi.pdf
+export NCCL_BUFFSIZE=8388608
+### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
+### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
+export NCCL_P2P_NET_CHUNKSIZE=1048576
+
+### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
+### message size and number of ranks.
+### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.
+export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so
+
+#Get Hostname and Instance IDs
+mpirun -N 1 bash -c 'echo $(hostname): $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
+
+export MASTER_PORT=29500
+export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
+export NUM_NODES=${SLURM_NNODES}
+export NUM_GPUS=8
+
+srun -l \
+    --container-image ./pplx-garden.sqsh \
+    --mpi=pmix --cpu-bind=none \
+    bash -c 'python3 -X faulthandler -m benchmarks.bench_all_to_all \
+        --world-size $((NUM_NODES * NUM_GPUS)) \
+        --nets-per-gpu 2 \
+        --init-method=tcp://${MASTER_ADDR}:${MASTER_PORT} \
+        --node-rank=${SLURM_NODEID} \
+        --nvlink=8 \
+        --max-num-tokens 128'
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile
new file mode 100644
index 000000000..6b0003eae
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/pplx-garden_uccl-ep.Dockerfile
@@ -0,0 +1,13 @@
+FROM pplx-garden:latest
+
+RUN apt-get update && apt-get install -y \
+    sudo \
+    libnuma-dev
+
+COPY uccl /opt/uccl
+RUN cd /opt/uccl \
+    && cd ep \
+    && ./install_deps.sh \
+    && make -j install
+
+ENV PYTHONPATH=/opt/uccl/ep/bench:$PYTHONPATH
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch
new file mode 100644
index 000000000..dfc488365
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl-pplx-garden.sbatch
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=pplx-garden
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node 1
+##SBATCH --output %x_%j.out
+##SBATCH --error %x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+### Disable hyperthreading by setting the tasks per core to 1
+##SBATCH --ntasks-per-core=1
+
+###########################
+###### User Variables #####
+###########################
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+export FI_EFA_FORK_SAFE=1
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+
+### Increase the send queue depth and can turn NCCL communications into non-blocking.
+### https://www.usenix.org/system/files/atc23-choi.pdf
+export NCCL_BUFFSIZE=8388608
+### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
+### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
+export NCCL_P2P_NET_CHUNKSIZE=1048576
+
+### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
+### message size and number of ranks.
+### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.
+export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so
+
+#Get Hostname and Instance IDs
+mpirun -N 1 bash -c 'echo $(hostname): $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
+
+export MASTER_PORT=29500
+export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
+export NUM_NODES=${SLURM_NNODES}
+export NUM_GPUS=8
+
+export PYTHONPATH=/opt/uccl/ep/bench:$PYTHONPATH
+
+srun -l \
+    --container-image ./pplx-garden_uccl-ep.sqsh \
+    --container-mounts ./uccl_bench_all_to_all.py:/app/benchmarks/uccl_bench_all_to_all.py \
+    --mpi=pmix --cpu-bind=none \
+    bash -c 'python3 -X faulthandler -m benchmarks.uccl_bench_all_to_all \
+        --world-size $((NUM_NODES * NUM_GPUS)) \
+        --nets-per-gpu 2 \
+        --init-method=tcp://${MASTER_ADDR}:${MASTER_PORT} \
+        --node-rank=${SLURM_NODEID} \
+        --nvlink=8 \
+        --max-num-tokens 128 \
+        --use-uccl'
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py
new file mode 100644
index 000000000..352c38083
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-garden-benchmark/uccl_bench_all_to_all.py
@@ -0,0 +1,838 @@
+# ruff: noqa: B023
+
+import argparse
+import json
+import os
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+from pplx_garden.distributed import ParallelGroup, ParallelLaunch
+from pplx_garden.kernels.p2p_all_to_all import P2PAllToAll
+from pplx_garden.utils import logging_utils
+from pplx_garden.utils.math import Statistics, round_up
+from pplx_garden.utils.torch import profile_range, str_to_dtype
+from tests.p2p_all_to_all.data import RankTestData
+
+# Add UCCL imports
+try:
+    from uccl.ep import Config
+    from buffer import Buffer
+    from utils import inplace_unique
+    UCCL_AVAILABLE = True
+except ImportError:
+    UCCL_AVAILABLE = False
+    Config = None
+    Buffer = None
+
+logger = logging_utils.get_logger("bench_all_to_all")
+
+
+def rand_topk_idx(
+    num_tokens: int,
+    num_experts: int,
+    num_topk: int,
+    generator: torch.Generator,
+    device: torch.device,
+) -> torch.Tensor:
+    scores = torch.randn(
+        (num_tokens, num_experts),
+        dtype=torch.float32,
+        device=device,
+        generator=generator,
+    )
+    scores = scores.abs() + 1
+    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
+    return topk_idx.to(torch.uint32)
+
+
+def act(x: torch.Tensor, x_scale: Optional[torch.Tensor]) -> torch.Tensor:
+    if x_scale is None:
+        return x * 2
+
+    b, h = x.shape
+    _, hs = x_scale.shape
+    x_reshaped = x.view(b, hs, h // hs)
+    result = x_reshaped.to(torch.float32)
+    result *= x_scale.view(b, hs, 1)
+    result *= 2
+    return result.view(b, h)
+
+
+def make_rng(device: torch.device, rank: int) -> torch.Generator:
+    generator = torch.Generator(device=device)
+    generator.manual_seed(rank + 123)
+    return generator
+
+
+@dataclass(slots=True)
+class AllToAllConfig:
+    nets_per_gpu: int
+    max_num_tokens: int
+    max_private_tokens: int
+    num_experts: int
+    hidden_dim: int
+    hidden_dim_scale: Optional[int]
+    num_experts_per_token: int
+    in_dtype: torch.dtype
+    out_dtype: torch.dtype
+    scale_dtype: Optional[torch.dtype]
+    nvlink: Optional[int]
+
+    @property
+    def dispatch_bytes(self) -> int:
+        dispatch_token_dim = self.hidden_dim * self.in_dtype.itemsize
+        if self.hidden_dim_scale is not None:
+            assert self.scale_dtype is not None
+            dispatch_token_dim += self.hidden_dim_scale * self.scale_dtype.itemsize
+        return self.max_num_tokens * self.num_experts_per_token * dispatch_token_dim
+
+    @property
+    def combine_bytes(self) -> int:
+        combine_token_dim = self.hidden_dim * self.out_dtype.itemsize
+        return self.max_num_tokens * self.num_experts_per_token * combine_token_dim
+
+
+class AllToAllResource:
+    def __init__(
+        self,
+        device: torch.device,
+        dp_group: ParallelGroup,
+        global_group: ParallelGroup,
+        cfg: AllToAllConfig,
+    ) -> None:
+        self.device = device
+        self.dp_group = dp_group
+        self.global_group = global_group
+        self.cfg = cfg
+
+        self.expert_padding = expert_padding = 1
+        self.dp_rank = global_group.rank // dp_group.size
+        self.num_dp_groups = num_dp_groups = global_group.size // dp_group.size
+        self.num_local_experts = num_local_experts = (
+            cfg.num_experts // global_group.size
+        )
+        self.num_tokens = num_tokens = num_dp_groups * cfg.max_num_tokens
+        max_recv_tokens = round_up(
+            max(
+                min(
+                    num_tokens * cfg.num_experts_per_token
+                    + num_local_experts * (expert_padding - 1),
+                    num_tokens * num_local_experts,
+                ),
+                num_local_experts * expert_padding,
+            ),
+            expert_padding,
+        )
+
+        node_group: Optional[ParallelGroup] = None
+        if cfg.nvlink is not None:
+            assert 0 < cfg.nvlink <= min(8, global_group.size, 8)
+            node_group = global_group.slice_by_count(global_group.size // cfg.nvlink)
+
+        # Instantiate the all-to-all kernel.
+        self.all_to_all = P2PAllToAll(
+            max_num_tokens=cfg.max_num_tokens,
+            num_experts=cfg.num_experts,
+            expert_padding=expert_padding,
+            hidden_dim=cfg.hidden_dim,
+            hidden_dim_scale=cfg.hidden_dim_scale,
+            max_private_tokens=cfg.max_private_tokens,
+            in_dtype=cfg.in_dtype,
+            out_dtype=cfg.out_dtype,
+            scale_dtype=cfg.scale_dtype,
+            num_experts_per_token=cfg.num_experts_per_token,
+            nets_per_gpu=cfg.nets_per_gpu,
+            device=device,
+            dp_group=dp_group,
+            node_group=node_group,
+            global_group=global_group,
+        )
+
+        # Allocate buffers.
+        self.expert_num_tokens = torch.empty(
+            (num_local_experts,),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.out_expert_x = torch.empty(
+            (max_recv_tokens, cfg.hidden_dim),
+            dtype=cfg.in_dtype,
+            device=device,
+        )
+        self.out_tokens = torch.empty(
+            (cfg.max_num_tokens, cfg.hidden_dim),
+            dtype=cfg.out_dtype,
+            device=device,
+        )
+        self.expert_y = torch.empty(
+            (max_recv_tokens, cfg.hidden_dim),
+            dtype=cfg.out_dtype,
+            device=device,
+        )
+        self.out_expert_x_scale: torch.Tensor | None = None
+        if cfg.hidden_dim_scale is not None or cfg.scale_dtype is not None:
+            assert cfg.scale_dtype is not None
+            assert cfg.hidden_dim_scale is not None
+            self.out_expert_x_scale = torch.empty(
+                (max_recv_tokens, cfg.hidden_dim_scale),
+                dtype=cfg.scale_dtype,
+                device=device,
+            )
+
+    def create_rank_data(self, dp_rank: int) -> RankTestData:
+        return RankTestData.create(
+            num_experts=self.cfg.num_experts,
+            num_experts_per_token=self.cfg.num_experts_per_token,
+            max_num_tokens=self.cfg.max_num_tokens,
+            hidden_dim=self.cfg.hidden_dim,
+            hidden_dim_scale=self.cfg.hidden_dim_scale,
+            in_dtype=self.cfg.in_dtype,
+            scale_dtype=self.cfg.scale_dtype,
+            generator=make_rng(self.device, dp_rank),
+            device=self.device,
+        )
+
+
+class UCCLAllToAllResource:
+    """Resource wrapper for UCCL-based all-to-all communication."""
+    
+    def __init__(
+        self,
+        device: torch.device,
+        global_group: ParallelGroup,
+        cfg: AllToAllConfig,
+    ) -> None:
+        if not UCCL_AVAILABLE:
+            raise RuntimeError("UCCL is not available. Please install uccl.ep.")
+        
+        self.device = device
+        self.global_group = global_group
+        self.cfg = cfg
+        self.rank = global_group.rank
+        self.num_ranks = global_group.size
+        
+        # Convert ParallelGroup to torch.distributed ProcessGroup
+        self.dist_group = dist.new_group(list(range(self.num_ranks)))
+        
+        self.num_local_experts = cfg.num_experts // self.num_ranks
+        
+        # Initialize UCCL buffer
+        num_sms = 24
+        num_nvl_bytes = int(2e9)
+        num_rdma_bytes = int(1e9)
+        num_qps_per_rank = max(num_sms, self.num_local_experts)
+        
+        num_nodes = self.num_ranks // int(os.environ.get("LOCAL_WORLD_SIZE", 8))
+        is_intranode = num_nodes == 1
+        
+        self.buffer = Buffer(
+            self.dist_group,
+            num_nvl_bytes,
+            num_rdma_bytes,
+            low_latency_mode=False,
+            num_qps_per_rank=num_qps_per_rank,
+            explicitly_destroy=True,
+            is_intranode=is_intranode,
+        )
+        
+        # Default configs
+        rdma_buffer_size = 512
+        nvl_buffer_size = 720 if self.num_ranks in (144, 160) else 512
+        if self.num_ranks == 24:
+            nvl_buffer_size = 540
+        self.dispatch_config = Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
+        self.combine_config = Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
+        
+    def create_rank_data_uccl(self, rank: int):
+        """Create test data in UCCL format."""
+        num_tokens = self.cfg.max_num_tokens
+        hidden = self.cfg.hidden_dim
+        num_experts = self.cfg.num_experts
+        num_topk = self.cfg.num_experts_per_token
+        
+        # Generate input tensor
+        x = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device=self.device)
+        
+        # Generate topk indices
+        scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device=self.device).abs() + 1
+        topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
+        topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device=self.device)
+        
+        # Calculate rank indices
+        rank_idx = topk_idx // (num_experts // self.num_ranks)
+        rank_idx.masked_fill_(topk_idx == -1, -1)
+        inplace_unique(rank_idx, self.num_ranks)
+        
+        return x, topk_idx, topk_weights, rank_idx
+    
+    def destroy(self):
+        """Clean up UCCL resources."""
+        self.buffer.destroy()
+
+
+def correctness_check(r: AllToAllResource) -> None:
+    expected_num_tokens_list = [
+        RankTestData.rand_indices_and_count(
+            num_experts=r.cfg.num_experts,
+            num_experts_per_token=r.cfg.num_experts_per_token,
+            max_num_tokens=r.cfg.max_num_tokens,
+            generator=make_rng(r.device, dp_rank),
+            device=r.device,
+        )[1]
+        for dp_rank in range(r.num_dp_groups)
+    ]
+    expected_num_tokens = torch.sum(
+        torch.stack(expected_num_tokens_list, dim=0),
+        dim=0,
+        dtype=torch.int32,
+    ).to("cpu")
+
+    local_rank = r.create_rank_data(r.dp_rank)
+    ref_out_tokens = act(local_rank.dp_x, local_rank.dp_x_scale).to(r.cfg.out_dtype)
+
+    # Test run.
+    r.all_to_all.dispatch(
+        out_expert_num_tokens=r.expert_num_tokens,
+        out_expert_x=r.out_expert_x,
+        out_expert_x_scale=r.out_expert_x_scale,
+        dp_x=local_rank.dp_x,
+        dp_x_scale=local_rank.dp_x_scale,
+        indices=local_rank.indices,
+        weights=local_rank.weights,
+    )
+    expert_y = act(r.out_expert_x, r.out_expert_x_scale).to(r.cfg.out_dtype)
+    r.all_to_all.combine(
+        out_tokens=r.out_tokens,
+        indices=local_rank.indices,
+        weights=local_rank.weights,
+        expert_y=expert_y,
+        bound_m=local_rank.bound_m,
+    )
+    torch.cuda.synchronize()
+
+    # Verify the token counts.
+    first_expert = r.global_group.rank * r.num_local_experts
+    last_expert = min(first_expert + r.num_local_experts, r.cfg.num_experts)
+    expected_local_tokens = expected_num_tokens[first_expert:last_expert]
+    torch.testing.assert_close(expected_local_tokens, r.expert_num_tokens.to("cpu"))
+
+    # Verify the tokens.
+    def hash_token(x: torch.Tensor) -> str:
+        return ",".join(f"{v:.2f}" for v in x.tolist())
+
+    tokens_on_rank = set()
+    index = 0
+    for n in expected_local_tokens.tolist():
+        for token in r.out_expert_x[index : index + n]:
+            tokens_on_rank.add(hash_token(token))
+
+        index = round_up(index + n, r.expert_padding)
+
+    # Verify the tokens on the rank.
+    num_missing = 0
+    for i, (token, routes) in enumerate(
+        zip(list(local_rank.dp_x), local_rank.indices.tolist())
+    ):
+        if not any(first_expert <= route < last_expert for route in routes):
+            continue
+        key = hash_token(token)
+        if key not in tokens_on_rank:
+            num_missing += 1
+            logger.error(
+                "Token %i: %s not found in output on rank %i (routed to %s)",
+                i,
+                key,
+                r.dp_rank,
+                ", ".join(str(route) for route in routes),
+            )
+    assert num_missing == 0, f"Missing {num_missing} tokens on rank {r.dp_rank}"
+
+    # Verify the combine output.
+    torch.testing.assert_close(r.out_tokens, ref_out_tokens)
+
+
+def benchmark(
+    r: AllToAllResource, num_warmup: int, num_repeats: int, output: Path
+) -> None:
+    local_rank = r.create_rank_data(r.dp_rank)
+    rng = make_rng(r.device, r.dp_rank)
+    out_dummy = torch.empty((1,), dtype=torch.float32, device=r.device)
+    gemm = torch.empty(
+        (2048, 2048) if r.cfg.max_num_tokens <= 128 else (8192, 8192),
+        dtype=torch.float32,
+        device=r.device,
+    )
+
+    def wait() -> None:
+        # Wait to simulate the delay of other layers.
+        torch.distributed.all_reduce(out_dummy)
+        _ = gemm @ gemm
+        torch.distributed.all_reduce(out_dummy)
+
+    def dispatch(do_send: bool, do_recv: bool) -> None:
+        r.all_to_all.dispatch(
+            out_expert_num_tokens=r.expert_num_tokens,
+            out_expert_x=r.out_expert_x,
+            out_expert_x_scale=r.out_expert_x_scale,
+            dp_x=local_rank.dp_x,
+            dp_x_scale=local_rank.dp_x_scale,
+            indices=local_rank.indices,
+            weights=local_rank.weights,
+            bound_m=local_rank.bound_m,
+            do_send=do_send,
+            do_recv=do_recv,
+        )
+
+    def combine(do_send: bool, do_recv: bool) -> None:
+        r.all_to_all.combine(
+            out_tokens=r.out_tokens,
+            indices=local_rank.indices,
+            weights=local_rank.weights,
+            expert_y=r.expert_y,
+            bound_m=local_rank.bound_m,
+            do_send=do_send,
+            do_recv=do_recv,
+        )
+
+    # Create and initialize events for timing.
+    events = []
+    for _ in range(num_warmup + num_repeats):
+        dispatch_start = torch.cuda.Event(enable_timing=True)
+        dispatch_end = torch.cuda.Event(enable_timing=True)
+        combine_start = torch.cuda.Event(enable_timing=True)
+        combine_end = torch.cuda.Event(enable_timing=True)
+        dispatch_send_start = torch.cuda.Event(enable_timing=True)
+        dispatch_send_end = torch.cuda.Event(enable_timing=True)
+        dispatch_recv_start = torch.cuda.Event(enable_timing=True)
+        dispatch_recv_end = torch.cuda.Event(enable_timing=True)
+        combine_send_start = torch.cuda.Event(enable_timing=True)
+        combine_send_end = torch.cuda.Event(enable_timing=True)
+        combine_recv_start = torch.cuda.Event(enable_timing=True)
+        combine_recv_end = torch.cuda.Event(enable_timing=True)
+        dispatch_start.record()
+        dispatch_end.record()
+        combine_start.record()
+        combine_end.record()
+        dispatch_send_start.record()
+        dispatch_send_end.record()
+        dispatch_recv_start.record()
+        dispatch_recv_end.record()
+        combine_send_start.record()
+        combine_send_end.record()
+        combine_recv_start.record()
+        combine_recv_end.record()
+        events.append(
+            (
+                dispatch_start,
+                dispatch_end,
+                combine_start,
+                combine_end,
+                dispatch_send_start,
+                dispatch_send_end,
+                dispatch_recv_start,
+                dispatch_recv_end,
+                combine_send_start,
+                combine_send_end,
+                combine_recv_start,
+                combine_recv_end,
+            )
+        )
+
+    # Benchmark loop
+    last_report_time = time.time()
+    for i in range(num_warmup + num_repeats):
+        if i + 1 == num_warmup:
+            # Start profiling one iteration before the bench starts.
+            torch.cuda.profiler.start()
+        now = time.time()
+        if now - last_report_time > 1 or i + 1 == num_warmup + num_repeats:
+            logger.info("Iteration %i/%i", i + 1, num_warmup + num_repeats)
+            last_report_time = now
+
+        (
+            dispatch_start,
+            dispatch_end,
+            combine_start,
+            combine_end,
+            dispatch_send_start,
+            dispatch_send_end,
+            dispatch_recv_start,
+            dispatch_recv_end,
+            combine_send_start,
+            combine_send_end,
+            combine_recv_start,
+            combine_recv_end,
+        ) = events[i]
+
+        # Update indices
+        local_rank.indices = rand_topk_idx(
+            r.cfg.max_num_tokens,
+            r.cfg.num_experts,
+            r.cfg.num_experts_per_token,
+            rng,
+            r.device,
+        )
+
+        # Send-Recv back to back
+        with profile_range("back-to-back"):
+            wait()
+
+            dispatch_start.record()
+            dispatch(do_send=True, do_recv=True)
+            dispatch_end.record()
+
+            wait()
+
+            combine_start.record()
+            combine(do_send=True, do_recv=True)
+            combine_end.record()
+
+        # Insert long kernel in between send and recv
+        with profile_range("overlap"):
+            wait()
+
+            dispatch_send_start.record()
+            dispatch(do_send=True, do_recv=False)
+            dispatch_send_end.record()
+
+            wait()  # Fake overlap work
+
+            dispatch_recv_start.record()
+            dispatch(do_send=False, do_recv=True)
+            dispatch_recv_end.record()
+
+            wait()
+
+            combine_send_start.record()
+            combine(do_send=True, do_recv=False)
+            combine_send_end.record()
+
+            wait()  # Fake overlap work
+
+            combine_recv_start.record()
+            combine(do_send=False, do_recv=True)
+            combine_recv_end.record()
+
+    torch.cuda.synchronize()
+    torch.cuda.profiler.stop()
+
+    dispatch_times: list[float] = []
+    dispatch_send_times: list[float] = []
+    dispatch_recv_times: list[float] = []
+    combine_times: list[float] = []
+    combine_send_times: list[float] = []
+    combine_recv_times: list[float] = []
+    for (
+        dispatch_st,
+        dispatch_en,
+        combine_st,
+        combine_en,
+        dispatch_send_st,
+        dispatch_send_en,
+        dispatch_recv_st,
+        dispatch_recv_en,
+        combine_send_st,
+        combine_send_en,
+        combine_recv_st,
+        combine_recv_en,
+    ) in events[num_warmup:]:
+        dispatch_times.append(dispatch_st.elapsed_time(dispatch_en) * 1000)
+        combine_times.append(combine_st.elapsed_time(combine_en) * 1000)
+        dispatch_send_times.append(
+            dispatch_send_st.elapsed_time(dispatch_send_en) * 1000
+        )
+        dispatch_recv_times.append(
+            dispatch_recv_st.elapsed_time(dispatch_recv_en) * 1000
+        )
+        combine_send_times.append(combine_send_st.elapsed_time(combine_send_en) * 1000)
+        combine_recv_times.append(combine_recv_st.elapsed_time(combine_recv_en) * 1000)
+
+    # All-gather results from all ranks
+    dispatch_times = sum(r.global_group.all_gather_object(dispatch_times), [])
+    combine_times = sum(r.global_group.all_gather_object(combine_times), [])
+    dispatch_send_times = sum(r.global_group.all_gather_object(dispatch_send_times), [])
+    dispatch_recv_times = sum(r.global_group.all_gather_object(dispatch_recv_times), [])
+    combine_send_times = sum(r.global_group.all_gather_object(combine_send_times), [])
+    combine_recv_times = sum(r.global_group.all_gather_object(combine_recv_times), [])
+
+    # Report the results.
+    if r.global_group.rank == 0:
+        stat_dispatch = Statistics.create(dispatch_times)
+        stat_dispatch_send = Statistics.create(dispatch_send_times)
+        stat_dispatch_recv = Statistics.create(dispatch_recv_times)
+        stat_combine = Statistics.create(combine_times)
+        stat_combine_send = Statistics.create(combine_send_times)
+        stat_combine_recv = Statistics.create(combine_recv_times)
+
+        dispatch_bandwidth = r.cfg.dispatch_bytes / stat_dispatch.p50 * 1e-3
+        combine_bandwidth = r.cfg.combine_bytes / stat_combine.p50 * 1e-3
+
+        logger.info(
+            "Dispatch both time: %s, %.1f GB/s",
+            stat_dispatch,
+            dispatch_bandwidth,
+        )
+        logger.info("Dispatch send time: %s", stat_dispatch_send)
+        logger.info("Dispatch recv time: %s", stat_dispatch_recv)
+
+        logger.info(
+            "Combine both time: %s, %.1f GB/s",
+            stat_combine,
+            combine_bandwidth,
+        )
+        logger.info("Combine send time: %s", stat_combine_send)
+        logger.info("Combine recv time: %s", stat_combine_recv)
+
+        data = {
+            "dispatch": {
+                "both": asdict(stat_dispatch),
+                "send": asdict(stat_dispatch_send),
+                "recv": asdict(stat_dispatch_recv),
+            },
+            "combine": {
+                "both": asdict(stat_combine),
+                "send": asdict(stat_combine_send),
+                "recv": asdict(stat_combine_recv),
+            },
+        }
+
+        with output.open("w") as f:
+            f.write(json.dumps(data))
+
+
+def benchmark_uccl(
+    r: UCCLAllToAllResource, num_warmup: int, num_repeats: int, output: Path
+) -> None:
+    """Benchmark UCCL dispatch and combine latency."""
+    
+    # Create test data
+    x, topk_idx, topk_weights, rank_idx = r.create_rank_data_uccl(r.rank)
+    
+    # Get dispatch layout
+    (
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+        is_token_in_rank,
+        _,
+    ) = r.buffer.get_dispatch_layout(topk_idx, r.cfg.num_experts)
+    
+    # Initial dispatch to get handle
+    dispatch_args = {
+        "x": x,
+        "num_tokens_per_rank": num_tokens_per_rank,
+        "num_tokens_per_rdma_rank": num_tokens_per_rdma_rank,
+        "is_token_in_rank": is_token_in_rank,
+        "num_tokens_per_expert": num_tokens_per_expert,
+        "topk_idx": topk_idx,
+        "topk_weights": topk_weights,
+        "config": r.dispatch_config,
+    }
+    recv_x, recv_topk_idx, recv_topk_weights, recv_num_tokens_per_expert_list, handle, _ = r.buffer.dispatch(**dispatch_args)
+    
+    if r.rank == 0:
+        logger.info("Starting UCCL latency benchmark...")
+        logger.info("Num warmup: %d, Num repeats: %d", num_warmup, num_repeats)
+    
+    # Warmup
+    for _ in range(num_warmup):
+        r.buffer.dispatch(x=x, handle=handle, config=r.dispatch_config)
+        r.buffer.combine(x=recv_x, handle=handle, topk_weights=recv_topk_weights, config=r.combine_config)
+    torch.cuda.synchronize()
+    
+    # Benchmark dispatch
+    dispatch_events = []
+    for _ in range(num_repeats):
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        r.buffer.dispatch(x=x, handle=handle, config=r.dispatch_config)
+        end_event.record()
+        dispatch_events.append((start_event, end_event))
+    
+    # Benchmark combine
+    combine_events = []
+    for _ in range(num_repeats):
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        r.buffer.combine(x=recv_x, handle=handle, topk_weights=recv_topk_weights, config=r.combine_config)
+        end_event.record()
+        combine_events.append((start_event, end_event))
+    
+    torch.cuda.synchronize()
+    
+    # Extract timing results (convert from ms to us)
+    dispatch_times = [start.elapsed_time(end) * 1000 for start, end in dispatch_events]
+    combine_times = [start.elapsed_time(end) * 1000 for start, end in combine_events]
+    
+    # Gather results from all ranks
+    dispatch_times = sum(r.global_group.all_gather_object(dispatch_times), [])
+    combine_times = sum(r.global_group.all_gather_object(combine_times), [])
+    
+    # Report results
+    if r.rank == 0:
+        stat_dispatch = Statistics.create(dispatch_times)
+        stat_combine = Statistics.create(combine_times)
+        
+        dispatch_bandwidth = r.cfg.dispatch_bytes / stat_dispatch.p50 * 1e-3
+        combine_bandwidth = r.cfg.combine_bytes / stat_combine.p50 * 1e-3
+        
+        logger.info(
+            "UCCL Dispatch time: %s, %.1f GB/s",
+            stat_dispatch,
+            dispatch_bandwidth,
+        )
+        logger.info(
+            "UCCL Combine time: %s, %.1f GB/s",
+            stat_combine,
+            combine_bandwidth,
+        )
+        
+        data = {
+            "uccl_dispatch": asdict(stat_dispatch),
+            "uccl_combine": asdict(stat_combine),
+        }
+        
+        with output.open("w") as f:
+            f.write(json.dumps(data))
+
+
+def _worker(
+    device: torch.device,
+    dp_group: Optional[ParallelGroup],
+    global_group: Optional[ParallelGroup],
+    config: AllToAllConfig,
+    num_warmup: int,
+    num_repeats: int,
+    output: Path,
+    check: bool,
+    use_uccl: bool = False,
+) -> None:
+    """Benchmark worker process."""
+
+    assert dp_group is not None
+    assert global_group is not None
+
+    os.environ["LOCAL_RANK"] = str(global_group.local_rank)
+    os.environ["LOCAL_WORLD_SIZE"] = os.environ.get("NUM_GPUS", "8")
+    os.environ["RANK"] = str(global_group.rank)
+    os.environ["WORLD_SIZE"] = str(global_group.size)
+
+    if global_group.rank == 0:
+        logging_utils.setup(level="INFO")
+
+    if use_uccl:
+        if not UCCL_AVAILABLE:
+            logger.error("UCCL requested but not available!")
+            return
+        
+        r_uccl = UCCLAllToAllResource(device, global_group, config)
+        try:
+            benchmark_uccl(r_uccl, num_warmup, num_repeats, output)
+        finally:
+            global_group.barrier()
+            r_uccl.destroy()
+    else:
+        r = AllToAllResource(device, dp_group, global_group, config)
+        try:
+            # Correctness check
+            if check:
+                logger.info("Checking correctness...")
+                ex: Exception | None = None
+                try:
+                    correctness_check(r)
+                    ok = True
+                except Exception as e:  #  noqa: BLE001
+                    ex = e
+                    ok = False
+                global_ok = r.global_group.all_gather_object(ok)
+                bad_rank = next((i for i, ok in enumerate(global_ok) if not ok), None)
+                if bad_rank is not None:
+                    raise RuntimeError(
+                        f"Correctness check failed on rank {bad_rank}"
+                    ) from ex
+                logger.info("Correctness check passed")
+            else:
+                logger.info("Skipping correctness check")
+
+            # Benchmark.
+            benchmark(r, num_warmup, num_repeats, output)
+        finally:
+            global_group.barrier()
+            r.all_to_all.destroy()
+
+
+def main() -> None:
+    """Benchmark entry point."""
+
+    parser = argparse.ArgumentParser(description="All-to-All Benchmark")
+    parser.add_argument("--world-size", type=int, required=True)
+    parser.add_argument(
+        "--init-method",
+        type=str,
+        default=None,
+    )
+    parser.add_argument("--dp-size", type=int, default=1)  # TODO: rename to tp_size
+    parser.add_argument("--node-rank", type=int, default=0)
+    parser.add_argument("--num-warmup", type=int, default=10000)
+    parser.add_argument("--num-repeats", type=int, default=10000)
+    parser.add_argument("--nets-per-gpu", type=int, default=2)
+    parser.add_argument("--max-num-tokens", type=int, default=128)
+    parser.add_argument("--max-private-tokens", type=int, default=256)
+    parser.add_argument("--num-experts", type=int, default=256)
+    parser.add_argument("--hidden-dim", type=int, default=7168)
+    parser.add_argument("--hidden-dim-scale", type=int, default=56)
+    parser.add_argument("--num-experts-per-token", type=int, default=8)
+    parser.add_argument("--in-dtype", type=str_to_dtype, default=torch.float8_e4m3fn)
+    parser.add_argument("--out-dtype", type=str_to_dtype, default=torch.bfloat16)
+    parser.add_argument("--scale-dtype", type=str_to_dtype, default=torch.float32)
+    parser.add_argument("--nvlink", type=int, default=None)
+    parser.add_argument("--output", type=Path, default=Path("/dev/stdout"))
+    parser.add_argument(
+        "--check", type=bool, default=True, action=argparse.BooleanOptionalAction
+    )
+    parser.add_argument(
+        "--use-uccl", 
+        action="store_true",
+        help="Use UCCL backend for benchmarking instead of pplx_garden"
+    )
+    args = parser.parse_args()
+
+    config = AllToAllConfig(
+        nets_per_gpu=args.nets_per_gpu,
+        max_num_tokens=args.max_num_tokens,
+        max_private_tokens=args.max_private_tokens,
+        num_experts=args.num_experts,
+        hidden_dim=args.hidden_dim,
+        hidden_dim_scale=args.hidden_dim_scale,
+        num_experts_per_token=args.num_experts_per_token,
+        in_dtype=args.in_dtype,
+        out_dtype=args.out_dtype,
+        scale_dtype=args.scale_dtype,
+        nvlink=args.nvlink,
+    )
+
+    ParallelLaunch(
+        world_size=args.world_size,
+        init_method=args.init_method,
+        dp_size=args.dp_size,
+        node_rank=args.node_rank,
+    ).run(
+        _worker,
+        config,
+        args.num_warmup,
+        args.num_repeats,
+        args.output,
+        args.check,
+        args.use_uccl,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore
new file mode 100644
index 000000000..29bd446f5
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/.gitignore
@@ -0,0 +1,4 @@
+*.sqsh
+nvshmem*
+*.out
+*.err
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md
new file mode 100644
index 000000000..b6055062e
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/README.md
@@ -0,0 +1,60 @@
+# Perplexity Kernels Benchmark
+https://github.com/ppl-ai/pplx-kernels
+
+Updated to [NVSHMEM 3.4.5-0](https://github.com/NVIDIA/nvshmem/commit/df2814155acfba6227534dd81a8bf338da9e55f2) and PPLX-KERNELS [Aug 6, 2025](https://github.com/ppl-ai/pplx-kernels/commit/12cecfda252e4e646417ac263d96e994d476ee5d)
+
+## Git clone NVSHMEM
+
+3.4.5-0:
+```
+git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 && cd ..
+```
+
+devel brach:
+```
+git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout devel && cd ..
+```
+
+## Building Perplexity Kernels Docker image
+
+```bash
+GDRCOPY_VERSION=v2.5.1
+EFA_INSTALLER_VERSION=1.43.2
+NCCL_VERSION=v2.27.7-1
+NCCL_TESTS_VERSION=v2.16.9
+NVSHMEM_VERSION=3.4.5-0
+TAG="efa${EFA_INSTALLER_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}"
+PPLX_CONTAINER_IMAGE_NAME_TAG="pplx-kernels:${TAG}"
+```
+
+```bash
+docker build --progress=plain -f ./pplx-kernels.Dockerfile \
+       --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \
+       --build-arg="NCCL_VERSION=${NCCL_VERSION}" \
+       --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \
+       --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \
+       -t ${PPLX_CONTAINER_IMAGE_NAME_TAG} \
+       .
+```
+
+```bash
+enroot import -o ./pplx-kernels.sqsh dockerd://${PPLX_CONTAINER_IMAGE_NAME_TAG}
+```
+
+## Running Perplexity Kernels Benchmark
+
+```bash
+sbatch pplx-kernels.sbatch
+```
+
+## Check the logs
+
+```bash
+tail -f -n +0 slurm-XXX.out
+```
+
+## Core dump
+
+1. run `ulimit -c unlimited` and check that `srun -N <num of nodes> bash -c "ulimit -c"` should print `unlimited` <num of nodes> times
+2. run `srun -N <num of nodes> sudo bash -c "mkdir -p /tmp/coredump && echo '/tmp/coredump/core.%e.%p' > /proc/sys/kernel/core_pattern"`
+3. run `sbatch pplx-kernels.sbatch`
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/coredump/.gitkeep b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/coredump/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/data/.gitkeep b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/data/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py
new file mode 100644
index 000000000..11aa08ce4
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/launch_bench_all_to_all.py
@@ -0,0 +1,62 @@
+import argparse
+from typing import Callable, Concatenate, ParamSpec
+import torch
+import os
+from tests.bench_all_to_all import _worker_bench_all_to_all
+from tests.distributed_utils import ProcessGroupInfo
+
+P = ParamSpec("P")
+
+def parallel_launch_from_torchrun(
+    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> None:
+    rank = int(os.environ.get("RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    world_local_size = torch.cuda.device_count()
+    local_rank = rank % world_local_size
+    node_rank = rank // world_local_size
+    device = torch.device("cuda", local_rank)
+    torch.cuda.set_device(device)
+    torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl", device_id=device)
+    try:
+        worker(
+            ProcessGroupInfo(
+                world_size=world_size,
+                world_local_size=world_local_size,
+                rank=rank,
+                node_rank=node_rank,
+                local_rank=local_rank,
+                device=device,
+            ),
+            *args,
+            **kwargs,
+        )
+    finally:
+        torch.distributed.destroy_process_group()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dp-size", type=int, default=1)
+    parser.add_argument(
+        "--in-dtype",
+        choices=["bfloat16", "float16", "float8_e4m3fn"],
+        default="float8_e4m3fn",
+    )
+    parser.add_argument(
+        "--out-dtype",
+        choices=["bfloat16", "float16"],
+        default="bfloat16",
+    )
+    args = parser.parse_args()
+    dp_size = int(args.dp_size)
+    in_dtype = str(args.in_dtype)
+    out_dtype = str(args.out_dtype)
+
+    parallel_launch_from_torchrun(_worker_bench_all_to_all, dp_size, in_dtype, out_dtype)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile
new file mode 100644
index 000000000..2c194691f
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.Dockerfile
@@ -0,0 +1,197 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+################################ NCCL ########################################
+
+ARG GDRCOPY_VERSION=v2.5.1
+ARG EFA_INSTALLER_VERSION=1.43.2
+ARG AWS_OFI_NCCL_VERSION=v1.16.3
+ARG NCCL_VERSION=v2.27.7-1
+ARG NCCL_TESTS_VERSION=v2.16.9
+
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+ENV OPAL_PREFIX=
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
+    git \
+    gcc \
+    gdb \
+    kmod \
+    libsubunit-dev \
+    libtool \
+    openssh-client \
+    openssh-server \
+    pkg-config \
+    python3-distutils \
+    vim \
+    python3.10-dev \
+    python3.10-venv
+RUN apt-get purge -y cuda-compat-*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
+ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
+    && python3 /tmp/get-pip.py \
+    && pip3 install awscli pynvml
+
+#################################################
+## Install NVIDIA GDRCopy
+##
+## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
+## that the cuda-compat-xx-x package is the latest.
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
+ENV CPATH=/opt/gdrcopy/include:$CPATH
+ENV PATH=/opt/gdrcopy/bin:$PATH
+
+#################################################
+## Install EFA installer
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+###################################################
+## Install NCCL
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
+    && cd /opt/nccl \
+    && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+###################################################
+## Install NCCL-tests
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) \
+    MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi/ \
+    CUDA_HOME=/usr/local/cuda \
+    NCCL_HOME=/opt/nccl/build \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
+ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so
+
+################################ NVSHMEM ########################################
+
+ENV NVSHMEM_DIR=/opt/nvshmem
+ENV NVSHMEM_HOME=/opt/nvshmem
+
+# 3.2.5-1: wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz
+# 3.3.9:   wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+# 3.4.5-0: git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2
+COPY ./nvshmem /nvshmem_src
+
+RUN cd /nvshmem_src \
+    && mkdir -p build \
+    && cd build \ 
+    && cmake \
+    -DNVSHMEM_PREFIX=/opt/nvshmem \
+    -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \
+    \
+    -DCUDA_HOME=/usr/local/cuda \
+    -DCMAKE_CUDA_ARCHITECTURES="90a;100" \
+    \
+    -DNVSHMEM_USE_GDRCOPY=1 \
+    -DGDRCOPY_HOME=/opt/gdrcopy \
+    \
+    -DNVSHMEM_USE_NCCL=1 \
+    -DNCCL_HOME=/opt/nccl/build \
+    -DNCCL_INCLUDE=/opt/nccl/build/include \
+    \
+    -DNVSHMEM_LIBFABRIC_SUPPORT=1 \
+    -DLIBFABRIC_HOME=/opt/amazon/efa \
+    \
+    -DNVSHMEM_MPI_SUPPORT=1 \
+    -DMPI_HOME=/opt/amazon/openmpi \
+    \
+    -DNVSHMEM_PMIX_SUPPORT=1 \
+    -DPMIX_HOME=/opt/amazon/pmix \
+    -DNVSHMEM_DEFAULT_PMIX=1 \
+    \
+    -DNVSHMEM_BUILD_TESTS=1 \
+    -DNVSHMEM_BUILD_EXAMPLES=1 \
+    -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \
+    -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \
+    \
+    -DNVSHMEM_IBRC_SUPPORT=1 \
+    -DNVSHMEM_IBGDA_SUPPORT=1 \
+    \
+    -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    .. \
+    && make -j$(nproc) \
+    && make install
+
+ENV PATH=/opt/nvshmem/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH
+# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa
+
+################################ PyTorch ########################################
+
+RUN pip install torch --index-url https://download.pytorch.org/whl/cu128
+RUN pip install ninja numpy cmake pytest
+
+################################ PPLX-KERNELS ########################################
+
+RUN git clone https://github.com/ppl-ai/pplx-kernels.git /pplx-kernels \
+    && cd /pplx-kernels \
+    && git checkout 12cecfda252e4e646417ac263d96e994d476ee5d
+# COPY pplx-kernels /pplx-kernels
+
+RUN cd /pplx-kernels \
+    && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" python3 setup.py bdist_wheel \
+    && pip install dist/*.whl
+
+ENV PYTHONPATH=/pplx-kernels
+
+COPY launch_bench_all_to_all.py /launch_bench_all_to_all.py
+
+RUN mkdir -p /tmp/coredump
diff --git a/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch
new file mode 100644
index 000000000..85f0d0bc9
--- /dev/null
+++ b/3.test_cases/expert-parallelism/pplx-kernels-benchmark/pplx-kernels.sbatch
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=pplx-ker
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node 8
+##SBATCH --output %x_%j.out
+##SBATCH --error %x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+### Disable hyperthreading by setting the tasks per core to 1
+##SBATCH --ntasks-per-core=1
+
+###########################
+###### User Variables #####
+###########################
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+export FI_EFA_FORK_SAFE=1
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+
+### Increase the send queue depth and can turn NCCL communications into non-blocking.
+### https://www.usenix.org/system/files/atc23-choi.pdf
+export NCCL_BUFFSIZE=8388608
+### Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
+### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
+export NCCL_P2P_NET_CHUNKSIZE=1048576
+
+### Improve performance for AllReduce by selecting specific protocol and algorithm for specific
+### message size and number of ranks.
+### More information https://github.com/aws/aws-ofi-nccl/wiki/Algorithm-and-Protocol-Tuner-for-AWS.
+export NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so
+
+export NVSHMEM_DIR=/opt/nvshmem
+
+export LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH
+
+export NVSHMEM_BOOTSTRAP_PMI=PMIX
+
+# export NVSHMEM_DEBUG=TRACE
+# export NVSHMEM_DEBUG_SUBSYS=ALL
+
+export NVSHMEM_REMOTE_TRANSPORT=libfabric
+export NVSHMEM_LIBFABRIC_PROVIDER=efa
+
+#Get Hostname and Instance IDs
+mpirun -N 1 bash -c 'echo $(hostname): $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
+
+export MASTER_PORT=29500
+export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
+export WORLD_SIZE=${SLURM_NPROCS}
+
+export PYTHONFAULTHANDLER=1
+
+srun --container-image ./pplx-kernels.sqsh \
+    --mpi=pmix --cpu-bind=none \
+    --container-mounts=$(pwd)/coredump:/tmp/coredump,$(pwd)/data:/pplx-kernels/data \
+    bash -c "RANK=\${SLURM_PROCID} python3 -X faulthandler launch_bench_all_to_all.py"
\ No newline at end of file
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore b/3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore
new file mode 100644
index 000000000..e39592d35
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/.gitignore
@@ -0,0 +1,3 @@
+*.sqsh
+*.out
+*.err
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE b/3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE
new file mode 100644
index 000000000..f4dfd3ce7
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Pavel Belevich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md b/3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md
new file mode 100644
index 000000000..8a99c45e7
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/README.md
@@ -0,0 +1,33 @@
+# UCCL-EP Benchmark
+
+https://uccl-project.github.io/posts/uccl-ep/
+
+```bash
+docker build -t uccl-ep -f uccl-ep.Dockerfile .
+```
+
+```bash
+enroot import -o ./uccl-ep.sqsh dockerd://uccl-ep
+```
+
+test_internode.sbatch
+```bash
+sbatch test_internode.sbatch
+```
+
+test_intranode.sbatch
+```bash
+sbatch test_intranode.sbatch
+```
+
+|   Type    | Dispatch #EP  | Bottleneck bandwidth | Combine #EP  | Bottleneck bandwidth |
+|:---------:|:-------------:|:--------------------:|:------------:|:--------------------:|
+| Intranode | 8             | 318-321 GB/s (NVLink)| 8            | 320-323 GB/s (NVLink)|
+| Internode | 16            | 48-54 GB/s (RDMA)    | 16           | 15-19 GB/s (RDMA)    |
+| Internode | 24            | 52-55 GB/s (RDMA)    | 24           | 23-29 GB/s (RDMA)    |
+| Internode | 32            | 53-55 GB/s (RDMA)    | 32           | 40-42 GB/s (RDMA)    |
+
+test_low_latency.sbatch
+```bash
+sbatch test_low_latency.sbatch
+```
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch
new file mode 100644
index 000000000..716c3b60a
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_internode.sbatch
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=internode
+#SBATCH --nodes=6
+#SBATCH --ntasks-per-node 1
+##SBATCH --output %x_%j.out
+##SBATCH --error %x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+###########################
+###### User Variables #####
+###########################
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+export FI_EFA_FORK_SAFE=1
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+
+export OMP_NUM_THREADS=8
+export PYTHONFAULTHANDLER=1
+
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=12355
+SLURM_GPUS_PER_NODE=8
+
+srun -l \
+    --container-image ./uccl-ep.sqsh \
+        torchrun \
+            --nnodes=$SLURM_JOB_NUM_NODES \
+            --nproc_per_node=$SLURM_GPUS_PER_NODE \
+            --rdzv_id=$SLURM_JOB_ID \
+            --rdzv_backend=c10d \
+            --rdzv_endpoint=$(hostname) \
+            bench/test_internode.py \
+                --num-tokens=4096 \
+                --hidden=7168 \
+                --num-topk=8 \
+                --num-experts=288 \
+                --test-ll-compatibility
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch
new file mode 100644
index 000000000..62b2c1e9f
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_intranode.sbatch
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=intranode
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node 1
+##SBATCH --output %x_%j.out
+##SBATCH --error %x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+###########################
+###### User Variables #####
+###########################
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+export FI_EFA_FORK_SAFE=1
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+
+export OMP_NUM_THREADS=8
+export PYTHONFAULTHANDLER=1
+
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=12355
+SLURM_GPUS_PER_NODE=8
+
+srun -l \
+    --container-image ./uccl-ep.sqsh \
+        torchrun \
+            --nnodes=$SLURM_JOB_NUM_NODES \
+            --nproc_per_node=$SLURM_GPUS_PER_NODE \
+            --rdzv_id=$SLURM_JOB_ID \
+            --rdzv_backend=c10d \
+            --rdzv_endpoint=$(hostname) \
+            bench/test_intranode.py \
+                --num-tokens 4096 \
+                --hidden 7168 \
+                --num-topk 8 \
+                --num-experts 256
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch
new file mode 100644
index 000000000..e7690b9e9
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/test_low_latency.sbatch
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=lowlatency
+#SBATCH --nodes=4
+#SBATCH --ntasks-per-node 1
+##SBATCH --output %x_%j.out
+##SBATCH --error %x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex;
+
+###########################
+###### User Variables #####
+###########################
+
+## Set libfabric flags to use EFA
+export FI_PROVIDER=efa
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+export FI_EFA_FORK_SAFE=1
+
+## Set this flag for debugging EFA
+# export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+
+export OMP_NUM_THREADS=8
+export PYTHONFAULTHANDLER=1
+
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=12355
+SLURM_GPUS_PER_NODE=8
+
+srun -l \
+    --container-image ./uccl-ep.sqsh \
+        torchrun \
+            --nnodes=$SLURM_JOB_NUM_NODES \
+            --nproc_per_node=$SLURM_GPUS_PER_NODE \
+            --rdzv_id=$SLURM_JOB_ID \
+            --rdzv_backend=c10d \
+            --rdzv_endpoint=$(hostname) \
+            bench/test_low_latency.py \
+                --num-tokens=128 \
+                --hidden=7168 \
+                --num-topk=8 \
+                --num-experts=288
diff --git a/3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile b/3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile
new file mode 100644
index 000000000..573a646eb
--- /dev/null
+++ b/3.test_cases/expert-parallelism/uccl-ep-benchmark/uccl-ep.Dockerfile
@@ -0,0 +1,136 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+ARG CUDA_VERSION=12.8.1
+FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+ARG GDRCOPY_VERSION=v2.5.1
+ARG EFA_INSTALLER_VERSION=1.43.2
+ARG AWS_OFI_NCCL_VERSION=v1.16.3
+ARG NCCL_VERSION=v2.27.7-1
+ARG NCCL_TESTS_VERSION=v2.16.9
+
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+ENV OPAL_PREFIX=
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
+    git \
+    gcc \
+    gdb \
+    kmod \
+    libsubunit-dev \
+    libtool \
+    openssh-client \
+    openssh-server \
+    pkg-config \
+    python3-distutils \
+    vim \
+    python3.10-dev \
+    python3.10-venv
+RUN apt-get purge -y cuda-compat-*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+# Set paths for both aarch64 and x86_64
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
+ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
+    && python3 /tmp/get-pip.py \
+    && pip3 install awscli pynvml
+
+#################################################
+## Install NVIDIA GDRCopy
+##
+## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
+## that the cuda-compat-xx-x package is the latest.
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
+ENV CPATH=/opt/gdrcopy/include:$CPATH
+ENV PATH=/opt/gdrcopy/bin:$PATH
+
+#################################################
+## Install EFA installer
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+###################################################
+## Install NCCL
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
+    && cd /opt/nccl \
+    && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+###################################################
+## Install NCCL-tests
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) \
+    MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi/ \
+    CUDA_HOME=/usr/local/cuda \
+    NCCL_HOME=/opt/nccl/build \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
+ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so
+
+###################################################
+## Install uccl-ep
+
+RUN apt-get update && apt-get install -y \
+    sudo \
+    libnuma-dev
+
+RUN git clone https://github.com/uccl-project/uccl.git /opt/uccl \
+    && cd /opt/uccl \
+    # && git checkout <commit-hash> \
+    && cd ep \
+    && ./install_deps.sh \
+    && make -j install
+
+WORKDIR /opt/uccl/ep