1- FROM nvcr.io/nvidia/nemo:24.12
2- ARG GDRCOPY_VERSION=v2.4.1
3- ARG EFA_INSTALLER_VERSION=1.37.0
4- ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws
5- ARG NCCL_VERSION=v2.23.4-1
6- ARG NCCL_TESTS_VERSION=v2.13.10
7- ARG TRANSFORMERS_VERSION=4.48 .1
1+ FROM nvcr.io/nvidia/nemo:25.07.00
2+ ARG GDRCOPY_VERSION=v2.5
3+ ARG EFA_INSTALLER_VERSION=1.43.1
4+ # ARG AWS_OFI_NCCL_VERSION=v1.13.2-aws # OFI NCCL already packaged into EFA installation (/opt/amazon/ofi-nccl) cf. https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-changelog.html
5+ ARG NCCL_VERSION=v2.23.4-1
6+ ARG NCCL_TESTS_VERSION=v2.16.7
7+ ARG TRANSFORMERS_VERSION=4.54 .1
88
9- ARG OPEN_MPI_PATH=/opt/amazon/openmpi
9+ ARG OPEN_MPI_PATH=/opt/amazon/openmpi # Open MPI already packaged into EFA installation (/opt/amazon/openmpi)
1010
1111# #####################
1212# Update and remove the IB libverbs
@@ -52,7 +52,7 @@ RUN rm -rf /root/.ssh/ \
5252 && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
5353 && printf "Host *\n StrictHostKeyChecking no\n " >> /root/.ssh/config
5454
55- ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws- ofi-nccl/install /lib:$LD_LIBRARY_PATH
55+ ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ ofi-nccl/lib:$LD_LIBRARY_PATH
5656ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
5757
5858# ################################################
@@ -81,24 +81,24 @@ RUN cd $HOME \
8181
8282# ##################################################
8383# # Install AWS-OFI-NCCL plugin
84- RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
85- # Switch from sh to bash to allow parameter expansion
86- SHELL ["/bin/bash" , "-c" ]
87- RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
88- && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
89- && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
90- && ./configure --prefix=/opt/aws-ofi-nccl/install \
91- --with-mpi=/opt/amazon/openmpi \
92- --with-libfabric=/opt/amazon/efa \
93- --with-cuda=/usr/local/cuda \
94- --enable-platform-aws \
95- && make -j $(nproc) \
96- && make install \
97- && cd .. \
98- && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
99- && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
100-
101- SHELL ["/bin/sh" , "-c" ]
84+ # RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
85+ # # Switch from sh to bash to allow parameter expansion
86+ # SHELL ["/bin/bash", "-c"]
87+ # RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
88+ # && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
89+ # && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
90+ # && ./configure --prefix=/opt/aws-ofi-nccl/install \
91+ # --with-mpi=/opt/amazon/openmpi \
92+ # --with-libfabric=/opt/amazon/efa \
93+ # --with-cuda=/usr/local/cuda \
94+ # --enable-platform-aws \
95+ # && make -j $(nproc) \
96+ # && make install \
97+ # && cd .. \
98+ # && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
99+ # && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
100+
101+ # SHELL ["/bin/sh", "-c"]
102102
103103# ##################################################
104104RUN rm -rf /var/lib/apt/lists/*
@@ -126,4 +126,7 @@ ENV OMPI_MCA_pml=^cm,ucx \
126126 NCCL_SOCKET_IFNAME=^docker,lo,veth
127127
128128# # Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
129- ENV PMIX_MCA_gds=hash
129+ ENV PMIX_MCA_gds=hash
130+
131+ # Debug: Verify OFI NCCL and OPENMPI installation
132+ RUN ls -la /opt/amazon/efa/lib/ && ls -la /opt/amazon/ofi-nccl/lib/ && ls -la /opt/amazon/openmpi/lib/
0 commit comments