|
| 1 | +# Docker Build for the Slurmd Deep Learning Container |
| 2 | + |
| 3 | +This build includes Python 3.12.8 + PyTorch 2.6.0 + CUDA 12.6 + NCCL 2.23.4 + EFA Installer 1.38.0 (bundled with OFI NCCL plugin) |
| 4 | + |
| 5 | +Clone the AWSome Distributed Training repo: |
| 6 | +``` |
| 7 | +git clone https://github.com/aws-samples/awsome-distributed-training.git |
| 8 | +cd awsome-distributed-training/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/ |
| 9 | +
|
| 10 | +``` |
| 11 | + |
| 12 | +Build the container image: |
| 13 | + |
| 14 | +``` |
| 15 | +
|
| 16 | +# Authenticate to DLC repo (Account 763104351884 is publicly known) |
| 17 | +aws ecr get-login-password --region us-east-1 \ |
| 18 | +| docker login --username AWS \ |
| 19 | +--password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com |
| 20 | +
|
| 21 | +# on a Mac |
| 22 | +docker buildx build --platform linux/amd64 -t dlc-slurmd:25.05.0-ubuntu24.04 -f dlc-slurmd.Dockerfile . |
| 23 | +
|
| 24 | +# on Linux |
| 25 | +# docker build -t dlc-slurmd:25.05.0-ubuntu24.04 -f dlc-slurmd.Dockerfile . |
| 26 | +
|
| 27 | +``` |
| 28 | + |
| 29 | +Test locally: |
| 30 | + |
| 31 | +Verify Python 3.12.8 + PyTorch 2.6.0 + CUDA 12.6 + NCCL 2.23.4 |
| 32 | + |
| 33 | +``` |
| 34 | +
|
| 35 | +docker run --platform linux/amd64 -it --entrypoint=/bin/bash dlc-slurmd:25.05.0-ubuntu24.04 |
| 36 | +
|
| 37 | +python3 --version |
| 38 | +# Python 3.12.8 |
| 39 | +
|
| 40 | +which python3 |
| 41 | +# /usr/local/bin/python3 |
| 42 | +
|
| 43 | +nvcc --version |
| 44 | +# nvcc: NVIDIA (R) Cuda compiler driver |
| 45 | +# Copyright (c) 2005-2024 NVIDIA Corporation |
| 46 | +# Built on Tue_Oct_29_23:50:19_PDT_2024 |
| 47 | +# Cuda compilation tools, release 12.6, V12.6.85 |
| 48 | +# Build cuda_12.6.r12.6/compiler.35059454_0 |
| 49 | +
|
| 50 | +python3 -c "import torch; print(torch.__version__)" |
| 51 | +# 2.6.0+cu126 |
| 52 | +
|
| 53 | +python3 -c "import torch; print(torch.cuda.nccl.version())" |
| 54 | +# (2, 23, 4) |
| 55 | +
|
| 56 | +ls -l /usr/local/lib/libnccl* |
| 57 | +# -rwxr-xr-x 1 root root 263726576 Mar 6 23:36 /usr/local/lib/libnccl.so |
| 58 | +# -rwxr-xr-x 1 root root 263726576 Mar 6 23:36 /usr/local/lib/libnccl.so.2 |
| 59 | +# -rwxr-xr-x 1 root root 263726576 Mar 6 23:36 /usr/local/lib/libnccl.so.2.23.4 |
| 60 | +# -rw-r--r-- 1 root root 277972056 Mar 6 23:36 /usr/local/lib/libnccl_static.a |
| 61 | +
|
| 62 | +cat /etc/nccl.conf |
| 63 | +# NCCL_DEBUG=INFO |
| 64 | +# NCCL_SOCKET_IFNAME=^docker0 |
| 65 | +
|
| 66 | +exit |
| 67 | +``` |
| 68 | + |
| 69 | +Create a private ECR repo: |
| 70 | + |
| 71 | +``` |
| 72 | +
|
| 73 | +aws ecr create-repository --repository-name dlc-slurmd |
| 74 | +
|
| 75 | +``` |
| 76 | + |
| 77 | +Authenticate to the repo: |
| 78 | + |
| 79 | +``` |
| 80 | +export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) |
| 81 | +export AWS_REGION=<your-region-here> |
| 82 | +
|
| 83 | +aws ecr get-login-password --region $AWS_REGION \ |
| 84 | + | docker login --username AWS \ |
| 85 | + --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com |
| 86 | + |
| 87 | +``` |
| 88 | + |
| 89 | +Tag the image: |
| 90 | + |
| 91 | +``` |
| 92 | +
|
| 93 | +docker tag dlc-slurmd:25.05.0-ubuntu24.04 \ |
| 94 | + ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/dlc-slurmd:25.05.0-ubuntu24.04 |
| 95 | + |
| 96 | +``` |
| 97 | + |
| 98 | +Push the image to an ECR repo: |
| 99 | + |
| 100 | +``` |
| 101 | +
|
| 102 | +docker push ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/dlc-slurmd:25.05.0-ubuntu24.04 |
| 103 | +
|
| 104 | +``` |
| 105 | + |
| 106 | +Test ECR access: |
| 107 | + |
| 108 | +``` |
| 109 | +
|
| 110 | +kubectl run test-pod \ |
| 111 | + --image=${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/dlc-slurmd:25.05.0-ubuntu24.04 \ |
| 112 | + --restart=Never \ |
| 113 | + --image-pull-policy=Always |
| 114 | +
|
| 115 | +# verify slurm version |
| 116 | +kubectl exec -it test-pod -- slurmd -V |
| 117 | + |
| 118 | +kubectl describe pod test-pod |
| 119 | +
|
| 120 | +# verify additional requirements |
| 121 | +kubectl exec -it test-pod -- ls /usr/local/lib/python3.12/site-packages/ \ |
| 122 | + | egrep "datasets|fsspec|numpy|torch|torchaudio|torchvision|transformers" |
| 123 | + |
| 124 | +kubectl delete pod test-pod |
| 125 | +
|
| 126 | +``` |
| 127 | + |
| 128 | +(Optional) Update the container image used by the Slinky NodeSet: |
| 129 | + |
| 130 | +Note: this step is not required if you specify the image repository and tag in the [values.yaml](./values.yaml) file, but is useful if you want to test a new image build without redeploying the entire Slurm cluster. |
| 131 | + |
| 132 | +``` |
| 133 | +export NODESET_NAME=$(kubectl get nodeset -n slurm -o custom-columns=NAME:metadata.name --no-headers) |
| 134 | +
|
| 135 | +kubectl -n slurm patch nodeset.slinky.slurm.net \ |
| 136 | + $NODESET_NAME \ |
| 137 | + --type='json' \ |
| 138 | + -p="[ |
| 139 | + {\"op\": \"replace\", \"path\": \"/spec/template/spec/containers/0/image\", \"value\":\"${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/dlc-slurmd:25.05.0-ubuntu24.04\"}, |
| 140 | + {\"op\": \"replace\", \"path\": \"/spec/template/spec/containers/0/imagePullPolicy\", \"value\":\"Always\"} |
| 141 | + ]" |
| 142 | + |
| 143 | +``` |
| 144 | + |
| 145 | +Scale the Slinky NodeSet down and back up to trigger replacement: |
| 146 | + |
| 147 | +``` |
| 148 | +
|
| 149 | +kubectl -n slurm scale nodeset/$NODESET_NAME --replicas=0 |
| 150 | +
|
| 151 | +
|
| 152 | +kubectl -n slurm scale nodeset/$NODESET_NAME --replicas=4 |
| 153 | +
|
| 154 | +``` |
| 155 | + |
0 commit comments