From 604bebd3a2ce344e510124c9bf369805516b82e4 Mon Sep 17 00:00:00 2001 From: David Grove Date: Fri, 30 May 2025 16:23:29 -0400 Subject: [PATCH] Prevent user from defining NCCL_TOPO_FILE when topologyFileConfingMap is set --- .../chart/templates/_helpers.tpl | 6 + .../__snapshot__/helloworld_test.yaml.snap | 154 ++++++++++++++++++ .../chart/tests/helloworld_test.yaml | 19 +++ 3 files changed, 179 insertions(+) diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl index 68b09ab..f2ee7a9 100644 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ b/tools/pytorchjob-generator/chart/templates/_helpers.tpl @@ -90,6 +90,12 @@ env: fieldPath: metadata.labels['sakkara.member.rank'] {{- end }} {{- if .Values.topologyFileConfigMap }} + {{- range $variable := .Values.environmentVariables }} + {{- if eq $variable.name "NCCL_TOPO_FILE" }} + {{ required "If topologyFileConfigMap is defined, environment variables must not define NCCL_TOPO_FILE" nil }} + {{- end }} + {{- end }} + # Put the path to virtualTopology.xml file that was volume-mounted into the expected environment variable for CUDA - name: NCCL_TOPO_FILE value: /var/run/nvidia-topologyd/virtualTopology.xml {{- end }} diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap index 16870fc..43e9c2d 100644 --- a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap +++ b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap @@ -1362,6 +1362,160 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: - emptyDir: medium: Memory name: dshm +Harmless environment variables can be set when topologyFileConfigMap is provided: + 1: | + apiVersion: workload.codeflare.dev/v1beta2 + kind: AppWrapper + metadata: + annotations: + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 + labels: + kueue.x-k8s.io/queue-name: default-queue + name: my-job + namespace: my-namespace + spec: + components: + - template: + apiVersion: kubeflow.org/v1 + kind: PyTorchJob + metadata: + name: my-job + spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: Never + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: autopilot.ibm.com/gpuhealth + operator: NotIn + values: + - ERR + - TESTING + - EVICT + containers: + - command: + - sh + - -c + - | + echo "Environment variables set by the kubeflow training operator:" + echo ${MASTER_ADDR}:${MASTER_PORT} + echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} + echo My global rank is ${RANK} / ${WORLD_SIZE} + echo "Other injected environment variables:" + echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} + # + # User commands + # + git clone https://github.com/dbarnett/python-helloworld + cd python-helloworld + echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + env: + - name: NCCL_TOPO_FILE + value: /var/run/nvidia-topologyd/virtualTopology.xml + - name: EXAMPLE_VAR1 + value: "42" + image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 + imagePullPolicy: IfNotPresent + name: pytorch + resources: + limits: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + requests: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + volumeMounts: + - mountPath: /var/run/nvidia-topologyd + name: topology-volume + - mountPath: /dev/shm + name: dshm + imagePullSecrets: [] + priorityClassName: default-priority + volumes: + - configMap: + name: nvidia-topo-gdr + name: topology-volume + - emptyDir: + medium: Memory + name: dshm + Worker: + replicas: 3 + restartPolicy: Never + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: autopilot.ibm.com/gpuhealth + operator: NotIn + values: + - ERR + - TESTING + - EVICT + containers: + - command: + - sh + - -c + - | + echo "Environment variables set by the kubeflow training operator:" + echo ${MASTER_ADDR}:${MASTER_PORT} + echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} + echo My global rank is ${RANK} / ${WORLD_SIZE} + echo "Other injected environment variables:" + echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} + # + # User commands + # + git clone https://github.com/dbarnett/python-helloworld + cd python-helloworld + echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py + env: + - name: NCCL_TOPO_FILE + value: /var/run/nvidia-topologyd/virtualTopology.xml + - name: EXAMPLE_VAR1 + value: "42" + image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 + imagePullPolicy: IfNotPresent + name: pytorch + resources: + limits: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + requests: + cpu: 500m + memory: 1Gi + nvidia.com/gpu: 8 + nvidia.com/roce_gdr: 0 + volumeMounts: + - mountPath: /var/run/nvidia-topologyd + name: topology-volume + - mountPath: /dev/shm + name: dshm + imagePullSecrets: [] + priorityClassName: default-priority + volumes: + - configMap: + name: nvidia-topo-gdr + name: topology-volume + - emptyDir: + medium: Memory + name: dshm scheduler can be set: 1: | apiVersion: workload.codeflare.dev/v1beta2 diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml index 83aa908..a0c2be1 100644 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml @@ -270,3 +270,22 @@ tests: asserts: - matchSnapshot: path: spec.components[0].template + +- it: Harmless environment variables can be set when topologyFileConfigMap is provided + set: + topologyFileConfigMap: nvidia-topo-gdr + environmentVariables: + - name: EXAMPLE_VAR1 + value: 42 + asserts: + - matchSnapshot: + path: spec.components[0].template + +- it: NCCL_TOPO_FILE environment variables cannot be set when topologyFileConfigMap is provided + set: + topologyFileConfigMap: nvidia-topo-gdr + environmentVariables: + - name: NCCL_TOPO_FILE + value: myFile + asserts: + - failedTemplate: {}