diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 614bd74a054..c484e4332bc 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1003,14 +1003,20 @@ configGPUDrivers() { if [ "$OS" = "$UBUNTU_OS_NAME" ]; then waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL mkdir -p /opt/{actions,gpu} - ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + # The driver image is normally pre-pulled into the VHD; only hit the registry when it is + # actually missing so provisioning doesn't pay a redundant manifest/layer round trip. + if ! ctr -n k8s.io images ls -q | grep -qx "$NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG"; then + ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + fi retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" ret=$? if [ "$ret" -ne 0 ]; then echo "Failed to install GPU driver, exiting..." exit $ERR_GPU_DRIVERS_START_FAIL fi - ctr -n k8s.io images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + # Drop the driver image reference so containerd can reclaim its space, but skip --sync so + # garbage collection runs asynchronously instead of blocking node provisioning. + ctr -n k8s.io images rm $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG elif isMarinerOrAzureLinux "$OS" && ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then downloadGPUDrivers installNvidiaContainerToolkit @@ -1636,7 +1642,9 @@ EOF logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL # 2. Start the nvidia-dcgm service. - logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStart nvidia-dcgm 30" || exit $ERR_NVIDIA_DCGM_FAIL + # DCGM is monitoring/telemetry and does not gate GPU workload scheduling, so start it without + # blocking node provisioning and treat a slow/failed start as non-fatal. + logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStartNoBlock nvidia-dcgm 30" || echo "warning: nvidia-dcgm could not be enqueued; GPU monitoring will start asynchronously" # 3. Start the nvidia-dcgm-exporter service. # Create systemd drop-in directory for nvidia-dcgm-exporter service @@ -1658,7 +1666,9 @@ EOF systemctl daemon-reload # Start the nvidia-dcgm-exporter service. - logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStart nvidia-dcgm-exporter 30" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL + # The exporter is telemetry only and does not gate scheduling, so start it off the critical + # path and treat a slow/failed start as non-fatal. + logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30" || echo "warning: nvidia-dcgm-exporter could not be enqueued; GPU metrics will start asynchronously" } get_compute_sku() { diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 5051528c554..3b7c970bb3d 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -1678,4 +1678,54 @@ SETUP_EOF The output should include "rm -f /opt/azure/containers/managed-gpu-experience.enabled" End End + + Describe 'startNvidiaManagedExpServices' + logs_to_events() { + echo "logs_to_events $1" + eval "$2" + } + systemctlEnableAndStart() { + echo "systemctlEnableAndStart $@" + } + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + mkdir() { + echo "mkdir $@" + } + tee() { + cat > /dev/null + echo "tee $@" + } + systemctl() { + echo "systemctl $@" + } + + BeforeEach 'MIG_NODE="false"' + + It 'starts the device-plugin blocking but dcgm and dcgm-exporter off the critical path' + When call startNvidiaManagedExpServices + + # device-plugin gates GPU scheduling, so it must stay blocking. + The output should include "systemctlEnableAndStart nvidia-device-plugin 30" + # dcgm/dcgm-exporter are telemetry only and must not block provisioning. + The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm 30" + The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30" + The output should not include "systemctlEnableAndStart nvidia-dcgm 30" + The output should not include "systemctlEnableAndStart nvidia-dcgm-exporter 30" + End + + It 'does not fail when dcgm telemetry services cannot be enqueued' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 1 + } + + When call startNvidiaManagedExpServices + + The status should be success + The output should include "warning: nvidia-dcgm could not be enqueued" + The output should include "warning: nvidia-dcgm-exporter could not be enqueued" + End + End End