From c7eca88f8c88f324824c96bfcdcf197ac22a724a Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Sun, 31 May 2026 12:15:56 -0700 Subject: [PATCH] perf: trim GPU provisioning critical path (skip redundant pull, async image cleanup, defer DCGM) Three low-risk CSE-time optimizations for GPU nodes, none of which change the default driver install behavior: 1. Skip the redundant `ctr image pull` in configGPUDrivers() when the driver image is already present locally. The image is normally pre-pulled into the VHD, so the boot-time pull was paying a wasted manifest/layer round trip to MCR; we still pull as a fallback when the image is genuinely missing. 2. Drop `--sync` from the post-install `ctr images rm` so containerd garbage collection happens asynchronously instead of blocking provisioning. The image reference is still removed to reclaim disk. 3. Start nvidia-dcgm and nvidia-dcgm-exporter with systemctlEnableAndStartNoBlock and treat a slow/failed start as non-fatal. These are telemetry only and do not gate GPU workload scheduling. The nvidia-device-plugin start stays blocking and fatal because it gates the node advertising GPUs to the scheduler. Adds shellspec coverage for startNvidiaManagedExpServices asserting the device-plugin stays blocking while dcgm/dcgm-exporter are enqueued off the critical path and do not fail provisioning. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../linux/cloud-init/artifacts/cse_config.sh | 18 +++++-- .../cloud-init/artifacts/cse_config_spec.sh | 50 +++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 614bd74a054..c484e4332bc 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1003,14 +1003,20 @@ configGPUDrivers() { if [ "$OS" = "$UBUNTU_OS_NAME" ]; then waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL mkdir -p /opt/{actions,gpu} - ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + # The driver image is normally pre-pulled into the VHD; only hit the registry when it is + # actually missing so provisioning doesn't pay a redundant manifest/layer round trip. + if ! ctr -n k8s.io images ls -q | grep -qx "$NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG"; then + ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + fi retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" ret=$? if [ "$ret" -ne 0 ]; then echo "Failed to install GPU driver, exiting..." exit $ERR_GPU_DRIVERS_START_FAIL fi - ctr -n k8s.io images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG + # Drop the driver image reference so containerd can reclaim its space, but skip --sync so + # garbage collection runs asynchronously instead of blocking node provisioning. + ctr -n k8s.io images rm $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG elif isMarinerOrAzureLinux "$OS" && ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then downloadGPUDrivers installNvidiaContainerToolkit @@ -1636,7 +1642,9 @@ EOF logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL # 2. Start the nvidia-dcgm service. - logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStart nvidia-dcgm 30" || exit $ERR_NVIDIA_DCGM_FAIL + # DCGM is monitoring/telemetry and does not gate GPU workload scheduling, so start it without + # blocking node provisioning and treat a slow/failed start as non-fatal. + logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStartNoBlock nvidia-dcgm 30" || echo "warning: nvidia-dcgm could not be enqueued; GPU monitoring will start asynchronously" # 3. Start the nvidia-dcgm-exporter service. # Create systemd drop-in directory for nvidia-dcgm-exporter service @@ -1658,7 +1666,9 @@ EOF systemctl daemon-reload # Start the nvidia-dcgm-exporter service. - logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStart nvidia-dcgm-exporter 30" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL + # The exporter is telemetry only and does not gate scheduling, so start it off the critical + # path and treat a slow/failed start as non-fatal. + logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30" || echo "warning: nvidia-dcgm-exporter could not be enqueued; GPU metrics will start asynchronously" } get_compute_sku() { diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 5051528c554..3b7c970bb3d 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -1678,4 +1678,54 @@ SETUP_EOF The output should include "rm -f /opt/azure/containers/managed-gpu-experience.enabled" End End + + Describe 'startNvidiaManagedExpServices' + logs_to_events() { + echo "logs_to_events $1" + eval "$2" + } + systemctlEnableAndStart() { + echo "systemctlEnableAndStart $@" + } + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + } + mkdir() { + echo "mkdir $@" + } + tee() { + cat > /dev/null + echo "tee $@" + } + systemctl() { + echo "systemctl $@" + } + + BeforeEach 'MIG_NODE="false"' + + It 'starts the device-plugin blocking but dcgm and dcgm-exporter off the critical path' + When call startNvidiaManagedExpServices + + # device-plugin gates GPU scheduling, so it must stay blocking. + The output should include "systemctlEnableAndStart nvidia-device-plugin 30" + # dcgm/dcgm-exporter are telemetry only and must not block provisioning. + The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm 30" + The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30" + The output should not include "systemctlEnableAndStart nvidia-dcgm 30" + The output should not include "systemctlEnableAndStart nvidia-dcgm-exporter 30" + End + + It 'does not fail when dcgm telemetry services cannot be enqueued' + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 1 + } + + When call startNvidiaManagedExpServices + + The status should be success + The output should include "warning: nvidia-dcgm could not be enqueued" + The output should include "warning: nvidia-dcgm-exporter could not be enqueued" + End + End End