From c7eca88f8c88f324824c96bfcdcf197ac22a724a Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan <aganeshkumar@microsoft.com>
Date: Sun, 31 May 2026 12:15:56 -0700
Subject: [PATCH] perf: trim GPU provisioning critical path (skip redundant
 pull, async image cleanup, defer DCGM)

Three low-risk CSE-time optimizations for GPU nodes, none of which change the
default driver install behavior:

1. Skip the redundant `ctr image pull` in configGPUDrivers() when the driver
   image is already present locally. The image is normally pre-pulled into the
   VHD, so the boot-time pull was paying a wasted manifest/layer round trip to
   MCR; we still pull as a fallback when the image is genuinely missing.

2. Drop `--sync` from the post-install `ctr images rm` so containerd garbage
   collection happens asynchronously instead of blocking provisioning. The
   image reference is still removed to reclaim disk.

3. Start nvidia-dcgm and nvidia-dcgm-exporter with
   systemctlEnableAndStartNoBlock and treat a slow/failed start as non-fatal.
   These are telemetry only and do not gate GPU workload scheduling. The
   nvidia-device-plugin start stays blocking and fatal because it gates the
   node advertising GPUs to the scheduler.

Adds shellspec coverage for startNvidiaManagedExpServices asserting the
device-plugin stays blocking while dcgm/dcgm-exporter are enqueued off the
critical path and do not fail provisioning.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../linux/cloud-init/artifacts/cse_config.sh  | 18 +++++--
 .../cloud-init/artifacts/cse_config_spec.sh   | 50 +++++++++++++++++++
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh
index 614bd74a054..c484e4332bc 100755
--- a/parts/linux/cloud-init/artifacts/cse_config.sh
+++ b/parts/linux/cloud-init/artifacts/cse_config.sh
@@ -1003,14 +1003,20 @@ configGPUDrivers() {
     if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
         waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL
         mkdir -p /opt/{actions,gpu}
-        ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
+        # The driver image is normally pre-pulled into the VHD; only hit the registry when it is
+        # actually missing so provisioning doesn't pay a redundant manifest/layer round trip.
+        if ! ctr -n k8s.io images ls -q | grep -qx "$NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG"; then
+            ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
+        fi
         retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
         ret=$?
         if [ "$ret" -ne 0 ]; then
             echo "Failed to install GPU driver, exiting..."
             exit $ERR_GPU_DRIVERS_START_FAIL
         fi
-        ctr -n k8s.io images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
+        # Drop the driver image reference so containerd can reclaim its space, but skip --sync so
+        # garbage collection runs asynchronously instead of blocking node provisioning.
+        ctr -n k8s.io images rm $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
     elif isMarinerOrAzureLinux "$OS" && ! isAzureLinuxOSGuard "$OS" "$OS_VARIANT"; then
         downloadGPUDrivers
         installNvidiaContainerToolkit
@@ -1636,7 +1642,9 @@ EOF
     logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL
 
     # 2. Start the nvidia-dcgm service.
-    logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStart nvidia-dcgm 30" || exit $ERR_NVIDIA_DCGM_FAIL
+    # DCGM is monitoring/telemetry and does not gate GPU workload scheduling, so start it without
+    # blocking node provisioning and treat a slow/failed start as non-fatal.
+    logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStartNoBlock nvidia-dcgm 30" || echo "warning: nvidia-dcgm could not be enqueued; GPU monitoring will start asynchronously"
 
     # 3. Start the nvidia-dcgm-exporter service.
     # Create systemd drop-in directory for nvidia-dcgm-exporter service
@@ -1658,7 +1666,9 @@ EOF
     systemctl daemon-reload
 
     # Start the nvidia-dcgm-exporter service.
-    logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStart nvidia-dcgm-exporter 30" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL
+    # The exporter is telemetry only and does not gate scheduling, so start it off the critical
+    # path and treat a slow/failed start as non-fatal.
+    logs_to_events "AKS.CSE.start.nvidia-dcgm-exporter" "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30" || echo "warning: nvidia-dcgm-exporter could not be enqueued; GPU metrics will start asynchronously"
 }
 
 get_compute_sku() {
diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
index 5051528c554..3b7c970bb3d 100755
--- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
+++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
@@ -1678,4 +1678,54 @@ SETUP_EOF
             The output should include "rm -f /opt/azure/containers/managed-gpu-experience.enabled"
         End
     End
+
+    Describe 'startNvidiaManagedExpServices'
+        logs_to_events() {
+            echo "logs_to_events $1"
+            eval "$2"
+        }
+        systemctlEnableAndStart() {
+            echo "systemctlEnableAndStart $@"
+        }
+        systemctlEnableAndStartNoBlock() {
+            echo "systemctlEnableAndStartNoBlock $@"
+        }
+        mkdir() {
+            echo "mkdir $@"
+        }
+        tee() {
+            cat > /dev/null
+            echo "tee $@"
+        }
+        systemctl() {
+            echo "systemctl $@"
+        }
+
+        BeforeEach 'MIG_NODE="false"'
+
+        It 'starts the device-plugin blocking but dcgm and dcgm-exporter off the critical path'
+            When call startNvidiaManagedExpServices
+
+            # device-plugin gates GPU scheduling, so it must stay blocking.
+            The output should include "systemctlEnableAndStart nvidia-device-plugin 30"
+            # dcgm/dcgm-exporter are telemetry only and must not block provisioning.
+            The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm 30"
+            The output should include "systemctlEnableAndStartNoBlock nvidia-dcgm-exporter 30"
+            The output should not include "systemctlEnableAndStart nvidia-dcgm 30"
+            The output should not include "systemctlEnableAndStart nvidia-dcgm-exporter 30"
+        End
+
+        It 'does not fail when dcgm telemetry services cannot be enqueued'
+            systemctlEnableAndStartNoBlock() {
+                echo "systemctlEnableAndStartNoBlock $@"
+                return 1
+            }
+
+            When call startNvidiaManagedExpServices
+
+            The status should be success
+            The output should include "warning: nvidia-dcgm could not be enqueued"
+            The output should include "warning: nvidia-dcgm-exporter could not be enqueued"
+        End
+    End
 End