Skip to content
67 changes: 67 additions & 0 deletions download_azure_grid_driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env bash

set -eu

# GRID_INSTALLER_DIR is provided by Dockerfile ENV
GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install}

# Available Azure GRID driver versions
AVAILABLE_VERSIONS="550.144.06, 535.161.08, 525.105.17"

print_usage() {
echo "Usage: $0 <driver_version>"
echo "Available versions: $AVAILABLE_VERSIONS"
}

get_grid_azure_url() {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to support all those versions, especially since they are hardcoded anyway. Only keeping 1 (the latest) per driver branch would shorten the script a little bit

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

local version="$1"

# Azure GRID driver version mapping
case "$version" in
550.144.06*)
echo "https://download.microsoft.com/download/c5319e92-672e-4067-8d85-ab66a7a64db3/NVIDIA-Linux-x86_64-550.144.06-grid-azure.run"
;;
535.161.08*)
echo "https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"
;;
525.105.17*)
echo "https://download.microsoft.com/download/6/b/d/6bd2850f-5883-4e2a-9a35-edbd3dd6808c/NVIDIA-Linux-x86_64-525.105.17-grid-azure.run"
;;
*)
echo ""
return 1
;;
esac
return 0
}

fetch_grid_azure_installer() {
local driver_version="$1"

if [ -z "$driver_version" ]; then
echo "ERROR: Driver version must be provided as an argument"
print_usage
exit 1
fi

mkdir -p "$GRID_INSTALLER_DIR"
cd "$GRID_INSTALLER_DIR"

local download_url=$(get_grid_azure_url "$driver_version")

if [ -z "$download_url" ]; then
echo "ERROR: No Azure GRID driver URL found for version $driver_version"
print_usage
exit 1
fi

local filename=$(basename "$download_url")
echo "Downloading GRID driver from: $download_url"

curl -fSsl -o "$filename" "$download_url"
chmod +x "$filename"

echo "GRID installer downloaded successfully to $GRID_INSTALLER_DIR/$filename"
}

fetch_grid_azure_installer "$@"
14 changes: 14 additions & 0 deletions precompiled.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,22 @@ RUN . /versions.env && \
DEP_PACKAGES=$(apt-rdepends $BASE_PACKAGES_NAMES | grep -v "^ " | grep -v "^debconf-2.0$" | grep -v "^linux-image-unsigned-") && \
apt-get install -y --download-only --no-install-recommends --reinstall $BASE_PACKAGES $DEP_PACKAGES

# Download GRID driver and its dependencie: kernel headers, dkms, linux-modules (for video.ko) — Azure only
COPY download_azure_grid_driver.sh /tmp
# linux-modules contains video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol
# TODO: Azure supports only several GRID driver versions. Temporary hardcode the version.
RUN . /versions.env && \
if [ "${KERNEL_VERSION##*-}" = "azure" ]; then \
apt-get install -y --download-only --no-install-recommends \
linux-headers-${KERNEL_VERSION} \
linux-modules-${KERNEL_VERSION} \
dkms && \
/tmp/download_azure_grid_driver.sh "550.144.06"; \
fi

RUN mkdir -p /opt/nvidia-driver/bin
COPY ubuntu22.04/precompiled/nvidia-driver /opt/nvidia-driver/bin/nvidia-driver
COPY ubuntu22.04/precompiled/grid-driver /opt/nvidia-driver/bin/grid-driver
COPY nvidia-driver-wrapper.sh /usr/local/bin/nvidia-driver

WORKDIR /drivers
Expand Down
103 changes: 103 additions & 0 deletions ubuntu22.04/precompiled/grid-driver
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#! /bin/bash

GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install}

_install_grid_driver() {
echo "Installing NVIDIA GRID driver from Azure package..."

if [ ! -d "$GRID_INSTALLER_DIR" ]; then
echo "ERROR: GRID installer directory not found: $GRID_INSTALLER_DIR"
exit 1
fi

# Find the .run installer file
local installer_file=$(find "$GRID_INSTALLER_DIR" -maxdepth 1 -type f -name "NVIDIA-Linux-*.run" | head -n 1)

if [ -z "$installer_file" ]; then
echo "ERROR: GRID installer .run file not found in $GRID_INSTALLER_DIR"
exit 1
fi

echo "Using GRID installer: $installer_file"

# Install kernel headers and modules required for DKMS
# linux-modules provides video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol
echo "Installing kernel headers and modules for ${KERNEL_VERSION}..."
apt-get install --no-install-recommends --no-download -y \
linux-headers-${KERNEL_VERSION} \
linux-modules-${KERNEL_VERSION} \
dkms

# Create temporary directory for installer
local tmpdir="$GRID_INSTALLER_DIR/nvidia-grid-tmp"
mkdir -p "$tmpdir"

# Install GRID driver using the .run installer
# -s (--silent): non-interactive silent mode
# --dkms: use DKMS to build and load kernel modules automatically
# --tmpdir: specify temporary directory for installation
# Note: GRID drivers do not support --skip-module-load option
bash -c "$installer_file -s --dkms --tmpdir $tmpdir"

local exit_code=$?

# Clean up temporary directory
rm -rf "$tmpdir"

if [ $exit_code -ne 0 ]; then
echo "ERROR: GRID driver installation failed with exit code $exit_code"
exit 1
fi

# Updating gridd.conf as required for Azure NV/NVv3 VMs.
# See: https://learn.microsoft.com/en-us/azure/virtual-machines/linux/n-series-driver-setup#install-grid-drivers-on-nv-or-nvv3-series-vms
echo "Creating GRID config"
cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf

# Replace EnableUI in place (handles both commented and uncommented)
sed -i 's/^#\?[[:space:]]*EnableUI=.*/EnableUI=FALSE/' /etc/nvidia/gridd.conf

# Add EnableUI if not present anywhere in the file
grep -q '^EnableUI=' /etc/nvidia/gridd.conf || echo "EnableUI=FALSE" >> /etc/nvidia/gridd.conf

# Replace IgnoreSP in place (handles both commented and uncommented)
sed -i 's/^#\?[[:space:]]*IgnoreSP=.*/IgnoreSP=FALSE/' /etc/nvidia/gridd.conf

# Add IgnoreSP if not present anywhere in the file
grep -q '^IgnoreSP=' /etc/nvidia/gridd.conf || echo "IgnoreSP=FALSE" >> /etc/nvidia/gridd.conf

# Comment out FeatureType if uncommented
sed -i 's/^FeatureType=/#FeatureType=/' /etc/nvidia/gridd.conf

echo "GRID driver installed successfully"
}

_has_nvidia_a10_gpu() {
# Check for NVIDIA A10 GPU (vendor: 0x10de, device: 0x2236)
# NVIDIA A10 requires GRID driver on Azure
for dev in /sys/bus/pci/devices/*; do
if [ -f "$dev/vendor" ] && [ -f "$dev/device" ]; then
vendor=$(cat "$dev/vendor")
device=$(cat "$dev/device")

if [ "$vendor" = "0x10de" ] && [ "$device" = "0x2236" ]; then
echo "Detected NVIDIA A10 GPU at $(basename $dev), GRID driver required"
return 0 # A10 GPU present
fi
fi
done

return 1 # A10 GPU not present
}

_is_grid_driver_required() {
# Extract CSP name from kernel version (e.g. "azure" from "5.15.0-1040-azure")
local csp_name="${KERNEL_VERSION##*-}"

# Check if this is an Azure instance with NVidia A10 GPU
if [ "$csp_name" = "azure" ] && _has_nvidia_a10_gpu; then
return 0 # GRID driver required
fi

return 1 # GRID driver not required
}
12 changes: 11 additions & 1 deletion ubuntu22.04/precompiled/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
MODPROBE_CONFIG_DIR="/etc/modprobe.d"

source "$(dirname "$0")/grid-driver"

fabricmanager_install() {
apt-get install -y --no-install-recommends --no-download nvidia-fabricmanager-${DRIVER_BRANCH}=${FULL_DRIVER_VERSION}
}
Expand Down Expand Up @@ -390,7 +392,7 @@ _resolve_kernel_type() {
}

# Link and install the kernel modules from a precompiled packages
_install_driver() {
_install_precompiled_driver() {
# Install necessary driver userspace packages
apt-get install -y --no-install-recommends --no-download \
nvidia-utils-${DRIVER_BRANCH}-server=${FULL_DRIVER_VERSION} \
Expand All @@ -415,6 +417,14 @@ _install_driver() {
fi
}

_install_driver() {
if _is_grid_driver_required; then
_install_grid_driver
else
_install_precompiled_driver
fi
}

# Mount the driver rootfs into the run directory with the exception of sysfs.
_mount_rootfs() {
echo "Mounting NVIDIA driver rootfs..."
Expand Down