diff --git a/download_azure_grid_driver.sh b/download_azure_grid_driver.sh new file mode 100755 index 00000000..507791d1 --- /dev/null +++ b/download_azure_grid_driver.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +set -eu + +# GRID_INSTALLER_DIR is provided by Dockerfile ENV +GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install} + +# Available Azure GRID driver versions +AVAILABLE_VERSIONS="550.144.06, 535.161.08, 525.105.17" + +print_usage() { + echo "Usage: $0 " + echo "Available versions: $AVAILABLE_VERSIONS" +} + +get_grid_azure_url() { + local version="$1" + + # Azure GRID driver version mapping + case "$version" in + 550.144.06*) + echo "https://download.microsoft.com/download/c5319e92-672e-4067-8d85-ab66a7a64db3/NVIDIA-Linux-x86_64-550.144.06-grid-azure.run" + ;; + 535.161.08*) + echo "https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run" + ;; + 525.105.17*) + echo "https://download.microsoft.com/download/6/b/d/6bd2850f-5883-4e2a-9a35-edbd3dd6808c/NVIDIA-Linux-x86_64-525.105.17-grid-azure.run" + ;; + *) + echo "" + return 1 + ;; + esac + return 0 +} + +fetch_grid_azure_installer() { + local driver_version="$1" + + if [ -z "$driver_version" ]; then + echo "ERROR: Driver version must be provided as an argument" + print_usage + exit 1 + fi + + mkdir -p "$GRID_INSTALLER_DIR" + cd "$GRID_INSTALLER_DIR" + + local download_url=$(get_grid_azure_url "$driver_version") + + if [ -z "$download_url" ]; then + echo "ERROR: No Azure GRID driver URL found for version $driver_version" + print_usage + exit 1 + fi + + local filename=$(basename "$download_url") + echo "Downloading GRID driver from: $download_url" + + curl -fSsl -o "$filename" "$download_url" + chmod +x "$filename" + + echo "GRID installer downloaded successfully to $GRID_INSTALLER_DIR/$filename" +} + +fetch_grid_azure_installer "$@" diff --git a/precompiled.Dockerfile b/precompiled.Dockerfile index 0f1b760e..2b6755b0 100644 --- a/precompiled.Dockerfile +++ b/precompiled.Dockerfile @@ -97,8 +97,22 @@ RUN . /versions.env && \ DEP_PACKAGES=$(apt-rdepends $BASE_PACKAGES_NAMES | grep -v "^ " | grep -v "^debconf-2.0$" | grep -v "^linux-image-unsigned-") && \ apt-get install -y --download-only --no-install-recommends --reinstall $BASE_PACKAGES $DEP_PACKAGES +# Download GRID driver and its dependencie: kernel headers, dkms, linux-modules (for video.ko) — Azure only +COPY download_azure_grid_driver.sh /tmp +# linux-modules contains video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol +# TODO: Azure supports only several GRID driver versions. Temporary hardcode the version. +RUN . /versions.env && \ + if [ "${KERNEL_VERSION##*-}" = "azure" ]; then \ + apt-get install -y --download-only --no-install-recommends \ + linux-headers-${KERNEL_VERSION} \ + linux-modules-${KERNEL_VERSION} \ + dkms && \ + /tmp/download_azure_grid_driver.sh "550.144.06"; \ + fi + RUN mkdir -p /opt/nvidia-driver/bin COPY ubuntu22.04/precompiled/nvidia-driver /opt/nvidia-driver/bin/nvidia-driver +COPY ubuntu22.04/precompiled/grid-driver /opt/nvidia-driver/bin/grid-driver COPY nvidia-driver-wrapper.sh /usr/local/bin/nvidia-driver WORKDIR /drivers diff --git a/ubuntu22.04/precompiled/grid-driver b/ubuntu22.04/precompiled/grid-driver new file mode 100644 index 00000000..176b0079 --- /dev/null +++ b/ubuntu22.04/precompiled/grid-driver @@ -0,0 +1,103 @@ +#! /bin/bash + +GRID_INSTALLER_DIR=${GRID_INSTALLER_DIR:-/opt/nvidia-grid-install} + +_install_grid_driver() { + echo "Installing NVIDIA GRID driver from Azure package..." + + if [ ! -d "$GRID_INSTALLER_DIR" ]; then + echo "ERROR: GRID installer directory not found: $GRID_INSTALLER_DIR" + exit 1 + fi + + # Find the .run installer file + local installer_file=$(find "$GRID_INSTALLER_DIR" -maxdepth 1 -type f -name "NVIDIA-Linux-*.run" | head -n 1) + + if [ -z "$installer_file" ]; then + echo "ERROR: GRID installer .run file not found in $GRID_INSTALLER_DIR" + exit 1 + fi + + echo "Using GRID installer: $installer_file" + + # Install kernel headers and modules required for DKMS + # linux-modules provides video.ko which nvidia-modeset depends on for __acpi_video_get_backlight_type symbol + echo "Installing kernel headers and modules for ${KERNEL_VERSION}..." + apt-get install --no-install-recommends --no-download -y \ + linux-headers-${KERNEL_VERSION} \ + linux-modules-${KERNEL_VERSION} \ + dkms + + # Create temporary directory for installer + local tmpdir="$GRID_INSTALLER_DIR/nvidia-grid-tmp" + mkdir -p "$tmpdir" + + # Install GRID driver using the .run installer + # -s (--silent): non-interactive silent mode + # --dkms: use DKMS to build and load kernel modules automatically + # --tmpdir: specify temporary directory for installation + # Note: GRID drivers do not support --skip-module-load option + bash -c "$installer_file -s --dkms --tmpdir $tmpdir" + + local exit_code=$? + + # Clean up temporary directory + rm -rf "$tmpdir" + + if [ $exit_code -ne 0 ]; then + echo "ERROR: GRID driver installation failed with exit code $exit_code" + exit 1 + fi + + # Updating gridd.conf as required for Azure NV/NVv3 VMs. + # See: https://learn.microsoft.com/en-us/azure/virtual-machines/linux/n-series-driver-setup#install-grid-drivers-on-nv-or-nvv3-series-vms + echo "Creating GRID config" + cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf + + # Replace EnableUI in place (handles both commented and uncommented) + sed -i 's/^#\?[[:space:]]*EnableUI=.*/EnableUI=FALSE/' /etc/nvidia/gridd.conf + + # Add EnableUI if not present anywhere in the file + grep -q '^EnableUI=' /etc/nvidia/gridd.conf || echo "EnableUI=FALSE" >> /etc/nvidia/gridd.conf + + # Replace IgnoreSP in place (handles both commented and uncommented) + sed -i 's/^#\?[[:space:]]*IgnoreSP=.*/IgnoreSP=FALSE/' /etc/nvidia/gridd.conf + + # Add IgnoreSP if not present anywhere in the file + grep -q '^IgnoreSP=' /etc/nvidia/gridd.conf || echo "IgnoreSP=FALSE" >> /etc/nvidia/gridd.conf + + # Comment out FeatureType if uncommented + sed -i 's/^FeatureType=/#FeatureType=/' /etc/nvidia/gridd.conf + + echo "GRID driver installed successfully" +} + +_has_nvidia_a10_gpu() { + # Check for NVIDIA A10 GPU (vendor: 0x10de, device: 0x2236) + # NVIDIA A10 requires GRID driver on Azure + for dev in /sys/bus/pci/devices/*; do + if [ -f "$dev/vendor" ] && [ -f "$dev/device" ]; then + vendor=$(cat "$dev/vendor") + device=$(cat "$dev/device") + + if [ "$vendor" = "0x10de" ] && [ "$device" = "0x2236" ]; then + echo "Detected NVIDIA A10 GPU at $(basename $dev), GRID driver required" + return 0 # A10 GPU present + fi + fi + done + + return 1 # A10 GPU not present +} + +_is_grid_driver_required() { + # Extract CSP name from kernel version (e.g. "azure" from "5.15.0-1040-azure") + local csp_name="${KERNEL_VERSION##*-}" + + # Check if this is an Azure instance with NVidia A10 GPU + if [ "$csp_name" = "azure" ] && _has_nvidia_a10_gpu; then + return 0 # GRID driver required + fi + + return 1 # GRID driver not required +} diff --git a/ubuntu22.04/precompiled/nvidia-driver b/ubuntu22.04/precompiled/nvidia-driver index 5471cd46..697b62ca 100755 --- a/ubuntu22.04/precompiled/nvidia-driver +++ b/ubuntu22.04/precompiled/nvidia-driver @@ -20,6 +20,8 @@ TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} MODPROBE_CONFIG_DIR="/etc/modprobe.d" +source "$(dirname "$0")/grid-driver" + fabricmanager_install() { apt-get install -y --no-install-recommends --no-download nvidia-fabricmanager-${DRIVER_BRANCH}=${FULL_DRIVER_VERSION} } @@ -390,7 +392,7 @@ _resolve_kernel_type() { } # Link and install the kernel modules from a precompiled packages -_install_driver() { +_install_precompiled_driver() { # Install necessary driver userspace packages apt-get install -y --no-install-recommends --no-download \ nvidia-utils-${DRIVER_BRANCH}-server=${FULL_DRIVER_VERSION} \ @@ -415,6 +417,14 @@ _install_driver() { fi } +_install_driver() { + if _is_grid_driver_required; then + _install_grid_driver + else + _install_precompiled_driver + fi +} + # Mount the driver rootfs into the run directory with the exception of sysfs. _mount_rootfs() { echo "Mounting NVIDIA driver rootfs..."