diff --git a/scripts/packer/provisioners/cuda.sh b/scripts/packer/provisioners/cuda.sh index 78fe309fde..6815683e2d 100644 --- a/scripts/packer/provisioners/cuda.sh +++ b/scripts/packer/provisioners/cuda.sh @@ -9,14 +9,17 @@ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y linux-headers-$(uname -r) ARCH=$(uname -m) CUDA_DISTRO=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g') -# based on https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#ubuntu-lts +# based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ubuntu.html wget https://developer.download.nvidia.com/compute/cuda/repos/$CUDA_DISTRO/$ARCH/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb rm cuda-keyring_1.1-1_all.deb sudo apt-get update sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - nvidia-driver-$CUDA_DRIVERS_VERSION-server-open \ - nvidia-fabricmanager-$CUDA_DRIVERS_VERSION \ + nvidia-driver-pinning-$CUDA_DRIVERS_VERSION + +sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + nvidia-open \ + nvidia-fabricmanager \ datacenter-gpu-manager-4-core datacenter-gpu-manager-4-proprietary datacenter-gpu-manager-exporter sudo systemctl enable nvidia-fabricmanager diff --git a/scripts/packer/provisioners/downgrade-azure-kernel.sh b/scripts/packer/provisioners/downgrade-azure-kernel.sh index 9c1d278b2e..6cfc4a7687 100755 --- a/scripts/packer/provisioners/downgrade-azure-kernel.sh +++ b/scripts/packer/provisioners/downgrade-azure-kernel.sh @@ -5,9 +5,18 @@ set -e -# Install 6.8 kernel +# Install the latest available 6.8 Azure kernel. The exact revision Azure ships +# in the repos changes over time, so we resolve it dynamically instead of pinning +# a specific one (which eventually gets removed and breaks the build). sudo apt-get update -sudo DEBIAN_FRONTEND=noninteractive apt install linux-image-6.8.0-1015-azure linux-headers-6.8.0-1015-azure -y +KERNEL_VERSION=$(apt-cache search --names-only '^linux-image-6\.8\.0-[0-9]+-azure$' | awk '{print $1}' | sed 's/^linux-image-//' | sort -V | tail -1) + +if [ -z "$KERNEL_VERSION" ]; then + echo "No linux-image-6.8.0-*-azure kernel available in the repositories" >&2 + exit 1 +fi +echo "Installing Azure kernel $KERNEL_VERSION" +sudo DEBIAN_FRONTEND=noninteractive apt install "linux-image-$KERNEL_VERSION" "linux-headers-$KERNEL_VERSION" -y # Update the Grub entry name grub_entry_name="$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6\.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1)" diff --git a/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh b/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh index d9e54c9079..8a57b2a7cf 100755 --- a/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh +++ b/scripts/packer/provisioners/install-nvidia-grid-driver-for-azure.sh @@ -8,7 +8,7 @@ sudo apt-get update sudo DEBIAN_FRONTEND=noninteractive apt-get install build-essential linux-azure -y wget --no-verbose -O NVIDIA-Linux-x86_64-grid.run \ - https://download.microsoft.com/download/c5319e92-672e-4067-8d85-ab66a7a64db3/NVIDIA-Linux-x86_64-550.144.06-grid-azure.run + https://download.microsoft.com/download/2a04ca6a-9eec-40d9-9564-9cdea1ab795f/NVIDIA-Linux-x86_64-570.211.01-grid-azure.run chmod +x NVIDIA-Linux-x86_64-grid.run sudo ./NVIDIA-Linux-x86_64-grid.run --silent --disable-nouveau rm NVIDIA-Linux-x86_64-grid.run diff --git a/scripts/packer/versions.json b/scripts/packer/versions.json index bc471090ee..584a832abb 100644 --- a/scripts/packer/versions.json +++ b/scripts/packer/versions.json @@ -1,4 +1,4 @@ { "docker_version": "27.1.1", - "cuda_drivers_version": "570" + "cuda_drivers_version": "580" } diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index b658bf6372..ae79675c6c 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -37,7 +37,9 @@ def get_image_id_and_username( image_owner = DLAMI_OWNER_ACCOUNT_ID username = "ubuntu" else: - image_name = f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + image_name = ( + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + ) image_owner = DSTACK_ACCOUNT_ID username = "ubuntu" response = ec2_client.describe_images( diff --git a/src/dstack/_internal/core/backends/azure/compute.py b/src/dstack/_internal/core/backends/azure/compute.py index e94d656934..d2843c7f22 100644 --- a/src/dstack/_internal/core/backends/azure/compute.py +++ b/src/dstack/_internal/core/backends/azure/compute.py @@ -442,14 +442,15 @@ def from_instance_type(cls, instance: InstanceType) -> "VMImageVariant": return cls.STANDARD def get_image_name(self) -> str: + prefix = settings.DSTACK_VM_BASE_IMAGE_PREFIX if self is self.GRID: - return f"dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + return f"{prefix}dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" elif self is self.CUDA: - return f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + return f"{prefix}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" elif self is self.CUDA_WITH_PROPRIETARY_KERNEL_MODULES: return f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" elif self is self.STANDARD: - return f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + return f"{prefix}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" else: raise ValueError(f"Unexpected image variant {self!r}") @@ -528,6 +529,13 @@ def _get_image_ref( location: str, variant: VMImageVariant, ) -> ImageReference: + if settings.DSTACK_VM_BASE_IMAGE_PREFIX: + # Staging images are not published to the community gallery, so reference directly. + image = compute_client.images.get( + resource_group_name="dstack-resources-westeurope", + image_name=variant.get_image_name(), + ) + return ImageReference(id=image.id) image = compute_client.community_gallery_images.get( location=location, public_gallery_name="dstack-ebac134d-04b9-4c2b-8b6c-ad3e73904aa7", # Gen2 diff --git a/src/dstack/_internal/core/backends/gcp/compute.py b/src/dstack/_internal/core/backends/gcp/compute.py index d5a5cdce8f..86544b112f 100644 --- a/src/dstack/_internal/core/backends/gcp/compute.py +++ b/src/dstack/_internal/core/backends/gcp/compute.py @@ -1133,12 +1133,17 @@ def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage: ) elif gpu_name is not None: if not requires_nvidia_proprietary_kernel_modules(gpu_name): - image_name = f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + image_name = ( + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}" + f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + ) else: image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" is_ufw_installed = True else: - image_name = f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + image_name = ( + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + ) is_ufw_installed = True image_name = image_name.replace(".", "-") return GCPImage( diff --git a/src/dstack/_internal/core/backends/oci/resources.py b/src/dstack/_internal/core/backends/oci/resources.py index ea7a3922de..91e1d890f0 100644 --- a/src/dstack/_internal/core/backends/oci/resources.py +++ b/src/dstack/_internal/core/backends/oci/resources.py @@ -356,10 +356,11 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st def get_marketplace_listing_and_package( gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient ) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]: - listing_name = f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + prefix = settings.DSTACK_VM_BASE_IMAGE_PREFIX + listing_name = f"{prefix}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" if gpu_name is not None: if not requires_nvidia_proprietary_kernel_modules(gpu_name): - listing_name = f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" + listing_name = f"{prefix}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}" else: listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}" diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 9d3517a80a..afb8bb5f0b 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -28,6 +28,7 @@ "DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION", version.docker_base_image_ubuntu_version ) DSTACK_VM_BASE_IMAGE_VERSION = os.getenv("DSTACK_VM_BASE_IMAGE_VERSION", version.vm_base_image) +DSTACK_VM_BASE_IMAGE_PREFIX = os.getenv("DSTACK_VM_BASE_IMAGE_PREFIX", "") # e.g. stgn-123- DSTACK_DIND_IMAGE = os.getenv("DSTACK_DIND_IMAGE", "dstackai/dind") CLI_LOG_LEVEL = os.getenv("DSTACK_CLI_LOG_LEVEL", "INFO").upper() diff --git a/src/tests/_internal/core/backends/azure/test_compute.py b/src/tests/_internal/core/backends/azure/test_compute.py index 50b5a342ac..a0ce0afaef 100644 --- a/src/tests/_internal/core/backends/azure/test_compute.py +++ b/src/tests/_internal/core/backends/azure/test_compute.py @@ -55,9 +55,18 @@ def test_from_instance_type( @pytest.mark.parametrize( ["variant", "expected_name"], [ - [VMImageVariant.GRID, f"dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"], - [VMImageVariant.CUDA, f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"], - [VMImageVariant.STANDARD, f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"], + [ + VMImageVariant.GRID, + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}", + ], + [ + VMImageVariant.CUDA, + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}", + ], + [ + VMImageVariant.STANDARD, + f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}", + ], ], ) def test_get_image_name(self, variant: VMImageVariant, expected_name: str):