Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions scripts/packer/provisioners/cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y linux-headers-$(uname -r)
ARCH=$(uname -m)
CUDA_DISTRO=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')

# based on https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#ubuntu-lts
# based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ubuntu.html
wget https://developer.download.nvidia.com/compute/cuda/repos/$CUDA_DISTRO/$ARCH/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb

sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
nvidia-driver-$CUDA_DRIVERS_VERSION-server-open \
nvidia-fabricmanager-$CUDA_DRIVERS_VERSION \
nvidia-driver-pinning-$CUDA_DRIVERS_VERSION

sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
nvidia-open \
nvidia-fabricmanager \
datacenter-gpu-manager-4-core datacenter-gpu-manager-4-proprietary datacenter-gpu-manager-exporter
sudo systemctl enable nvidia-fabricmanager
13 changes: 11 additions & 2 deletions scripts/packer/provisioners/downgrade-azure-kernel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,18 @@

set -e

# Install 6.8 kernel
# Install the latest available 6.8 Azure kernel. The exact revision Azure ships
# in the repos changes over time, so we resolve it dynamically instead of pinning
# a specific one (which eventually gets removed and breaks the build).
sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt install linux-image-6.8.0-1015-azure linux-headers-6.8.0-1015-azure -y
KERNEL_VERSION=$(apt-cache search --names-only '^linux-image-6\.8\.0-[0-9]+-azure$' | awk '{print $1}' | sed 's/^linux-image-//' | sort -V | tail -1)

if [ -z "$KERNEL_VERSION" ]; then
echo "No linux-image-6.8.0-*-azure kernel available in the repositories" >&2
exit 1
fi
echo "Installing Azure kernel $KERNEL_VERSION"
sudo DEBIAN_FRONTEND=noninteractive apt install "linux-image-$KERNEL_VERSION" "linux-headers-$KERNEL_VERSION" -y

# Update the Grub entry name
grub_entry_name="$(sudo grep -Po "menuentry '\KUbuntu, with Linux 6\.8[^(']+" /boot/grub/grub.cfg | sort -V | head -1)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get install build-essential linux-azure -y

wget --no-verbose -O NVIDIA-Linux-x86_64-grid.run \
https://download.microsoft.com/download/c5319e92-672e-4067-8d85-ab66a7a64db3/NVIDIA-Linux-x86_64-550.144.06-grid-azure.run
https://download.microsoft.com/download/2a04ca6a-9eec-40d9-9564-9cdea1ab795f/NVIDIA-Linux-x86_64-570.211.01-grid-azure.run
chmod +x NVIDIA-Linux-x86_64-grid.run
sudo ./NVIDIA-Linux-x86_64-grid.run --silent --disable-nouveau
rm NVIDIA-Linux-x86_64-grid.run
Expand Down
2 changes: 1 addition & 1 deletion scripts/packer/versions.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"docker_version": "27.1.1",
"cuda_drivers_version": "570"
"cuda_drivers_version": "580"
}
4 changes: 3 additions & 1 deletion src/dstack/_internal/core/backends/aws/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def get_image_id_and_username(
image_owner = DLAMI_OWNER_ACCOUNT_ID
username = "ubuntu"
else:
image_name = f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
image_name = (
f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
)
image_owner = DSTACK_ACCOUNT_ID
username = "ubuntu"
response = ec2_client.describe_images(
Expand Down
14 changes: 11 additions & 3 deletions src/dstack/_internal/core/backends/azure/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,14 +442,15 @@ def from_instance_type(cls, instance: InstanceType) -> "VMImageVariant":
return cls.STANDARD

def get_image_name(self) -> str:
prefix = settings.DSTACK_VM_BASE_IMAGE_PREFIX
if self is self.GRID:
return f"dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
return f"{prefix}dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
elif self is self.CUDA:
return f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
return f"{prefix}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
elif self is self.CUDA_WITH_PROPRIETARY_KERNEL_MODULES:
return f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
elif self is self.STANDARD:
return f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
return f"{prefix}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
else:
raise ValueError(f"Unexpected image variant {self!r}")

Expand Down Expand Up @@ -528,6 +529,13 @@ def _get_image_ref(
location: str,
variant: VMImageVariant,
) -> ImageReference:
if settings.DSTACK_VM_BASE_IMAGE_PREFIX:
# Staging images are not published to the community gallery, so reference directly.
image = compute_client.images.get(
resource_group_name="dstack-resources-westeurope",
image_name=variant.get_image_name(),
)
return ImageReference(id=image.id)
image = compute_client.community_gallery_images.get(
location=location,
public_gallery_name="dstack-ebac134d-04b9-4c2b-8b6c-ad3e73904aa7", # Gen2
Expand Down
9 changes: 7 additions & 2 deletions src/dstack/_internal/core/backends/gcp/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,12 +1133,17 @@ def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
)
elif gpu_name is not None:
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
image_name = f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
image_name = (
f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}"
f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
)
else:
image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
is_ufw_installed = True
else:
image_name = f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
image_name = (
f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
)
is_ufw_installed = True
image_name = image_name.replace(".", "-")
return GCPImage(
Expand Down
5 changes: 3 additions & 2 deletions src/dstack/_internal/core/backends/oci/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,10 +356,11 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st
def get_marketplace_listing_and_package(
gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient
) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]:
listing_name = f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
prefix = settings.DSTACK_VM_BASE_IMAGE_PREFIX
listing_name = f"{prefix}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
if gpu_name is not None:
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
listing_name = f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
listing_name = f"{prefix}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"
else:
listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"

Expand Down
1 change: 1 addition & 0 deletions src/dstack/_internal/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION", version.docker_base_image_ubuntu_version
)
DSTACK_VM_BASE_IMAGE_VERSION = os.getenv("DSTACK_VM_BASE_IMAGE_VERSION", version.vm_base_image)
DSTACK_VM_BASE_IMAGE_PREFIX = os.getenv("DSTACK_VM_BASE_IMAGE_PREFIX", "") # e.g. stgn-123-
DSTACK_DIND_IMAGE = os.getenv("DSTACK_DIND_IMAGE", "dstackai/dind")

CLI_LOG_LEVEL = os.getenv("DSTACK_CLI_LOG_LEVEL", "INFO").upper()
Expand Down
15 changes: 12 additions & 3 deletions src/tests/_internal/core/backends/azure/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,18 @@ def test_from_instance_type(
@pytest.mark.parametrize(
["variant", "expected_name"],
[
[VMImageVariant.GRID, f"dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"],
[VMImageVariant.CUDA, f"dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"],
[VMImageVariant.STANDARD, f"dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}"],
[
VMImageVariant.GRID,
f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-grid-{settings.DSTACK_VM_BASE_IMAGE_VERSION}",
],
[
VMImageVariant.CUDA,
f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-cuda-{settings.DSTACK_VM_BASE_IMAGE_VERSION}",
],
[
VMImageVariant.STANDARD,
f"{settings.DSTACK_VM_BASE_IMAGE_PREFIX}dstack-{settings.DSTACK_VM_BASE_IMAGE_VERSION}",
],
],
)
def test_get_image_name(self, variant: VMImageVariant, expected_name: str):
Expand Down
Loading