diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index ae8f76d1ae..bb9abe8f00 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,7 +43,7 @@ jobs: defaults: run: working-directory: docker - runs-on: ubuntu-latest + runs-on: dstack-ubuntu-latest-32-cores strategy: matrix: flavor: ["base", "devel", "devel-efa"] diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile index 105650a836..3ea6a49704 100644 --- a/docker/base/efa/Dockerfile +++ b/docker/base/efa/Dockerfile @@ -16,15 +16,11 @@ RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ && apt-get install -y --no-install-recommends \ cuda-libraries-dev-${cuda_version} \ cuda-nvcc-${cuda_version} \ - libhwloc-dev \ - autoconf \ - automake \ - libtool \ && rm -rf /var/lib/apt/lists/* # EFA -ARG EFA_VERSION=1.38.1 +ARG EFA_VERSION=1.48.0 RUN cd /tmp \ && apt-get update \ @@ -36,7 +32,7 @@ RUN cd /tmp \ # NCCL -ARG NCCL_VERSION=2.26.2-1 +ARG NCCL_VERSION=2.27.7-1 RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ @@ -44,26 +40,6 @@ RUN cd /tmp \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ && rm -rf /tmp/nccl -# AWS OFI NCCL - -ARG OFI_VERSION=1.14.0 - -RUN cd /tmp \ - && git clone https://github.com/aws/aws-ofi-nccl.git -b v${OFI_VERSION} \ - && cd aws-ofi-nccl \ - && ./autogen.sh \ - && ./configure \ - --with-cuda=${CUDA_HOME} \ - --with-libfabric=${LIBFABRIC_PATH} \ - --with-mpi=${OPEN_MPI_PATH} \ - --with-cuda=${CUDA_HOME} \ - --with-nccl=${NCCL_HOME} \ - --disable-tests \ - --prefix=${NCCL_HOME} \ - && make -j$(nproc) \ - && make install \ - && rm -rf /tmp/aws-ofi-nccl /var/lib/apt/lists/* - # NCCL Tests RUN cd /opt \ diff --git a/docker/base/efa/README.md b/docker/base/efa/README.md index 29ed748213..9790d84d78 100644 --- a/docker/base/efa/README.md +++ b/docker/base/efa/README.md @@ -2,8 +2,7 @@ This image has the following installed: -* CUDA 12.1 -* AWS EFA Installer 1.38.1 (Libfabric + Open MPI 4 + Open MPI 5) -* NCCL 2.26.2-1 -* AWS OFI NCCL 1.14.0 +* CUDA 12.8 +* AWS EFA Installer 1.48.0 (Libfabric + Open MPI + AWS OFI NCCL 1.19.0) +* NCCL 2.27.7-1 * NCCL Tests diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index 4c0fb4e39a..cd072c1f5f 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -1240,6 +1240,8 @@ def _supported_instances(offer: InstanceOffer) -> bool: "t2.small", "c5.", "m5.", + "p6-b300.", + "p6-b200.", "p5.", "p5e.", "p4d.", diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 67959a19c0..4e92bd84b5 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -191,40 +191,13 @@ def create_instances_struct( # AWS allows specifying either NetworkInterfaces for specific subnet_id # or instance-level SecurityGroupIds in case of no specific subnet_id, not both. if subnet_id is not None: - # AWS does not auto-assign a public IPv4 to instances launched with multiple network - # interfaces ("AssociatePublicIpAddress [...] You cannot specify more than one network - # interface in the request"). For multi-EFA instance types (e.g. p4d, p5, trn1), we - # therefore launch all EFA NICs without `AssociatePublicIpAddress` and, when - # `public_ips: true`, attach an Elastic IP after launch in `update_provisioning_data`. - multi_eni = max_efa_interfaces > 1 - struct["NetworkInterfaces"] = [ - { - "AssociatePublicIpAddress": allocate_public_ip and not multi_eni, - "DeviceIndex": 0, - "SubnetId": subnet_id, - "Groups": [security_group_id], - "InterfaceType": "efa" if max_efa_interfaces > 0 else "interface", - }, - ] - - if multi_eni: - for i in range(1, max_efa_interfaces): - # Set to efa-only to use interfaces exclusively for GPU-to-GPU communication - interface_type = "efa-only" - if instance_type == "p5.48xlarge": - # EFA configuration for P5 instances: - # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 - interface_type = "efa" if i % 4 == 0 else "efa-only" - struct["NetworkInterfaces"].append( - { - "AssociatePublicIpAddress": False, - "NetworkCardIndex": i, - "DeviceIndex": 1, - "SubnetId": subnet_id, - "Groups": [security_group_id], - "InterfaceType": interface_type, - } - ) + struct["NetworkInterfaces"] = _create_network_interfaces_struct( + instance_type=instance_type, + subnet_id=subnet_id, + security_group_id=security_group_id, + allocate_public_ip=allocate_public_ip, + max_efa_interfaces=max_efa_interfaces, + ) else: struct["SecurityGroupIds"] = [security_group_id] @@ -632,6 +605,64 @@ def _is_private_subnet_with_internet_egress( return False +def _create_network_interfaces_struct( + instance_type: str, + subnet_id: str, + security_group_id: str, + allocate_public_ip: bool, + max_efa_interfaces: int, +) -> List[Dict[str, Any]]: + # AWS does not auto-assign a public IPv4 to instances launched with multiple network + # interfaces ("AssociatePublicIpAddress [...] You cannot specify more than one network + # interface in the request"). For multi-EFA instance types (e.g. p4d, p5, p6, trn1), we + # therefore launch all EFA NICs without `AssociatePublicIpAddress` and, when + # `public_ips: true`, attach an Elastic IP after launch in `update_provisioning_data`. + multi_eni = max_efa_interfaces > 1 + primary_supports_efa = _primary_nic_supports_efa(instance_type) + network_interfaces: List[Dict[str, Any]] = [ + { + "AssociatePublicIpAddress": allocate_public_ip and not multi_eni, + "DeviceIndex": 0, + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": "efa" + if max_efa_interfaces > 0 and primary_supports_efa + else "interface", + }, + ] + + if multi_eni: + last_card_index = max_efa_interfaces + if not primary_supports_efa: + last_card_index += 1 + for i in range(1, last_card_index): + # Set to efa-only to use interfaces exclusively for GPU-to-GPU communication + interface_type = "efa-only" + if instance_type == "p5.48xlarge": + # EFA configuration for P5 instances: + # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 + interface_type = "efa" if i % 4 == 0 else "efa-only" + network_interfaces.append( + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": subnet_id, + "Groups": [security_group_id], + "InterfaceType": interface_type, + } + ) + return network_interfaces + + +def _primary_nic_supports_efa(instance_type: str) -> bool: + """For most EFA-supported instance types, primary network card (index 0) supports + attaching both ENA and EFA. But some may support only one interface (ENA), + and all EFA interfaces are placed on the secondary network cards (1..max_efa_interfaces). + """ + return instance_type not in {"p6-b300.48xlarge"} + + def get_reservation( ec2_client: botocore.client.BaseClient, reservation_id: str, diff --git a/src/dstack/_internal/server/services/backends/provisioning.py b/src/dstack/_internal/server/services/backends/provisioning.py index b02deb8ade..f289a41dce 100644 --- a/src/dstack/_internal/server/services/backends/provisioning.py +++ b/src/dstack/_internal/server/services/backends/provisioning.py @@ -12,7 +12,7 @@ # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types _AWS_EFA_ENABLED_INSTANCE_TYPE_PATTERNS = [ - # TODO: p6-b200 isn't supported yet in gpuhunt + r"^p6-b300\.(48xlarge)$", r"^p6-b200\.(48xlarge)$", r"^p5\.(4xlarge|48xlarge)$", r"^p5e\.(48xlarge)$", diff --git a/src/tests/_internal/core/backends/aws/test_resources.py b/src/tests/_internal/core/backends/aws/test_resources.py index dcec84bf62..4fa9a30fa6 100644 --- a/src/tests/_internal/core/backends/aws/test_resources.py +++ b/src/tests/_internal/core/backends/aws/test_resources.py @@ -5,6 +5,7 @@ from dstack._internal.core.backends.aws.models import AWSOSImage, AWSOSImageConfig from dstack._internal.core.backends.aws.resources import ( + _create_network_interfaces_struct, _is_valid_tag_key, _is_valid_tag_value, get_image_id_and_username, @@ -235,3 +236,161 @@ def test_raises_resource_not_found_if_image_config_property_not_set( image_config=image_config, ) assert "cpu image not configured" in caplog.text + + +class TestCreateNetworkInterfacesStruct: + def test_non_efa_instance_single_interface(self): + interfaces = _create_network_interfaces_struct( + instance_type="m5.large", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=0, + ) + assert interfaces == [ + { + "AssociatePublicIpAddress": True, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "interface", + }, + ] + + def test_non_efa_instance_no_public_ip(self): + interfaces = _create_network_interfaces_struct( + instance_type="m5.large", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=False, + max_efa_interfaces=0, + ) + assert interfaces[0]["AssociatePublicIpAddress"] is False + assert interfaces[0]["InterfaceType"] == "interface" + + def test_single_efa_interface(self): + interfaces = _create_network_interfaces_struct( + instance_type="g5.8xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=1, + ) + # multi_eni is False, so the single EFA NIC keeps the public IP + assert interfaces == [ + { + "AssociatePublicIpAddress": True, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa", + }, + ] + + def test_multi_efa_instance(self): + interfaces = _create_network_interfaces_struct( + instance_type="p4d.24xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=4, + ) + # Multiple NICs disable auto-assigned public IP on every interface + assert interfaces[0] == { + "AssociatePublicIpAddress": False, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa", + } + assert interfaces[1:] == [ + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa-only", + } + for i in range(1, 4) + ] + + def test_p5_uses_efa_every_fourth_interface(self): + interfaces = _create_network_interfaces_struct( + instance_type="p5.48xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=32, + ) + assert len(interfaces) == 32 + assert all(i["NetworkCardIndex"] == idx for idx, i in enumerate(interfaces) if idx > 0) + # The primary NIC is a combined efa interface + assert interfaces[0]["InterfaceType"] == "efa" + assert "NetworkCardIndex" not in interfaces[0] + # Every 4th secondary NIC is a combined efa interface, the rest are efa-only + for idx, interface in enumerate(interfaces[1:], start=1): + expected = "efa" if idx % 4 == 0 else "efa-only" + assert interface["InterfaceType"] == expected, idx + + def test_p6_b200_efa_on_every_card(self): + # p6-b200 has 8 EFA-capable network cards (indexes 0-7), handled by the generic path + interfaces = _create_network_interfaces_struct( + instance_type="p6-b200.48xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=8, + ) + assert len(interfaces) == 8 + assert interfaces[0] == { + "AssociatePublicIpAddress": False, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa", + } + assert interfaces[1:] == [ + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa-only", + } + for i in range(1, 8) + ] + + def test_p6_b300_ena_only_primary_nic(self): + # p6-b300 has 17 network cards: the primary (index 0) supports only ENA, the remaining + # 16 cards (indexes 1-16) support EFA. max_efa_interfaces is 16. + interfaces = _create_network_interfaces_struct( + instance_type="p6-b300.48xlarge", + subnet_id="subnet-1", + security_group_id="sg-1", + allocate_public_ip=True, + max_efa_interfaces=16, + ) + # 1 ENA primary + 16 EFA secondary cards + assert len(interfaces) == 17 + # Primary card is a plain ENA interface, not EFA + assert interfaces[0] == { + "AssociatePublicIpAddress": False, + "DeviceIndex": 0, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "interface", + } + # EFA-only interfaces span network card indexes 1-16 + assert interfaces[1:] == [ + { + "AssociatePublicIpAddress": False, + "NetworkCardIndex": i, + "DeviceIndex": 1, + "SubnetId": "subnet-1", + "Groups": ["sg-1"], + "InterfaceType": "efa-only", + } + for i in range(1, 17) + ]