From dd4435c3e5209db5c180047d607b082d498b2696 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Mon, 22 Sep 2025 02:26:34 +0530 Subject: [PATCH 1/9] Added ROCm 7.0 support --- .github/workflows/iris-tests-apptainer.yml | 16 ++++++---- apptainer/iris-rocm6.3.1.def | 35 ++++++++++++++++++++++ apptainer/iris-rocm7.0.def | 35 ++++++++++++++++++++++ 3 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 apptainer/iris-rocm6.3.1.def create mode 100644 apptainer/iris-rocm7.0.def diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 5e2d9a85..a9c174f2 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -15,6 +15,9 @@ jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -32,20 +35,21 @@ jobs: mkdir -p ~/apptainer # Build Apptainer image from definition file (only if it doesn't exist) - if [ ! -f ~/apptainer/iris-dev.sif ]; then - echo "Building new Apptainer image..." - apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then + echo "Building new Apptainer image for ROCm ${{ matrix.rocm_version }}..." + apptainer build ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif apptainer/iris-rocm${{ matrix.rocm_version }}.def else - echo "Using existing Apptainer image" + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }}" fi run-tests: - name: ${{ matrix.ranks }}-rank Iris Test + name: ${{ matrix.ranks }}-rank Iris Test (ROCm ${{ matrix.rocm_version }}) needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 20 strategy: matrix: ranks: [1, 2, 4, 8] + rocm_version: ["6.3.1", "7.0"] max-parallel: 1 steps: @@ -54,7 +58,7 @@ jobs: - name: Run Iris Tests with ${{ matrix.ranks }} ranks run: | - apptainer exec ~/apptainer/iris-dev.sif bash -c " + apptainer exec ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " set -e # Exit on any error # Install iris first diff --git a/apptainer/iris-rocm6.3.1.def b/apptainer/iris-rocm6.3.1.def new file mode 100644 index 00000000..9974fd34 --- /dev/null +++ b/apptainer/iris-rocm6.3.1.def @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Bootstrap: docker +From: rocm/pytorch:rocm6.3.1_ubuntu22.04_py3.10_pytorch + +%post + /bin/bash -c " + apt-get update && apt-get install -y git + export TRITON_PATH=/workspace/triton + conda env list + source /opt/conda/bin/activate py_3.10 + conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel + git clone https://github.com/triton-lang/triton.git \$TRITON_PATH + cd \$TRITON_PATH + git checkout dd5823453bcc7973eabadb65f9d827c43281c434 + pip install -e . + wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.3.1/rocprofiler-systems-install.py + python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 6.3 + " + +%environment + # Define environment variables + export TRITON_PATH=/workspace/triton + export PYTHONPATH=$TRITON_PATH/python/ + export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH + export ROCM_PATH=/opt/rocm + export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH + export OMPI_MCA_mtl="^ofi" + export OMPI_MCA_pml="ob1" + +%runscript + echo "Welcome to the ROCm-aware Apptainer image!" + source /opt/conda/bin/activate py_3.10 + exec "$@" \ No newline at end of file diff --git a/apptainer/iris-rocm7.0.def b/apptainer/iris-rocm7.0.def new file mode 100644 index 00000000..0a04ca04 --- /dev/null +++ b/apptainer/iris-rocm7.0.def @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Bootstrap: docker +From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 + +%post + /bin/bash -c " + apt-get update && apt-get install -y git + export TRITON_PATH=/workspace/triton + conda env list + source /opt/conda/bin/activate py_3.10 + conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel + git clone https://github.com/triton-lang/triton.git \$TRITON_PATH + cd \$TRITON_PATH + git checkout dd5823453bcc7973eabadb65f9d827c43281c434 + pip install -e . + wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-7.0.0/rocprofiler-systems-install.py + python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 7.0 + " + +%environment + # Define environment variables + export TRITON_PATH=/workspace/triton + export PYTHONPATH=$TRITON_PATH/python/ + export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH + export ROCM_PATH=/opt/rocm + export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH + export OMPI_MCA_mtl="^ofi" + export OMPI_MCA_pml="ob1" + +%runscript + echo "Welcome to the ROCm-aware Apptainer image!" + source /opt/conda/bin/activate py_3.10 + exec "$@" \ No newline at end of file From ae9987f9f8c5d620335489495ed6a7b1a0e8f1a0 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Mon, 22 Sep 2025 03:15:01 +0530 Subject: [PATCH 2/9] updated roc profiler version --- apptainer/iris-rocm7.0.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apptainer/iris-rocm7.0.def b/apptainer/iris-rocm7.0.def index 0a04ca04..20bff40b 100644 --- a/apptainer/iris-rocm7.0.def +++ b/apptainer/iris-rocm7.0.def @@ -16,7 +16,7 @@ From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 git checkout dd5823453bcc7973eabadb65f9d827c43281c434 pip install -e . wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-7.0.0/rocprofiler-systems-install.py - python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 7.0 + python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 6.4 " %environment From 5fcc4a7ce6963c8a60844b56f2f24ae878e9c289 Mon Sep 17 00:00:00 2001 From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com> Date: Fri, 3 Oct 2025 20:38:00 -0700 Subject: [PATCH 3/9] Use ROCM 7.0 Apptainer --- apptainer/iris-rocm7.0.def | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/apptainer/iris-rocm7.0.def b/apptainer/iris-rocm7.0.def index 20bff40b..00956150 100644 --- a/apptainer/iris-rocm7.0.def +++ b/apptainer/iris-rocm7.0.def @@ -4,6 +4,38 @@ Bootstrap: docker From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 +%post + /bin/bash -c " + apt-get update && apt-get install -y git + export TRITON_PATH=/workspace/triton + #conda env list + #source /opt/conda/bin/activate py_3.10 + #conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel + git clone https://github.com/triton-lang/triton.git \$TRITON_PATH + cd \$TRITON_PATH + git checkout aafec417bded34db6308f5b3d6023daefae43905 + pip install -e . + " + +%environment + # Define environment variables + export TRITON_PATH=/workspace/triton + export PYTHONPATH=$TRITON_PATH/python/ + export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH + export ROCM_PATH=/opt/rocm + export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH + export OMPI_MCA_mtl="^ofi" + export OMPI_MCA_pml="ob1" + +%runscript + echo "Welcome to the ROCm-aware Apptainer image!" + source /opt/conda/bin/activate py_3.10 + exec "$@" +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Bootstrap: docker +From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 + %post /bin/bash -c " apt-get update && apt-get install -y git From ab29b3d0f7d991193fd53870b580abe9f30af60e Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 3 Oct 2025 22:47:52 -0500 Subject: [PATCH 4/9] Fix up the bad apptainer file --- apptainer/iris-rocm7.0.def | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/apptainer/iris-rocm7.0.def b/apptainer/iris-rocm7.0.def index 00956150..1989575d 100644 --- a/apptainer/iris-rocm7.0.def +++ b/apptainer/iris-rocm7.0.def @@ -27,40 +27,6 @@ From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 export OMPI_MCA_mtl="^ofi" export OMPI_MCA_pml="ob1" -%runscript - echo "Welcome to the ROCm-aware Apptainer image!" - source /opt/conda/bin/activate py_3.10 - exec "$@" -# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. - -Bootstrap: docker -From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 - -%post - /bin/bash -c " - apt-get update && apt-get install -y git - export TRITON_PATH=/workspace/triton - conda env list - source /opt/conda/bin/activate py_3.10 - conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel - git clone https://github.com/triton-lang/triton.git \$TRITON_PATH - cd \$TRITON_PATH - git checkout dd5823453bcc7973eabadb65f9d827c43281c434 - pip install -e . - wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-7.0.0/rocprofiler-systems-install.py - python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 6.4 - " - -%environment - # Define environment variables - export TRITON_PATH=/workspace/triton - export PYTHONPATH=$TRITON_PATH/python/ - export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH - export ROCM_PATH=/opt/rocm - export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH - export OMPI_MCA_mtl="^ofi" - export OMPI_MCA_pml="ob1" - %runscript echo "Welcome to the ROCm-aware Apptainer image!" source /opt/conda/bin/activate py_3.10 From 050b6d03950a76c1e75da856e4a15f5fd7fcf28c Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 3 Oct 2025 23:04:32 -0500 Subject: [PATCH 5/9] Add hash step --- .github/workflows/iris-tests-apptainer.yml | 32 +++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index a9c174f2..3369b4e6 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -34,12 +34,36 @@ jobs: # Create persistent Apptainer directory mkdir -p ~/apptainer - # Build Apptainer image from definition file (only if it doesn't exist) + # Compute hash of the definition file + DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" + CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') + HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 + + # Check if we need to rebuild + REBUILD=false if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then - echo "Building new Apptainer image for ROCm ${{ matrix.rocm_version }}..." - apptainer build ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif apptainer/iris-rocm${{ matrix.rocm_version }}.def + echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true + elif [ ! -f "$HASH_FILE" ]; then + echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true else - echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }}" + STORED_HASH=$(cat "$HASH_FILE") + if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then + echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + echo " Previous hash: $STORED_HASH" + echo " Current hash: $CURRENT_HASH" + REBUILD=true + else + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" + fi + fi + + # Build if needed + if [ "$REBUILD" = true ]; then + apptainer build ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + echo "$CURRENT_HASH" > "$HASH_FILE" + echo "Successfully built and stored hash: $CURRENT_HASH" fi run-tests: name: ${{ matrix.ranks }}-rank Iris Test (ROCm ${{ matrix.rocm_version }}) From 495a7e705a38035b83b28d519521d21b6868e944 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 3 Oct 2025 23:06:37 -0500 Subject: [PATCH 6/9] Force rebuild --- .github/workflows/iris-tests-apptainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 3369b4e6..d10f887f 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -61,7 +61,7 @@ jobs: # Build if needed if [ "$REBUILD" = true ]; then - apptainer build ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" echo "$CURRENT_HASH" > "$HASH_FILE" echo "Successfully built and stored hash: $CURRENT_HASH" fi From d07de4bd568432df16b784336bbcc9dd378302ec Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Fri, 3 Oct 2025 23:31:04 -0500 Subject: [PATCH 7/9] Add write option --- .github/workflows/iris-tests-apptainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index d10f887f..d1ce209d 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -82,7 +82,7 @@ jobs: - name: Run Iris Tests with ${{ matrix.ranks }} ranks run: | - apptainer exec ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " + apptainer exec --writable-tmpfs ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " set -e # Exit on any error # Install iris first From e549e16ee30b8a2f4f9af95005c65eef1349fd9c Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Sat, 4 Oct 2025 19:48:59 -0500 Subject: [PATCH 8/9] Update all tests to run for both rocm versions --- .../iris-external-validation-test.yml | 44 ++++++++++++--- .github/workflows/iris-pip-install-test.yml | 55 +++++++++++++++---- .github/workflows/iris-tests-apptainer.yml | 9 ++- 3 files changed, 87 insertions(+), 21 deletions(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index 1dbbe977..d25b683a 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -15,6 +15,9 @@ jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -31,19 +34,46 @@ jobs: # Create persistent Apptainer directory mkdir -p ~/apptainer - # Build Apptainer image from definition file (only if it doesn't exist) - if [ ! -f ~/apptainer/iris-dev.sif ]; then - echo "Building new Apptainer image..." - apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + # Compute hash of the definition file + DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" + CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') + HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 + + # Check if we need to rebuild + REBUILD=false + if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then + echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true + elif [ ! -f "$HASH_FILE" ]; then + echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true else - echo "Using existing Apptainer image" + STORED_HASH=$(cat "$HASH_FILE") + if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then + echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + echo " Previous hash: $STORED_HASH" + echo " Current hash: $CURRENT_HASH" + REBUILD=true + else + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" + fi + fi + + # Build if needed + if [ "$REBUILD" = true ]; then + apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + echo "$CURRENT_HASH" > "$HASH_FILE" + echo "Successfully built and stored hash: $CURRENT_HASH" fi external-validation-test: - name: External Validation Test + name: External Validation Test - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -61,7 +91,7 @@ jobs: echo "::group::Running external validation test" apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py python test_iris_distributed.py diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index 48cec6c9..fc950b00 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -15,6 +15,9 @@ jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -31,18 +34,45 @@ jobs: # Create persistent Apptainer directory mkdir -p ~/apptainer - # Build Apptainer image from definition file (only if it doesn't exist) - if [ ! -f ~/apptainer/iris-dev.sif ]; then - echo "Building new Apptainer image..." - apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + # Compute hash of the definition file + DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" + CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') + HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 + + # Check if we need to rebuild + REBUILD=false + if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then + echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true + elif [ ! -f "$HASH_FILE" ]; then + echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true else - echo "Using existing Apptainer image" + STORED_HASH=$(cat "$HASH_FILE") + if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then + echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + echo " Previous hash: $STORED_HASH" + echo " Current hash: $CURRENT_HASH" + REBUILD=true + else + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" + fi + fi + + # Build if needed + if [ "$REBUILD" = true ]; then + apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + echo "$CURRENT_HASH" > "$HASH_FILE" + echo "Successfully built and stored hash: $CURRENT_HASH" fi test-1-2-4-ranks: - name: Pip Install Test 1/2/4 Ranks (Parallel) + name: Pip Install Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -72,7 +102,7 @@ jobs: echo "Starting 1-rank test on GPUs 0,1..." apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 1 " & @@ -81,7 +111,7 @@ jobs: echo "Starting 2-rank test on GPUs 2,3..." apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 2 " & @@ -90,7 +120,7 @@ jobs: echo "Starting 4-rank test on GPUs 4,5,6,7..." apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 4 " & @@ -120,10 +150,13 @@ jobs: echo "✅ All parallel tests (1, 2, 4 ranks) passed!" test-8-ranks: - name: Pip Install Test 8 Ranks + name: Pip Install Test 8 Ranks - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -143,7 +176,7 @@ jobs: echo "::group::Running 8-rank test on all GPUs" apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 8 " diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 8a5bad2c..cffd6e18 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -14,7 +14,7 @@ concurrency: jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] - timeout-minutes: 90w + timeout-minutes: 90 strategy: matrix: rocm_version: ["6.3.1", "7.0"] @@ -78,6 +78,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Run 1, 2, 4 rank tests in parallel run: | @@ -152,7 +154,7 @@ jobs: name: Test 8 Ranks - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] - timeout-minutes: 15 + timeout-minutes: 30 strategy: matrix: rocm_version: ["6.3.1", "7.0"] @@ -160,7 +162,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - + with: + fetch-depth: 0 - name: Run 8-rank test run: | # Create unique overlay image for isolation From f89c38249b1bfafa59d64b8059f94561f7a8cbb4 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Sat, 4 Oct 2025 20:36:50 -0500 Subject: [PATCH 9/9] Rename tests --- .../workflows/{iris-tests-apptainer.yml => iris-tests-dev.yml} | 2 +- ...ris-external-validation-test.yml => iris-tests-external.yml} | 2 +- .../{iris-pip-install-test.yml => iris-tests-package.yml} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename .github/workflows/{iris-tests-apptainer.yml => iris-tests-dev.yml} (99%) rename .github/workflows/{iris-external-validation-test.yml => iris-tests-external.yml} (99%) rename .github/workflows/{iris-pip-install-test.yml => iris-tests-package.yml} (99%) diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-dev.yml similarity index 99% rename from .github/workflows/iris-tests-apptainer.yml rename to .github/workflows/iris-tests-dev.yml index cffd6e18..036828c6 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-dev.yml @@ -1,4 +1,4 @@ -name: Iris Tests with Apptainer +name: Iris Development Tests on: push: diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-tests-external.yml similarity index 99% rename from .github/workflows/iris-external-validation-test.yml rename to .github/workflows/iris-tests-external.yml index d25b683a..ff3427a4 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-tests-external.yml @@ -1,4 +1,4 @@ -name: Iris External Validation Test +name: Iris External Validation on: push: diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-tests-package.yml similarity index 99% rename from .github/workflows/iris-pip-install-test.yml rename to .github/workflows/iris-tests-package.yml index fc950b00..31671e3b 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-tests-package.yml @@ -1,4 +1,4 @@ -name: Iris Pip Install Test +name: Iris Package Tests on: push: