From 4147f3be0e75cea3be8aabaf28ee73fb68877325 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Mon, 17 Nov 2025 22:22:07 +0000 Subject: [PATCH 1/5] Revamp testing. --- .github/scripts/container_build.sh | 5 +- .github/scripts/container_exec.sh | 4 +- .github/scripts/container_run.sh | 3 +- .github/scripts/run_tests.sh | 47 +++-- .../iris-external-validation-test.yml | 3 + .../iris-performance-regression-test.yml | 3 + .github/workflows/iris-pip-install-test.yml | 198 ------------------ .github/workflows/iris-tests-apptainer.yml | 111 ---------- .github/workflows/iris-tests.yml | 104 +++++++++ 9 files changed, 142 insertions(+), 336 deletions(-) delete mode 100644 .github/workflows/iris-pip-install-test.yml delete mode 100644 .github/workflows/iris-tests-apptainer.yml create mode 100644 .github/workflows/iris-tests.yml diff --git a/.github/scripts/container_build.sh b/.github/scripts/container_build.sh index a1f7464a..dfe62d1d 100755 --- a/.github/scripts/container_build.sh +++ b/.github/scripts/container_build.sh @@ -36,9 +36,10 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then elif [ "$CONTAINER_RUNTIME" = "docker" ]; then echo "[INFO] Checking Docker images..." - IMAGE_NAME="iris-dev-triton-aafec41" + # Use GitHub variable if set, otherwise default to iris-dev + IMAGE_NAME=${DOCKER_IMAGE_NAME:-"iris-dev"} - # Check if the triton image exists + # Check if the image exists if docker image inspect "$IMAGE_NAME" &> /dev/null; then echo "[INFO] Using existing Docker image: $IMAGE_NAME" else diff --git a/.github/scripts/container_exec.sh b/.github/scripts/container_exec.sh index 1ef3e327..a02affba 100755 --- a/.github/scripts/container_exec.sh +++ b/.github/scripts/container_exec.sh @@ -86,7 +86,9 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then exit $EXIT_CODE elif [ "$CONTAINER_RUNTIME" = "docker" ]; then - IMAGE_NAME=${CUSTOM_IMAGE:-${DOCKER_IMAGE_NAME:-"iris-dev-triton-aafec41"}} + # Use custom image if provided, otherwise use GitHub variable or default + # GitHub Actions sets DOCKER_IMAGE_NAME, locally defaults to iris-dev + IMAGE_NAME=${CUSTOM_IMAGE:-${DOCKER_IMAGE_NAME:-"iris-dev"}} if ! docker image inspect "$IMAGE_NAME" &> /dev/null; then echo "[ERROR] Docker image $IMAGE_NAME not found" >&2 diff --git a/.github/scripts/container_run.sh b/.github/scripts/container_run.sh index ce5ffe2e..30b4a535 100755 --- a/.github/scripts/container_run.sh +++ b/.github/scripts/container_run.sh @@ -25,7 +25,8 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then bash apptainer/run.sh "$@" elif [ "$CONTAINER_RUNTIME" = "docker" ]; then echo "[INFO] Running with Docker..." - IMAGE_NAME=${1:-"iris-dev-triton-aafec41"} + # Use GitHub variable if set, otherwise default to iris-dev + IMAGE_NAME=${1:-${DOCKER_IMAGE_NAME:-"iris-dev"}} WORKSPACE_DIR=${2:-"$(pwd)"} bash docker/run.sh "$IMAGE_NAME" "$WORKSPACE_DIR" fi diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh index c74801d1..160de790 100755 --- a/.github/scripts/run_tests.sh +++ b/.github/scripts/run_tests.sh @@ -3,16 +3,28 @@ # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. # # Run Iris tests in a container -# Usage: run_tests.sh [gpu_devices] +# Usage: run_tests.sh [gpu_devices] +# test_dir: subdirectory under tests/ (e.g., examples, unittests, ccl) +# num_ranks: number of GPU ranks (1, 2, 4, or 8) +# gpu_devices: comma-separated GPU device IDs (optional) set -e -NUM_RANKS=$1 -GPU_DEVICES=${2:-""} +TEST_DIR=$1 +NUM_RANKS=$2 +GPU_DEVICES=${3:-""} -if [ -z "$NUM_RANKS" ]; then - echo "[ERROR] NUM_RANKS not provided" - echo "Usage: $0 [gpu_devices]" +if [ -z "$TEST_DIR" ] || [ -z "$NUM_RANKS" ]; then + echo "[ERROR] Missing required arguments" + echo "Usage: $0 [gpu_devices]" + echo " test_dir: examples, unittests, or ccl" + echo " num_ranks: 1, 2, 4, or 8" + exit 1 +fi + +# Validate test directory +if [ ! -d "tests/$TEST_DIR" ]; then + echo "[ERROR] Test directory tests/$TEST_DIR does not exist" exit 1 fi @@ -29,23 +41,12 @@ fi set -e pip install -e . - # Run examples tests - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with $NUM_RANKS ranks\" - python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10 + # Run tests in the specified directory + for test_file in tests/$TEST_DIR/test_*.py; do + if [ -f \"\$test_file\" ]; then + echo \"Testing: \$test_file with $NUM_RANKS ranks\" + python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10 + fi done - - # Run unit tests - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with $NUM_RANKS ranks\" - python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10 - done - - # Run ccl tests - # DISABLED: CCL host-side APIs have issues for some data types/algorithms - # for test_file in tests/ccl/test_*.py; do - # echo \"Testing: \$test_file with $NUM_RANKS ranks\" - # python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10 - # done " diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index f609fc7a..655d13f6 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -11,6 +11,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} +env: + DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }} + jobs: build-container-image: runs-on: [self-hosted, mi3008x] diff --git a/.github/workflows/iris-performance-regression-test.yml b/.github/workflows/iris-performance-regression-test.yml index fa017886..137ee2d0 100644 --- a/.github/workflows/iris-performance-regression-test.yml +++ b/.github/workflows/iris-performance-regression-test.yml @@ -11,6 +11,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} +env: + DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }} + jobs: build-container-image: runs-on: [self-hosted, mi3008x] diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml deleted file mode 100644 index aa7ee86f..00000000 --- a/.github/workflows/iris-pip-install-test.yml +++ /dev/null @@ -1,198 +0,0 @@ -name: Iris Pip Install Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - test-1-2-4-ranks: - name: Pip Install Test 1/2/4 Ranks (Parallel) - needs: build-container-image - runs-on: [self-hosted, mi3008x] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run pip install tests for 1, 2, 4 ranks in parallel - run: | - # Don't use set -e here - we want to handle errors manually for parallel processes - # Run tests in parallel with different GPU assignments - # Note: Each test gets 2+ GPUs even if it only uses some of them. - # This allows tests like test_empty_device_handling to verify that - # allocating on a different device correctly raises an error. - - echo "::group::Starting parallel tests" - echo "Starting 1-rank test on GPUs 0,1..." - bash .github/scripts/container_exec.sh --gpus "0,1" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 1 ranks\" - python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 1 ranks\" - python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10 - done - " > /tmp/test_1rank.log 2>&1 & - PID1=$! - - echo "Starting 2-rank test on GPUs 2,3..." - bash .github/scripts/container_exec.sh --gpus "2,3" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 2 ranks\" - python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 2 ranks\" - python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10 - done - " > /tmp/test_2rank.log 2>&1 & - PID2=$! - - echo "Starting 4-rank test on GPUs 4,5,6,7..." - bash .github/scripts/container_exec.sh --gpus "4,5,6,7" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 4 ranks\" - python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 4 ranks\" - python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10 - done - " > /tmp/test_4rank.log 2>&1 & - PID4=$! - echo "::endgroup::" - - # Wait for all parallel tests and track failures - echo "::group::Waiting for parallel tests to complete" - FAIL=0 - FAILED_TESTS="" - - # Wait for each process and capture exit status - if ! wait $PID1; then - echo "::error::1-rank test FAILED" - echo "::group::1-rank test logs" - cat /tmp/test_1rank.log || true - echo "::endgroup::" - FAILED_TESTS="$FAILED_TESTS 1-rank" - FAIL=1 - else - echo "✅ 1-rank test passed" - fi - - if ! wait $PID2; then - echo "::error::2-rank test FAILED" - echo "::group::2-rank test logs" - cat /tmp/test_2rank.log || true - echo "::endgroup::" - FAILED_TESTS="$FAILED_TESTS 2-rank" - FAIL=1 - else - echo "✅ 2-rank test passed" - fi - - if ! wait $PID4; then - echo "::error::4-rank test FAILED" - echo "::group::4-rank test logs" - cat /tmp/test_4rank.log || true - echo "::endgroup::" - FAILED_TESTS="$FAILED_TESTS 4-rank" - FAIL=1 - else - echo "✅ 4-rank test passed" - fi - echo "::endgroup::" - - # Clean up log files - rm -f /tmp/test_1rank.log /tmp/test_2rank.log /tmp/test_4rank.log - - if [ $FAIL -eq 1 ]; then - echo "::error::Parallel tests failed:$FAILED_TESTS" - exit 1 - fi - - echo "✅ All parallel tests (1, 2, 4 ranks) passed!" - - test-8-ranks: - name: Pip Install Test 8 Ranks - needs: build-container-image - runs-on: [self-hosted, mi3008x] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 8-rank pip install test - run: | - echo "::group::Running 8-rank test on all GPUs" - if bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 8 ranks\" - python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 8 ranks\" - python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10 - done - "; then - echo "::endgroup::" - echo "✅ 8-rank test passed!" - else - echo "::endgroup::" - echo "::error::8-rank test FAILED" - exit 1 - fi diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml deleted file mode 100644 index 8df639bf..00000000 --- a/.github/workflows/iris-tests-apptainer.yml +++ /dev/null @@ -1,111 +0,0 @@ -name: Iris Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - test-1-2-4-ranks: - name: Test 1/2/4 Ranks (Parallel) - needs: build-container-image - runs-on: [self-hosted, mi3008x] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 1, 2, 4 rank tests in parallel - run: | - set -e - - # Run tests in parallel with different GPU assignments - # Note: Each test gets 2+ GPUs even if it only uses some of them. - # This allows tests like test_empty_device_handling to verify that - # allocating on a different device correctly raises an error. - - echo "::group::Starting parallel tests" - echo "Starting 1-rank test on GPUs 0,1..." - bash .github/scripts/run_tests.sh 1 "0,1" & - PID1=$! - - echo "Starting 2-rank test on GPUs 2,3..." - bash .github/scripts/run_tests.sh 2 "2,3" & - PID2=$! - - echo "Starting 4-rank test on GPUs 4,5,6,7..." - bash .github/scripts/run_tests.sh 4 "4,5,6,7" & - PID4=$! - echo "::endgroup::" - - # Wait for all parallel tests and track failures - echo "::group::Waiting for parallel tests to complete" - FAIL=0 - FAILED_TESTS="" - - wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; } - wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; } - wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; } - echo "::endgroup::" - - if [ $FAIL -eq 1 ]; then - echo "::error::Parallel tests failed:$FAILED_TESTS" - exit 1 - fi - - echo "✅ All parallel tests (1, 2, 4 ranks) passed!" - - test-8-ranks: - name: Test 8 Ranks - needs: build-container-image - runs-on: [self-hosted, mi3008x] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 8-rank test - run: | - set -e - - echo "::group::Running 8-rank test on all GPUs" - bash .github/scripts/run_tests.sh 8 "0,1,2,3,4,5,6,7" - echo "::endgroup::" - - echo "✅ 8-rank test passed!" diff --git a/.github/workflows/iris-tests.yml b/.github/workflows/iris-tests.yml new file mode 100644 index 00000000..1fec194b --- /dev/null +++ b/.github/workflows/iris-tests.yml @@ -0,0 +1,104 @@ +name: Iris Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +env: + DOCKER_IMAGE_NAME: ${{ vars.DOCKER_IMAGE_NAME || 'iris-dev-triton-aafec41' }} + +jobs: + build-container-image: + runs-on: [self-hosted, mi3008x] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Apptainer (if not available) + run: | + if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then + echo "Neither Apptainer nor Docker found, installing Apptainer..." + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + else + echo "Container runtime already available" + fi + + - name: Build Iris container + run: | + bash .github/scripts/container_build.sh + + test: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks) + needs: build-container-image + runs-on: [self-hosted, mi3008x] + strategy: + fail-fast: false + matrix: + include: + # Test each subdirectory with each rank count + - test_dir: examples + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: examples + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: examples + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: examples + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: unittests + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: unittests + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: unittests + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: unittests + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ccl + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ccl + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ccl + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ccl + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cleanup lingering ports before tests + run: | + bash .github/scripts/cleanup_ports.sh + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks" + bash .github/scripts/run_tests.sh \ + "${{ matrix.test_dir }}" \ + "${{ matrix.num_ranks }}" \ + "${{ matrix.gpu_devices }}" + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks passed!" + From 0a197622d2c5b2fbfff046666045d732bf56defa Mon Sep 17 00:00:00 2001 From: neoblizz Date: Tue, 18 Nov 2025 02:52:49 +0000 Subject: [PATCH 2/5] Use preamble --- tests/ccl/test_all_reduce.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py index 58f88bab..fd4da1e4 100644 --- a/tests/ccl/test_all_reduce.py +++ b/tests/ccl/test_all_reduce.py @@ -75,7 +75,16 @@ def test_all_reduce(variant, dtype, M, N): config.all_reduce_distribution = 0 # striding if variant == "ring": config.all_reduce_num_rings = min(2, config.comm_sms) - shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config) + + # Explicitly call preamble to ensure proper initialization and synchronization + # This helps with test isolation when tests run sequentially + workspace = shmem.ccl.all_reduce_preamble( + iris_output_tensor, iris_input_tensor, config=config + ) + shmem.barrier() # Ensure all ranks have completed preamble before starting kernel + + # Now call all_reduce with the prepared workspace + shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config, workspace=workspace) torch.cuda.synchronize() # Compare results @@ -131,7 +140,15 @@ def test_all_reduce_two_shot_distribution(distribution, dtype=torch.float32, M=1 shmem.barrier() config = Config(all_reduce_variant="two_shot", all_reduce_distribution=distribution) - shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config) + + # Explicitly call preamble to ensure proper initialization and synchronization + workspace = shmem.ccl.all_reduce_preamble( + iris_output_tensor, iris_input_tensor, config=config + ) + shmem.barrier() # Ensure all ranks have completed preamble before starting kernel + + # Now call all_reduce with the prepared workspace + shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config, workspace=workspace) torch.cuda.synchronize() atol = 1e-5 From 4983672dd4cd254e768ba44f8af7129af6ec409f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 18 Nov 2025 03:04:16 +0000 Subject: [PATCH 3/5] Apply Ruff auto-fixes --- tests/ccl/test_all_reduce.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py index fd4da1e4..16501c22 100644 --- a/tests/ccl/test_all_reduce.py +++ b/tests/ccl/test_all_reduce.py @@ -75,14 +75,12 @@ def test_all_reduce(variant, dtype, M, N): config.all_reduce_distribution = 0 # striding if variant == "ring": config.all_reduce_num_rings = min(2, config.comm_sms) - + # Explicitly call preamble to ensure proper initialization and synchronization # This helps with test isolation when tests run sequentially - workspace = shmem.ccl.all_reduce_preamble( - iris_output_tensor, iris_input_tensor, config=config - ) + workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config) shmem.barrier() # Ensure all ranks have completed preamble before starting kernel - + # Now call all_reduce with the prepared workspace shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config, workspace=workspace) torch.cuda.synchronize() @@ -140,13 +138,11 @@ def test_all_reduce_two_shot_distribution(distribution, dtype=torch.float32, M=1 shmem.barrier() config = Config(all_reduce_variant="two_shot", all_reduce_distribution=distribution) - + # Explicitly call preamble to ensure proper initialization and synchronization - workspace = shmem.ccl.all_reduce_preamble( - iris_output_tensor, iris_input_tensor, config=config - ) + workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config) shmem.barrier() # Ensure all ranks have completed preamble before starting kernel - + # Now call all_reduce with the prepared workspace shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config, workspace=workspace) torch.cuda.synchronize() From b49a3e894d80881be79148e7b663034fe2eecfa1 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Sat, 22 Nov 2025 20:15:01 +0000 Subject: [PATCH 4/5] Disable ring-reduce. --- tests/ccl/test_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py index 16501c22..32cc4774 100644 --- a/tests/ccl/test_all_reduce.py +++ b/tests/ccl/test_all_reduce.py @@ -16,7 +16,7 @@ "variant", [ "atomic", - "ring", + # "ring", "two_shot", "one_shot", "spinlock", From 9ddd9ebd6c69a27fc7adfcaa2fcbefe287dd2b12 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Wed, 3 Dec 2025 20:29:16 +0000 Subject: [PATCH 5/5] Enable installation methods. --- .github/scripts/run_tests.sh | 35 +++++++- .github/workflows/iris-tests.yml | 150 +++++++++++++++++++++++++++++-- 2 files changed, 174 insertions(+), 11 deletions(-) diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh index 160de790..83c84512 100755 --- a/.github/scripts/run_tests.sh +++ b/.github/scripts/run_tests.sh @@ -3,22 +3,28 @@ # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. # # Run Iris tests in a container -# Usage: run_tests.sh [gpu_devices] +# Usage: run_tests.sh [gpu_devices] [install_method] # test_dir: subdirectory under tests/ (e.g., examples, unittests, ccl) # num_ranks: number of GPU ranks (1, 2, 4, or 8) # gpu_devices: comma-separated GPU device IDs (optional) +# install_method: pip install method - "git", "editable", or "install" (optional, default: "editable") +# - "git": pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} +# - "editable": pip install -e . +# - "install": pip install . set -e TEST_DIR=$1 NUM_RANKS=$2 GPU_DEVICES=${3:-""} +INSTALL_METHOD=${4:-"editable"} if [ -z "$TEST_DIR" ] || [ -z "$NUM_RANKS" ]; then echo "[ERROR] Missing required arguments" - echo "Usage: $0 [gpu_devices]" + echo "Usage: $0 [gpu_devices] [install_method]" echo " test_dir: examples, unittests, or ccl" echo " num_ranks: 1, 2, 4, or 8" + echo " install_method: git, editable, or install (default: editable)" exit 1 fi @@ -28,6 +34,13 @@ if [ ! -d "tests/$TEST_DIR" ]; then exit 1 fi +# Validate install method +if [ "$INSTALL_METHOD" != "git" ] && [ "$INSTALL_METHOD" != "editable" ] && [ "$INSTALL_METHOD" != "install" ]; then + echo "[ERROR] Invalid install_method: $INSTALL_METHOD" + echo " Must be one of: git, editable, install" + exit 1 +fi + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Build GPU argument if provided @@ -36,15 +49,29 @@ if [ -n "$GPU_DEVICES" ]; then GPU_ARG="--gpus $GPU_DEVICES" fi +# Build install command based on method +INSTALL_CMD="" +if [ "$INSTALL_METHOD" = "git" ]; then + # For git install, we need the repository and SHA from environment or use defaults + REPO=${GITHUB_REPOSITORY:-"ROCm/iris"} + SHA=${GITHUB_SHA:-"HEAD"} + INSTALL_CMD="pip install git+https://github.com/${REPO}.git@${SHA}" +elif [ "$INSTALL_METHOD" = "editable" ]; then + INSTALL_CMD="pip install -e ." +elif [ "$INSTALL_METHOD" = "install" ]; then + INSTALL_CMD="pip install ." +fi + # Run tests in container "$SCRIPT_DIR/container_exec.sh" $GPU_ARG " set -e - pip install -e . + echo \"Installing iris using method: $INSTALL_METHOD\" + $INSTALL_CMD # Run tests in the specified directory for test_file in tests/$TEST_DIR/test_*.py; do if [ -f \"\$test_file\" ]; then - echo \"Testing: \$test_file with $NUM_RANKS ranks\" + echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD)\" python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10 fi done diff --git a/.github/workflows/iris-tests.yml b/.github/workflows/iris-tests.yml index 1fec194b..0e4b4da3 100644 --- a/.github/workflows/iris-tests.yml +++ b/.github/workflows/iris-tests.yml @@ -37,15 +37,15 @@ jobs: run: | bash .github/scripts/container_build.sh - test: - name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks) + test-git: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, git install) needs: build-container-image runs-on: [self-hosted, mi3008x] strategy: fail-fast: false matrix: include: - # Test each subdirectory with each rank count + # Test each subdirectory with each rank count using git install - test_dir: examples num_ranks: 1 gpu_devices: "0,1" @@ -91,14 +91,150 @@ jobs: run: | bash .github/scripts/cleanup_ports.sh - - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git install) + env: + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_SHA: ${{ github.sha }} run: | set -e - echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks" + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git)" bash .github/scripts/run_tests.sh \ "${{ matrix.test_dir }}" \ "${{ matrix.num_ranks }}" \ - "${{ matrix.gpu_devices }}" + "${{ matrix.gpu_devices }}" \ + "git" echo "::endgroup::" - echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks passed!" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git) passed!" + + test-editable: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, editable install) + needs: [build-container-image, test-git] + runs-on: [self-hosted, mi3008x] + strategy: + fail-fast: false + matrix: + include: + # Test each subdirectory with each rank count using editable install + - test_dir: examples + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: examples + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: examples + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: examples + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: unittests + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: unittests + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: unittests + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: unittests + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ccl + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ccl + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ccl + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ccl + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cleanup lingering ports before tests + run: | + bash .github/scripts/cleanup_ports.sh + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable install) + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable)" + bash .github/scripts/run_tests.sh \ + "${{ matrix.test_dir }}" \ + "${{ matrix.num_ranks }}" \ + "${{ matrix.gpu_devices }}" \ + "editable" + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable) passed!" + + test-install: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, pip install) + needs: [build-container-image, test-editable] + runs-on: [self-hosted, mi3008x] + strategy: + fail-fast: false + matrix: + include: + # Test each subdirectory with each rank count using pip install + - test_dir: examples + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: examples + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: examples + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: examples + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: unittests + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: unittests + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: unittests + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: unittests + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ccl + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ccl + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ccl + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ccl + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cleanup lingering ports before tests + run: | + bash .github/scripts/cleanup_ports.sh + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (pip install) + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install)" + bash .github/scripts/run_tests.sh \ + "${{ matrix.test_dir }}" \ + "${{ matrix.num_ranks }}" \ + "${{ matrix.gpu_devices }}" \ + "install" + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install) passed!"