From e782febc5136de0177f85fb315daf88a7907b6ee Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 18 Mar 2026 13:57:33 -0700 Subject: [PATCH 01/10] ci: add GPU test job using self-hosted runners Add a test matrix job that runs on self-hosted GPU runners (AWS EC2 Ampere instances). Tests run inside Docker containers with --gpus all using the pre-built test images from GHCR. Also update all image tags to 2026-03-18 builds which include tileiras 13.2 (adds sm_86 support). --- .github/workflows/ci.yml | 72 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4328ad..7d5ebc9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,12 +20,16 @@ jobs: name: Define Base Images runs-on: ubuntu-latest outputs: - lint: ghcr.io/nvidia/cutile-python/lint:2026-03-02-d33a8a50c68d - docs: ghcr.io/nvidia/cutile-python/docs:2026-03-02-2ab6fb9d9368 - build_py310: ghcr.io/nvidia/cutile-python/build_py_3.10_x86_64:2026-03-02-c7f3f36001fd - build_py311: ghcr.io/nvidia/cutile-python/build_py_3.11_x86_64:2026-03-02-92c972404358 - build_py312: ghcr.io/nvidia/cutile-python/build_py_3.12_x86_64:2026-03-02-299d123ad082 - build_py313: ghcr.io/nvidia/cutile-python/build_py_3.13_x86_64:2026-03-02-8eea98e968b5 + lint: ghcr.io/nvidia/cutile-python/lint:2026-03-18-3ee906b0ced0 + docs: ghcr.io/nvidia/cutile-python/docs:2026-03-18-67c908a4176e + build_py310: ghcr.io/nvidia/cutile-python/build_py_3.10_x86_64:2026-03-18-a2fdea5320fe + build_py311: ghcr.io/nvidia/cutile-python/build_py_3.11_x86_64:2026-03-18-8573f3996301 + build_py312: ghcr.io/nvidia/cutile-python/build_py_3.12_x86_64:2026-03-18-63835ff03f5d + build_py313: ghcr.io/nvidia/cutile-python/build_py_3.13_x86_64:2026-03-18-9cadab6c475e + test_py310: ghcr.io/nvidia/cutile-python/test_py_3.10_x86_64:2026-03-18-09e8ff4f33de + test_py311: ghcr.io/nvidia/cutile-python/test_py_3.11_x86_64:2026-03-18-0f68d8d46ac4 + test_py312: ghcr.io/nvidia/cutile-python/test_py_3.12_x86_64:2026-03-18-3fe476fda925 + test_py313: ghcr.io/nvidia/cutile-python/test_py_3.13_x86_64:2026-03-18-f40db2451d39 steps: - run: echo "Defining image tags" @@ -117,3 +121,59 @@ jobs: path: dist/*.whl if-no-files-found: error retention-days: 7 + + test: + name: Test (Python ${{ matrix.python-version }}) + needs: [images, build] + runs-on: [self-hosted, gpu] + timeout-minutes: 60 + strategy: + matrix: + include: + - python-version: "3.10" + image_key: test_py310 + - python-version: "3.11" + image_key: test_py311 + - python-version: "3.12" + image_key: test_py312 + - python-version: "3.13" + image_key: test_py313 + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Download wheel + uses: actions/download-artifact@v4 + with: + name: wheel-py${{ matrix.python-version }}-linux-x86_64 + path: dist/ + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull test image + run: docker pull ${{ needs.images.outputs[matrix.image_key] }} + + - name: Run tests + run: | + docker run --rm --gpus all \ + -v "${{ github.workspace }}":/workspace \ + -w /workspace \ + ${{ needs.images.outputs[matrix.image_key] }} \ + bash -c "pip install dist/*.whl && \ + pytest --ignore internal \ + -m 'not benchmark and not use_mlir' \ + --durations=10 \ + --junitxml=/workspace/test-results.xml" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-py${{ matrix.python-version }} + path: test-results.xml + retention-days: 7 From 2434e12c891d0f7ba056289f37ea4aeba044ac55 Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 18 Mar 2026 14:28:19 -0700 Subject: [PATCH 02/10] ci: fix workspace permissions after docker test runs Docker containers run as root, so files created during tests (e.g. .pytest_cache) are root-owned. Subsequent jobs on the same runner fail when actions/checkout tries to clean the workspace. Fix by restoring ownership after each test run. --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7d5ebc9..2fd0a0f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -177,3 +177,7 @@ jobs: name: test-results-py${{ matrix.python-version }} path: test-results.xml retention-days: 7 + + - name: Fix workspace permissions + if: always() + run: sudo chown -R $(id -u):$(id -g) "${{ github.workspace }}" From 48aef3af46b5a9bbe5ecd9e8009315d050dda0e6 Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 18 Mar 2026 16:00:28 -0700 Subject: [PATCH 03/10] ci: add dorny/test-reporter for JUnit results in GitHub UI --- .github/workflows/ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2fd0a0f..787d2ef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -178,6 +178,14 @@ jobs: path: test-results.xml retention-days: 7 + - name: Report test results + if: always() + uses: dorny/test-reporter@v2 + with: + name: Test Results (Python ${{ matrix.python-version }}) + path: test-results.xml + reporter: java-junit + - name: Fix workspace permissions if: always() run: sudo chown -R $(id -u):$(id -g) "${{ github.workspace }}" From 4ebcc25f802c11c8c047660b36c3e3eb1cf83c0d Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 11:03:09 -0700 Subject: [PATCH 04/10] ci: use native container directive for GPU test jobs Replace manual docker run/mount/permission-fix workflow with GitHub Actions' built-in container: directive and --gpus all via options. --- .github/workflows/ci.yml | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 787d2ef..61bc634 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,6 +138,12 @@ jobs: image_key: test_py312 - python-version: "3.13" image_key: test_py313 + container: + image: ${{ needs.images.outputs[matrix.image_key] }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --gpus all steps: - name: Checkout repository uses: actions/checkout@v6 @@ -148,27 +154,15 @@ jobs: name: wheel-py${{ matrix.python-version }}-linux-x86_64 path: dist/ - - name: Log in to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Pull test image - run: docker pull ${{ needs.images.outputs[matrix.image_key] }} + - name: Install wheel + run: pip install dist/*.whl - name: Run tests run: | - docker run --rm --gpus all \ - -v "${{ github.workspace }}":/workspace \ - -w /workspace \ - ${{ needs.images.outputs[matrix.image_key] }} \ - bash -c "pip install dist/*.whl && \ - pytest --ignore internal \ - -m 'not benchmark and not use_mlir' \ - --durations=10 \ - --junitxml=/workspace/test-results.xml" + pytest --ignore internal \ + -m 'not benchmark and not use_mlir' \ + --durations=10 \ + --junitxml=test-results.xml - name: Upload test results if: always() @@ -185,7 +179,3 @@ jobs: name: Test Results (Python ${{ matrix.python-version }}) path: test-results.xml reporter: java-junit - - - name: Fix workspace permissions - if: always() - run: sudo chown -R $(id -u):$(id -g) "${{ github.workspace }}" From 0febd8eec6efc5d946a979e36bae49311ce16149 Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 11:06:38 -0700 Subject: [PATCH 05/10] ci: bump actions to Node.js 24 versions - actions/upload-artifact v4 -> v6 - actions/download-artifact v4 -> v7 - dorny/test-reporter v2 -> v3 --- .github/workflows/ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 61bc634..82b2003 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,7 +71,7 @@ jobs: uses: actions/checkout@v6 - name: Download wheel - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: name: wheel-py3.12-linux-x86_64 path: dist/ @@ -83,7 +83,7 @@ jobs: run: make -C docs html - name: Upload docs artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: docs-html path: docs/build/html @@ -115,7 +115,7 @@ jobs: run: python setup.py bdist_wheel - name: Upload wheel artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: wheel-py${{ matrix.python-version }}-linux-x86_64 path: dist/*.whl @@ -149,7 +149,7 @@ jobs: uses: actions/checkout@v6 - name: Download wheel - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: name: wheel-py${{ matrix.python-version }}-linux-x86_64 path: dist/ @@ -166,7 +166,7 @@ jobs: - name: Upload test results if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: test-results-py${{ matrix.python-version }} path: test-results.xml @@ -174,7 +174,7 @@ jobs: - name: Report test results if: always() - uses: dorny/test-reporter@v2 + uses: dorny/test-reporter@v3 with: name: Test Results (Python ${{ matrix.python-version }}) path: test-results.xml From 6ea74dd0bff6cdb73a69d622313b53936b9c9d20 Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 11:14:40 -0700 Subject: [PATCH 06/10] ci: harden workflow with least-privilege permissions and best practices - Add workflow-level permissions (contents: read, packages: read) - Add checks: write to test job for dorny/test-reporter - Add fail-fast: false to build and test matrices - Replace if: always() with if: !cancelled() on test result steps --- .github/workflows/ci.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 82b2003..2973267 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,6 +15,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + packages: read + jobs: images: name: Define Base Images @@ -95,6 +99,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: include: - python-version: "3.10" @@ -127,7 +132,12 @@ jobs: needs: [images, build] runs-on: [self-hosted, gpu] timeout-minutes: 60 + permissions: + contents: read + packages: read + checks: write strategy: + fail-fast: false matrix: include: - python-version: "3.10" @@ -165,7 +175,7 @@ jobs: --junitxml=test-results.xml - name: Upload test results - if: always() + if: ${{ !cancelled() }} uses: actions/upload-artifact@v6 with: name: test-results-py${{ matrix.python-version }} @@ -173,7 +183,7 @@ jobs: retention-days: 7 - name: Report test results - if: always() + if: ${{ !cancelled() }} uses: dorny/test-reporter@v3 with: name: Test Results (Python ${{ matrix.python-version }}) From cf3c138d56fefd2268ccbe2174d3a20d0cf5297f Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 12:36:28 -0700 Subject: [PATCH 07/10] ci: update test images with git, simplify pytest command - Update test image tags to include git (needed by dorny/test-reporter) - Remove --ignore internal (no internal folder in OSS) - Remove -m 'not benchmark and not use_mlir' (run all tests, use_mlir auto-skips) --- .github/workflows/ci.yml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2973267..39d8727 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,10 +30,10 @@ jobs: build_py311: ghcr.io/nvidia/cutile-python/build_py_3.11_x86_64:2026-03-18-8573f3996301 build_py312: ghcr.io/nvidia/cutile-python/build_py_3.12_x86_64:2026-03-18-63835ff03f5d build_py313: ghcr.io/nvidia/cutile-python/build_py_3.13_x86_64:2026-03-18-9cadab6c475e - test_py310: ghcr.io/nvidia/cutile-python/test_py_3.10_x86_64:2026-03-18-09e8ff4f33de - test_py311: ghcr.io/nvidia/cutile-python/test_py_3.11_x86_64:2026-03-18-0f68d8d46ac4 - test_py312: ghcr.io/nvidia/cutile-python/test_py_3.12_x86_64:2026-03-18-3fe476fda925 - test_py313: ghcr.io/nvidia/cutile-python/test_py_3.13_x86_64:2026-03-18-f40db2451d39 + test_py310: ghcr.io/nvidia/cutile-python/test_py_3.10_x86_64:2026-03-25-d688c40b1f28 + test_py311: ghcr.io/nvidia/cutile-python/test_py_3.11_x86_64:2026-03-25-ee977c750e6a + test_py312: ghcr.io/nvidia/cutile-python/test_py_3.12_x86_64:2026-03-25-4a28b7ac9c10 + test_py313: ghcr.io/nvidia/cutile-python/test_py_3.13_x86_64:2026-03-25-daa77b7df120 steps: - run: echo "Defining image tags" @@ -168,11 +168,7 @@ jobs: run: pip install dist/*.whl - name: Run tests - run: | - pytest --ignore internal \ - -m 'not benchmark and not use_mlir' \ - --durations=10 \ - --junitxml=test-results.xml + run: pytest --durations=10 --junitxml=test-results.xml . - name: Upload test results if: ${{ !cancelled() }} From f20f535eacc876b7564ff650ff56d6783eeb991c Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 13:22:32 -0700 Subject: [PATCH 08/10] ci: separate benchmark tests into dedicated job - Exclude benchmarks from the regular test job with -m "not benchmark" to prevent GPU OOM from large tensor allocations competing with parallel test jobs - Add a dedicated benchmark job (Python 3.10 only, continue-on-error) mirroring the GitLab CI pattern - Add git safe.directory config to the test job to fix dorny/test-reporter failing with git exit code 128 in Docker containers Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 56 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 39d8727..8f32bc9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -158,6 +158,9 @@ jobs: - name: Checkout repository uses: actions/checkout@v6 + - name: Mark workspace as safe directory + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Download wheel uses: actions/download-artifact@v7 with: @@ -168,7 +171,7 @@ jobs: run: pip install dist/*.whl - name: Run tests - run: pytest --durations=10 --junitxml=test-results.xml . + run: pytest --durations=10 --junitxml=test-results.xml -m "not benchmark" . - name: Upload test results if: ${{ !cancelled() }} @@ -185,3 +188,54 @@ jobs: name: Test Results (Python ${{ matrix.python-version }}) path: test-results.xml reporter: java-junit + + benchmark: + name: Benchmark + needs: [images, build] + runs-on: [self-hosted, gpu] + timeout-minutes: 60 + continue-on-error: true + permissions: + contents: read + packages: read + checks: write + container: + image: ${{ needs.images.outputs.test_py310 }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --gpus all + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Mark workspace as safe directory + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Download wheel + uses: actions/download-artifact@v7 + with: + name: wheel-py3.10-linux-x86_64 + path: dist/ + + - name: Install wheel + run: pip install dist/*.whl + + - name: Run benchmarks + run: pytest --durations=10 --junitxml=benchmark-results.xml -m "benchmark" . + + - name: Upload benchmark results + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v6 + with: + name: benchmark-results + path: benchmark-results.xml + retention-days: 7 + + - name: Report benchmark results + if: ${{ !cancelled() }} + uses: dorny/test-reporter@v3 + with: + name: Benchmark Results + path: benchmark-results.xml + reporter: java-junit From 88f0073c5839a3c55784c04b8ab823338d33ec47 Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 13:28:38 -0700 Subject: [PATCH 09/10] ci: run benchmark job after test jobs to avoid GPU contention Benchmark was running in parallel with all 4 test jobs, putting 5 concurrent GPU workloads on the same runner and causing OOM for everything. Sequencing benchmark after test ensures it gets the GPU to itself. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8f32bc9..8b28199 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -191,7 +191,7 @@ jobs: benchmark: name: Benchmark - needs: [images, build] + needs: [images, build, test] runs-on: [self-hosted, gpu] timeout-minutes: 60 continue-on-error: true From 88951f1def679b8958a29528afcc591788e02e31 Mon Sep 17 00:00:00 2001 From: cdunning Date: Wed, 25 Mar 2026 13:47:20 -0700 Subject: [PATCH 10/10] ci: set fail-on-error=false on test reporters The dorny/test-reporter steps were failing the job a second time when tests failed. The pytest step is the authoritative failure signal; the reporter is for display only. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8b28199..c37efb5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -188,6 +188,7 @@ jobs: name: Test Results (Python ${{ matrix.python-version }}) path: test-results.xml reporter: java-junit + fail-on-error: false benchmark: name: Benchmark @@ -239,3 +240,4 @@ jobs: name: Benchmark Results path: benchmark-results.xml reporter: java-junit + fail-on-error: false