From 94fd77194361f396888f8f07d5627340eb495a42 Mon Sep 17 00:00:00 2001 From: abetlen Date: Tue, 24 Mar 2026 21:33:45 -0700 Subject: [PATCH 1/2] fix(ci): shrink CUDA wheel fatbins --- .github/workflows/build-wheels-cuda.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index b8d6c9dce..17daaa12a 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -153,9 +153,10 @@ jobs: } $cudaTagVersion = $nvccVersion.Replace('.','') $env:VERBOSE = '1' - # Keep a portable SM set, including sm_70, instead of CMake's `all`, - # which now pulls in future targets the hosted-runner toolchains cannot assemble. - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" + # Build real cubins for the supported GPUs, including sm_70, and keep + # one forward-compatible PTX target instead of embedding PTX for every + # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" # if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' # } From 36d167517e3a2462be5138d453830873781c9a1b Mon Sep 17 00:00:00 2001 From: abetlen Date: Wed, 25 Mar 2026 01:55:56 -0700 Subject: [PATCH 2/2] docs: update changelog for cuda wheel size fix --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4118f4848..f4a0b55d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158 + ## [0.3.18] - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143