From 94fd77194361f396888f8f07d5627340eb495a42 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 21:33:45 -0700
Subject: [PATCH 1/2] fix(ci): shrink CUDA wheel fatbins

---
 .github/workflows/build-wheels-cuda.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index b8d6c9dce..17daaa12a 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -153,9 +153,10 @@ jobs:
           }
           $cudaTagVersion = $nvccVersion.Replace('.','')
           $env:VERBOSE = '1'
-          # Keep a portable SM set, including sm_70, instead of CMake's `all`,
-          # which now pulls in future targets the hosted-runner toolchains cannot assemble.
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
+          # Build real cubins for the supported GPUs, including sm_70, and keep
+          # one forward-compatible PTX target instead of embedding PTX for every
+          # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70-real;75-real;80-real;86-real;89-real;90-real;90-virtual -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
           # if ($env:AVXVER -eq 'AVX') {
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           # }

From 36d167517e3a2462be5138d453830873781c9a1b Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Wed, 25 Mar 2026 01:55:56 -0700
Subject: [PATCH 2/2] docs: update changelog for cuda wheel size fix

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4118f4848..f4a0b55d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158
+
 ## [0.3.18]
 
 - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143