pytorch · mergennachin · Apr 27, 2026 · Apr 27, 2026 · Copilot · Apr 27, 2026
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in
     LINTRUNNER=""
     GCC_VERSION=11
     CUDA_WINDOWS_CROSS_COMPILE=yes
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.8
     SKIP_PYTORCH=yes
     ;;
   *)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-358117c166b75167a09bca81ac9925940feda339
+release/2.11
-release/2.11
+v2.11.0
-release/2.11
+v2.11.0
diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -11,13 +11,12 @@ set -ex
 
 INSTALL_DIR="${WINDOWS_CUDA_INSTALL_DIR:-/opt/cuda-windows}"
 
-# Mapping of CUDA versions to their corresponding driver versions for Windows installers.
+# Mapping of CUDA versions to their corresponding driver versions for Windows installers
 # Source: https://developer.nvidia.com/cuda-toolkit-archive
-# Format: "PATCH_VERSION:DRIVER_VERSION". Starting with CUDA 13.0, NVIDIA dropped the
-# driver suffix from the Windows installer filename, so the driver field is empty.
 declare -A CUDA_DRIVER_MAP=(
     ["12.6"]="12.6.3:561.17"
-    ["13.0"]="13.0.3:"
+    ["12.8"]="12.8.1:572.61"
+    ["12.9"]="12.9.1:576.57"
 )
 
 install_mingw() {
@@ -84,8 +83,7 @@ install_windows_cuda() {
     mkdir -p "${INSTALL_DIR}"
     cd "${INSTALL_DIR}"
 
-    # CUDA 13.0+ installers no longer include the driver version in the filename.
-    CUDA_INSTALLER="cuda_${CUDA_VERSION}${CUDA_DRIVER_VERSION:+_${CUDA_DRIVER_VERSION}}_windows.exe"
+    CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
     CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
 
     # Check if already downloaded and extracted

diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
@@ -27,19 +27,14 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
-  # PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
-  # doesn't pass — gcc-11 in this image is too old to accept the combined
-  # NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
-  # runtime targets (phones, embedded) don't use SVE, so bypass the check.
-  export BUILD_IGNORE_SVE_UNAVAILABLE=1
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
   # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 

diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh
@@ -158,17 +158,17 @@ print(module_vars["TORCH_VERSION"])
 PY
 )
 
-  NIGHTLY_VERSION=$(
-  "$PYBIN" - <<'PY'
-import runpy
-module_vars = runpy.run_path("torch_pin.py")
-print(module_vars["NIGHTLY_VERSION"])
-PY
-)
-  echo "=== [$LABEL] Install torch==${TORCH_VERSION}.${NIGHTLY_VERSION} ==="
-
-  # Install torchao based on the pinned PyTorch version
-  "$PIPBIN" install torch=="${TORCH_VERSION}.${NIGHTLY_VERSION}" --index-url "https://download.pytorch.org/whl/nightly/cpu"
+#   NIGHTLY_VERSION=$(
+#   "$PYBIN" - <<'PY'
+# import runpy
+# module_vars = runpy.run_path("torch_pin.py")
+# print(module_vars["NIGHTLY_VERSION"])
+# PY
+# )
+  echo "=== [$LABEL] Install torch==${TORCH_VERSION} ==="
+
+  # Install torch based on the pinned PyTorch version, preferring the PyTorch test index
+  "$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
-  # Install torch based on the pinned PyTorch version, preferring the PyTorch test index
-  "$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
+  # Install torch from the PyTorch test CPU wheel index, while allowing PyPI for dependencies
+  "$PIPBIN" install torch=="${TORCH_VERSION}" --index-url "https://download.pytorch.org/whl/test/cpu" --extra-index-url "https://pypi.org/simple"
-  # Install torch based on the pinned PyTorch version, preferring the PyTorch test index
-  "$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
+  # Install torch from the PyTorch test CPU wheel index, while allowing PyPI for dependencies
+  "$PIPBIN" install torch=="${TORCH_VERSION}" --index-url "https://download.pytorch.org/whl/test/cpu" --extra-index-url "https://pypi.org/simple"
   "$PIPBIN" install wheel
 
   # Install torchao based on the pinned commit from third-party/ao submodule

diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -53,7 +53,7 @@ dedupe_macos_loader_path_rpaths() {
   pushd ..
   torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
   popd
-  
+
   if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
     return
   fi
@@ -141,9 +141,9 @@ install_pytorch_and_domains() {
 
   dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains

diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
@@ -64,7 +64,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: 12.8
       docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
       upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
@@ -146,7 +146,7 @@ jobs:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: 12.8
       download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
@@ -158,7 +158,7 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6'
+          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
           \$env:CUDA_PATH = \$env:CUDA_HOME
           \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
           nvcc --version
@@ -169,5 +169,5 @@ jobs:
             throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
           }
 
-          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.6'
+          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
         }"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -1,6 +1,6 @@
 # Test ExecuTorch CUDA Build Compatibility
 # This workflow tests whether ExecuTorch can be successfully built with CUDA support
-# across different CUDA versions (12.6, 13.0) using the command:
+# across different CUDA versions (12.6, 12.8, 12.9, 13.0) using the command:
 #   ./install_executorch.sh
 #
 # Note: ExecuTorch automatically detects the system CUDA version using nvcc and
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda-version: ["12.6", "13.0"]
+        cuda-version: ["12.6", "12.8", "12.9", "13.0"]
 
     name: test-executorch-cuda-build-${{ matrix.cuda-version }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -66,7 +66,7 @@ jobs:
             echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
             exit 1
           else
-            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 13.0) completed successfully!"
+            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9, 13.0) completed successfully!"
           fi
 
   test-models-cuda:

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
@@ -33,20 +33,17 @@ jobs:
       matrix:
         runner: [linux.4xlarge]
         docker-image-name: [
+          executorch-ubuntu-22.04-gcc11,
           executorch-ubuntu-22.04-gcc9-nopytorch,
           executorch-ubuntu-22.04-clang12,
           executorch-ubuntu-22.04-linter,
           executorch-ubuntu-22.04-arm-sdk,
+          executorch-ubuntu-22.04-zephyr-sdk,
           executorch-ubuntu-22.04-qnn-sdk,
           executorch-ubuntu-22.04-mediatek-sdk,
           executorch-ubuntu-22.04-clang12-android
         ]
         include:
-          # PyTorch is built from source in these images; 4xlarge OOMs mid-build.
-          - docker-image-name: executorch-ubuntu-22.04-gcc11
-            runner: linux.12xlarge
-          - docker-image-name: executorch-ubuntu-22.04-zephyr-sdk
-            runner: linux.12xlarge
           - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64
             runner: linux.arm64.2xlarge
           - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64-android

@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps

diff --git a/install_requirements.py b/install_requirements.py
@@ -12,11 +12,9 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-from torch_pin import NIGHTLY_VERSION, TORCH_VERSION
-
 # The pip repository that hosts nightly torch packages.
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
-TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
+TORCH_URL_BASE = "https://download.pytorch.org/whl/test"
 
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
@@ -44,18 +42,14 @@ def install_requirements(use_pytorch_nightly):
         sys.exit(1)
 
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     # pip packages needed by exir.
     TORCH_PACKAGE = [
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        (
-            f"torch=={TORCH_VERSION}.{NIGHTLY_VERSION}"
-            if use_pytorch_nightly
-            else "torch"
-        ),
+        ("torch==2.11.0" if use_pytorch_nightly else "torch"),
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -114,20 +108,12 @@ def install_requirements(use_pytorch_nightly):
 
 def install_optional_example_requirements(use_pytorch_nightly):
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
-        (
-            f"torchvision==0.27.0.{NIGHTLY_VERSION}"
-            if use_pytorch_nightly
-            else "torchvision"
-        ),
-        (
-            f"torchaudio==2.11.0.{NIGHTLY_VERSION}"
-            if use_pytorch_nightly
-            else "torchaudio"
-        ),
+        ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
+        ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
     ]
     # Then install domain libraries
     subprocess.run(

@@ -325,88 +325,41 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
+#if defined(USE_ROCM)
 // C10_WARP_SIZE is only allowed for device code.
-// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
-// Host or device statically-sized arrays _must_ use either
-// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
-//
+// Host code _must_ use at::cuda::warp_size()
 // HIP header used to define warpSize as a constexpr that was either 32 or 64
 // depending on the target device, and then always set it to 64 for host code.
-// For a time, that allowed C10_WARP_SIZE to be defined like so:
-//
-// #ifdef USE_ROCM
-// #define C10_WARP_SIZE warpSize
-// #else
-// #define C10_WARP_SIZE 32
-// #endif
-//
-// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
-// We can now only use warpSize for C10_WARP_SIZE in device code and this is
-// enforced by using __device__ in its definition.  In host code where
-// C10_WARP_SIZE was previously used as a compile-time constant, this will now
-// cause a compile-time error.
-//
-// If an array was previously expected to be sized at compile-time using
-// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
-// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
-//
-// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
-// must now use at::cuda::warp_size() for the dynamic runtime query.
-//
-// Unfortunately, C10_WARP_SIZE has been public and available for both host and
-// device since approximately 2019, so forcing it to be device-only would break
-// existing code in the wild.
-#if defined(USE_ROCM)
+// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
+// set it to something unreasonable to trigger obvious host code errors.
+
 namespace at::cuda {
 TORCH_CUDA_CPP_API int warp_size();
 }
-#if defined(__HIPCC__)
-static __host__ inline int C10_WARP_SIZE_INTERNAL() {
+#ifdef __HIPCC__
+static inline int __host__ C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-// NOTE: __device__ C10_WARP_SIZE_INTERNAL
-// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
-// we can use constexpr. This matches prior behavior. We preserve this for
-// backward compatibility instead of forcing old code to use dynamic warpSize
-// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
-// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
-// warpSize is not allowed.
-#if defined(__SPIRV__)
-static __device__ inline int C10_WARP_SIZE_INTERNAL() {
-  return warpSize;
-}
-#else // __SPIRV__
-static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
+
+static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
   return 64;
 #else // __GFX9__
   return 32;
 #endif // __GFX9__
 }
-#endif // __SPIRV__
-#if defined(__SPIRV__)
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 64
-#elif defined(__GFX9__)
-#define C10_WARP_SIZE_LOWER_BOUND 64
-#define C10_WARP_SIZE_UPPER_BOUND 64
-#else
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 32
-#endif
-#else // !__HIPCC__
+#else // __HIPCC__
 static inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 64
 #endif // __HIPCC__
+
 #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
-#else // !USE_ROCM
+#define C10_WARP_SIZE_STATIC 64
+
+#else // defined(USE_ROCM)
 #define C10_WARP_SIZE 32
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 32
-#endif // USE_ROCM
+#endif
 
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 #define __func__ __FUNCTION__
@@ -676,7 +629,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
+#if (defined(__GNUC__) && __GNUC__ < 13) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else