From 78f62763bed8bafcf0092438dd74fcf21b030a31 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Mon, 27 Apr 2026 14:10:57 -0400
Subject: [PATCH] =?UTF-8?q?Revert=20"Move=20torch=20pin=20from=20the=202.1?=
 =?UTF-8?q?1=20to=20the=202026-04-09=20nightly,=20and=20drop=20depr?=
 =?UTF-8?q?=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit d7f87188c9c19c3a603f54a34a0898a9712320ab.
---
 .ci/docker/build.sh                           |  2 +-
 .ci/docker/ci_commit_pins/pytorch.txt         |  2 +-
 .../install_cuda_windows_cross_compile.sh     | 10 +--
 .ci/docker/common/install_pytorch.sh          |  9 +--
 .ci/scripts/test_model_e2e.sh                 |  2 +-
 .ci/scripts/test_wheel_package_qnn.sh         | 22 +++---
 .ci/scripts/utils.sh                          |  6 +-
 .github/workflows/cuda-windows.yml            |  8 +-
 .github/workflows/cuda.yml                    |  6 +-
 .github/workflows/docker-builds.yml           |  7 +-
 .../models/moshi/mimi/install_requirements.sh |  2 +-
 install_requirements.py                       | 26 ++----
 .../c10/torch/headeronly/macros/Macros.h      | 79 ++++---------------
 .../c10/torch/headeronly/util/BFloat16.h      | 13 ++-
 torch_pin.py                                  |  4 +-
 15 files changed, 63 insertions(+), 135 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 5d73835ea15..7c4a80044e4 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in
     LINTRUNNER=""
     GCC_VERSION=11
     CUDA_WINDOWS_CROSS_COMPILE=yes
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.8
     SKIP_PYTORCH=yes
     ;;
   *)
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 25963674d4f..f6e39a63b92 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-358117c166b75167a09bca81ac9925940feda339
+release/2.11
\ No newline at end of file
diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
index 7f6826a7260..e3529751221 100644
--- a/.ci/docker/common/install_cuda_windows_cross_compile.sh
+++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -11,13 +11,12 @@ set -ex
 
 INSTALL_DIR="${WINDOWS_CUDA_INSTALL_DIR:-/opt/cuda-windows}"
 
-# Mapping of CUDA versions to their corresponding driver versions for Windows installers.
+# Mapping of CUDA versions to their corresponding driver versions for Windows installers
 # Source: https://developer.nvidia.com/cuda-toolkit-archive
-# Format: "PATCH_VERSION:DRIVER_VERSION". Starting with CUDA 13.0, NVIDIA dropped the
-# driver suffix from the Windows installer filename, so the driver field is empty.
 declare -A CUDA_DRIVER_MAP=(
     ["12.6"]="12.6.3:561.17"
-    ["13.0"]="13.0.3:"
+    ["12.8"]="12.8.1:572.61"
+    ["12.9"]="12.9.1:576.57"
 )
 
 install_mingw() {
@@ -84,8 +83,7 @@ install_windows_cuda() {
     mkdir -p "${INSTALL_DIR}"
     cd "${INSTALL_DIR}"
 
-    # CUDA 13.0+ installers no longer include the driver version in the filename.
-    CUDA_INSTALLER="cuda_${CUDA_VERSION}${CUDA_DRIVER_VERSION:+_${CUDA_DRIVER_VERSION}}_windows.exe"
+    CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
     CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
 
     # Check if already downloaded and extracted
diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
index 0d79671c827..548a24f885d 100755
--- a/.ci/docker/common/install_pytorch.sh
+++ b/.ci/docker/common/install_pytorch.sh
@@ -27,19 +27,14 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
-  # PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
-  # doesn't pass — gcc-11 in this image is too old to accept the combined
-  # NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
-  # runtime targets (phones, embedded) don't use SVE, so bypass the check.
-  export BUILD_IGNORE_SVE_UNAVAILABLE=1
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
   # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index f050538a283..8b8783d0db8 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh
index 43be46d1941..763bd8733c1 100644
--- a/.ci/scripts/test_wheel_package_qnn.sh
+++ b/.ci/scripts/test_wheel_package_qnn.sh
@@ -158,17 +158,17 @@ print(module_vars["TORCH_VERSION"])
 PY
 )
 
-  NIGHTLY_VERSION=$(
-  "$PYBIN" - <<'PY'
-import runpy
-module_vars = runpy.run_path("torch_pin.py")
-print(module_vars["NIGHTLY_VERSION"])
-PY
-)
-  echo "=== [$LABEL] Install torch==${TORCH_VERSION}.${NIGHTLY_VERSION} ==="
-
-  # Install torchao based on the pinned PyTorch version
-  "$PIPBIN" install torch=="${TORCH_VERSION}.${NIGHTLY_VERSION}" --index-url "https://download.pytorch.org/whl/nightly/cpu"
+#   NIGHTLY_VERSION=$(
+#   "$PYBIN" - <<'PY'
+# import runpy
+# module_vars = runpy.run_path("torch_pin.py")
+# print(module_vars["NIGHTLY_VERSION"])
+# PY
+# )
+  echo "=== [$LABEL] Install torch==${TORCH_VERSION} ==="
+
+  # Install torch based on the pinned PyTorch version, preferring the PyTorch test index
+  "$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
   "$PIPBIN" install wheel
 
   # Install torchao based on the pinned commit from third-party/ao submodule
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 18038e36831..86e54b478ef 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -53,7 +53,7 @@ dedupe_macos_loader_path_rpaths() {
   pushd ..
   torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
   popd
-  
+
   if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
     return
   fi
@@ -141,9 +141,9 @@ install_pytorch_and_domains() {
 
   dedupe_macos_loader_path_rpaths
   # Grab the pinned audio and vision commits from PyTorch
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 105055c669c..265b7e3069d 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -64,7 +64,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: 12.8
       docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
       upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
@@ -146,7 +146,7 @@ jobs:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: 12.8
       download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
@@ -158,7 +158,7 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6'
+          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
           \$env:CUDA_PATH = \$env:CUDA_HOME
           \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
           nvcc --version
@@ -169,5 +169,5 @@ jobs:
             throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
           }
 
-          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.6'
+          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
         }"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 077f48ff0c9..c3b7c058ee6 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -1,6 +1,6 @@
 # Test ExecuTorch CUDA Build Compatibility
 # This workflow tests whether ExecuTorch can be successfully built with CUDA support
-# across different CUDA versions (12.6, 13.0) using the command:
+# across different CUDA versions (12.6, 12.8, 12.9, 13.0) using the command:
 #   ./install_executorch.sh
 #
 # Note: ExecuTorch automatically detects the system CUDA version using nvcc and
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda-version: ["12.6", "13.0"]
+        cuda-version: ["12.6", "12.8", "12.9", "13.0"]
 
     name: test-executorch-cuda-build-${{ matrix.cuda-version }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -66,7 +66,7 @@ jobs:
             echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
             exit 1
           else
-            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 13.0) completed successfully!"
+            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9, 13.0) completed successfully!"
           fi
 
   test-models-cuda:
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 25234ca387a..0f9778e9e69 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -33,20 +33,17 @@ jobs:
       matrix:
         runner: [linux.4xlarge]
         docker-image-name: [
+          executorch-ubuntu-22.04-gcc11,
           executorch-ubuntu-22.04-gcc9-nopytorch,
           executorch-ubuntu-22.04-clang12,
           executorch-ubuntu-22.04-linter,
           executorch-ubuntu-22.04-arm-sdk,
+          executorch-ubuntu-22.04-zephyr-sdk,
           executorch-ubuntu-22.04-qnn-sdk,
           executorch-ubuntu-22.04-mediatek-sdk,
           executorch-ubuntu-22.04-clang12-android
         ]
         include:
-          # PyTorch is built from source in these images; 4xlarge OOMs mid-build.
-          - docker-image-name: executorch-ubuntu-22.04-gcc11
-            runner: linux.12xlarge
-          - docker-image-name: executorch-ubuntu-22.04-zephyr-sdk
-            runner: linux.12xlarge
           - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64
             runner: linux.arm64.2xlarge
           - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64-android
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 29e9fe10977..9fc12f64bc9 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/install_requirements.py b/install_requirements.py
index 85431bbc8d9..b30068cbdb8 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,11 +12,9 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-from torch_pin import NIGHTLY_VERSION, TORCH_VERSION
-
 # The pip repository that hosts nightly torch packages.
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
-TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
+TORCH_URL_BASE = "https://download.pytorch.org/whl/test"
 
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
@@ -44,18 +42,14 @@ def install_requirements(use_pytorch_nightly):
         sys.exit(1)
 
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     # pip packages needed by exir.
     TORCH_PACKAGE = [
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        (
-            f"torch=={TORCH_VERSION}.{NIGHTLY_VERSION}"
-            if use_pytorch_nightly
-            else "torch"
-        ),
+        ("torch==2.11.0" if use_pytorch_nightly else "torch"),
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -114,20 +108,12 @@ def install_requirements(use_pytorch_nightly):
 
 def install_optional_example_requirements(use_pytorch_nightly):
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
-        (
-            f"torchvision==0.27.0.{NIGHTLY_VERSION}"
-            if use_pytorch_nightly
-            else "torchvision"
-        ),
-        (
-            f"torchaudio==2.11.0.{NIGHTLY_VERSION}"
-            if use_pytorch_nightly
-            else "torchaudio"
-        ),
+        ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
+        ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
     ]
     # Then install domain libraries
     subprocess.run(
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index cef99df3f56..63aa0d20d8e 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -325,88 +325,41 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
+#if defined(USE_ROCM)
 // C10_WARP_SIZE is only allowed for device code.
-// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
-// Host or device statically-sized arrays _must_ use either
-// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
-//
+// Host code _must_ use at::cuda::warp_size()
 // HIP header used to define warpSize as a constexpr that was either 32 or 64
 // depending on the target device, and then always set it to 64 for host code.
-// For a time, that allowed C10_WARP_SIZE to be defined like so:
-//
-// #ifdef USE_ROCM
-// #define C10_WARP_SIZE warpSize
-// #else
-// #define C10_WARP_SIZE 32
-// #endif
-//
-// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
-// We can now only use warpSize for C10_WARP_SIZE in device code and this is
-// enforced by using __device__ in its definition.  In host code where
-// C10_WARP_SIZE was previously used as a compile-time constant, this will now
-// cause a compile-time error.
-//
-// If an array was previously expected to be sized at compile-time using
-// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
-// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
-//
-// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
-// must now use at::cuda::warp_size() for the dynamic runtime query.
-//
-// Unfortunately, C10_WARP_SIZE has been public and available for both host and
-// device since approximately 2019, so forcing it to be device-only would break
-// existing code in the wild.
-#if defined(USE_ROCM)
+// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
+// set it to something unreasonable to trigger obvious host code errors.
+
 namespace at::cuda {
 TORCH_CUDA_CPP_API int warp_size();
 }
-#if defined(__HIPCC__)
-static __host__ inline int C10_WARP_SIZE_INTERNAL() {
+#ifdef __HIPCC__
+static inline int __host__ C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-// NOTE: __device__ C10_WARP_SIZE_INTERNAL
-// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
-// we can use constexpr. This matches prior behavior. We preserve this for
-// backward compatibility instead of forcing old code to use dynamic warpSize
-// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
-// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
-// warpSize is not allowed.
-#if defined(__SPIRV__)
-static __device__ inline int C10_WARP_SIZE_INTERNAL() {
-  return warpSize;
-}
-#else // __SPIRV__
-static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
+
+static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
   return 64;
 #else // __GFX9__
   return 32;
 #endif // __GFX9__
 }
-#endif // __SPIRV__
-#if defined(__SPIRV__)
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 64
-#elif defined(__GFX9__)
-#define C10_WARP_SIZE_LOWER_BOUND 64
-#define C10_WARP_SIZE_UPPER_BOUND 64
-#else
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 32
-#endif
-#else // !__HIPCC__
+#else // __HIPCC__
 static inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 64
 #endif // __HIPCC__
+
 #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
-#else // !USE_ROCM
+#define C10_WARP_SIZE_STATIC 64
+
+#else // defined(USE_ROCM)
 #define C10_WARP_SIZE 32
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 32
-#endif // USE_ROCM
+#endif
 
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 #define __func__ __FUNCTION__
@@ -676,7 +629,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
+#if (defined(__GNUC__) && __GNUC__ < 13) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
index 9aa08c265bd..64479ba36f1 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
@@ -12,7 +12,7 @@
 #include <iosfwd>
 #include <ostream>
 
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
 #include <cuda_bf16.h>
 #endif
 
@@ -46,7 +46,7 @@ struct alignas(2) BFloat16 {
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
 
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
   inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
   explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
 #endif
@@ -124,9 +124,8 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 /// Constructors
 inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
     :
-#if defined(__CUDACC__) &&                                                   \
-    (!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \
-     defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 800
       x(__bfloat16_as_ushort(__float2bfloat16(value)))
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -140,7 +139,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
 
 /// Implicit conversions
 inline C10_HOST_DEVICE BFloat16::operator float() const {
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
   return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -150,7 +149,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const {
 #endif
 }
 
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
 inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
 }
diff --git a/torch_pin.py b/torch_pin.py
index 10a015c081c..3575d9a376d 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
-TORCH_VERSION = "2.12.0"
-NIGHTLY_VERSION = "dev20260409"
+TORCH_VERSION = "2.11.0"
+# NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287