Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in
LINTRUNNER=""
GCC_VERSION=11
CUDA_WINDOWS_CROSS_COMPILE=yes
CUDA_VERSION=12.6
CUDA_VERSION=12.8
SKIP_PYTORCH=yes
;;
*)
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/pytorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
358117c166b75167a09bca81ac9925940feda339
release/2.11
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pinning to a moving ref (release/2.11) makes builds less reproducible and also conflicts with the existing pin-bump automation (.github/scripts/update_pytorch_pin.py) which writes a commit SHA into this file. Consider pinning to a specific commit hash (or at least a tag) and updating/pausing the automation if the intent is to stay on a stable release branch long-term.

Suggested change
release/2.11
v2.11.0

Copilot uses AI. Check for mistakes.
10 changes: 4 additions & 6 deletions .ci/docker/common/install_cuda_windows_cross_compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@ set -ex

INSTALL_DIR="${WINDOWS_CUDA_INSTALL_DIR:-/opt/cuda-windows}"

# Mapping of CUDA versions to their corresponding driver versions for Windows installers.
# Mapping of CUDA versions to their corresponding driver versions for Windows installers
# Source: https://developer.nvidia.com/cuda-toolkit-archive
# Format: "PATCH_VERSION:DRIVER_VERSION". Starting with CUDA 13.0, NVIDIA dropped the
# driver suffix from the Windows installer filename, so the driver field is empty.
declare -A CUDA_DRIVER_MAP=(
["12.6"]="12.6.3:561.17"
["13.0"]="13.0.3:"
["12.8"]="12.8.1:572.61"
["12.9"]="12.9.1:576.57"
)

install_mingw() {
Expand Down Expand Up @@ -84,8 +83,7 @@ install_windows_cuda() {
mkdir -p "${INSTALL_DIR}"
cd "${INSTALL_DIR}"

# CUDA 13.0+ installers no longer include the driver version in the filename.
CUDA_INSTALLER="cuda_${CUDA_VERSION}${CUDA_DRIVER_VERSION:+_${CUDA_DRIVER_VERSION}}_windows.exe"
CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"

# Check if already downloaded and extracted
Expand Down
9 changes: 2 additions & 7 deletions .ci/docker/common/install_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,14 @@ install_pytorch_and_domains() {
chown -R ci-user .

export _GLIBCXX_USE_CXX11_ABI=1
# PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
# doesn't pass — gcc-11 in this image is too old to accept the combined
# NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
# runtime targets (phones, embedded) don't use SVE, so bypass the check.
export BUILD_IGNORE_SVE_UNAVAILABLE=1
# Then build and install PyTorch
conda_run python setup.py bdist_wheel
pip_install "$(echo dist/*.whl)"

# Grab the pinned audio and vision commits from PyTorch
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
TORCHAUDIO_VERSION=release/2.11
export TORCHAUDIO_VERSION
TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
TORCHVISION_VERSION=release/0.26
export TORCHVISION_VERSION

install_domains
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ if [ "$AUDIO_URL" != "" ]; then
elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
conda install -y -c conda-forge "ffmpeg<8"
pip install datasets soundfile
pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
fi

Expand Down
22 changes: 11 additions & 11 deletions .ci/scripts/test_wheel_package_qnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,17 @@ print(module_vars["TORCH_VERSION"])
PY
)

NIGHTLY_VERSION=$(
"$PYBIN" - <<'PY'
import runpy
module_vars = runpy.run_path("torch_pin.py")
print(module_vars["NIGHTLY_VERSION"])
PY
)
echo "=== [$LABEL] Install torch==${TORCH_VERSION}.${NIGHTLY_VERSION} ==="

# Install torchao based on the pinned PyTorch version
"$PIPBIN" install torch=="${TORCH_VERSION}.${NIGHTLY_VERSION}" --index-url "https://download.pytorch.org/whl/nightly/cpu"
# NIGHTLY_VERSION=$(
# "$PYBIN" - <<'PY'
# import runpy
# module_vars = runpy.run_path("torch_pin.py")
# print(module_vars["NIGHTLY_VERSION"])
# PY
# )
echo "=== [$LABEL] Install torch==${TORCH_VERSION} ==="

# Install torch based on the pinned PyTorch version, preferring the PyTorch test index
"$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
Comment on lines +170 to +171
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--extra-index-url https://download.pytorch.org/whl/test does not “prefer” the test index: pip will still prefer the primary index (PyPI) when the same version exists there. Also, other scripts use the per-platform subindex (e.g. .../whl/test/cpu or .../whl/test/cu126), and .../whl/test may not be the correct/simple index for wheels. If the goal is to force the test index, use --index-url with the correct subindex (and optionally add PyPI as an extra index), or update the comment to match actual behavior.

Suggested change
# Install torch based on the pinned PyTorch version, preferring the PyTorch test index
"$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
# Install torch from the PyTorch test CPU wheel index, while allowing PyPI for dependencies
"$PIPBIN" install torch=="${TORCH_VERSION}" --index-url "https://download.pytorch.org/whl/test/cpu" --extra-index-url "https://pypi.org/simple"

Copilot uses AI. Check for mistakes.
"$PIPBIN" install wheel

# Install torchao based on the pinned commit from third-party/ao submodule
Expand Down
6 changes: 3 additions & 3 deletions .ci/scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ dedupe_macos_loader_path_rpaths() {
pushd ..
torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
popd

if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
return
fi
Expand Down Expand Up @@ -141,9 +141,9 @@ install_pytorch_and_domains() {

dedupe_macos_loader_path_rpaths
# Grab the pinned audio and vision commits from PyTorch
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
TORCHAUDIO_VERSION=release/2.11
export TORCHAUDIO_VERSION
TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
TORCHVISION_VERSION=release/0.26
export TORCHVISION_VERSION

install_domains
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/cuda-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
secrets-env: EXECUTORCH_HF_TOKEN
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
gpu-arch-version: 12.8
docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
submodules: recursive
upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
timeout: 240
runner: windows.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: 12.6
gpu-arch-version: 12.8
download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
Expand All @@ -158,7 +158,7 @@ jobs:
\$ErrorActionPreference = 'Stop'
\$PSNativeCommandUseErrorActionPreference = \$true

\$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6'
\$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
\$env:CUDA_PATH = \$env:CUDA_HOME
\$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
nvcc --version
Expand All @@ -169,5 +169,5 @@ jobs:
throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
}

.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.6'
.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
}"
6 changes: 3 additions & 3 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Test ExecuTorch CUDA Build Compatibility
# This workflow tests whether ExecuTorch can be successfully built with CUDA support
# across different CUDA versions (12.6, 13.0) using the command:
# across different CUDA versions (12.6, 12.8, 12.9, 13.0) using the command:
# ./install_executorch.sh
#
# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
Expand Down Expand Up @@ -31,7 +31,7 @@ jobs:
strategy:
fail-fast: false
matrix:
cuda-version: ["12.6", "13.0"]
cuda-version: ["12.6", "12.8", "12.9", "13.0"]

name: test-executorch-cuda-build-${{ matrix.cuda-version }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
Expand Down Expand Up @@ -66,7 +66,7 @@ jobs:
echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
exit 1
else
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 13.0) completed successfully!"
echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9, 13.0) completed successfully!"
fi

test-models-cuda:
Expand Down
7 changes: 2 additions & 5 deletions .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,17 @@ jobs:
matrix:
runner: [linux.4xlarge]
docker-image-name: [
executorch-ubuntu-22.04-gcc11,
executorch-ubuntu-22.04-gcc9-nopytorch,
Comment on lines 33 to 37
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default runner for this matrix is linux.4xlarge, and this list now includes executorch-ubuntu-22.04-gcc11. That image builds PyTorch from source during the Docker build (SKIP_PYTORCH is not set for gcc11), which is very memory-intensive and has previously required larger runners to avoid OOM. Consider restoring an include: override to run the gcc11 image on a larger runner (or skipping the PyTorch-from-source build in this image).

Copilot uses AI. Check for mistakes.
executorch-ubuntu-22.04-clang12,
executorch-ubuntu-22.04-linter,
executorch-ubuntu-22.04-arm-sdk,
executorch-ubuntu-22.04-zephyr-sdk,
executorch-ubuntu-22.04-qnn-sdk,
executorch-ubuntu-22.04-mediatek-sdk,
Comment on lines 39 to 43
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

executorch-ubuntu-22.04-zephyr-sdk is also on the default linux.4xlarge runner now, but it builds PyTorch from source during the Docker build (SKIP_PYTORCH is unset for zephyr-sdk). This is likely to OOM on 4xlarge and fail the docker-build workflow. Consider restoring a per-image runner override (e.g. linux.12xlarge) for this image, or otherwise avoiding the from-source PyTorch build here.

Copilot uses AI. Check for mistakes.
executorch-ubuntu-22.04-clang12-android
]
include:
# PyTorch is built from source in these images; 4xlarge OOMs mid-build.
- docker-image-name: executorch-ubuntu-22.04-gcc11
runner: linux.12xlarge
- docker-image-name: executorch-ubuntu-22.04-zephyr-sdk
runner: linux.12xlarge
- docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64
runner: linux.arm64.2xlarge
- docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64-android
Expand Down
2 changes: 1 addition & 1 deletion examples/models/moshi/mimi/install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
set -x

sudo apt install ffmpeg -y
pip install torchcodec==0.12.0.dev20260409 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
pip install moshi==0.2.11
pip install bitsandbytes soundfile einops
# Run llama2/install requirements for torchao deps
Expand Down
26 changes: 6 additions & 20 deletions install_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible

from torch_pin import NIGHTLY_VERSION, TORCH_VERSION

# The pip repository that hosts nightly torch packages.
# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
TORCH_URL_BASE = "https://download.pytorch.org/whl/test"

# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
# pip versions will have the required features.
Expand Down Expand Up @@ -44,18 +42,14 @@ def install_requirements(use_pytorch_nightly):
sys.exit(1)

# Determine the appropriate PyTorch URL based on CUDA delegate status
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
torch_url = determine_torch_url(TORCH_URL_BASE)

# pip packages needed by exir.
TORCH_PACKAGE = [
# Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
# that we don't need to set any version number there because they have already
# been installed on CI before this step, so pip won't reinstall them
(
f"torch=={TORCH_VERSION}.{NIGHTLY_VERSION}"
if use_pytorch_nightly
else "torch"
),
("torch==2.11.0" if use_pytorch_nightly else "torch"),
]
Comment on lines 49 to 53
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This hard-codes the torch version string in two places (torch_pin.py and here). That duplication can drift over time and create inconsistent local vs CI installs. Consider importing/reading TORCH_VERSION from torch_pin.py and using it here instead of repeating 2.11.0.

Copilot uses AI. Check for mistakes.

# Install the requirements for core ExecuTorch package.
Expand Down Expand Up @@ -114,20 +108,12 @@ def install_requirements(use_pytorch_nightly):

def install_optional_example_requirements(use_pytorch_nightly):
# Determine the appropriate PyTorch URL based on CUDA delegate status
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
torch_url = determine_torch_url(TORCH_URL_BASE)

print("Installing torch domain libraries")
DOMAIN_LIBRARIES = [
(
f"torchvision==0.27.0.{NIGHTLY_VERSION}"
if use_pytorch_nightly
else "torchvision"
),
(
f"torchaudio==2.11.0.{NIGHTLY_VERSION}"
if use_pytorch_nightly
else "torchaudio"
),
("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
]
Comment on lines 113 to 117
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These domain library versions are hard-coded here, which can drift from whatever torch/CI pin is intended and makes upgrades easy to miss. Consider centralizing the torchvision/torchaudio pins alongside the torch pin (or reusing the existing pin mechanism) so version bumps are consistent across scripts.

Copilot uses AI. Check for mistakes.
# Then install domain libraries
subprocess.run(
Expand Down
79 changes: 16 additions & 63 deletions runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,88 +325,41 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
#define C10_HIP_HOST_DEVICE
#endif

#if defined(USE_ROCM)
// C10_WARP_SIZE is only allowed for device code.
// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
// Host or device statically-sized arrays _must_ use either
// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
//
// Host code _must_ use at::cuda::warp_size()
// HIP header used to define warpSize as a constexpr that was either 32 or 64
// depending on the target device, and then always set it to 64 for host code.
// For a time, that allowed C10_WARP_SIZE to be defined like so:
//
// #ifdef USE_ROCM
// #define C10_WARP_SIZE warpSize
// #else
// #define C10_WARP_SIZE 32
// #endif
//
// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
// We can now only use warpSize for C10_WARP_SIZE in device code and this is
// enforced by using __device__ in its definition. In host code where
// C10_WARP_SIZE was previously used as a compile-time constant, this will now
// cause a compile-time error.
//
// If an array was previously expected to be sized at compile-time using
// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
//
// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
// must now use at::cuda::warp_size() for the dynamic runtime query.
//
// Unfortunately, C10_WARP_SIZE has been public and available for both host and
// device since approximately 2019, so forcing it to be device-only would break
// existing code in the wild.
#if defined(USE_ROCM)
// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
// set it to something unreasonable to trigger obvious host code errors.

namespace at::cuda {
TORCH_CUDA_CPP_API int warp_size();
}
#if defined(__HIPCC__)
static __host__ inline int C10_WARP_SIZE_INTERNAL() {
#ifdef __HIPCC__
static inline int __host__ C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}
// NOTE: __device__ C10_WARP_SIZE_INTERNAL
// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
// we can use constexpr. This matches prior behavior. We preserve this for
// backward compatibility instead of forcing old code to use dynamic warpSize
// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
// warpSize is not allowed.
#if defined(__SPIRV__)
static __device__ inline int C10_WARP_SIZE_INTERNAL() {
return warpSize;
}
#else // __SPIRV__
static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {

static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
#if defined(__GFX9__)
return 64;
#else // __GFX9__
return 32;
#endif // __GFX9__
}
#endif // __SPIRV__
#if defined(__SPIRV__)
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#elif defined(__GFX9__)
#define C10_WARP_SIZE_LOWER_BOUND 64
#define C10_WARP_SIZE_UPPER_BOUND 64
#else
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif
#else // !__HIPCC__
#else // __HIPCC__
static inline int C10_WARP_SIZE_INTERNAL() {
return at::cuda::warp_size();
}
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 64
#endif // __HIPCC__

#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
#else // !USE_ROCM
#define C10_WARP_SIZE_STATIC 64

#else // defined(USE_ROCM)
#define C10_WARP_SIZE 32
#define C10_WARP_SIZE_LOWER_BOUND 32
#define C10_WARP_SIZE_UPPER_BOUND 32
#endif // USE_ROCM
#endif

#if defined(_MSC_VER) && _MSC_VER <= 1900
#define __func__ __FUNCTION__
Expand Down Expand Up @@ -676,7 +629,7 @@ __host__ __device__
// This macro is used to find older C++ compilers
// that don't support move optimization for return values.

#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
#if (defined(__GNUC__) && __GNUC__ < 13) || \
(defined(__clang_major__) && __clang_major__ < 13)
#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
#else
Expand Down
Loading
Loading