ROCm · Micky774 · Feb 18, 2026 · Oct 24, 2025 · Oct 24, 2025 · Oct 31, 2025
@@ -1281,17 +1281,60 @@ def test_transformer_layer(
 
     # FusedAttention backend
     if fused_attn_supported:
-        fused_attn_fwd, fused_attn_bwd = _run_transformer_layer(
-            dtype,
-            config,
-            "FusedAttention",
-            ckpt_attn,
-            qkv_format,
-            workspace_opt,
-            fused_qkv_params,
-            RoPE,
-            is_training,
-        )
+        if len(fused_attn_backends) == 1 or not IS_HIP_EXTENSION:
+            fused_attn_fwd, fused_attn_bwd = _run_transformer_layer(
+                dtype,
+                config,
+                "FusedAttention",
+                ckpt_attn,
+                qkv_format,
+                workspace_opt,
+                fused_qkv_params,
+                RoPE,
+                is_training,
+            )
+        elif len(fused_attn_backends) == 2:
+            os.environ["NVTE_FUSED_ATTN_CK"] = "0"
+            os.environ["NVTE_FUSED_ATTN_AOTRITON"] = "1"
+            fused_attn_fwd, fused_attn_bwd = _run_transformer_layer(
+                dtype,
+                config,
+                "FusedAttention",
+                ckpt_attn,
+                qkv_format,
+                workspace_opt,
+                fused_qkv_params,
+                RoPE,
+                is_training,
+            )
+            os.environ["NVTE_FUSED_ATTN_CK"] = "1"
+            os.environ["NVTE_FUSED_ATTN_AOTRITON"] = "0"
+            fused_attn_fwd_1, fused_attn_bwd_1 = _run_transformer_layer(
+                dtype,
+                config,
+                "FusedAttention",
+                ckpt_attn,
+                qkv_format,
+                workspace_opt,
+                fused_qkv_params,
+                RoPE,
+                is_training,
+            )
+
+            os.environ["NVTE_CK_USES_FWD_V3"] = "0"
+            os.environ["NVTE_CK_USES_BWD_V3"] = "0"
+            fused_attn_fwd_2, fused_attn_bwd_2 = _run_transformer_layer(
+                dtype,
+                config,
+                "FusedAttention",
+                ckpt_attn,
+                qkv_format,
+                workspace_opt,
+                fused_qkv_params,
+                RoPE,
+                is_training,
+            )
+
 
     # FlashAttention backend
     if flash_attn_supported:
@@ -1320,6 +1363,15 @@ def test_transformer_layer(
         logging.info("[test_transformer_layer]: fused attn vs flash attn")
         torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, **tols)
         torch.testing.assert_close(fused_attn_bwd, flash_attn_bwd, **tols)
+    if IS_HIP_EXTENSION and fused_attn_supported and len(fused_attn_backends) == 2:
+        logging.info("[test_transformer_layer]: fused attn backend 0 vs 1")
+        torch.testing.assert_close(fused_attn_fwd, fused_attn_fwd_1, **tols)
+        for i, _ in enumerate(fused_attn_bwd):
+            torch.testing.assert_close(fused_attn_bwd[i], fused_attn_bwd_1[i], **tols)
+        logging.info("[test_transformer_layer]: fused attn backend 0 vs 2")
+        torch.testing.assert_close(fused_attn_fwd, fused_attn_fwd_2, **tols)
+        for i, _ in enumerate(fused_attn_bwd):
+            torch.testing.assert_close(fused_attn_bwd[i], fused_attn_bwd_2[i], **tols)
 
 
 @pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")

@@ -8,7 +8,6 @@ cmake_minimum_required(VERSION 3.21)
 
 option(USE_ROCM "Use ROCm" ON)
 option(USE_FUSED_ATTN_AOTRITON "Use aotriton backend" ON)
-option(USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS "Build AOTriton GPU kernels" OFF)
 option(USE_FUSED_ATTN_CK "Use ck backend" ON)
 set(USE_CUDA OFF)
 

@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
 cmake_minimum_required(VERSION 3.21)
@@ -8,19 +8,16 @@ project(aotriton LANGUAGES CXX)
 # The AOTriton C++ runtime will be built from {TE}/3rdparty/aotriton
 # Hence there is no need to add multiple ROCM version here
 
+if(DEFINED ENV{AOTRITON_PATH})
+    set(AOTRITON_PATH $ENV{AOTRITON_PATH})
+endif()
+
 set(__AOTRITON_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/aotriton")
 set(__AOTRITON_SUFFIX "_TEprivate")
 
 if(NOT DEFINED AOTRITON_PATH)
-    # If AOTRITON_PATH is not provided, we proceed to build the runtime
-    # ourselves and either build or download the GPU kernels
-    if(USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS)
-        set(AOTRITON_NOIMAGE_MODE OFF)
-    else()
-        set(AOTRITON_NOIMAGE_MODE ON)
-    endif()
 
-    set(__AOTRITON_VER "0.11.1b")
+    set(__AOTRITON_VER "0.11.2b")
     set(__AOTRITON_IMAGE_LIST
         "amd-gfx942"
         "amd-gfx950"
@@ -66,8 +63,7 @@ if(NOT DEFINED AOTRITON_PATH)
 
     # Build the AOTriton runtime from source with custom suffix to avoid
     # potential conflict with libaotriton as provided by PyTorch
-    function(aotriton_build_from_source noimage)
-        message(STATUS "No-image mode: ${noimage}.")
+    function(aotriton_build_from_source)
         get_git_commit(${TE}/3rdparty/aotriton AOTRITON_SHA)
         ExternalProject_Add(aotriton_external
             LIST_SEPARATOR ","
@@ -78,7 +74,7 @@ if(NOT DEFINED AOTRITON_PATH)
             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
             -DAOTRITON_NO_PYTHON=ON
             -DAOTRITON_NAME_SUFFIX=${__AOTRITON_SUFFIX}
-            -DAOTRITON_NOIMAGE_MODE=${noimage}
+            -DAOTRITON_NOIMAGE_MODE=ON
             -DTE_AOTRITON_COMMIT_SHA1=${AOTRITON_SHA}
             -DCMAKE_PROJECT_INCLUDE=${CMAKE_CURRENT_LIST_DIR}/aotriton_custom.cmake
             BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton${__AOTRITON_SUFFIX}_v2.so"
@@ -97,7 +93,7 @@ if(NOT DEFINED AOTRITON_PATH)
     add_library(aotriton INTERFACE)
     message(STATUS "Building AOTriton from source.")
     string(REPLACE ";" "," ARCH_LIST_COMMA_STR "${CMAKE_HIP_ARCHITECTURES}")
-    aotriton_build_from_source(${AOTRITON_NOIMAGE_MODE})
+    aotriton_build_from_source()
 
     # Download GPU kernels if needed
     if(NOT USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS)
@@ -121,8 +117,9 @@ if(NOT DEFINED AOTRITON_PATH)
 else()
     # Use aotriton built during initial TE building/installation
     # When only need rebuild TE library itself
+    message(STATUS "Using existing AOTriton lib at $ENV{AOTRITON_PATH}")
     unset(AOTRITON_LIB CACHE)
-    find_library(AOTRITON_LIB NAMES aotriton aotriton${__AOTRITON_SUFFIX}_v2 PATHS ${AOTRITON_PATH}/lib REQUIRED NO_DEFAULT_PATH)
+    find_library(AOTRITON_LIB NAMES aotriton aotriton${__AOTRITON_SUFFIX}_v2 PATHS ${AOTRITON_PATH} REQUIRED NO_DEFAULT_PATH)
     add_library( aotriton SHARED IMPORTED )
     set_target_properties( aotriton PROPERTIES IMPORTED_LOCATION ${AOTRITON_LIB} )
     target_include_directories(aotriton INTERFACE ${AOTRITON_PATH}/include)

@@ -492,6 +492,7 @@ void nvte_fused_attn_bwd_qkvpacked(const NVTETensor QKV, const NVTETensor O, con
     fused_attn_aotriton_bwd_qkvpacked(
       b, h, max_seqlen, d,
       attn_scale, dropout, 
+      window_size_left, window_size_right,
       qkv_layout, bias_type, attn_mask_type,
       input_QKV, input_O, input_dO, output_S,
       output_dQKV,
@@ -678,6 +679,7 @@ void nvte_fused_attn_bwd_kvpacked(
     fused_attn_aotriton_bwd_kvpacked(
       b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d,
       attn_scale, dropout, 
+      window_size_left, window_size_right,
       qkv_layout, bias_type, attn_mask_type,
       input_Q, input_KV, input_O, input_dO, 
       output_S,
@@ -858,6 +860,7 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
     fused_attn_aotriton_bwd(
       b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk,
       attn_scale, dropout, 
+      window_size_left, window_size_right,
       qkv_layout, bias_type, attn_mask_type,
       input_Q, input_K, input_V, input_O, input_dO,
       output_S,
+9 −2		.ci/build-release.sh
+6 −3		.ci/releasesuite-git-head.sh
+2 −2		CMakeLists.txt
+41 −0		dockerfile/input/docker-script-build-altwheel.sh
+7 −1		dockerfile/input/docker-script-build.sh
+3 −1		include/aotriton/util.h
+1 −1		tritonsrc/attn_torch_function.py
+1 −1		v3python/compile.py
+15 −5		v3python/gpu_targets.py
+1 −1		v3src/flash/aiter_bwd.cc
+1 −1		v3src/flash/attn_bwd.cc
+1 −1		v3src/flash/attn_fwd.cc
+4 −0		v3src/util.cc