diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index b9e2b907e0..a6ba451788 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -36,7 +36,11 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) endif() endif() -# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures +# Process CMAKE_CUDA_ARCHITECTURES to separate standard, generic, and specific architectures. +# - NVTE_STANDARD_ARCHS: pre-Blackwell archs (e.g. 75, 80, 89, 90). Applied to all CUDA sources. +# - NVTE_GENERIC_ARCHS: Blackwell family heads (e.g. 100, 120). Applied to non-arch-specific sources only. +# - NVTE_SPECIFIC_ARCHS: Blackwell specific targets (e.g. 100a, 120f). Applied to arch-specific sources only. +set(NVTE_STANDARD_ARCHS) set(NVTE_GENERIC_ARCHS) set(NVTE_SPECIFIC_ARCHS) @@ -79,6 +83,10 @@ if(NOT arch_120_index EQUAL -1) endif() endif() +# Move remaining standard (pre-Blackwell) architectures into NVTE_STANDARD_ARCHS. +# These are applied to all CUDA sources (both generic and arch-specific). +set(NVTE_STANDARD_ARCHS ${CMAKE_CUDA_ARCHITECTURES}) + # cuDNN frontend API set(CUDNN_FRONTEND_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include") @@ -192,9 +200,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s ${transformer_engine_cuda_sources} ${transformer_engine_cpp_sources}) -# Set compile options for CUDA sources with generic architectures +# Set compile options for CUDA sources with generic architectures. +# These get standard archs (pre-Blackwell) + generic Blackwell family heads. foreach(cuda_source IN LISTS transformer_engine_cuda_sources) set(arch_compile_options) + foreach(arch IN LISTS NVTE_STANDARD_ARCHS) + list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") + endforeach() foreach(arch IN LISTS NVTE_GENERIC_ARCHS) list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") endforeach() @@ -209,9 +221,14 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources) endif() endforeach() -# Set compile options for CUDA sources with specific architectures +# Set compile options for CUDA sources with arch-specific features. +# These get standard archs (pre-Blackwell) + Blackwell specific targets (a/f suffix). +# They must NOT get generic Blackwell archs, as they use family/arch-specific PTX features. foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources) set(arch_compile_options) + foreach(arch IN LISTS NVTE_STANDARD_ARCHS) + list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") + endforeach() foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS) list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}") endforeach() @@ -232,6 +249,10 @@ list(APPEND transformer_engine_SOURCES endif() add_library(transformer_engine SHARED ${transformer_engine_SOURCES}) +# Disable CMake's automatic architecture flag injection. +# All architectures are handled explicitly via per-source COMPILE_OPTIONS +# using NVTE_STANDARD_ARCHS, NVTE_GENERIC_ARCHS, and NVTE_SPECIFIC_ARCHS above. +set_target_properties(transformer_engine PROPERTIES CUDA_ARCHITECTURES OFF) target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")