Allow some other version (#5134)

henrylhtsang · meta-codesync[bot] · commit 29161632e292 · 2025-11-15T14:25:39.000-08:00
Summary: Pull Request resolved: #5134 X-link: https://github.com/facebookresearch/FBGEMM/pull/2136 NA Reviewed By: Aya-ZIbra Differential Revision: D87104671 fbshipit-source-id: cff2f82455f240dc0a6b94d2615d370a0d0e3e51
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_bwd_kernel_tma_warpspecialized.hpp
@@ -1649,7 +1649,8 @@ struct Sm100FmhaBwdKernelTmaWarpSpecialized {
 
 
   CUTLASS_DEVICE void operator()(Params const& params, char* smem) {
-#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED))
+#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
+     ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
     int warp_idx = cutlass::canonical_warp_idx_sync();
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_bwd_mla_kernel_tma_warpspecialized.hpp
@@ -1481,9 +1481,10 @@ struct Sm100FmhaBwdMlaKernelTmaWarpSpecialized {
 
 
   CUTLASS_DEVICE void operator()(Params const& params, char* smem) {
-#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED))
+#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
+     ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
-#else    
+#else
     int warp_idx = cutlass::canonical_warp_idx_sync();
     auto role = warp_idx_to_role(warp_idx);
     uint32_t lane_predicate = cute::elect_one_sync();
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
@@ -265,7 +265,8 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
   }
 
   CUTLASS_DEVICE void operator()(const Params &params, char* smem) {
-#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED))
+#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
+     ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
 
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_gen_kernel_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_gen_kernel_warpspecialized.hpp
@@ -248,7 +248,8 @@ struct Sm100FmhaGenKernelWarpspecialized {
   }
 
   CUTLASS_DEVICE void operator()(const Params &params, char* smem) {
-#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED))
+#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
+     ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
 
@@ -280,7 +281,7 @@ struct Sm100FmhaGenKernelWarpspecialized {
       shared_storage.pipelines.load_q,
       pipeline_load_q_params,
       ClusterShape{},  cute::true_type{}, /*mask calc*/cute::false_type{});
-    
+
     typename CollectiveMainloop::PipelineKV::Params pipeline_load_kv_params;
     if (role == WarpRole::Load) {
       pipeline_load_kv_params.role = CollectiveMainloop::PipelineKV::ThreadCategory::Producer;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@@ -508,7 +508,8 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
 
 
   CUTLASS_DEVICE void operator()(Params const& params, char* smem_raw) {
-#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED))
+#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \
+     ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
     printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
 #else
 

Original file line number	Diff line number	Diff line change
`@@ -265,7 +265,8 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {`
`265`	`265`	`}`
`266`	`266`
`267`	`267`	`CUTLASS_DEVICE void operator()(const Params &params, char* smem) {`
`268`		`-#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED))`
	`268`	`+#if (! defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \`
	`269`	`+ ! defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) && ! defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))`
`269`	`270`	`printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");`
`270`	`271`	`#else`
`271`	`272`