Support Blackwell CUTLASS attention kernels in torch.compile (#5136)

jbschlosser · meta-codesync[bot] · commit 138c0a3f1a55 · 2025-11-17T21:20:23.000-08:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2138 Pull Request resolved: #5136 Support the fbgemm Blackwell CUTLASS attention kernel in torch.compile by adding a C++-side meta function. Reviewed By: henrylhtsang Differential Revision: D86986981 fbshipit-source-id: 066cd6b93c2d815e3f4e180806dd0af243db5724
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_fmha_bwd.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_fmha_bwd.cu
@@ -171,6 +171,26 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> dispatch_fmha_bwd(
   }
 }
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor> dispatch_fmha_bwd_meta(
+    const at::Tensor& dOutput,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& output,
+    const at::Tensor& softmax_lse,
+    const std::optional<at::Tensor>& cu_seqlens_q,
+    const std::optional<at::Tensor>& cu_seqlens_k,
+    std::optional<c10::SymInt> max_seq_len_q,
+    std::optional<c10::SymInt> max_seq_len_k,
+    std::optional<double> softmax_scale,
+    bool causal,
+    c10::SymInt window_size_left,
+    c10::SymInt window_size_right,
+    bool bottom_right,
+    bool deterministic) {
+  return std::make_tuple(at::empty_like(query), at::empty_like(key), at::empty_like(value));
+}
+
 // -------------------------------------------------------------------------------------------------
 // Op registration
 // -------------------------------------------------------------------------------------------------
@@ -185,12 +205,12 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "    Tensor softmax_lse, "
       "    Tensor? cu_seqlens_q=None, "
       "    Tensor? cu_seqlens_k=None, "
-      "    int? max_seq_len_q=None, "
-      "    int? max_seq_len_k=None, "
+      "    SymInt? max_seq_len_q=None, "
+      "    SymInt? max_seq_len_k=None, "
       "    float? softmax_scale=None, "
       "    bool causal=False, "
-      "    int window_size_left=-1, "
-      "    int window_size_right=-1, "
+      "    SymInt window_size_left=-1, "
+      "    SymInt window_size_right=-1, "
       "    bool bottom_right=True, "
       "    bool deterministic=False"
       ") -> (Tensor, Tensor, Tensor)");
@@ -199,4 +219,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   m.impl("fmha_bwd", dispatch_fmha_bwd);
 }
+TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
+  m.impl("fmha_bwd", dispatch_fmha_bwd_meta);
+}
 #endif // CUTLASS_ARCH_MMA_SM100_SUPPORTED
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_fmha_fwd.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_fmha_fwd.cu
@@ -139,6 +139,31 @@ std::tuple<at::Tensor, at::Tensor> dispatch_fmha_fwd(
   }
 }
 
+std::tuple<at::Tensor, at::Tensor> dispatch_fmha_fwd_meta(
+    const at::Tensor& q,
+    const at::Tensor& k, // (batch_size, KV_seqlen, num_KV_heads, head_dim) if non-paged or (num_blocks, page_block_size, num_KV_heads, head_dim) if paged
+    const at::Tensor& v, // (batch_size, KV_seqlen, num_KV_heads, head_dim) if non-paged or (num_blocks, page_block_size, num_KV_heads, head_dim) if paged
+    const std::optional<at::Tensor>& cu_seqlens_q,
+    const std::optional<at::Tensor>& cu_seqlens_k,
+    std::optional<c10::SymInt> max_seq_len_q,
+    std::optional<c10::SymInt> max_seq_len_k,
+    std::optional<double> softmax_scale,
+    bool causal,
+    const std::optional<at::Tensor>& seqlen_kv,
+    const std::optional<at::Tensor>& page_table, // dim: (batch_size, max_num_pages_per_seq) , null if non-paged
+    std::optional<c10::SymInt> seqlen_k,
+    c10::SymInt window_size_left,
+    c10::SymInt window_size_right,
+    bool bottom_right) {
+  auto output = at::empty_like(q);
+  bool k_is_varlen = max_seq_len_q.has_value();
+  auto SQ = k_is_varlen ? q.sym_size(0) : q.sym_size(1);
+  auto H_Q = k_is_varlen ? q.sym_size(1) : q.sym_size(2);
+  auto B = k_is_varlen ? 1 : q.sym_size(0);
+  auto logsumexp = q.new_empty_symint({B, H_Q, SQ}, q.options());
+  return std::make_tuple(output, logsumexp);
+}
+
 // -------------------------------------------------------------------------------------------------
 // Op registration
 // -------------------------------------------------------------------------------------------------
@@ -150,20 +175,23 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "    Tensor value, "
       "    Tensor? cu_seqlens_q=None, "
       "    Tensor? cu_seqlens_k=None, "
-      "    int? max_seq_len_q=None, "
-      "    int? max_seq_len_k=None, "
+      "    SymInt? max_seq_len_q=None, "
+      "    SymInt? max_seq_len_k=None, "
       "    float? softmax_scale=None, "
       "    bool causal=False, "
       "    Tensor? seqlen_kv=None, "
       "    Tensor? page_table=None, "
-      "    int? seqlen_k=None, "
-      "    int window_size_left=-1, "
-      "    int window_size_right=-1, "
+      "    SymInt? seqlen_k=None, "
+      "    SymInt window_size_left=-1, "
+      "    SymInt window_size_right=-1, "
       "    bool bottom_right=True"
       ") -> (Tensor, Tensor)");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   m.impl("fmha_fwd", dispatch_fmha_fwd);
 }
+TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
+  m.impl("fmha_fwd", dispatch_fmha_fwd_meta);
+}
 #endif // CUTLASS_ARCH_MMA_SM100_SUPPORTED
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu
@@ -310,6 +310,16 @@ at::Tensor dispatch_fmha_gen_fwd(
   });
 }
 
+at::Tensor dispatch_fmha_gen_fwd_meta(
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& seqlen_kv,
+    const std::optional<at::Tensor>& batch_idx,
+    int64_t kernel_type
+  ) {
+  return at::empty_like(q);
+}
 
 // -------------------------------------------------------------------------------------------------
 // Op registration
@@ -329,4 +339,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
   m.impl("fmha_gen_fwd", dispatch_fmha_gen_fwd);
 }
+TORCH_LIBRARY_IMPL(fbgemm, Meta, m) {
+  m.impl("fmha_gen_fwd", dispatch_fmha_gen_fwd_meta);
+}
 #endif // CUTLASS_ARCH_MMA_SM100_SUPPORTED
diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/blackwell_fmha_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/blackwell_fmha_test.py
@@ -293,6 +293,7 @@ def _execute_cutlass_blackwell_attn_dense(
         deterministic: bool,
         sm_scale: Optional[float],
         is_paged: Optional[bool],
+        use_compile: bool = False,
     ) -> None:
         device = torch.accelerator.current_accelerator()
         assert device is not None
@@ -369,9 +370,12 @@ def _execute_cutlass_blackwell_attn_dense(
             )
 
         # Run tested kernel
+        func_to_test = cutlass_blackwell_fmha_func
+        if use_compile:
+            func_to_test = torch.compile(func_to_test, fullgraph=True)
         if is_paged:
             assert k_paged is not None and v_paged is not None
-            out_paged = cutlass_blackwell_fmha_func(
+            out_paged = func_to_test(
                 q,
                 k_paged,
                 v_paged,
@@ -384,7 +388,7 @@ def _execute_cutlass_blackwell_attn_dense(
                 softmax_scale=sm_scale,
             )
 
-        out = cutlass_blackwell_fmha_func(
+        out = func_to_test(
             q,
             k,
             v,
@@ -411,7 +415,7 @@ def _execute_cutlass_blackwell_attn_dense(
 
         if deterministic:
             # Rerun the test. The outputs must be bit-wise exact
-            out_d = cutlass_blackwell_fmha_func(
+            out_d = func_to_test(
                 q,
                 cast(torch.Tensor, k_paged) if is_paged else k,
                 cast(torch.Tensor, v_paged) if is_paged else v,
@@ -479,6 +483,7 @@ def _execute_cutlass_blackwell_attn_varlen(
         deterministic: bool,
         sm_scale: Optional[float],
         is_paged: Optional[bool],
+        use_compile: bool = False,
     ) -> None:
         device = torch.accelerator.current_accelerator()
         assert device is not None
@@ -572,9 +577,12 @@ def _execute_cutlass_blackwell_attn_varlen(
             softmax_scale=sm_scale,
         )
 
+        func_to_test = cutlass_blackwell_fmha_func
+        if use_compile:
+            func_to_test = torch.compile(func_to_test, fullgraph=True)
         if is_paged:
             assert k_paged is not None and v_paged is not None
-            out_unpad_paged = cutlass_blackwell_fmha_func(
+            out_unpad_paged = func_to_test(
                 q_unpad,
                 k_paged,
                 v_paged,
@@ -590,7 +598,7 @@ def _execute_cutlass_blackwell_attn_varlen(
             )
             out_paged = output_pad_fn(out_unpad_paged)
 
-        out_unpad = cutlass_blackwell_fmha_func(
+        out_unpad = func_to_test(
             q_unpad,
             k_unpad,
             v_unpad,
@@ -617,7 +625,7 @@ def _execute_cutlass_blackwell_attn_varlen(
 
         if deterministic:
             # Rerun the test. The outputs must be bit-wise exact
-            out_unpad_d = cutlass_blackwell_fmha_func(
+            out_unpad_d = func_to_test(
                 q_unpad,
                 cast(torch.Tensor, k_paged) if is_paged else k_unpad,
                 cast(torch.Tensor, v_paged) if is_paged else v_unpad,
@@ -1165,3 +1173,62 @@ def test_backward(
             sm_scale=sm_scale,
             is_paged=False,
         )
+
+    @skip_cuda_lt_sm100
+    @skip_rocm
+    @parameterized.expand(
+        [
+            (
+                is_varlen,
+                is_mqa,
+                seqlen_q,
+            )
+            for is_varlen in [False, True]
+            for is_mqa in [False, True]
+            for seqlen_q in [1, 64]
+        ]
+    )
+    def test_compile(
+        self,
+        is_varlen: bool,
+        is_mqa: bool,
+        seqlen_q: int,
+    ):
+        test_func = (
+            self._execute_cutlass_blackwell_attn_varlen
+            if is_varlen
+            else self._execute_cutlass_blackwell_attn_dense
+        )
+        q_heads = 8
+        kv_heads = 2 if is_mqa else q_heads
+        batch_size = 2
+        seqlen_k = 128
+        kv_heads = 2
+        head_dim = 128
+        dtype = torch.bfloat16
+        causal = True
+        # Decode kernel does not support sliding window attention yet
+        window_size = (-1, -1)
+        deterministic = False
+        # Backward pass is not supported for generation phase (sq=1)
+        is_decode = seqlen_q == 1
+        fwd_only = is_decode
+        # Decode kernel does not support sm_scale
+        sm_scale = None if is_decode else 1.0 / head_dim
+
+        test_func(
+            batch_size,
+            seqlen_q,
+            seqlen_k,
+            q_heads=q_heads,
+            kv_heads=kv_heads,
+            head_dim=head_dim,
+            page_block_size=0,
+            dtype=dtype,
+            causal=causal,
+            window_size=window_size,
+            fwd_only=fwd_only,
+            deterministic=deterministic,
+            sm_scale=sm_scale,
+            is_paged=False,
+        )