[blackwell attn example] qk scale as param (#969)

v0i0 · web-flow · commit f0631e330537 · 2025-10-16T12:36:07.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -306,7 +306,7 @@ class RunResult:
     "blackwell_attentions": (
         "tritonbench.operators.blackwell_attentions.operator",
         "examples.blackwell_attention",
-        "blackwell_attention",
+        "blackwell_attention_tritonbench",
         {
             "d_head": 128,  # Set default head dimension to 128 for TLX attention compatibility
             "num_inputs": 6,  # flash_attention takes long time on Benchmark CI, so use fewer inputs instead.
@@ -594,8 +594,8 @@ class RunResult:
         "triton_tutorial_flash_v2_tma_ws_persistent-accuracy": "triton_accuracy",
         "flex_attention-speedup": "torch_compile_speedup",
         "flex_attention-accuracy": "torch_compile_accuracy",
-        "helion_attention-speedup": "helion_speedup",
-        "helion_attention-accuracy": "helion_accuracy",
+        "helion_blackwell_attention_tritonbench-speedup": "helion_speedup",
+        "helion_blackwell_attention_tritonbench-accuracy": "helion_accuracy",
     },
 }
 
diff --git a/examples/blackwell_attention.py b/examples/blackwell_attention.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 import math
+from typing import Callable
 
 import torch
 from triton.testing import do_bench
@@ -96,8 +97,8 @@ def _fma_f32x2(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor) -> torch.Tenso
     static_shapes=True,
     autotune_accuracy_check=False,
 )
-def blackwell_attention(
-    q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor
+def blackwell_attention_kernel(
+    q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, qk_scale: float
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Computes scaled dot-product attention.
@@ -143,8 +144,7 @@ def blackwell_attention(
     hl.register_tunable("_triton_config_maxRegAutoWS", EnumFragment(choices=(152, 192)))
     SUBTILING = True
     VECT_MUL = 1
-    sm_scale = 1.0 / math.sqrt(D)
-    qk_scale = sm_scale * 1.44269504  # 1/log(2)
+    qk_scale = qk_scale * 1.44269504  # 1/log(2)
     for tile_m in hl.tile(MM, block_size=block_m):
         m_i = hl.zeros([tile_m]) - float("inf")
         l_i = hl.zeros([tile_m]) + 1.0
@@ -205,6 +205,18 @@ def blackwell_attention(
     return o.reshape(B, H, M, Dv), lse.reshape(B, H, M)
 
 
+def blackwell_attention(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return blackwell_attention_kernel(q, k, v, qk_scale=math.sqrt(1.0 / q.shape[-1]))
+
+
+def blackwell_attention_tritonbench(
+    tb_mod: object, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+) -> Callable:
+    return lambda: blackwell_attention(q, k, v)
+
+
 # %%
 # Testing Function
 # ----------------