Register tile symbol origin, to support tile + offset use case in blackwell attention (#939)

yf225 · web-flow · commit d4d122be0ebe · 2025-10-14T22:45:52.000-07:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -416,7 +416,7 @@ def has_current() -> bool:
         except NoCurrentEnvironment:
             return False
 
-    def get_block_id(self, size: int | torch.SymInt | sympy.Expr) -> int | None:
+    def get_block_id(self, size: int | torch.SymInt | sympy.Basic) -> int | None:
         """
         Get the block ID associated with a given size expression.
 
@@ -425,7 +425,7 @@ def get_block_id(self, size: int | torch.SymInt | sympy.Expr) -> int | None:
         symbolic expressions to find their associated block IDs.
 
         Args:
-            size: The size expression to check. Can be an integer, torch.SymInt, or sympy.Expr.
+            size: The size expression to check. Can be an integer, torch.SymInt, or sympy.Basic.
 
         Returns:
             The block ID if the size corresponds to a registered block size, None otherwise.
diff --git a/helion/_compiler/generate_ast.py b/helion/_compiler/generate_ast.py
@@ -25,15 +25,17 @@
 from .inductor_lowering import CodegenState
 from .inductor_lowering import codegen_call_with_graph
 from .program_id import ForEachProgramID
+from .tile_strategy import DeviceLoopState
 from .variable_origin import ArgumentOrigin
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
+    import sympy
+
     from ..runtime import Config
     from .host_function import HostFunction
     from .tile_strategy import DeviceLoopOrGridState
-    from .tile_strategy import DeviceLoopState
     from .type_propagation import TensorType
 
 
@@ -97,6 +99,60 @@ def lift(self, expr: ast.AST, *, dce: bool = False, prefix: str = "v") -> ast.Na
             )
             return create(ast.Name, id=varname, ctx=ast.Load())
 
+    def lift_symnode(
+        self,
+        expr: ast.AST,
+        sym_expr: sympy.Expr,
+        *,
+        dce: bool = False,
+        prefix: str = "symnode",
+    ) -> ast.Name:
+        if isinstance(expr, ast.Name):
+            return expr
+        assert isinstance(expr, ExtendedAST), expr
+
+        target_statements = self.statements_stack[-1]
+        env = CompileEnvironment.current()
+        # Identify every block dimension the symbolic value depends on so we know
+        # which loop nests the expression depends on.
+        dep_block_ids = {
+            block_id
+            for symbol in sym_expr.free_symbols
+            if (block_id := env.get_block_id(symbol)) is not None
+        }
+
+        # Walk outward through the active device loops: as soon as we see a loop
+        # whose block id appears in the dependency set we must stop, otherwise we
+        # can safely hoist into that loop's outer prefix (which executes before the
+        # loop body).
+        for loop_state in reversed(self._active_loop_stack()):
+            if dep_block_ids.intersection(loop_state.block_ids):
+                break
+            target_statements = loop_state.outer_prefix
+
+        with expr:
+            varname = self.tmpvar(dce=dce, prefix=prefix)
+            # Emit the temporary into the chosen statement list so the symbolic
+            # expression is computed exactly once at the appropriate scope.
+            target_statements.append(
+                statement_from_string(f"{varname} = {{expr}}", expr=expr)
+            )
+            # Reuse the temporary everywhere else in the kernel body.
+            return create(ast.Name, id=varname, ctx=ast.Load())
+
+    def _active_loop_stack(self) -> list[DeviceLoopState]:
+        seen: set[int] = set()
+        stack: list[DeviceLoopState] = []
+        for loops in self.active_device_loops.values():
+            for loop_state in loops:
+                if not isinstance(loop_state, DeviceLoopState):
+                    continue
+                key = id(loop_state)
+                if key not in seen:
+                    stack.append(loop_state)
+                    seen.add(key)
+        return stack
+
     @contextlib.contextmanager
     def set_statements(self, new_statements: list[ast.AST] | None) -> Iterator[None]:
         if new_statements is None:
diff --git a/helion/language/_tracing_ops.py b/helion/language/_tracing_ops.py
@@ -54,8 +54,10 @@ def _(state: CodegenState) -> ast.AST:
         if block_size_var is None:
             return expr_from_string("1")
         return expr_from_string(block_size_var)
-    return state.codegen.lift(
-        expr_from_string(state.sympy_expr(val._sympy_())),
+    sym_expr = val._sympy_()
+    return state.codegen.lift_symnode(
+        expr_from_string(state.sympy_expr(sym_expr)),
+        sym_expr,
         dce=True,
         prefix="symnode",
     )
diff --git a/helion/language/tile_ops.py b/helion/language/tile_ops.py
@@ -7,6 +7,9 @@
 from .. import exc
 from .._compiler.ast_extension import expr_from_string
 from .._compiler.compile_environment import CompileEnvironment
+from .._compiler.host_function import HostFunction
+from .._compiler.host_function import SymbolOrigin
+from .._compiler.variable_origin import GridOrigin
 from . import _decorators
 
 if TYPE_CHECKING:
@@ -17,6 +20,13 @@
     from .tile_interface import TileInterface
 
 
+def _register_tile_symbol_origin(symbol: torch.SymInt, tile_index: int) -> None:
+    """Register the origin for a tile-related symbol so it can be resolved during codegen."""
+    HostFunction.current().expr_to_origin[symbol._sympy_()] = SymbolOrigin(
+        GridOrigin(tile_index)
+    )
+
+
 @_decorators.api(tiles_as_sizes=True)
 def tile_index(tile: TileInterface) -> torch.Tensor:
     """
@@ -68,10 +78,12 @@ def tile_begin(tile: TileInterface) -> int:
 
 @_decorators.register_fake(tile_begin)
 def _(tile: torch.SymInt) -> torch.SymInt:
-    _disable_flatten_get_tile(tile)  # update config spec if needed
-    return CompileEnvironment.current().cached_create_unbacked_symint(
+    index = _disable_flatten_get_tile(tile)  # update config spec if needed
+    result = CompileEnvironment.current().cached_create_unbacked_symint(
         ("tile_begin", tile)
     )
+    _register_tile_symbol_origin(result, index)
+    return result
 
 
 def _disable_flatten_get_tile(tile: object) -> int:
@@ -109,10 +121,12 @@ def tile_end(tile: TileInterface) -> int:
 
 @_decorators.register_fake(tile_end)
 def _(tile: torch.SymInt) -> torch.SymInt:
-    _disable_flatten_get_tile(tile)  # update config spec if needed
-    return CompileEnvironment.current().cached_create_unbacked_symint(
+    index = _disable_flatten_get_tile(tile)  # update config spec if needed
+    result = CompileEnvironment.current().cached_create_unbacked_symint(
         ("tile_end", tile)
     )
+    _register_tile_symbol_origin(result, index)
+    return result
 
 
 @_decorators.codegen(tile_end)
@@ -175,9 +189,13 @@ def tile_id(tile: TileInterface) -> int:
 
 @_decorators.register_fake(tile_id)
 def _(tile: torch.SymInt) -> torch.SymInt:
-    _disable_flatten_get_tile(tile)  # update config spec if needed
+    index = _disable_flatten_get_tile(tile)  # update config spec if needed
     assert isinstance(tile, torch.SymInt)
-    return CompileEnvironment.current().cached_create_unbacked_symint(("tile_id", tile))
+    result = CompileEnvironment.current().cached_create_unbacked_symint(
+        ("tile_id", tile)
+    )
+    _register_tile_symbol_origin(result, index)
+    return result
 
 
 @_decorators.codegen(tile_id)
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -987,6 +987,8 @@ def _helion_fp8_attention_kernel(q, k, v, out, out_stride_0, heads, _RDIM_SIZE_2
     pid_0 = tl.program_id(0)
     offset_0 = pid_0
     indices_5 = tl.arange(0, _RDIM_SIZE_2).to(tl.int32)
+    symnode_0 = triton_helpers.div_floor_integer(offset_0, heads)
+    symnode_1 = triton_helpers.remainder_integer(offset_0, heads)
     for offset_4 in tl.range(0, 256, _BLOCK_SIZE_1):
         indices_4 = offset_4 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
         m_i = tl.full([_BLOCK_SIZE_1], float('-inf'), tl.float32)
@@ -1028,8 +1030,6 @@ def _helion_fp8_attention_kernel(q, k, v, out, out_stride_0, heads, _RDIM_SIZE_2
         subscript_2 = l_i[:, None]
         v_11 = acc / subscript_2
         v_12 = tl.cast(v_11, tl.float8e4nv)
-        symnode_0 = triton_helpers.div_floor_integer(offset_0, heads)
-        symnode_1 = triton_helpers.remainder_integer(offset_0, heads)
         tl.store(out + (symnode_0 * out_stride_0 + symnode_1 * 16384 + indices_4[:, None] * 64 + indices_5[None, :] * 1), v_12, None)
 
 def fp8_attention_kernel(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, batch: int, heads: int, *, _launcher=_default_launcher):
diff --git a/test/test_indexing.expected b/test/test_indexing.expected
@@ -462,6 +462,93 @@ def tile_offset_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
     _launcher(_helion_tile_offset_kernel, (triton.cdiv(out.size(0), _BLOCK_SIZE_0),), out, x, out.size(0), x.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return out
 
+--- assertExpectedJournal(TestIndexing.test_tile_with_offset_from_expr)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime import triton_helpers
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_attention(q, k, v, lse, o, _BLOCK_SIZE_0: tl.constexpr, _RDIM_SIZE_2: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    indices_3 = tl.arange(0, _RDIM_SIZE_2).to(tl.int32)
+    full = tl.full([_BLOCK_SIZE_0], 0.0, tl.float32)
+    v_0 = float('inf')
+    v_1 = full - v_0
+    full_1 = tl.full([_BLOCK_SIZE_0], 0.0, tl.float32)
+    v_2 = 1.0
+    v_3 = full_1 + v_2
+    acc = tl.full([_BLOCK_SIZE_0, 64], 0.0, tl.float32)
+    q_i = tl.load(q + (indices_0[:, None] * 64 + indices_3[None, :] * 1), None)
+    symnode_0 = 64 * triton_helpers.div_floor_integer(offset_0, 64)
+    for offset_2 in tl.range(0, 64, _BLOCK_SIZE_1):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
+        q_i_copy = q_i
+        v_1_copy = v_1
+        acc_copy = acc
+        v_3_copy = v_3
+        q_i_copy_0 = q_i_copy
+        v_1_copy_0 = v_1_copy
+        acc_copy_0 = acc_copy
+        v_3_copy_0 = v_3_copy
+        v_4 = tl.cast(symnode_0, tl.int32)
+        v_5 = indices_2 + v_4
+        k_j = tl.load(k + ((indices_2 + symnode_0)[:, None] * 64 + indices_3[None, :] * 1), None)
+        v_6 = tl.cast(symnode_0, tl.int32)
+        v_7 = indices_2 + v_6
+        v_j = tl.load(v + ((indices_2 + symnode_0)[:, None] * 64 + indices_3[None, :] * 1), None)
+        permute = tl.permute(k_j, [1, 0])
+        qk = tl.dot(tl.cast(q_i_copy_0, tl.bfloat16), tl.cast(permute, tl.bfloat16), input_precision='tf32', out_dtype=tl.float32)
+        amax = tl.cast(tl.max(qk, 1), tl.float32)
+        v_8 = 0.18033688
+        v_9 = amax * v_8
+        v_10 = triton_helpers.maximum(v_1_copy_0, v_9)
+        v_11 = 0.18033688
+        v_12 = qk * v_11
+        subscript = v_10[:, None]
+        v_13 = v_12 - subscript
+        v_14 = libdevice.exp2(v_13)
+        v_15 = v_1_copy_0 - v_10
+        v_16 = libdevice.exp2(v_15)
+        l_ij = tl.cast(tl.sum(v_14, 1), tl.float32)
+        subscript_1 = v_16[:, None]
+        v_17 = acc_copy_0 * subscript_1
+        v_18 = tl.cast(v_14, tl.bfloat16)
+        acc = tl.dot(tl.cast(v_18, tl.bfloat16), tl.cast(v_j, tl.bfloat16), acc=v_17, input_precision='tf32', out_dtype=tl.float32)
+        v_19 = v_3_copy_0 * v_16
+        v_3 = v_19 + l_ij
+        v_1 = v_10
+    v_21 = libdevice.log2(v_3)
+    v_22 = v_1 + v_21
+    subscript_2 = v_3[:, None]
+    v_23 = acc / subscript_2
+    tl.store(lse + indices_0 * 1, v_22, None)
+    v_24 = tl.cast(v_23, tl.bfloat16)
+    tl.store(o + (indices_0[:, None] * 64 + indices_3[None, :] * 1), v_24, None)
+
+def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, *, _launcher=_default_launcher):
+    B, H, M, D = q_in.shape
+    Bk, Hk, N, Dk = k_in.shape
+    Bv, Hv, Nv, Dv = v_in.shape
+    D = 64
+    Dv = 64
+    q = q_in.reshape(-1, D)
+    k = k_in.reshape(-1, D)
+    v = v_in.reshape(-1, Dv)
+    MM = q.shape[0]
+    o = q.new_empty(MM, Dv)
+    lse = q.new_empty(MM, dtype=torch.float32)
+    _BLOCK_SIZE_0 = 32
+    _RDIM_SIZE_2 = 64
+    _BLOCK_SIZE_1 = 32
+    _launcher(_helion_attention, (triton.cdiv(8192, _BLOCK_SIZE_0),), q, k, v, lse, o, _BLOCK_SIZE_0, _RDIM_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
+    return (o.reshape(B, H, M, Dv), lse.reshape(B, H, M))
+
 --- assertExpectedJournal(TestIndexing.test_tile_with_offset_pointer)
 from __future__ import annotations
 
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import math
 import unittest
 
 import torch
@@ -1302,6 +1303,72 @@ def tile_offset_2d_kernel(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, x[10:, :])
         self.assertExpectedJournal(code)
 
+    @skipIfRefEager(
+        "Test is block size dependent which is not supported in ref eager mode"
+    )
+    def test_tile_with_offset_from_expr(self):
+        @helion.kernel(
+            autotune_effort="none",
+            static_shapes=True,
+        )
+        def attention(
+            q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            B, H, M, D = q_in.shape
+            Bk, Hk, N, Dk = k_in.shape
+            Bv, Hv, Nv, Dv = v_in.shape
+            D = hl.specialize(D)
+            Dv = hl.specialize(Dv)
+            q = q_in.reshape(-1, D)
+            k = k_in.reshape(-1, D)
+            v = v_in.reshape(-1, Dv)
+            MM = q.shape[0]
+            o = q.new_empty(MM, Dv)
+            lse = q.new_empty(MM, dtype=torch.float32)
+            block_m = hl.register_block_size(M)
+            block_n = hl.register_block_size(N)
+            sm_scale = 1.0 / math.sqrt(D)
+            qk_scale = sm_scale * 1.44269504  # 1/log(2)
+            for tile_m in hl.tile(MM, block_size=block_m):
+                m_i = hl.zeros([tile_m]) - float("inf")
+                l_i = hl.zeros([tile_m]) + 1.0
+                acc = hl.zeros([tile_m, Dv])
+                q_i = q[tile_m, :]
+
+                start_N = tile_m.begin // M * N
+                for tile_n in hl.tile(0, N, block_size=block_n):
+                    k_j = k[tile_n + start_N, :]
+                    v_j = v[tile_n + start_N, :]
+                    qk = hl.dot(q_i, k_j.T, out_dtype=torch.float32)
+                    m_ij = torch.maximum(m_i, torch.amax(qk, -1) * qk_scale)
+                    qk = qk * qk_scale - m_ij[:, None]
+                    p = torch.exp2(qk)
+                    alpha = torch.exp2(m_i - m_ij)
+                    l_ij = torch.sum(p, -1)
+                    acc = acc * alpha[:, None]
+                    p = p.to(v.dtype)
+                    acc = hl.dot(p, v_j, acc=acc)
+                    l_i = l_i * alpha + l_ij
+                    m_i = m_ij
+
+                m_i += torch.log2(l_i)
+                acc = acc / l_i[:, None]
+                lse[tile_m] = m_i
+                o[tile_m, :] = acc
+
+            return o.reshape(B, H, M, Dv), lse.reshape(B, H, M)
+
+        z, h, n_ctx, head_dim = 4, 32, 64, 64
+        dtype = torch.bfloat16
+        q, k, v = [
+            torch.randn((z, h, n_ctx, head_dim), dtype=dtype, device=DEVICE)
+            for _ in range(3)
+        ]
+        code, (o, lse) = code_and_output(attention, (q, k, v))
+        torch_out = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        torch.testing.assert_close(o, torch_out, atol=1e-2, rtol=1e-2)
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()