Fix CUDA IMA from combination of unrolling + pipelining (#920)

PaulZhang12 · web-flow · commit 276344a48144 · 2025-10-14T17:01:30.000-04:00
diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py
@@ -5,6 +5,7 @@
 import dataclasses
 import functools
 import itertools
+import math
 import operator
 from typing import TYPE_CHECKING
 from typing import NamedTuple
@@ -147,10 +148,28 @@ def get_tl_range_kwargs(config: Config, block_idx: int) -> list[str]:
         range_num_stages = env.config_spec.range_num_stages.config_get(
             config.range_num_stages, block_idx, 0
         )
+
         if config.indexing == "tensor_descriptor" and range_num_stages > 0:
             # Tensor descriptor + multi-stage tl.range pipelines tend to cause
             # CUDA "misaligned address" or "unspecified launch failure" errors.
             range_num_stages = 0
+        elif (
+            range_num_stages > 1
+            and range_unroll_factor > 1
+            and env.block_sizes[block_idx].size
+            and env.block_sizes[block_idx].numel.is_number
+        ):
+            # Unrolling can cause CUDA IMA with pipelining
+            # We want to ensure new step size + pipeline is within bounds
+            loop_numel = int(env.block_sizes[block_idx].numel)
+            block_size = int(env.block_sizes[block_idx].from_config_assert(config))
+            step = range_unroll_factor * block_size
+            last_offset = ((loop_numel - 1) // block_size) * block_size
+            remainder = loop_numel - last_offset
+            range_num_stages = min(
+                max(1, int(math.ceil(remainder / step))), range_num_stages
+            )
+
         if range_num_stages > 0:
             kwargs.append(f"num_stages={range_num_stages}")
 
@@ -194,6 +213,7 @@ def get_range_call_str(
 
         if use_static_range:
             return f"tl.static_range({', '.join(range_args)})"
+
         range_kwargs = TileStrategy.get_tl_range_kwargs(config, block_ids[0])
         return f"tl.range({', '.join(range_args + range_kwargs)})"
 
diff --git a/test/test_loops.expected b/test/test_loops.expected
@@ -1145,3 +1145,46 @@ def three_pass_kernel(x: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_3 = 8
     _launcher(_helion_three_pass_kernel, (triton.cdiv(B, _BLOCK_SIZE_0),), x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), B, M, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, num_warps=4, num_stages=2)
     return out
+
+--- assertExpectedJournal(TestLoops.test_unroll_with_pipelining)
+from __future__ import annotations
+
+import torch
+import helion
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_matmul(x, y, out, _NUM_SM: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    total_pids = tl.cdiv(256, _BLOCK_SIZE_1) * tl.cdiv(256, _BLOCK_SIZE_0)
+    block_size = tl.cdiv(total_pids, _NUM_SM)
+    start_pid = tl.program_id(0) * block_size
+    end_pid = tl.minimum(start_pid + block_size, total_pids)
+    for virtual_pid in tl.range(start_pid, end_pid, loop_unroll_factor=4, num_stages=1):
+        num_blocks_0 = tl.cdiv(256, _BLOCK_SIZE_1)
+        pid_0 = virtual_pid % num_blocks_0
+        pid_1 = virtual_pid // num_blocks_0
+        offset_1 = pid_0 * _BLOCK_SIZE_1
+        offset_0 = pid_1 * _BLOCK_SIZE_0
+        acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+        for offset_2 in tl.range(0, 256, _BLOCK_SIZE_2, loop_unroll_factor=4, num_stages=1):
+            acc_copy = acc
+            acc_copy_0 = acc_copy
+            load = tl.load(tl.make_block_ptr(x, [256, 256], [256, 1], [offset_0, offset_2], [_BLOCK_SIZE_0, _BLOCK_SIZE_2], [1, 0]), boundary_check=[0, 1], padding_option='zero')
+            load_1 = tl.load(tl.make_block_ptr(y, [256, 256], [256, 1], [offset_2, offset_1], [_BLOCK_SIZE_2, _BLOCK_SIZE_1], [1, 0]), boundary_check=[0, 1], padding_option='zero')
+            acc = tl.dot(tl.cast(load, tl.bfloat16), tl.cast(load_1, tl.bfloat16), acc=acc_copy_0, input_precision='tf32', out_dtype=tl.float32)
+        v_0 = tl.cast(acc, tl.bfloat16)
+        tl.store(tl.make_block_ptr(out, [256, 256], [256, 1], [offset_0, offset_1], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), v_0, boundary_check=[0, 1])
+
+def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f'size mismatch {k} != {k2}'
+    out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
+    _NUM_SM = helion.runtime.get_num_sm(x.device)
+    _BLOCK_SIZE_1 = 16
+    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_2 = 16
+    _launcher(_helion_matmul, (_NUM_SM,), x, y, out, _NUM_SM, _BLOCK_SIZE_1, _BLOCK_SIZE_0, _BLOCK_SIZE_2, num_warps=4, num_stages=2)
+    return out
diff --git a/test/test_loops.py b/test/test_loops.py
@@ -1213,6 +1213,47 @@ def three_pass_kernel(x: torch.Tensor) -> torch.Tensor:
         torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5)
         self.assertExpectedJournal(code)
 
+    def test_unroll_with_pipelining(self):
+        @helion.kernel(static_shapes=True)
+        def matmul(
+            x: torch.Tensor,
+            y: torch.Tensor,
+        ) -> torch.Tensor:
+            m, k = x.size()
+            k2, n = y.size()
+            assert k == k2, f"size mismatch {k} != {k2}"
+            out = torch.empty(
+                [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+            )
+            for tile_m, tile_n in hl.tile([m, n]):
+                acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+                for tile_k in hl.tile(k):
+                    acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+                out[tile_m, tile_n] = acc
+            return out
+
+        a = torch.randn(256, 256, device=DEVICE, dtype=torch.bfloat16)
+        b = torch.randn(256, 256, device=DEVICE, dtype=torch.bfloat16)
+
+        code, result = code_and_output(
+            matmul,
+            (a, b),
+            block_sizes=[64, 16, 16],
+            indexing="block_ptr",
+            loop_orders=[[1, 0]],
+            pid_type="persistent_blocked",
+            range_num_stages=[4, 2],
+            range_unroll_factors=[4, 4],
+        )
+
+        expected = torch.matmul(a, b)
+        torch.testing.assert_close(result, expected, atol=1e-2, rtol=1e-2)
+        self.assertExpectedJournal(code)
+
+        # Logic for modifying num_stages and loop unrolling factors should
+        # change num_stages=1
+        self.assertIn("num_stages=1", code)
+
 
 if __name__ == "__main__":
     unittest.main()