Merge branch 'main' into autotuner_cudagraph

yf225 · web-flow · commit 9449ed82e404 · 2025-11-05T23:49:14.000-08:00
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -2126,7 +2126,10 @@ def visit_Assert(self, node: ast.Assert) -> TypeInfo:
 
     visit_Raise: _VisitMethod = generic_statement  # pyright: ignore[reportAssignmentType,reportIncompatibleMethodOverride]
     visit_Delete: _VisitMethod = generic_statement  # pyright: ignore[reportAssignmentType,reportIncompatibleMethodOverride]
-    visit_Pass: _VisitMethod = generic_statement  # pyright: ignore[reportAssignmentType,reportIncompatibleMethodOverride]
+
+    def visit_Pass(self, node: ast.Pass) -> TypeInfo:
+        return NoType(origin=self.origin())
+
     visit_TypeAlias: _VisitMethod = generic_statement  # pyright: ignore[reportAssignmentType, reportIncompatibleMethodOverride]
     visit_Import: _VisitMethod = generic_statement  # pyright: ignore[reportAssignmentType,reportIncompatibleMethodOverride]
     visit_ImportFrom: _VisitMethod = generic_statement  # pyright: ignore[reportAssignmentType,reportIncompatibleMethodOverride]
diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py
@@ -25,3 +25,8 @@
     "PatternSearch": PatternSearch,
     "RandomSearch": RandomSearch,
 }
+
+cache_classes = {
+    "LocalAutotuneCache": LocalAutotuneCache,
+    "StrictLocalAutotuneCache": StrictLocalAutotuneCache,
+}
diff --git a/helion/language/_tracing_ops.py b/helion/language/_tracing_ops.py
@@ -88,6 +88,23 @@ def _(state: CodegenState) -> None:
     return HostFunction.current().device_ir.graphs[state.proxy_arg(0)].codegen(state)  # pyright: ignore[reportArgumentType,reportCallIssue]
 
 
+@has_side_effect
+@_decorators.api()
+def _while_loop(
+    cond_graph_id: int,
+    body_graph_id: int,
+    args: list[object],
+    orelse_graph_id: int | None = None,
+) -> list[object]:
+    """Represent a while loop in FX since FX lacks native control flow."""
+    raise AssertionError("this should never be called")
+
+
+@_decorators.codegen(_while_loop)
+def _(state: CodegenState) -> None:
+    return HostFunction.current().device_ir.graphs[state.proxy_arg(1)].codegen(state)  # pyright: ignore[reportArgumentType,reportCallIssue]
+
+
 @has_side_effect
 @_decorators.api()
 def _if(test: object, graph_id: int, args: list[object]) -> list[object]:
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -131,6 +131,13 @@ def _env_get_literal(
     )
 
 
+def _env_get_str(var_name: str, default: str) -> str:
+    value = os.environ.get(var_name)
+    if value is None or (value := value.strip()) == "":
+        return default
+    return value
+
+
 def _get_index_dtype() -> torch.dtype:
     value = os.environ.get("HELION_INDEX_DTYPE")
     if value is None or (token := value.strip()) == "":
@@ -184,7 +191,7 @@ def _get_autotune_config_overrides() -> dict[str, object]:
 def default_autotuner_fn(
     bound_kernel: BoundKernel, args: Sequence[object], **kwargs: object
 ) -> BaseAutotuner:
-    from ..autotuner import LocalAutotuneCache
+    from ..autotuner import cache_classes
     from ..autotuner import search_algorithms
 
     autotuner_name = os.environ.get("HELION_AUTOTUNER", "PatternSearch")
@@ -223,7 +230,16 @@ def default_autotuner_fn(
         assert profile.random_search is not None
         kwargs.setdefault("count", profile.random_search.count)
 
-    return LocalAutotuneCache(autotuner_cls(bound_kernel, args, **kwargs))  # pyright: ignore[reportArgumentType]
+    settings = bound_kernel.settings
+    cache_name = settings.autotune_cache
+    cache_cls = cache_classes.get(cache_name)
+    if cache_cls is None:
+        raise ValueError(
+            f"Unknown HELION_AUTOTUNE_CACHE value: {cache_name}, valid options are: "
+            f"{', '.join(cache_classes.keys())}"
+        )
+
+    return cache_cls(autotuner_cls(bound_kernel, args, **kwargs))  # pyright: ignore[reportArgumentType]
 
 
 def _get_autotune_random_seed() -> int:
@@ -348,6 +364,11 @@ class _Settings:
         )
     )
     ref_mode: RefMode = dataclasses.field(default_factory=_get_ref_mode)
+    autotune_cache: str = dataclasses.field(
+        default_factory=functools.partial(
+            _env_get_str, "HELION_AUTOTUNE_CACHE", "LocalAutotuneCache"
+        )
+    )
     autotuner_fn: AutotunerFunction = default_autotuner_fn
     autotune_baseline_fn: Callable[..., object] | None = None
 
@@ -413,6 +434,11 @@ class Settings(_Settings):
             "Should have the same signature as the kernel function. "
             "Pass as @helion.kernel(..., autotune_baseline_fn=my_baseline_fn)."
         ),
+        "autotune_cache": (
+            "The name of the autotuner cache class to use. "
+            "Set HELION_AUTOTUNE_CACHE=StrictLocalAutotuneCache to enable strict caching. "
+            "Defaults to 'LocalAutotuneCache'."
+        ),
     }
 
     def __init__(self, **settings: object) -> None:
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -7,6 +7,7 @@
 import logging
 import math
 import multiprocessing as mp
+import operator
 import os
 from pathlib import Path
 import pickle
@@ -41,6 +42,8 @@
 from helion.autotuner.config_generation import ConfigGeneration
 from helion.autotuner.effort_profile import get_effort_profile
 from helion.autotuner.finite_search import FiniteSearch
+from helion.autotuner.local_cache import LocalAutotuneCache
+from helion.autotuner.local_cache import StrictLocalAutotuneCache
 from helion.autotuner.logger import LambdaLogger
 from helion.autotuner.random_search import RandomSearch
 import helion.language as hl
@@ -955,5 +958,59 @@ def test_autotune_random_seed_from_settings(self) -> None:
         self.assertNotEqual(first, second)
 
 
+class TestAutotuneCacheSelection(TestCase):
+    """Selection of the autotune cache via HELION_AUTOTUNE_CACHE."""
+
+    def _make_bound(self):
+        @helion.kernel(autotune_baseline_fn=operator.add, autotune_log_level=0)
+        def add(a: torch.Tensor, b: torch.Tensor):
+            out = torch.empty_like(a)
+            for tile in hl.tile(out.size()):
+                out[tile] = a[tile] + b[tile]
+            return out
+
+        args = (
+            torch.randn([8], device=DEVICE),
+            torch.randn([8], device=DEVICE),
+        )
+        return add.bind(args), args
+
+    def test_autotune_cache_default_is_local(self):
+        """Default (no env var set) -> LocalAutotuneCache."""
+        with without_env_var("HELION_AUTOTUNE_CACHE"):
+            bound, args = self._make_bound()
+            with patch("torch.accelerator.synchronize", autospec=True) as sync:
+                sync.return_value = None
+                autotuner = bound.settings.autotuner_fn(bound, args)
+            self.assertIsInstance(autotuner, LocalAutotuneCache)
+            self.assertNotIsInstance(autotuner, StrictLocalAutotuneCache)
+
+    def test_autotune_cache_strict_selected_by_env(self):
+        """HELION_AUTOTUNE_CACHE=StrictLocalAutotuneCache -> StrictLocalAutotuneCache."""
+        with patch.dict(
+            os.environ,
+            {"HELION_AUTOTUNE_CACHE": "StrictLocalAutotuneCache"},
+            clear=False,
+        ):
+            bound, args = self._make_bound()
+            with patch("torch.accelerator.synchronize", autospec=True) as sync:
+                sync.return_value = None
+                autotuner = bound.settings.autotuner_fn(bound, args)
+            self.assertIsInstance(autotuner, StrictLocalAutotuneCache)
+
+    def test_autotune_cache_invalid_raises(self):
+        """Invalid HELION_AUTOTUNE_CACHE value should raise a ValueError."""
+        with patch.dict(
+            os.environ, {"HELION_AUTOTUNE_CACHE": "InvalidCacheName"}, clear=False
+        ):
+            bound, args = self._make_bound()
+            with patch("torch.accelerator.synchronize", autospec=True) as sync:
+                sync.return_value = None
+                with self.assertRaisesRegex(
+                    ValueError, "Unknown HELION_AUTOTUNE_CACHE"
+                ):
+                    bound.settings.autotuner_fn(bound, args)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_loops.expected b/test/test_loops.expected
@@ -1730,3 +1730,158 @@ def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     _launcher(_helion_matmul, (_NUM_SM,), x, y, out, _NUM_SM, _BLOCK_SIZE_1, _BLOCK_SIZE_0, _BLOCK_SIZE_2, num_warps=4, num_stages=1)
     # src[test_loops.py:N]: return out
     return out
+
+--- assertExpectedJournal(TestLoops.test_while_accumulates_tensor)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel(out, _BLOCK_SIZE_0: tl.constexpr):
+    # src[test_loops.py:N]: for tile in hl.tile(x.shape):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    # src[test_loops.py:N]: acc = torch.zeros_like(x[tile])
+    acc = tl.full([_BLOCK_SIZE_0], 0, tl.float32)
+    # src[test_loops.py:N]: steps = torch.zeros([], device=x.device, dtype=torch.int32)
+    steps = tl.full([], 0, tl.int32)
+    # src[test_loops.py:N]: while steps < 4:
+    # src[test_loops.py:N]:     acc = acc + 1
+    # src[test_loops.py:N]:     steps = steps + 1
+    steps_copy = steps
+    steps_copy_0 = steps_copy
+    # src[test_loops.py:N]: while steps < 4:
+    v_0 = tl.full([], 4, tl.int32)
+    v_1 = steps_copy_0 < v_0
+    # src[test_loops.py:N]: while steps < 4:
+    # src[test_loops.py:N]:     acc = acc + 1
+    # src[test_loops.py:N]:     steps = steps + 1
+    while_cond = v_1
+    while while_cond:
+        steps_copy_1 = steps
+        acc_copy = acc
+        steps_copy_1_0 = steps_copy_1
+        acc_copy_0 = acc_copy
+        # src[test_loops.py:N]: acc = acc + 1
+        v_2 = 1.0
+        acc = acc_copy_0 + v_2
+        # src[test_loops.py:N]: steps = steps + 1
+        v_4 = tl.full([], 1, tl.int32)
+        steps = steps_copy_1_0 + v_4
+        # src[test_loops.py:N]: while steps < 4:
+        # src[test_loops.py:N]:     acc = acc + 1
+        # src[test_loops.py:N]:     steps = steps + 1
+        steps_copy_2 = steps
+        steps_copy_2_0 = steps_copy_2
+        # src[test_loops.py:N]: while steps < 4:
+        v_6 = tl.full([], 4, tl.int32)
+        v_7 = steps_copy_2_0 < v_6
+        # src[test_loops.py:N]: while steps < 4:
+        # src[test_loops.py:N]:     acc = acc + 1
+        # src[test_loops.py:N]:     steps = steps + 1
+        while_cond = v_7
+    # src[test_loops.py:N]: out[tile] = acc
+    tl.store(out + indices_0 * 1, acc, None)
+
+def kernel(x: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_loops.py:N]: out = torch.empty_like(x)
+    out = torch.empty_like(x)
+    # src[test_loops.py:N]: for tile in hl.tile(x.shape):
+    _BLOCK_SIZE_0 = 16
+    # src[test_loops.py:N]: for tile in hl.tile(x.shape):
+    # src[test_loops.py:N]:     acc = torch.zeros_like(x[tile])
+    # src[test_loops.py:N]:     steps = torch.zeros([], device=x.device, dtype=torch.int32)
+    # src[test_loops.py:N-N]: ...
+    _launcher(_helion_kernel, (triton.cdiv(16, _BLOCK_SIZE_0),), out, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
+    # src[test_loops.py:N]: return out
+    return out
+
+--- assertExpectedJournal(TestLoops.test_while_atomic_add_accumulates)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel(counters, values, totals):
+    # src[test_loops.py:N]: for idx in hl.tile(values.size(0)):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0
+    indices_0 = offset_0 + tl.zeros([1], tl.int32)
+    # src[test_loops.py:N]: while hl.atomic_add(counters, [idx], 1).sum() < 1:
+    atomic_add = tl.atomic_add(counters + indices_0 * 1, 1, mask=None, sem='relaxed')
+    sum_1 = tl.cast(tl.sum(atomic_add, 0), tl.float32)
+    v_0 = 1.0
+    v_1 = sum_1 < v_0
+    # src[test_loops.py:N]: while hl.atomic_add(counters, [idx], 1).sum() < 1:
+    # src[test_loops.py:N]:     hl.atomic_add(totals, [idx], values[idx])
+    while_cond = v_1
+    while while_cond:
+        # src[test_loops.py:N]: hl.atomic_add(totals, [idx], values[idx])
+        load = tl.load(values + indices_0 * 1, None)
+        tl.atomic_add(totals + indices_0 * 1, load, mask=None, sem='relaxed')
+        # src[test_loops.py:N]: while hl.atomic_add(counters, [idx], 1).sum() < 1:
+        atomic_add_1 = tl.atomic_add(counters + indices_0 * 1, 1, mask=None, sem='relaxed')
+        sum_2 = tl.cast(tl.sum(atomic_add_1, 0), tl.float32)
+        v_2 = 1.0
+        v_3 = sum_2 < v_2
+        # src[test_loops.py:N]: while hl.atomic_add(counters, [idx], 1).sum() < 1:
+        # src[test_loops.py:N]:     hl.atomic_add(totals, [idx], values[idx])
+        while_cond = v_3
+
+def kernel(values: torch.Tensor, totals: torch.Tensor, counters: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_loops.py:N]: for idx in hl.tile(values.size(0)):
+    # src[test_loops.py:N]:     while hl.atomic_add(counters, [idx], 1).sum() < 1:
+    # src[test_loops.py:N]:         hl.atomic_add(totals, [idx], values[idx])
+    _launcher(_helion_kernel, (8,), counters, values, totals, num_warps=4, num_stages=1)
+    # src[test_loops.py:N]: return totals
+    return totals
+
+--- assertExpectedJournal(TestLoops.test_while_atomic_cas_pass)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_kernel(grad_x_lock, _BLOCK_SIZE_0: tl.constexpr):
+    # src[test_loops.py:N]: for idx in hl.tile(grad_x_lock.size(0)):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    # src[test_loops.py:N]: while hl.atomic_cas(grad_x_lock, [idx], 0, 1) == 1:
+    atomic_cas = tl.atomic_cas(grad_x_lock + indices_0 * 1, 0, 1, sem='relaxed')
+    v_0 = tl.full([], 1, tl.int32)
+    v_1 = atomic_cas == v_0
+    # src[test_loops.py:N]: while hl.atomic_cas(grad_x_lock, [idx], 0, 1) == 1:
+    # src[test_loops.py:N]:     pass
+    while_cond = v_1
+    while while_cond:
+        # src[test_loops.py:N]: while hl.atomic_cas(grad_x_lock, [idx], 0, 1) == 1:
+        atomic_cas_1 = tl.atomic_cas(grad_x_lock + indices_0 * 1, 0, 1, sem='relaxed')
+        v_2 = tl.full([], 1, tl.int32)
+        v_3 = atomic_cas_1 == v_2
+        # src[test_loops.py:N]: while hl.atomic_cas(grad_x_lock, [idx], 0, 1) == 1:
+        # src[test_loops.py:N]:     pass
+        while_cond = v_3
+    # src[test_loops.py:N]: hl.atomic_cas(grad_x_lock, [idx], 1, 0)
+    tl.atomic_cas(grad_x_lock + indices_0 * 1, 1, 0, sem='relaxed')
+
+def kernel(grad_x_lock: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_loops.py:N]: for idx in hl.tile(grad_x_lock.size(0)):
+    _BLOCK_SIZE_0 = 16
+    # src[test_loops.py:N]: for idx in hl.tile(grad_x_lock.size(0)):
+    # src[test_loops.py:N]:     while hl.atomic_cas(grad_x_lock, [idx], 0, 1) == 1:
+    # src[test_loops.py:N]:         pass
+    # src[test_loops.py:N-N]: ...
+    _launcher(_helion_kernel, (triton.cdiv(16, _BLOCK_SIZE_0),), grad_x_lock, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
+    # src[test_loops.py:N]: return grad_x_lock
+    return grad_x_lock
diff --git a/test/test_loops.py b/test/test_loops.py

Original file line number	Diff line number	Diff line change
`@@ -25,3 +25,8 @@`
`25`	`25`	`"PatternSearch": PatternSearch,`
`26`	`26`	`"RandomSearch": RandomSearch,`
`27`	`27`	`}`
	`28`	`+`
	`29`	`+cache_classes = {`
	`30`	`+ "LocalAutotuneCache": LocalAutotuneCache,`
	`31`	`+ "StrictLocalAutotuneCache": StrictLocalAutotuneCache,`
	`32`	`+}`