pytorch
diff --git a/‎README.md‎
Lines changed: 12 additions & 0 deletions b/‎README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/low_mem_dropout.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/low_mem_dropout.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎helion/runtime/settings.py‎
Lines changed: 1 addition & 1 deletion b/‎helion/runtime/settings.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/test_associative_scan.expected‎
Lines changed: 223 additions & 281 deletions b/‎test/test_associative_scan.expected‎
Lines changed: 223 additions & 281 deletions
diff --git a/‎test/test_atomic_ops.expected‎
Lines changed: 59 additions & 68 deletions b/‎test/test_atomic_ops.expected‎
Lines changed: 59 additions & 68 deletions
diff --git a/‎test/test_broadcasting.expected‎
Lines changed: 67 additions & 77 deletions b/‎test/test_broadcasting.expected‎
Lines changed: 67 additions & 77 deletions
diff --git a/‎test/test_closures.expected‎
Lines changed: 10 additions & 13 deletions b/‎test/test_closures.expected‎
Lines changed: 10 additions & 13 deletions
diff --git a/‎test/test_closures.py‎
Lines changed: 1 addition & 1 deletion b/‎test/test_closures.py‎
Lines changed: 1 addition & 1 deletion
@@ -189,6 +189,18 @@ and configurations directly from your code.
 
 **For production deployment**, we recommend using ahead-of-time tuned configurations rather than relying on runtime autotuning. The autotuning process can be time-consuming and resource-intensive, making it unsuitable for production environments where predictable performance and startup times are critical.
 
+### Static shapes and autotuning keys
+
+By default Helion uses static shapes (`static_shapes=True`). This means each unique input shape/stride signature is treated as its own specialization and will be autotuned separately. This typically yields the best performance, but may increase autotuning time when many shapes are encountered.
+
+If you want to reduce autotuning time by sharing configurations between different shapes, set `static_shapes=False`. In this mode, the autotuning key ignores exact sizes, allowing a single tuned config to be reused across multiple shapes. This can come with a performance penalty compared to fully specialized static shapes.
+
+```python
+@helion.kernel(static_shapes=False)
+def my_kernel(x: torch.Tensor) -> torch.Tensor:
+    ...
+```
+
 ## Configurations
 
 Helion configurations include the following options:
 
@@ -26,7 +26,7 @@
 
 
 # %%
-@helion.kernel()
+@helion.kernel(static_shapes=False)
 def low_mem_dropout(p: float, x: torch.Tensor, seed: int) -> torch.Tensor:
     """
     Applies dropout on x using p
@@ -57,7 +57,7 @@ def low_mem_dropout(p: float, x: torch.Tensor, seed: int) -> torch.Tensor:
 
 
 # %%
-@helion.kernel()
+@helion.kernel(static_shapes=False)
 def low_mem_dropout_bwd(p: float, grad_y: torch.Tensor, seed: int) -> torch.Tensor:
     """
     For low mem dropout we are applying randomness inside both fwd and bwd
 
@@ -166,7 +166,7 @@ class _Settings:
         "Literal['tf32', 'tf32x3', 'ieee']",
         os.environ.get("TRITON_F32_DEFAULT", "tf32"),
     )
-    static_shapes: bool = False
+    static_shapes: bool = True
     autotune_log_level: int = logging.INFO
     autotune_compile_timeout: int = int(
         os.environ.get("HELION_AUTOTUNE_COMPILE_TIMEOUT", "60")
 
@@ -12,28 +12,26 @@ from helion.runtime import default_launcher as _default_launcher
 import helion._testing.basic_kernels as _source_module
 
 @triton.jit
-def _helion_use_globals(a, _source_module_attr_global_tensor, out, a_size_0, a_size_1, _source_module_attr_global_tensor_stride_0, a_stride_0, a_stride_1, out_stride_0, out_stride_1, _source_module_attr_global_float, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
-    num_blocks_0 = tl.cdiv(a_size_0, _BLOCK_SIZE_0)
+def _helion_use_globals(a, _source_module_attr_global_tensor, out, _source_module_attr_global_float, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(512, _BLOCK_SIZE_0)
     pid_0 = tl.program_id(0) % num_blocks_0
     pid_1 = tl.program_id(0) // num_blocks_0
     offset_0 = pid_0 * _BLOCK_SIZE_0
     indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
-    mask_0 = indices_0 < a_size_0
     offset_1 = pid_1 * _BLOCK_SIZE_1
     indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
-    mask_1 = indices_1 < a_size_1
-    load = tl.load(a + (indices_0[:, None] * a_stride_0 + indices_1[None, :] * a_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
-    load_1 = tl.load(_source_module_attr_global_tensor + indices_1[None, :] * _source_module_attr_global_tensor_stride_0, mask_1[None, :], other=0)
+    load = tl.load(a + (indices_0[:, None] * 512 + indices_1[None, :] * 1), None)
+    load_1 = tl.load(_source_module_attr_global_tensor + indices_1[None, :] * 1, None)
     v_0 = load + load_1
     v_1 = tl_math.sin(v_0)
     v_2 = v_1 + _source_module_attr_global_float
-    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_2, mask_0[:, None] & mask_1[None, :])
+    tl.store(out + (indices_0[:, None] * 512 + indices_1[None, :] * 1), v_2, None)
 
 def use_globals(a, *, _launcher=_default_launcher):
     out = _source_module.empty_like(a)
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
-    _launcher(_helion_use_globals, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, _source_module.global_tensor, out, a.size(0), a.size(1), _source_module.global_tensor.stride(0), a.stride(0), a.stride(1), out.stride(0), out.stride(1), _source_module.global_float, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
+    _launcher(_helion_use_globals, (triton.cdiv(512, _BLOCK_SIZE_0) * triton.cdiv(512, _BLOCK_SIZE_1),), a, _source_module.global_tensor, out, _source_module.global_float, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
     return out
 
 --- assertExpectedJournal(TestClosures.test_fn_arg_with_closure)
@@ -160,17 +158,16 @@ from helion.runtime import default_launcher as _default_launcher
 import test.test_closures as _source_module
 
 @triton.jit
-def _helion_call_func_arg_on_host(a, out, a_size_0, a_stride_0, out_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+def _helion_call_func_arg_on_host(a, out, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_0 = pid_0 * _BLOCK_SIZE_0
     indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
-    mask_0 = indices_0 < a_size_0
-    load = tl.load(a + indices_0 * a_stride_0, mask_0, other=0)
+    load = tl.load(a + indices_0 * 1, None)
     v_0 = tl_math.sin(load)
-    tl.store(out + indices_0 * out_stride_0, v_0, mask_0)
+    tl.store(out + indices_0 * 1, v_0, None)
 
 def call_func_arg_on_host(a, alloc, *, _launcher=_default_launcher):
     out = alloc(a)
     _BLOCK_SIZE_0 = 512
-    _launcher(_helion_call_func_arg_on_host, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, out, a.size(0), a.stride(0), out.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=2)
+    _launcher(_helion_call_func_arg_on_host, (triton.cdiv(512, _BLOCK_SIZE_0),), a, out, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
     return out
@@ -19,7 +19,7 @@
 global_tensor = torch.randn([512], device=DEVICE)
 
 
-@helion.kernel
+@helion.kernel(static_shapes=False)
 def sin_func_arg(a, fn) -> torch.Tensor:
     out = torch.empty_like(a)
     for tile in hl.tile(a.size()):
Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ class _Settings:`
`166`	`166`	`"Literal['tf32', 'tf32x3', 'ieee']",`
`167`	`167`	`os.environ.get("TRITON_F32_DEFAULT", "tf32"),`
`168`	`168`	`)`
`169`		`- static_shapes: bool = False`
	`169`	`+ static_shapes: bool = True`
`170`	`170`	`autotune_log_level: int = logging.INFO`
`171`	`171`	`autotune_compile_timeout: int = int(`
`172`	`172`	`os.environ.get("HELION_AUTOTUNE_COMPILE_TIMEOUT", "60")`