wip

yf225 · yf225 · commit 252f6e0f9d6c · 2025-11-13T16:27:58.000-08:00
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -60,6 +60,22 @@ def _get_padded_iota_original_length(
     return None
 
 
+def _has_padded_iota_index(state: CodegenState | None, num_indices: int) -> bool:
+    if state is None:
+        return False
+    for idx in range(num_indices):
+        if _get_padded_iota_original_length(state, idx) is not None:
+            return True
+    return False
+
+
+def _has_multidim_tensor_index(index: list[object]) -> bool:
+    for k in index:
+        if isinstance(k, torch.Tensor) and k.ndim > 1:
+            return True
+    return False
+
+
 def _get_tile_with_offset_info(
     k: object, state: CodegenState, k_index: int
 ) -> tuple[int, int | torch.SymInt] | None:
@@ -102,6 +118,7 @@ def _get_tile_with_offset_info(
         return (meta["block_id"], meta["offset"])
 
     return None
+    return None
 
 
 class IndexingStrategy:
@@ -554,6 +571,17 @@ def codegen_store(
         )
 
 
+def _try_python_index_shape(
+    tensor: torch.Tensor, index: list[object]
+) -> list[int | torch.SymInt] | None:
+    try:
+        tuple_index = tuple(index)
+        result = tensor[tuple_index]  # pyright: ignore[reportGeneralTypeIssues]
+    except Exception:
+        return None
+    return list(result.size())
+
+
 class SubscriptIndexing(NamedTuple):
     index_expr: ast.AST
     mask_expr: ast.AST
@@ -567,6 +595,17 @@ def has_mask(self) -> bool:
     def compute_shape(
         tensor: torch.Tensor, index: list[object], state: CodegenState | None = None
     ) -> list[int | torch.SymInt]:
+        advanced_mode = (
+            isinstance(tensor, torch.Tensor)
+            and len(index) == tensor.ndim
+            and index
+            and all(isinstance(k, torch.Tensor) for k in index)
+            and _has_multidim_tensor_index(index)
+            and not _has_padded_iota_index(state, len(index))
+        )
+        if advanced_mode:
+            if (shape := _try_python_index_shape(tensor, index)) is not None:
+                return shape
         assert isinstance(tensor, torch.Tensor)
         assert isinstance(index, (list, tuple)), index
         input_size = collections.deque(tensor.size())
@@ -605,18 +644,28 @@ def compute_shape(
                 k_index += 1
             elif isinstance(k, slice):
                 size = input_size.popleft()
-                # Handle slices with steps
-                slice_size = compute_slice_size(k, size)
-
-                if slice_size != 1:
-                    rdim = env.allocate_reduction_dimension(slice_size)
-                    output_size.append(rdim.var)
+                is_full_slice = (
+                    (k.start is None or k.start == 0)
+                    and k.stop is None
+                    and (k.step is None or k.step == 1)
+                )
+
+                if is_full_slice:
+                    if env.known_equal(size, 1):
+                        output_size.append(1)
+                    else:
+                        output_size.append(size)
                 else:
-                    output_size.append(1)
+                    # Handle slices with steps or bounded ranges
+                    slice_size = compute_slice_size(k, size)
+
+                    if slice_size != 1:
+                        rdim = env.allocate_reduction_dimension(slice_size)
+                        output_size.append(rdim.var)
+                    else:
+                        output_size.append(1)
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and (
-                k.ndim == 1 or (len(index) == 1 and tensor.ndim == 1)
-            ):
+            elif isinstance(k, torch.Tensor):
                 input_size.popleft()
                 output_size.extend(k.size())
                 k_index += 1
@@ -664,6 +713,14 @@ def create(
         output_size = SubscriptIndexing.compute_shape(fake_value, index, state)
         env = CompileEnvironment.current()
         dtype = env.triton_index_type()
+        advanced_mode = (
+            isinstance(fake_value, torch.Tensor)
+            and len(index) == fake_value.ndim
+            and bool(index)
+            and all(isinstance(k, torch.Tensor) for k in index)
+            and _has_multidim_tensor_index(index)
+            and not _has_padded_iota_index(state, len(index))
+        )
         if dtype == "tl.int32" and SubscriptIndexing._needs_int64(fake_value):
             raise exc.IndexOffsetOutOfRangeForInt32(env.settings.index_dtype)
 
@@ -737,8 +794,26 @@ def _is_size_one(size: int | torch.SymInt) -> bool:
                     else:
                         index_values.append(f"{start}{expand}")
                 else:
-                    # Full slice or slice without step
-                    if not _is_size_one(size):
+                    is_full_slice = (
+                        (k.start is None or k.start == 0)
+                        and k.stop is None
+                        and (k.step is None or k.step == 1)
+                    )
+                    if is_full_slice and not _is_size_one(size):
+                        block_idx = env.get_block_id(size)
+                        if block_idx is not None:
+                            index_var = state.codegen.index_var(block_idx)
+                            index_values.append(f"({index_var}){expand}")
+                            if mask := state.codegen.mask_var(block_idx):
+                                mask_values.setdefault(f"({mask}){expand}")
+                        else:
+                            rdim = env.allocate_reduction_dimension(size)
+                            block_idx = rdim.block_id
+                            index_var = state.codegen.index_var(block_idx)
+                            index_values.append(f"({index_var}){expand}")
+                            if mask := state.codegen.mask_var(block_idx):
+                                mask_values.setdefault(f"({mask}){expand}")
+                    elif not _is_size_one(size):
                         rdim = env.allocate_reduction_dimension(size)
                         block_idx = rdim.block_id
                         index_var = state.codegen.index_var(block_idx)
@@ -749,22 +824,31 @@ def _is_size_one(size: int | torch.SymInt) -> bool:
                         index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
                 k_index += 1
-            elif isinstance(k, torch.Tensor) and k.ndim == 1:
-                expand = tile_strategy.expand_str(output_size, output_idx)
+            elif isinstance(k, torch.Tensor) and not (
+                len(index) == 1 and fake_value.ndim == 1
+            ):
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
                 assert len(ast_index) == len(index)
                 index_var = state.codegen.lift(ast_index[n], prefix="index").id
-                index_values.append(f"({index_var}){expand}")
-                if (block_idx := env.get_block_id(output_size[output_idx])) is not None:
-                    if mask := state.codegen.mask_var(block_idx):
-                        mask_values.setdefault(f"({mask}){expand}")
+                if advanced_mode:
+                    index_values.append(index_var)
+                else:
+                    expand = tile_strategy.expand_str(output_size, output_idx)
+                    index_values.append(f"({index_var}){expand}")
+                    if (block_idx := env.get_block_id(output_size[output_idx])) is not None:
+                        if mask := state.codegen.mask_var(block_idx):
+                            mask_values.setdefault(f"({mask}){expand}")
                 # Check if this index comes from a padded hl.arange and generate mask
                 if (
                     original_length := _get_padded_iota_original_length(state, n)
                 ) is not None:
-                    mask_values.setdefault(f"({index_var} < {original_length}){expand}")
-                output_idx += 1
+                    if advanced_mode:
+                        mask_values.setdefault(f"({index_var} < {original_length})")
+                    else:
+                        mask_values.setdefault(f"({index_var} < {original_length}){expand}")
+                if not advanced_mode:
+                    output_idx += 1
                 k_index += 1
             elif (
                 isinstance(k, torch.Tensor) and len(index) == 1 and fake_value.ndim == 1
@@ -786,6 +870,8 @@ def _is_size_one(size: int | torch.SymInt) -> bool:
                 k_index += 1
             else:
                 raise exc.InvalidIndexingType(type(k))
+        if advanced_mode:
+            output_idx = len(output_size)
         assert len(output_size) == output_idx
         assert len(index_values) == fake_value.ndim
         index_expr = []
@@ -885,7 +971,11 @@ def need_reshape(self, node: ast.AST) -> bool:
             return True
         env = CompileEnvironment.current()
         for a, b in zip(self.reshaped_size, self.block_shape, strict=True):
-            if not env.known_equal(a, b):
+            block_id_a = env.resolve_block_id(a)
+            block_id_b = env.resolve_block_id(b)
+            if block_id_a != block_id_b:
+                return True
+            if block_id_a is None and not env.known_equal(a, b):
                 return True
         return False
 
@@ -1035,7 +1125,13 @@ def create(
                 # Full slice or slice without step
                 if size != 1:
                     rdim = env.allocate_reduction_dimension(size)
-                    res.offsets.append(state.codegen.offset_var(rdim.block_id))
+                    active_loops = state.codegen.active_device_loops.get(
+                        rdim.block_id
+                    )
+                    if active_loops:
+                        res.offsets.append(state.codegen.offset_var(rdim.block_id))
+                    else:
+                        res.offsets.append("0")
                     res.block_shape.append(rdim.var)
                 else:
                     res.offsets.append("0")
diff --git a/helion/_compiler/tile_dispatch.py b/helion/_compiler/tile_dispatch.py
@@ -78,10 +78,25 @@ def _add_loop_strategy(
                 loop_order=loop_order,
             )
         else:
+            block_sizes = [bs.from_config_assert(config) for bs in block_size_infos]
+            if len(block_ids) == 1 and not block_size_infos[0].reduction:
+                max_reduction_block = 0
+                for info in env.block_sizes:
+                    if not info.reduction:
+                        continue
+                    configured = info.from_config(config)
+                    if isinstance(configured, int):
+                        max_reduction_block = max(max_reduction_block, configured)
+                if (
+                    isinstance(block_sizes[0], int)
+                    and block_sizes[0] > 1
+                    and max_reduction_block >= 1024
+                ):
+                    block_sizes[0] = 1
             strategy = NDTileStrategy(
                 fn,
                 block_ids,
-                block_size=[bs.from_config_assert(config) for bs in block_size_infos],
+                block_size=block_sizes,
                 loop_order=loop_order,
                 l2_grouping=l2_grouping,
             )
@@ -118,8 +133,22 @@ def codegen_device_loop(
 
     def _compact_shape(self, shapes: ShapeLike) -> list[CompactedShape]:
         compacted_shapes = []
+        env = CompileEnvironment.current()
+        fn = DeviceFunction.current()
         for idx, shape in enumerate(shapes):
-            block_idx = CompileEnvironment.current().get_block_id(shape)
+            block_idx = env.get_block_id(shape)
+            if block_idx is None and isinstance(shape, int):
+                for info in env.block_sizes:
+                    if not info.reduction:
+                        continue
+                    configured = info.from_config(fn.config)
+                    if not isinstance(configured, int) or configured != shape:
+                        continue
+                    static_size = info.size if isinstance(info.size, int) else None
+                    if static_size is not None and static_size == configured:
+                        continue
+                    block_idx = info.block_id
+                    break
             if block_idx is None:
                 # Check if this is a symbolic expression with block sizes
                 shape_str = self._get_shape_string(shape)
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -453,6 +453,19 @@ def _device_indexing_size(self, key: TypeInfo) -> list[int | torch.SymInt]:
             keys = key.unpack()
         else:
             keys = [key]
+        advanced_mode = (
+            len(keys) == self.fake_value.ndim
+            and bool(keys)
+            and all(isinstance(k, TensorType) for k in keys)
+        )
+        if advanced_mode:
+            try:
+                tuple_index = tuple(k.proxy() for k in keys)
+                result = self.fake_value[tuple_index]  # pyright: ignore[reportArgumentType]
+            except Exception:
+                pass
+            else:
+                return list(result.size())
         inputs_consumed = 0
         output_sizes = []
         env = CompileEnvironment.current()
@@ -501,9 +514,9 @@ def _device_indexing_size(self, key: TypeInfo) -> list[int | torch.SymInt]:
                 raise exc.DataDependentOutputShapeNotSupported(
                     op_desc="Boolean mask indexing (tensor[boolean_mask])"
                 )
-            elif isinstance(k, TensorType) and k.fake_value.ndim == 1:
+            elif isinstance(k, TensorType):
                 inputs_consumed += 1
-                output_sizes.append(k.fake_value.size(0))
+                output_sizes.extend(k.fake_value.size())
             elif k.contains_type(TileIndexType):
                 raise exc.OverpackedTile(k)
             else:
@@ -1495,7 +1508,22 @@ def _eval_unary(op: ast.unaryop, value: object) -> object:
 
 def _eval_binary(op: ast.operator, left: object, right: object) -> object:
     if isinstance(op, ast.Add):
-        return left + right  # pyright: ignore[reportOperatorIssue]
+        try:
+            return left + right  # pyright: ignore[reportOperatorIssue]
+        except Exception as exc:  # pragma: no cover - debug
+            import sys
+
+            def _fmt(val: object) -> str:
+                if hasattr(val, "shape"):
+                    shape = getattr(val, "shape")
+                    return f"{shape}"
+                return repr(val)
+
+            print(
+                f"_eval_binary Add failure: left={_fmt(left)}, right={_fmt(right)}",
+                file=sys.stderr,
+            )
+            raise
     if isinstance(op, ast.Sub):
         return left - right  # pyright: ignore[reportOperatorIssue]
     if isinstance(op, ast.Mult):
diff --git a/helion/language/memory_ops.py b/helion/language/memory_ops.py
@@ -7,6 +7,8 @@
 from torch.fx import has_side_effect
 
 from .. import exc
+from .._compiler.ast_extension import expr_from_string
+from .._compiler.host_function import HostFunction
 from .._compiler.indexing_strategy import SubscriptIndexing
 from . import _decorators
 from .stack_tensor import StackTensor
@@ -24,6 +26,24 @@
 }
 
 
+def _codegen_host_tensor_subscript(state: "CodegenState") -> ast.AST:
+    indices: list[str] = []
+    for val in state.proxy_arg(1):  # type: ignore[reportGeneralTypeIssues]
+        if val is None:
+            indices.append("None")
+        elif isinstance(val, slice) and val.start is None and val.stop is None and val.step is None:
+            indices.append(":")
+        else:
+            raise exc.InvalidIndexingType(
+                f"Host tensor indexing only supports None/':' entries, got {val!r}"
+            )
+    index_expr = ", ".join(indices)
+    return expr_from_string(
+        f"{{base}}[{index_expr}]" if index_expr else "{base}",
+        base=state.ast_arg(0),
+    )
+
+
 @has_side_effect
 @_decorators.api(tiles_as_sizes=True, allow_host_tensor=True)
 def store(
@@ -270,6 +290,10 @@ def _(state: CodegenState) -> ast.AST:
         eviction_policy = ast.Constant(value=eviction_policy)
 
     if isinstance(tensor, torch.Tensor):
+        host_fn = HostFunction.current()
+        if tensor not in host_fn.tensor_to_origin:
+            return _codegen_host_tensor_subscript(state)
+
         # Use the shared memory op index for indexing strategy
         indexing_idx = device_fn.device_memory_op_index
         device_fn.device_memory_op_index += 1
diff --git a/test/test_indexing.expected b/test/test_indexing.expected