pytorch · SS-JIA · Mar 10, 2026
@@ -1343,7 +1343,7 @@ def register_grid_priors():
 @update_features(exir_ops.edge.aten.repeat.default)
 def register_repeat():
     return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
     )
 

@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_of_bounds(out_bufi, out_meta)) {
+    return;
+  }
+
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(out_meta, out_bufi);
+
+  TensorIndex in_tidx;
+  initialize(in_tidx);
+
+  const int n = int_ndim(out_meta);
+  for (int d = 0; d < n; d++) {
+    in_tidx.data[div_4(d)][mod_4(d)] =
+        idx_at(out_tidx, d) % size_at(in_meta, d);
+  }
+
+  const uint in_bufi = tensor_idx_to_linear_idx(in_meta, in_tidx);
+
+  t_out[out_bufi] = t_in[in_bufi];
+}
@@ -0,0 +1,13 @@
+repeat_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+      - VALUE: int8
+      - VALUE: uint8
+  shader_variants:
+    - NAME: repeat_buffer
@@ -21,8 +21,8 @@ layout(push_constant) uniform restrict Block {
   ivec4 range;
   // source tensor sizes in WHCB dims respectively
   ivec4 src_dims;
-  // destination tensor repeats in WHCB dims respectively
-  ivec4 dst_repeats;
+  // output tensor sizes in WHCB dims respectively
+  ivec4 out_dims;
 };
 
 #include "indexing_utils.h"
@@ -58,7 +58,7 @@ void main() {
   // if tensors are channel packed
   if (packed_dim == C_DIM) {
     // the output channels in a batch will be channel size * channel repetitions aligned by 4
-    const int out_channel_size = alignup4(src_dims.z * dst_repeats.z);
+    const int out_channel_size = alignup4(out_dims.z);
 
     // batch index in the output
     const int out_pos_batch_index = pos.z / out_channel_size;
@@ -76,7 +76,7 @@ void main() {
     channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z;
   } else {
     // the output channels in a batch will be channel size * channel repetitions
-    const int out_channel_size = src_dims.z * dst_repeats.z;
+    const int out_channel_size = out_dims.z;
 
     // source batch index for based on current output pos
     batch_index = (pos.z / out_channel_size) % src_dims.w;

@@ -1,4 +1,4 @@
-repeat:
+repeat_texture:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
@@ -11,4 +11,4 @@ repeat:
       - VALUE: int8
       - VALUE: uint8
   shader_variants:
-    - NAME: repeat
+    - NAME: repeat_texture3d
@@ -16,107 +16,70 @@
 
 namespace vkcompute {
 
-namespace {
-
-void check_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const std::vector<int64_t>& repeats,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == graph.packed_dim_of(out));
-
-  VK_CHECK_COND(graph.storage_type_of(in) == graph.storage_type_of(out));
-  if (graph.storage_type_of(in) == utils::kTexture2D) {
-    VK_CHECK_COND(graph.dim_of(in) <= 2);
+void resize_repeat_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef in = args.at(1).refs.at(0);
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef repeats_ref = extra_args.at(0);
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+  const std::vector<int64_t> repeats =
+      graph->extract_int_or_symint_list(repeats_ref);
+
+  const size_t out_ndim = std::max(in_sizes.size(), repeats.size());
+  std::vector<int64_t> out_sizes(out_ndim);
+  for (size_t i = 0; i < out_ndim; i++) {
+    const size_t in_offset = i + in_sizes.size() - out_ndim;
+    const size_t rep_offset = i + repeats.size() - out_ndim;
+    // Prepend 1s to in_sizes if repeats is longer, and vice versa
+    const int64_t in_size =
+        (i >= out_ndim - in_sizes.size()) ? in_sizes[in_offset] : 1;
+    const int64_t r =
+        (i >= out_ndim - repeats.size()) ? repeats[rep_offset] : 1;
+    out_sizes[i] = in_size * r;
   }
-
-  const int64_t in_dim = graph.dim_of(in);
-  VK_CHECK_COND(
-      in_dim <= repeats.size(),
-      "Input tensor dim size must be not greater than the repeat argument's size");
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  VK_CHECK_COND(
-      dim_at<kWidth4D>(in_sizes) * dim_at<kWidth4D>(repeats) ==
-          dim_at<kWidth4D>(out_sizes),
-      "Output's width doesn't match input's width * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kHeight4D>(in_sizes) * dim_at<kHeight4D>(repeats) ==
-          dim_at<kHeight4D>(out_sizes),
-      "Output's height doesn't match input's height * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(in_sizes) * dim_at<kChannel4D>(repeats) ==
-          dim_at<kChannel4D>(out_sizes),
-      "Output's channel doesn't match input's channel * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kBatch4D>(in_sizes) * dim_at<kBatch4D>(repeats) ==
-          dim_at<kBatch4D>(out_sizes),
-      "Output's batch doesn't match input's batch * repeat count");
+  graph->virtual_resize(out, out_sizes);
 }
 
-} // namespace
-
 void add_repeat_node(
     ComputeGraph& graph,
     ValueRef in,
     ValueRef repeats_ref,
     ValueRef out) {
-  const std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
-
-  check_args(graph, in, repeats, out);
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const utils::ivec4 src_dims{
-      dim_at<kWidth4D>(in_sizes),
-      dim_at<kHeight4D>(in_sizes),
-      dim_at<kChannel4D>(in_sizes),
-      dim_at<kBatch4D>(in_sizes)};
-  const utils::ivec4 dst_repeats{
-      dim_at<kWidth4D>(repeats),
-      dim_at<kHeight4D>(repeats),
-      dim_at<kChannel4D>(repeats),
-      dim_at<kBatch4D>(repeats)};
-
   std::string kernel_name = "repeat";
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  // A copy of range with the last element set to batch size of the input tensor
-  const utils::ivec3 wg_size = graph.logical_limits_of(out);
-
-  const auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(&wg_size, sizeof(wg_size), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &src_dims, sizeof(src_dims), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &dst_repeats, sizeof(dst_repeats), sizeof(utils::ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
+  if (graph.is_buffer_storage(out)) {
+    graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        default_pick_global_wg_size,
+        default_pick_local_wg_size,
+        {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+        {graph.meta_ubo(out), graph.meta_ubo(in)},
+        {},
+        {},
+        {repeats_ref},
+        resize_repeat_node));
+  } else {
+    graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        default_pick_global_wg_size,
+        default_pick_local_wg_size,
+        {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+        {},
+        {graph.logical_limits_pc_of(out),
+         graph.sizes_pc_of(in),
+         graph.sizes_pc_of(out)},
+        {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
+        {repeats_ref},
+        resize_repeat_node));
+  }
 }
 
 void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {

@@ -1308,7 +1308,7 @@ def get_repeat_inputs():
         "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite_2d.storage_types = ["utils::kTexture3D"]
+    test_suite_2d.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite_2d.data_gen = "make_seq_tensor"
     test_suite_2d.dtypes = ["at::kFloat"]
     test_suite_2d.test_name_suffix = "2d"
@@ -1353,7 +1353,7 @@ def get_repeat_inputs():
         "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite_3d.storage_types = ["utils::kTexture3D"]
+    test_suite_3d.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite_3d.data_gen = "make_seq_tensor"
     test_suite_3d.dtypes = ["at::kFloat"]
     test_suite_3d.test_name_suffix = "3d"