From c8ef8233c81c58956578b239c30e28f140b7ee57 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devgpu053.atn3.facebook.com>
Date: Tue, 10 Mar 2026 10:00:45 -0700
Subject: [PATCH] [ET-VK] Generalize constant_pad_nd to support any storage
 type and packed dimension

Replace the old pad_channel and pad_height_width shaders with generalized pad_buffer and pad_texture shaders that work with any storage type and packed dimension using BufferMetadata/TextureMetadata and indexing.glslh utilities.

Differential Revision: [D95970168](https://our.internmc.facebook.com/intern/diff/D95970168/)

[ghstack-poisoned]
---
 backends/vulkan/op_registry.py                |  3 +-
 .../runtime/graph/ops/glsl/pad_buffer.glsl    | 54 +++++++++++++
 .../{pad_channel.yaml => pad_buffer.yaml}     |  7 +-
 .../runtime/graph/ops/glsl/pad_channel.glsl   | 80 ------------------
 .../graph/ops/glsl/pad_height_width.glsl      | 50 ------------
 .../runtime/graph/ops/glsl/pad_texture.glsl   | 81 +++++++++++++++++++
 ...pad_height_width.yaml => pad_texture.yaml} |  7 +-
 .../vulkan/runtime/graph/ops/impl/Pad.cpp     | 43 +++++-----
 backends/vulkan/test/op_tests/cases.py        |  8 ++
 9 files changed, 174 insertions(+), 159 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{pad_channel.yaml => pad_buffer.yaml} (67%)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{pad_height_width.yaml => pad_texture.yaml} (65%)

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index bb7c0562bad..d68f62fa0e7 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -1263,8 +1263,9 @@ def register_arange():
 @update_features(exir_ops.edge.aten.constant_pad_nd.default)
 def register_constant_pad_nd():
     return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
+        inputs_storage=utils.ANY_STORAGE,
         inputs_dtypes=utils.FP_INT_BOOL_T,
+        supports_resize=True,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl
new file mode 100644
index 00000000000..7c8f661cb8a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("buffer", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "BufferMetadata", "out_meta")}
+${layout_declare_ubo(B, "BufferMetadata", "in_meta")}
+${layout_declare_ubo(B, "ivec4", "pad_per_dim")}
+${layout_declare_ubo(B, "float", "fill_value")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_bufi >= numel(out_meta)) {
+    return;
+  }
+
+  TensorIndex out_tidx = linear_idx_to_tensor_idx(out_meta, out_bufi);
+
+  // Subtract pad offsets per dimension to get input tensor index.
+  // Unsigned underflow (when output index < pad offset) wraps to a large
+  // value that fails the out_of_bounds check below.
+  TensorIndex in_tidx = out_tidx;
+  [[unroll]] for (int d = 0; d < 4; d++) {
+    in_tidx.data[0][d] -= uint(pad_per_dim[d]);
+  }
+
+  if (out_of_bounds(in_tidx, in_meta)) {
+    t_out[out_bufi] = T(fill_value);
+    return;
+  }
+
+  const uint in_bufi = tensor_idx_to_linear_idx(in_meta, in_tidx);
+  t_out[out_bufi] = t_in[in_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml
similarity index 67%
rename from backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml
index 91306bd4cbf..8271ab2e64c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml
@@ -1,9 +1,6 @@
-pad_channel:
+pad_buffer:
   parameter_names_with_default_values:
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -11,4 +8,4 @@ pad_channel:
       - VALUE: int32
       - VALUE: uint8
   shader_variants:
-    - NAME: pad_channel
+    - NAME: pad_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
deleted file mode 100644
index 8c01ebef897..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
-${layout_declare_ubo(5, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  // mask_z/y/x is used to determine whether need to fecth data from input tensor
-  bool mask_z = (idx.z + 3) < pad_front || idx.z > (pad_front + in_sizes.z - 1);
-  bool mask_y = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1;
-  bool mask_x = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1;
-
-  if (!mask_z && mask_y && mask_x) {
-    // channel_mask is to determine the situation that when padding channel dimension,
-    // in one texel, some elements are filled vaule and some value are from input tensor
-    ivec4 c_ind = ivec4(idx.z) + ivec4(0, 1, 2, 3);
-    ivec4 channel_mask = ivec4(lessThan(c_ind, ivec4(pad_front))) + ivec4(greaterThan(c_ind, ivec4(pad_front + in_sizes.z - 1)));
-
-    ivec4 in_idx = idx;
-    in_idx.x -= pad_left;
-    in_idx.y -= pad_top;
-    in_idx.z -= divup4(pad_front) * 4;
-    const int shift = pad_front % 4;
-    VEC4_T cur_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-    VEC4_T next_in_texel;
-    // When shift is not 0, we need to read 2 texels from input tensor to write into output
-    // for example:
-    // input texel is [[1 2 3 4], [5 6 x x]] and front_pad = 2
-    // output texel is [[p p 1 2], [3 4 5 6]], where p is the filled value then need to fetch 2 texels to fill [3 4 5 6].
-    if (shift != 0) {
-      in_idx.z += 4;
-      next_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-    } else {
-      next_in_texel = cur_in_texel;
-    }
-
-    VEC4_T inter_texel;
-    for (int i = 0; i < 4; i++) {
-      if (i < shift) {
-        inter_texel[i] = cur_in_texel[4-shift+i];
-      } else {
-        inter_texel[i] = next_in_texel[i-shift];
-      }
-    }
-    outtex = inter_texel * (VEC4_T(1) - channel_mask) + outtex * channel_mask;
-  }
-
-  int packed_idx = idx[packed_dim];
-  const int packed_dim_size = out_sizes[packed_dim];
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
deleted file mode 100644
index c5b2c692bdc..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
+++ /dev/null
@@ -1,50 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
-${layout_declare_ubo(5, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  bool mask_height = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1;
-  bool mask_width = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1;
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  if (mask_height && mask_width) {
-    ivec4 in_idx = idx;
-    in_idx.x -= pad_left;
-    in_idx.y -= pad_top;
-    outtex = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-  }
-
-  int packed_idx = idx[packed_dim];
-  const int packed_dim_size = out_sizes[packed_dim];
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
new file mode 100644
index 00000000000..75a3ba7e87d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+${define_required_extensions("texture3d", DTYPE)}
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
+#define T ${texel_load_component_type(DTYPE, "texture3d")}
+
+${define_active_storage_type("texture3d")}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+#include "common.glslh"
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "TextureMetadata", "outp")}
+${layout_declare_ubo(B, "TextureMetadata", "inp")}
+${layout_declare_ubo(B, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
+${layout_declare_ubo(B, "float", "fill_value")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (out_of_bounds(out_pos, outp)) {
+    return;
+  }
+
+  // Convert the thread position to output tensor indices in element space.
+  // out_tidx.data[packed_dim] is the element index of the first component in
+  // this texel; the remaining three dims are scalar element indices.
+  TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos);
+
+  // Tail texels may have fewer than 4 valid elements; leave extras as 0.
+  const int limit =
+      min(4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]);
+
+  VEC4_T out_texel = VEC4_T(0);
+
+  // Process each of the (up to 4) elements in this output texel independently.
+  // For each element: subtract pad offsets to obtain the input element index,
+  // then copy from the input if in-bounds or write fill_value if in the padding
+  // region.
+  [[unroll]] for (int comp = 0; comp < limit; comp++) {
+    TensorIndex4D in_tidx = out_tidx;
+    in_tidx.data[outp.packed_dim] += comp;
+    in_tidx.data[0] -= pad_left;
+    in_tidx.data[1] -= pad_top;
+    in_tidx.data[2] -= pad_front;
+
+    // Signed underflow (output index < pad) produces a negative value that
+    // fails the >= 0 check, correctly identifying the padding region.
+    if (in_tidx.data[0] >= 0 && in_tidx.data[0] < inp.sizes[0] &&
+        in_tidx.data[1] >= 0 && in_tidx.data[1] < inp.sizes[1] &&
+        in_tidx.data[2] >= 0 && in_tidx.data[2] < inp.sizes[2]) {
+      TextureElementIndex elem =
+          tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx);
+      VEC4_T in_texel = texelFetch(t_in, elem.pos, 0);
+      out_texel[comp] = T(in_texel[elem.comp]);
+    } else {
+      out_texel[comp] = T(fill_value);
+    }
+  }
+
+  imageStore(t_out, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml
similarity index 65%
rename from backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml
index 2eb57291bb2..f2a40d289bf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml
@@ -1,9 +1,6 @@
-pad_height_width:
+pad_texture:
   parameter_names_with_default_values:
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: float
@@ -11,4 +8,4 @@ pad_height_width:
       - VALUE: int32
       - VALUE: uint8
   shader_variants:
-    - NAME: pad_height_width
+    - NAME: pad_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
index d225af05633..2b0ebbb98db 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
@@ -59,22 +59,33 @@ void add_constant_pad_nd_node(
     ComputeGraph& graph,
     const ValueRef& in,
     const ValueRef& pad,
-    const ValueRef& fill_value,
+    const ValueRef& fill_value_ref,
     const ValueRef& out) {
-  const float fill_value_val = graph.extract_scalar<float>(fill_value);
+  const float fill_value_val = graph.extract_scalar<float>(fill_value_ref);
   const IntListPtr pad_vec = graph.get_int_list(pad);
-
-  std::string kernel_name = "";
   const PadParam pad_param = creat_pad_param(*pad_vec);
 
-  if (pad_vec->size() <= 4) {
-    kernel_name = "pad_height_width";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, graph.dtype_of(out));
+  std::string kernel_name = "pad";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  vkapi::ParamsBindList param_ubos;
+  if (graph.is_buffer_storage(out)) {
+    // BufferMetadata stores sizes/strides in WHCN order (flip_and_unsqueeze
+    // reverses from NCHW). Map pad offsets to match: W=0, H=1, C=2.
+    utils::ivec4 pad_per_dim{pad_param.left, pad_param.top, pad_param.front, 0};
+    param_ubos = {
+        graph.buffer_meta_ubo(out),
+        graph.buffer_meta_ubo(in),
+        graph.create_params_buffer(pad_per_dim),
+        graph.create_params_buffer(fill_value_val)};
   } else {
-    kernel_name = "pad_channel";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, graph.dtype_of(out));
+    param_ubos = {
+        graph.meta_ubo(out),
+        graph.meta_ubo(in),
+        graph.create_params_buffer(pad_param),
+        graph.create_params_buffer(fill_value_val)};
   }
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
@@ -82,13 +93,9 @@ void add_constant_pad_nd_node(
       VK_KERNEL_FROM_STR(kernel_name),
       default_pick_global_wg_size,
       default_pick_local_wg_size,
-      // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.sizes_ubo(out),
-       graph.sizes_ubo(in),
-       graph.create_params_buffer(pad_param),
-       graph.create_params_buffer(fill_value_val)},
+      // Parameter buffers
+      param_ubos,
       // Push Constants
       {},
       // Specialization Constants
@@ -100,7 +107,7 @@ void add_constant_pad_nd_node(
 }
 
 void constant_pad_nd(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]);
+  add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 6a9db70adaa..3b2d6b8c48e 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1868,6 +1868,14 @@ def get_constant_pad_nd_inputs():
             ([L, M, M1, M2], [3, 3, 3, 3, 3, 3], 12.2),
         ]
     )
+    test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kChannelsPacked",
+    ]
+    test_suite.storage_types = [
+        "utils::kTexture3D",
+        "utils::kBuffer",
+    ]
     return test_suite