From c8ef8233c81c58956578b239c30e28f140b7ee57 Mon Sep 17 00:00:00 2001 From: ssjia Date: Tue, 10 Mar 2026 10:00:45 -0700 Subject: [PATCH] [ET-VK] Generalize constant_pad_nd to support any storage type and packed dimension Replace the old pad_channel and pad_height_width shaders with generalized pad_buffer and pad_texture shaders that work with any storage type and packed dimension using BufferMetadata/TextureMetadata and indexing.glslh utilities. Differential Revision: [D95970168](https://our.internmc.facebook.com/intern/diff/D95970168/) [ghstack-poisoned] --- backends/vulkan/op_registry.py | 3 +- .../runtime/graph/ops/glsl/pad_buffer.glsl | 54 +++++++++++++ .../{pad_channel.yaml => pad_buffer.yaml} | 7 +- .../runtime/graph/ops/glsl/pad_channel.glsl | 80 ------------------ .../graph/ops/glsl/pad_height_width.glsl | 50 ------------ .../runtime/graph/ops/glsl/pad_texture.glsl | 81 +++++++++++++++++++ ...pad_height_width.yaml => pad_texture.yaml} | 7 +- .../vulkan/runtime/graph/ops/impl/Pad.cpp | 43 +++++----- backends/vulkan/test/op_tests/cases.py | 8 ++ 9 files changed, 174 insertions(+), 159 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl rename backends/vulkan/runtime/graph/ops/glsl/{pad_channel.yaml => pad_buffer.yaml} (67%) delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl create mode 100644 backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl rename backends/vulkan/runtime/graph/ops/glsl/{pad_height_width.yaml => pad_texture.yaml} (65%) diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index bb7c0562bad..d68f62fa0e7 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -1263,8 +1263,9 @@ def register_arange(): @update_features(exir_ops.edge.aten.constant_pad_nd.default) def register_constant_pad_nd(): return OpFeatures( - inputs_storage=utils.CHANNELS_PACKED_TEXTURE, + inputs_storage=utils.ANY_STORAGE, inputs_dtypes=utils.FP_INT_BOOL_T, + supports_resize=True, ) diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl new file mode 100644 index 00000000000..7c8f661cb8a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.glsl @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions("buffer", DTYPE)} + +#define PRECISION ${PRECISION} + +#define T ${buffer_scalar_type(DTYPE)} + +layout(std430) buffer; + +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} + +${layout_declare_ubo(B, "BufferMetadata", "out_meta")} +${layout_declare_ubo(B, "BufferMetadata", "in_meta")} +${layout_declare_ubo(B, "ivec4", "pad_per_dim")} +${layout_declare_ubo(B, "float", "fill_value")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const uint out_bufi = gl_GlobalInvocationID.x; + if (out_bufi >= numel(out_meta)) { + return; + } + + TensorIndex out_tidx = linear_idx_to_tensor_idx(out_meta, out_bufi); + + // Subtract pad offsets per dimension to get input tensor index. + // Unsigned underflow (when output index < pad offset) wraps to a large + // value that fails the out_of_bounds check below. + TensorIndex in_tidx = out_tidx; + [[unroll]] for (int d = 0; d < 4; d++) { + in_tidx.data[0][d] -= uint(pad_per_dim[d]); + } + + if (out_of_bounds(in_tidx, in_meta)) { + t_out[out_bufi] = T(fill_value); + return; + } + + const uint in_bufi = tensor_idx_to_linear_idx(in_meta, in_tidx); + t_out[out_bufi] = t_in[in_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml similarity index 67% rename from backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml rename to backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml index 91306bd4cbf..8271ab2e64c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_buffer.yaml @@ -1,9 +1,6 @@ -pad_channel: +pad_buffer: parameter_names_with_default_values: - NDIM: 3 DTYPE: float - PACKING: C_packed - STORAGE: texture3d generate_variant_forall: DTYPE: - VALUE: float @@ -11,4 +8,4 @@ pad_channel: - VALUE: int32 - VALUE: uint8 shader_variants: - - NAME: pad_channel + - NAME: pad_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl deleted file mode 100644 index 8c01ebef897..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl +++ /dev/null @@ -1,80 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "ivec4", "in_sizes")} -${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")} -${layout_declare_ubo(5, "float", "fill_value")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); - - if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { - return; - } - - VEC4_T outtex = VEC4_T(fill_value); - // mask_z/y/x is used to determine whether need to fecth data from input tensor - bool mask_z = (idx.z + 3) < pad_front || idx.z > (pad_front + in_sizes.z - 1); - bool mask_y = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1; - bool mask_x = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1; - - if (!mask_z && mask_y && mask_x) { - // channel_mask is to determine the situation that when padding channel dimension, - // in one texel, some elements are filled vaule and some value are from input tensor - ivec4 c_ind = ivec4(idx.z) + ivec4(0, 1, 2, 3); - ivec4 channel_mask = ivec4(lessThan(c_ind, ivec4(pad_front))) + ivec4(greaterThan(c_ind, ivec4(pad_front + in_sizes.z - 1))); - - ivec4 in_idx = idx; - in_idx.x -= pad_left; - in_idx.y -= pad_top; - in_idx.z -= divup4(pad_front) * 4; - const int shift = pad_front % 4; - VEC4_T cur_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0); - VEC4_T next_in_texel; - // When shift is not 0, we need to read 2 texels from input tensor to write into output - // for example: - // input texel is [[1 2 3 4], [5 6 x x]] and front_pad = 2 - // output texel is [[p p 1 2], [3 4 5 6]], where p is the filled value then need to fetch 2 texels to fill [3 4 5 6]. - if (shift != 0) { - in_idx.z += 4; - next_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0); - } else { - next_in_texel = cur_in_texel; - } - - VEC4_T inter_texel; - for (int i = 0; i < 4; i++) { - if (i < shift) { - inter_texel[i] = cur_in_texel[4-shift+i]; - } else { - inter_texel[i] = next_in_texel[i-shift]; - } - } - outtex = inter_texel * (VEC4_T(1) - channel_mask) + outtex * channel_mask; - } - - int packed_idx = idx[packed_dim]; - const int packed_dim_size = out_sizes[packed_dim]; - if (packed_idx + 3 >= packed_dim_size) { - ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3); - VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size))); - outtex = outtex * valid_idx; - } - - imageStore(t_out, pos, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl deleted file mode 100644 index c5b2c692bdc..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl +++ /dev/null @@ -1,50 +0,0 @@ -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "ivec4", "in_sizes")} -${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")} -${layout_declare_ubo(5, "float", "fill_value")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); - - if (pos_out_of_bounds(pos, out_sizes, packed_dim)) { - return; - } - - bool mask_height = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1; - bool mask_width = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1; - - VEC4_T outtex = VEC4_T(fill_value); - if (mask_height && mask_width) { - ivec4 in_idx = idx; - in_idx.x -= pad_left; - in_idx.y -= pad_top; - outtex = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0); - } - - int packed_idx = idx[packed_dim]; - const int packed_dim_size = out_sizes[packed_dim]; - if (packed_idx + 3 >= packed_dim_size) { - ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3); - VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size))); - outtex = outtex * valid_idx; - } - - imageStore(t_out, pos, outtex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl new file mode 100644 index 00000000000..75a3ba7e87d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl @@ -0,0 +1,81 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +${define_required_extensions("texture3d", DTYPE)} + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_load_type(DTYPE, "texture3d")} +#define T ${texel_load_component_type(DTYPE, "texture3d")} + +${define_active_storage_type("texture3d")} + +#extension GL_EXT_control_flow_attributes : require + +layout(std430) buffer; + +#include "common.glslh" +#include "indexing.glslh" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} + +${layout_declare_ubo(B, "TextureMetadata", "outp")} +${layout_declare_ubo(B, "TextureMetadata", "inp")} +${layout_declare_ubo(B, "int", "pad_left", "int", "pad_top", "int", "pad_front")} +${layout_declare_ubo(B, "float", "fill_value")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 out_pos = ivec3(gl_GlobalInvocationID); + + if (out_of_bounds(out_pos, outp)) { + return; + } + + // Convert the thread position to output tensor indices in element space. + // out_tidx.data[packed_dim] is the element index of the first component in + // this texel; the remaining three dims are scalar element indices. + TensorIndex4D out_tidx = texture_pos_to_tensor4d_idx_simple(outp, out_pos); + + // Tail texels may have fewer than 4 valid elements; leave extras as 0. + const int limit = + min(4, outp.sizes[outp.packed_dim] - out_tidx.data[outp.packed_dim]); + + VEC4_T out_texel = VEC4_T(0); + + // Process each of the (up to 4) elements in this output texel independently. + // For each element: subtract pad offsets to obtain the input element index, + // then copy from the input if in-bounds or write fill_value if in the padding + // region. + [[unroll]] for (int comp = 0; comp < limit; comp++) { + TensorIndex4D in_tidx = out_tidx; + in_tidx.data[outp.packed_dim] += comp; + in_tidx.data[0] -= pad_left; + in_tidx.data[1] -= pad_top; + in_tidx.data[2] -= pad_front; + + // Signed underflow (output index < pad) produces a negative value that + // fails the >= 0 check, correctly identifying the padding region. + if (in_tidx.data[0] >= 0 && in_tidx.data[0] < inp.sizes[0] && + in_tidx.data[1] >= 0 && in_tidx.data[1] < inp.sizes[1] && + in_tidx.data[2] >= 0 && in_tidx.data[2] < inp.sizes[2]) { + TextureElementIndex elem = + tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx); + VEC4_T in_texel = texelFetch(t_in, elem.pos, 0); + out_texel[comp] = T(in_texel[elem.comp]); + } else { + out_texel[comp] = T(fill_value); + } + } + + imageStore(t_out, out_pos, out_texel); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml similarity index 65% rename from backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml rename to backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml index 2eb57291bb2..f2a40d289bf 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/pad_texture.yaml @@ -1,9 +1,6 @@ -pad_height_width: +pad_texture: parameter_names_with_default_values: - NDIM: 3 DTYPE: float - PACKING: C_packed - STORAGE: texture3d generate_variant_forall: DTYPE: - VALUE: float @@ -11,4 +8,4 @@ pad_height_width: - VALUE: int32 - VALUE: uint8 shader_variants: - - NAME: pad_height_width + - NAME: pad_texture3d diff --git a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp index d225af05633..2b0ebbb98db 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp @@ -59,22 +59,33 @@ void add_constant_pad_nd_node( ComputeGraph& graph, const ValueRef& in, const ValueRef& pad, - const ValueRef& fill_value, + const ValueRef& fill_value_ref, const ValueRef& out) { - const float fill_value_val = graph.extract_scalar(fill_value); + const float fill_value_val = graph.extract_scalar(fill_value_ref); const IntListPtr pad_vec = graph.get_int_list(pad); - - std::string kernel_name = ""; const PadParam pad_param = creat_pad_param(*pad_vec); - if (pad_vec->size() <= 4) { - kernel_name = "pad_height_width"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); + std::string kernel_name = "pad"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + vkapi::ParamsBindList param_ubos; + if (graph.is_buffer_storage(out)) { + // BufferMetadata stores sizes/strides in WHCN order (flip_and_unsqueeze + // reverses from NCHW). Map pad offsets to match: W=0, H=1, C=2. + utils::ivec4 pad_per_dim{pad_param.left, pad_param.top, pad_param.front, 0}; + param_ubos = { + graph.buffer_meta_ubo(out), + graph.buffer_meta_ubo(in), + graph.create_params_buffer(pad_per_dim), + graph.create_params_buffer(fill_value_val)}; } else { - kernel_name = "pad_channel"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, graph.dtype_of(out)); + param_ubos = { + graph.meta_ubo(out), + graph.meta_ubo(in), + graph.create_params_buffer(pad_param), + graph.create_params_buffer(fill_value_val)}; } graph.execute_nodes().emplace_back(new DynamicDispatchNode( @@ -82,13 +93,9 @@ void add_constant_pad_nd_node( VK_KERNEL_FROM_STR(kernel_name), default_pick_global_wg_size, default_pick_local_wg_size, - // Inputs and Outputs {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - // Shader params buffers - {graph.sizes_ubo(out), - graph.sizes_ubo(in), - graph.create_params_buffer(pad_param), - graph.create_params_buffer(fill_value_val)}, + // Parameter buffers + param_ubos, // Push Constants {}, // Specialization Constants @@ -100,7 +107,7 @@ void add_constant_pad_nd_node( } void constant_pad_nd(ComputeGraph& graph, const std::vector& args) { - return add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]); + add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]); } REGISTER_OPERATORS { diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 6a9db70adaa..3b2d6b8c48e 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -1868,6 +1868,14 @@ def get_constant_pad_nd_inputs(): ([L, M, M1, M2], [3, 3, 3, 3, 3, 3], 12.2), ] ) + test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kChannelsPacked", + ] + test_suite.storage_types = [ + "utils::kTexture3D", + "utils::kBuffer", + ] return test_suite