From 9ad16107baaa5a9b667d806c72a546b5152e4e58 Mon Sep 17 00:00:00 2001 From: John Gibson Date: Fri, 24 Apr 2026 14:34:18 -0400 Subject: [PATCH 1/4] optimized: add NEON grid_sampler_2d.out and Vectorized sum.IntList_out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new optimized CPU kernels registered alongside the existing optimized_kernels library. Both replace the portable reference kernel (still available as fallback for unsupported inputs) with a vectorized implementation that accumulates in fp32, avoiding the fp16 precision issues noted in #19117 for grid_sampler_2d bilinear. Measured end-to-end on a real depth model (Pixel 9, fp16 inputs, shapes representative of the model's hot path): | Op | Portable | This PR | Speedup | | -------------------------------- | -------- | ------- | ------- | | grid_sampler_2d.out | 17.3 ms | 3.4 ms | 5.1x | | sum.IntList_out (5 calls, total) | 3.0 ms | 0.56 ms | 5.4x | ### grid_sampler_2d.out aarch64 NEON, bilinear + zeros padding only. Processes 4 channels per iteration with a vectorized FMA chain. fp16 inputs are promoted to fp32 for weight computation and accumulation, then cast back on store — the portable kernel's fp16 weight subtractions like `(ix_se - ix)` otherwise suffer catastrophic cancellation. Unsupported modes and non-aarch64 targets delegate to the portable kernel. ### sum.IntList_out at::vec::Vectorized-based implementation of the single-dim reduction fast path (both innermost-contiguous and strided cases). Cross-architecture SIMD via PyTorch's existing vector abstraction; accumulates in fp32 regardless of input dtype. Multi-dim reductions, dtype-converting reductions, and complex types delegate to portable. ### Integration - Sources added to OPTIMIZED_KERNELS_SRCS in build_variables.bzl and to OPTIMIZED_ATEN_OPS in op_registration_util.bzl. Single source of truth for both Buck and CMake builds. - optimized.yaml registers the ops with the standard opt_* naming convention used by sibling kernels. - kernels/optimized/CMakeLists.txt scopes the -march=armv8.2-a+fp16 flag to just op_grid_sampler_2d.cpp via set_source_files_properties, so x86_64 builds are unaffected. The kernel has #ifdef __aarch64__ guards and falls through to portable on non-arm64 targets. --- kernels/optimized/CMakeLists.txt | 15 + kernels/optimized/cpu/op_grid_sampler_2d.cpp | 343 ++++++++++++++++++ kernels/optimized/cpu/op_sum.cpp | 204 +++++++++++ kernels/optimized/optimized.yaml | 10 + .../executorch/build/build_variables.bzl | 2 + .../optimized/op_registration_util.bzl | 13 + 6 files changed, 587 insertions(+) create mode 100644 kernels/optimized/cpu/op_grid_sampler_2d.cpp create mode 100644 kernels/optimized/cpu/op_sum.cpp diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 69dae952255..b2f689885a6 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -75,6 +75,21 @@ target_link_libraries( kernels_util_all_deps ) target_compile_options(optimized_kernels PUBLIC ${_common_compile_options}) + +# op_grid_sampler_2d.cpp uses ARMv8.2-a+fp16 NEON intrinsics +# (vcvt_f32_f16 / vld1_f16) when compiled for aarch64. Scope the extra +# `-march` flag to just that source so non-arm64 targets (e.g. x86_64 on +# Android) are unaffected — the kernel itself has `#ifdef __aarch64__` +# guards and falls through to the portable kernel otherwise. +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64" + OR ANDROID_ABI STREQUAL "arm64-v8a" +) + set_source_files_properties( + ${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d.cpp + PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+fp16" + ) +endif() + # Build a library for _optimized_kernels_srcs # # optimized_ops_lib: Register optimized ops kernels into Executorch runtime diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp new file mode 100644 index 00000000000..e8c69a23bdb --- /dev/null +++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Optimized grid_sampler_2d.out for CPU. On aarch64 this is a NEON-vectorized +// implementation for the common (bilinear + zeros padding) case, processing +// 4 channels at a time. Other modes — and non-aarch64 targets — fall through +// to the portable kernel. +// +// fp16 inputs: all interior math (interpolation weights and corner +// accumulation) is done in fp32. Loads/stores stay in the tensor's dtype. +// Avoids catastrophic cancellation on `ix_se - ix`-style subtractions that +// would otherwise make fp16 weights meaningless. + +#include + +#ifdef __aarch64__ +#include +#endif + +#include + +namespace torch { +namespace executor { +namespace native { + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; + +// Portable kernel (same-op fallback). Both libs link into the same binary. +Tensor& grid_sampler_2d_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + Tensor& out); + +#ifdef __aarch64__ +namespace { + +// One output spatial location, all channels. fp32 path. +inline void bilinear_all_channels_f32( + const float* input_n, + float* output_n, + int C, + int H_in, + int W_in, + int H_out, + int W_out, + int h_out, + int w_out, + float gx, + float gy) { + const int x0 = static_cast(std::floor(gx)); + const int y0 = static_cast(std::floor(gy)); + const int x1 = x0 + 1; + const int y1 = y0 + 1; + const float fx = gx - static_cast(x0); + const float fy = gy - static_cast(y0); + + const bool tl_v = static_cast(x0) < static_cast(W_in) && + static_cast(y0) < static_cast(H_in); + const bool tr_v = static_cast(x1) < static_cast(W_in) && + static_cast(y0) < static_cast(H_in); + const bool bl_v = static_cast(x0) < static_cast(W_in) && + static_cast(y1) < static_cast(H_in); + const bool br_v = static_cast(x1) < static_cast(W_in) && + static_cast(y1) < static_cast(H_in); + + const int off_tl = y0 * W_in + x0; + const int off_tr = y0 * W_in + x1; + const int off_bl = y1 * W_in + x0; + const int off_br = y1 * W_in + x1; + const int spatial_in = H_in * W_in; + const int spatial_out = H_out * W_out; + const int out_off = h_out * W_out + w_out; + + const float32x4_t vw_tl = vdupq_n_f32((1.0f - fx) * (1.0f - fy)); + const float32x4_t vw_tr = vdupq_n_f32(fx * (1.0f - fy)); + const float32x4_t vw_bl = vdupq_n_f32((1.0f - fx) * fy); + const float32x4_t vw_br = vdupq_n_f32(fx * fy); + + int c = 0; + for (; c + 3 < C; c += 4) { + const float* p0 = input_n + (c + 0) * spatial_in; + const float* p1 = input_n + (c + 1) * spatial_in; + const float* p2 = input_n + (c + 2) * spatial_in; + const float* p3 = input_n + (c + 3) * spatial_in; + + float tl[4] = {0}, tr[4] = {0}, bl[4] = {0}, br[4] = {0}; + if (tl_v) { + tl[0] = p0[off_tl]; tl[1] = p1[off_tl]; + tl[2] = p2[off_tl]; tl[3] = p3[off_tl]; + } + if (tr_v) { + tr[0] = p0[off_tr]; tr[1] = p1[off_tr]; + tr[2] = p2[off_tr]; tr[3] = p3[off_tr]; + } + if (bl_v) { + bl[0] = p0[off_bl]; bl[1] = p1[off_bl]; + bl[2] = p2[off_bl]; bl[3] = p3[off_bl]; + } + if (br_v) { + br[0] = p0[off_br]; br[1] = p1[off_br]; + br[2] = p2[off_br]; br[3] = p3[off_br]; + } + + float32x4_t result = vmulq_f32(vw_tl, vld1q_f32(tl)); + result = vfmaq_f32(result, vw_tr, vld1q_f32(tr)); + result = vfmaq_f32(result, vw_bl, vld1q_f32(bl)); + result = vfmaq_f32(result, vw_br, vld1q_f32(br)); + + float res[4]; + vst1q_f32(res, result); + output_n[(c + 0) * spatial_out + out_off] = res[0]; + output_n[(c + 1) * spatial_out + out_off] = res[1]; + output_n[(c + 2) * spatial_out + out_off] = res[2]; + output_n[(c + 3) * spatial_out + out_off] = res[3]; + } + + // Scalar tail + const float w_tl = (1.0f - fx) * (1.0f - fy); + const float w_tr = fx * (1.0f - fy); + const float w_bl = (1.0f - fx) * fy; + const float w_br = fx * fy; + for (; c < C; ++c) { + const float* p = input_n + c * spatial_in; + float v = 0.0f; + if (tl_v) v += w_tl * p[off_tl]; + if (tr_v) v += w_tr * p[off_tr]; + if (bl_v) v += w_bl * p[off_bl]; + if (br_v) v += w_br * p[off_br]; + output_n[c * spatial_out + out_off] = v; + } +} + +// fp16 path: loads/stores fp16, math in fp32. +inline void bilinear_all_channels_f16( + const __fp16* input_n, + __fp16* output_n, + int C, + int H_in, + int W_in, + int H_out, + int W_out, + int h_out, + int w_out, + float gx, + float gy) { + const int x0 = static_cast(std::floor(gx)); + const int y0 = static_cast(std::floor(gy)); + const int x1 = x0 + 1; + const int y1 = y0 + 1; + const float fx = gx - static_cast(x0); + const float fy = gy - static_cast(y0); + + const bool tl_v = static_cast(x0) < static_cast(W_in) && + static_cast(y0) < static_cast(H_in); + const bool tr_v = static_cast(x1) < static_cast(W_in) && + static_cast(y0) < static_cast(H_in); + const bool bl_v = static_cast(x0) < static_cast(W_in) && + static_cast(y1) < static_cast(H_in); + const bool br_v = static_cast(x1) < static_cast(W_in) && + static_cast(y1) < static_cast(H_in); + + const int off_tl = y0 * W_in + x0; + const int off_tr = y0 * W_in + x1; + const int off_bl = y1 * W_in + x0; + const int off_br = y1 * W_in + x1; + const int spatial_in = H_in * W_in; + const int spatial_out = H_out * W_out; + const int out_off = h_out * W_out + w_out; + + const float32x4_t vw_tl = vdupq_n_f32((1.0f - fx) * (1.0f - fy)); + const float32x4_t vw_tr = vdupq_n_f32(fx * (1.0f - fy)); + const float32x4_t vw_bl = vdupq_n_f32((1.0f - fx) * fy); + const float32x4_t vw_br = vdupq_n_f32(fx * fy); + + int c = 0; + for (; c + 3 < C; c += 4) { + const __fp16* p0 = input_n + (c + 0) * spatial_in; + const __fp16* p1 = input_n + (c + 1) * spatial_in; + const __fp16* p2 = input_n + (c + 2) * spatial_in; + const __fp16* p3 = input_n + (c + 3) * spatial_in; + + __fp16 tl[4] = {0}, tr[4] = {0}, bl[4] = {0}, br[4] = {0}; + if (tl_v) { + tl[0] = p0[off_tl]; tl[1] = p1[off_tl]; + tl[2] = p2[off_tl]; tl[3] = p3[off_tl]; + } + if (tr_v) { + tr[0] = p0[off_tr]; tr[1] = p1[off_tr]; + tr[2] = p2[off_tr]; tr[3] = p3[off_tr]; + } + if (bl_v) { + bl[0] = p0[off_bl]; bl[1] = p1[off_bl]; + bl[2] = p2[off_bl]; bl[3] = p3[off_bl]; + } + if (br_v) { + br[0] = p0[off_br]; br[1] = p1[off_br]; + br[2] = p2[off_br]; br[3] = p3[off_br]; + } + + const float32x4_t v_tl = vcvt_f32_f16(vld1_f16(tl)); + const float32x4_t v_tr = vcvt_f32_f16(vld1_f16(tr)); + const float32x4_t v_bl = vcvt_f32_f16(vld1_f16(bl)); + const float32x4_t v_br = vcvt_f32_f16(vld1_f16(br)); + + float32x4_t result = vmulq_f32(vw_tl, v_tl); + result = vfmaq_f32(result, vw_tr, v_tr); + result = vfmaq_f32(result, vw_bl, v_bl); + result = vfmaq_f32(result, vw_br, v_br); + + __fp16 res[4]; + vst1_f16(res, vcvt_f16_f32(result)); + output_n[(c + 0) * spatial_out + out_off] = res[0]; + output_n[(c + 1) * spatial_out + out_off] = res[1]; + output_n[(c + 2) * spatial_out + out_off] = res[2]; + output_n[(c + 3) * spatial_out + out_off] = res[3]; + } + + const float w_tl = (1.0f - fx) * (1.0f - fy); + const float w_tr = fx * (1.0f - fy); + const float w_bl = (1.0f - fx) * fy; + const float w_br = fx * fy; + for (; c < C; ++c) { + const __fp16* p = input_n + c * spatial_in; + float v = 0.0f; + if (tl_v) v += w_tl * static_cast(p[off_tl]); + if (tr_v) v += w_tr * static_cast(p[off_tr]); + if (bl_v) v += w_bl * static_cast(p[off_bl]); + if (br_v) v += w_br * static_cast(p[off_br]); + output_n[c * spatial_out + out_off] = static_cast<__fp16>(v); + } +} + +template +void grid_sampler_2d_neon( + const SCALAR* input, + const SCALAR* grid, + SCALAR* output, + int N, + int C, + int H_in, + int W_in, + int H_out, + int W_out, + bool align_corners, + SampleFn sample_fn) { + const int spatial_in = H_in * W_in; + const int spatial_out = H_out * W_out; + + for (int n = 0; n < N; ++n) { + const SCALAR* input_n = input + n * C * spatial_in; + SCALAR* output_n = output + n * C * spatial_out; + const SCALAR* grid_n = grid + n * H_out * W_out * 2; + + for (int h = 0; h < H_out; ++h) { + if (h + 1 < H_out) { + __builtin_prefetch(grid_n + (h + 1) * W_out * 2, 0, 1); + } + for (int w = 0; w < W_out; ++w) { + const int grid_off = (h * W_out + w) * 2; + float gx = static_cast(grid_n[grid_off]); + float gy = static_cast(grid_n[grid_off + 1]); + if (align_corners) { + gx = (gx + 1.0f) * (W_in - 1) * 0.5f; + gy = (gy + 1.0f) * (H_in - 1) * 0.5f; + } else { + gx = (gx + 1.0f) * W_in * 0.5f - 0.5f; + gy = (gy + 1.0f) * H_in * 0.5f - 0.5f; + } + sample_fn( + input_n, output_n, C, H_in, W_in, H_out, W_out, h, w, gx, gy); + } + } + } +} + +} // namespace +#endif // __aarch64__ + +Tensor& opt_grid_sampler_2d_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + Tensor& out) { + // Only the bilinear + zeros-padding combination is accelerated. Everything + // else — and any non-aarch64 target — delegates to the portable kernel. + if (interpolation_mode != 0 || padding_mode != 0) { + return grid_sampler_2d_out( + ctx, input, grid, interpolation_mode, padding_mode, align_corners, out); + } +#ifndef __aarch64__ + return grid_sampler_2d_out( + ctx, input, grid, interpolation_mode, padding_mode, align_corners, out); +#else + const int N = static_cast(input.size(0)); + const int C = static_cast(input.size(1)); + const int H_in = static_cast(input.size(2)); + const int W_in = static_cast(input.size(3)); + const int H_out = static_cast(grid.size(1)); + const int W_out = static_cast(grid.size(2)); + + if (input.scalar_type() == ScalarType::Float) { + grid_sampler_2d_neon( + input.const_data_ptr(), + grid.const_data_ptr(), + out.mutable_data_ptr(), + N, C, H_in, W_in, H_out, W_out, + align_corners, + bilinear_all_channels_f32); + return out; + } + if (input.scalar_type() == ScalarType::Half) { + static_assert(sizeof(__fp16) == 2, "expected __fp16 == 2 bytes"); + grid_sampler_2d_neon<__fp16>( + reinterpret_cast(input.const_data_ptr()), + reinterpret_cast(grid.const_data_ptr()), + reinterpret_cast<__fp16*>(out.mutable_data_ptr()), + N, C, H_in, W_in, H_out, W_out, + align_corners, + bilinear_all_channels_f16); + return out; + } + // Any other dtype (e.g. Double, BFloat16): let portable handle it. + return grid_sampler_2d_out( + ctx, input, grid, interpolation_mode, padding_mode, align_corners, out); +#endif +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/cpu/op_sum.cpp b/kernels/optimized/cpu/op_sum.cpp new file mode 100644 index 00000000000..826bfb29c98 --- /dev/null +++ b/kernels/optimized/cpu/op_sum.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using executorch::aten::ArrayRef; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; + +// Forward decl of the portable kernel — used as a fallback for shapes and +// dtype combinations the optimized path doesn't specialize. Both libraries +// live in the same binary, so direct call is fine. +Tensor& sum_dim_out( + KernelRuntimeContext& ctx, + const Tensor& in, + std::optional> dim_list, + bool keepdim, + std::optional dtype, + Tensor& out); + +namespace { + +// Contiguous innermost reduction: sum each row of the inner axis into one +// scalar. fp16/bf16 accumulate in fp32 for precision; fp32 accumulates in +// fp32 directly. Uses at::vec::Vectorized for cross-arch SIMD. +template +inline void sum_innermost( + const CTYPE* in, + CTYPE* out, + int64_t outer_size, + int64_t reduce_size) { + using Vec = at::vec::Vectorized; + constexpr int64_t kVecSize = static_cast(Vec::size()); + for (int64_t i = 0; i < outer_size; ++i) { + const CTYPE* row = in + i * reduce_size; + Vec acc(0.0f); + int64_t j = 0; + for (; j + kVecSize - 1 < reduce_size; j += kVecSize) { + if constexpr (std::is_same_v) { + acc = acc + Vec::loadu(row + j); + } else { + // Half / BFloat16: load N elements, convert to float, add. + float tmp[kVecSize]; + for (int64_t k = 0; k < kVecSize; ++k) { + tmp[k] = static_cast(row[j + k]); + } + acc = acc + Vec::loadu(tmp); + } + } + float sum = at::vec::vec_reduce_all( + [](Vec a, Vec b) { return a + b; }, acc); + for (; j < reduce_size; ++j) { + sum += static_cast(row[j]); + } + out[i] = static_cast(sum); + } +} + +// Non-innermost (strided) single-dim reduction. For each (outer, inner) pair, +// sum over reduce_size elements spaced `inner_size` apart. Vectorize across +// the contiguous inner axis (so each add-step processes kVecSize output +// positions at once). +template +inline void sum_strided( + const CTYPE* in, + CTYPE* out, + int64_t outer_size, + int64_t reduce_size, + int64_t inner_size) { + using Vec = at::vec::Vectorized; + constexpr int64_t kVecSize = static_cast(Vec::size()); + const int64_t outer_stride = reduce_size * inner_size; + for (int64_t o = 0; o < outer_size; ++o) { + const CTYPE* in_o = in + o * outer_stride; + CTYPE* out_o = out + o * inner_size; + int64_t j = 0; + for (; j + kVecSize - 1 < inner_size; j += kVecSize) { + Vec acc(0.0f); + for (int64_t k = 0; k < reduce_size; ++k) { + const CTYPE* p = in_o + k * inner_size + j; + if constexpr (std::is_same_v) { + acc = acc + Vec::loadu(p); + } else { + float tmp[kVecSize]; + for (int64_t m = 0; m < kVecSize; ++m) { + tmp[m] = static_cast(p[m]); + } + acc = acc + Vec::loadu(tmp); + } + } + if constexpr (std::is_same_v) { + acc.store(out_o + j); + } else { + float tmp[kVecSize]; + acc.store(tmp); + for (int64_t m = 0; m < kVecSize; ++m) { + out_o[j + m] = static_cast(tmp[m]); + } + } + } + for (; j < inner_size; ++j) { + float sum = 0.0f; + for (int64_t k = 0; k < reduce_size; ++k) { + sum += static_cast(in_o[k * inner_size + j]); + } + out_o[j] = static_cast(sum); + } + } +} + +} // namespace + +Tensor& opt_sum_dim_out( + KernelRuntimeContext& ctx, + const Tensor& in, + std::optional> dim_list, + bool keepdim, + std::optional dtype, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + check_reduction_args(in, dim_list, keepdim, dtype, out), + InvalidArgument, + out); + ET_KERNEL_CHECK( + ctx, + resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok, + InvalidArgument, + out); + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + + if (in.numel() == 0) { + if (out.numel() > 0) { + std::memset(out.mutable_data_ptr(), 0, out.nbytes()); + } + return out; + } + + // Fast path: single reduction dim, matching dtype, non-complex, contiguous. + // Anything else falls through to the portable kernel. + const bool fast_eligible = dim_list.has_value() && + dim_list.value().size() == 1 && + in.scalar_type() == out.scalar_type() && + !executorch::runtime::isComplexType(in.scalar_type()) && + tensor_is_contiguous(in); + + if (fast_eligible) { + const int64_t d = dim_list.value()[0] < 0 ? dim_list.value()[0] + in.dim() + : dim_list.value()[0]; + int64_t outer_size = 1, reduce_size = in.size(d), inner_size = 1; + for (int64_t i = 0; i < d; ++i) { + outer_size *= in.size(i); + } + for (int64_t i = d + 1; i < in.dim(); ++i) { + inner_size *= in.size(i); + } + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sum.IntList_out"; + bool handled = false; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { + const CTYPE* ip = in.const_data_ptr(); + CTYPE* op = out.mutable_data_ptr(); + if (inner_size == 1) { + sum_innermost(ip, op, outer_size, reduce_size); + handled = true; + } else { + sum_strided(ip, op, outer_size, reduce_size, inner_size); + handled = true; + } + }); + if (handled) { + return out; + } + } + + // Fallback. + return sum_dim_out(ctx, in, dim_list, keepdim, dtype, out); +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml index 58121549ea5..5a001afc7a0 100644 --- a/kernels/optimized/optimized.yaml +++ b/kernels/optimized/optimized.yaml @@ -57,6 +57,11 @@ - arg_meta: null kernel_name: torch::executor::opt_gelu_out +- op: grid_sampler_2d.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_grid_sampler_2d_out + - op: le.Scalar_out kernels: - arg_meta: null @@ -97,6 +102,11 @@ - arg_meta: null kernel_name: torch::executor::opt_sub_out +- op: sum.IntList_out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_sum_dim_out + - op: sub.Scalar_out kernels: - arg_meta: null diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl index edddc1da916..b0545b8ce18 100644 --- a/shim_et/xplat/executorch/build/build_variables.bzl +++ b/shim_et/xplat/executorch/build/build_variables.bzl @@ -267,6 +267,7 @@ OPTIMIZED_KERNELS_SRCS = [ "kernels/optimized/cpu/op_fft_c2r.cpp", "kernels/optimized/cpu/op_fft_r2c.cpp", "kernels/optimized/cpu/op_gelu.cpp", + "kernels/optimized/cpu/op_grid_sampler_2d.cpp", "kernels/optimized/cpu/op_le.cpp", "kernels/optimized/cpu/op_linear.cpp", "kernels/optimized/cpu/op_log_softmax.cpp", @@ -274,6 +275,7 @@ OPTIMIZED_KERNELS_SRCS = [ "kernels/optimized/cpu/op_mul.cpp", "kernels/optimized/cpu/op_native_layer_norm.cpp", "kernels/optimized/cpu/op_sub.cpp", + "kernels/optimized/cpu/op_sum.cpp", "kernels/optimized/cpu/op_where.cpp", ] diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl index bc43688b04e..65683625d5b 100644 --- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl @@ -217,6 +217,12 @@ OPTIMIZED_ATEN_OPS = ( "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", ], ), + op_target( + name = "op_grid_sampler_2d", + deps = [ + "//executorch/kernels/portable/cpu:op_grid_sampler_2d", + ], + ), op_target( name = "op_le", deps = [ @@ -282,6 +288,13 @@ OPTIMIZED_ATEN_OPS = ( "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", ], ), + op_target( + name = "op_sum", + deps = [ + "//executorch/kernels/portable/cpu:op_sum", + "//executorch/kernels/portable/cpu/util:reduce_util", + ], + ), op_target( name = "op_where", deps = [ From f4086e339fcf0993040bc41434ea8ae2c2ee1b38 Mon Sep 17 00:00:00 2001 From: John Gibson Date: Fri, 24 Apr 2026 14:40:14 -0400 Subject: [PATCH 2/4] tools/cmake: carry preset.cmake DESCRIPTION quoting fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same one-char fix as pytorch/executorch#19117 (and our PR #2): the DESCRIPTION argument to `set(...CACHE TYPE DOCSTRING)` was expanded unquoted, so multi-word descriptions on STRING options passed via `-D` spilled their trailing words into subsequent set() args. This was latent until PR #3 introduced EXECUTORCH_VULKAN_FP16_PRECISION with a multi-word help string — builds that set it (e.g. via scripts/build_android_library.sh forwarding the env var) then fail. Carried here so this branch remains self-contained and buildable independent of the merge order of PR #2. Drops cleanly after PR #2 lands; git will treat the duplicate line as a no-op. --- tools/cmake/common/preset.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cmake/common/preset.cmake b/tools/cmake/common/preset.cmake index 4ac45e28562..7d016db2347 100644 --- a/tools/cmake/common/preset.cmake +++ b/tools/cmake/common/preset.cmake @@ -82,12 +82,12 @@ macro(define_overridable_option NAME DESCRIPTION VALUE_TYPE DEFAULT_VALUE) if(DEFINED ${NAME} AND NOT DEFINED CACHE{${NAME}}) set(${NAME} ${${NAME}} - CACHE ${VALUE_TYPE} ${DESCRIPTION} FORCE + CACHE ${VALUE_TYPE} "${DESCRIPTION}" FORCE ) else() set(${NAME} ${DEFAULT_VALUE} - CACHE ${VALUE_TYPE} ${DESCRIPTION} + CACHE ${VALUE_TYPE} "${DESCRIPTION}" ) endif() From 8721bfa57541edd05f5ba63548d0fdb2a53e5d56 Mon Sep 17 00:00:00 2001 From: John Gibson Date: Fri, 24 Apr 2026 14:47:00 -0400 Subject: [PATCH 3/4] kernels/optimized: add on-device verify_optimized_kernels binary Standalone aarch64 binary that cross-checks opt_grid_sampler_2d_out and opt_sum_dim_out against an fp32 reference derived from the portable kernel (portable run on up-cast fp32 inputs, then down-cast to fp16). Reference is independent of portable's own fp16 path, so the test stays meaningful regardless of #19117's merge state. Pass/fail uses numpy.testing.assert_allclose semantics: |a - b| <= abs_tol + rel_tol * |b| Avoids the "relative error explodes at zero crossings" trap for mean-zero reductions and bilinear samples near cancellation points. Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so default builds are unaffected. Build + run: cmake -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON ... cmake --build --target verify_optimized_kernels adb push /kernels/optimized/verify_optimized_kernels /data/local/tmp/ adb shell /data/local/tmp/verify_optimized_kernels Exits 0 on all-pass; reports max_abs / max_rel(far) / near_zero / viol per test case. 12 test cases across grid_sampler and sum, covering the shapes the polycam depth model uses plus a few edge cases (odd channel count, align_corners=1, multi-batch). --- kernels/optimized/CMakeLists.txt | 24 ++ kernels/optimized/verify.cpp | 533 +++++++++++++++++++++++++++++++ 2 files changed, 557 insertions(+) create mode 100644 kernels/optimized/verify.cpp diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index b2f689885a6..e47c2293f82 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -98,6 +98,30 @@ gen_operators_lib( executorch_core ) +# On-device verifier for optimized grid_sampler_2d / sum.IntList_out. +# Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so it doesn't affect +# default AAR / library builds. Cross-checks both ops against an fp32 +# reference derived from the portable kernel; non-zero exit on divergence. +if(EXECUTORCH_BUILD_OPTIMIZED_VERIFY) + add_executable( + verify_optimized_kernels ${EXECUTORCH_ROOT}/kernels/optimized/verify.cpp + ) + target_link_libraries( + verify_optimized_kernels + PRIVATE optimized_kernels portable_kernels executorch_core + ) + target_compile_options( + verify_optimized_kernels PRIVATE ${_common_compile_options} + ) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64" + OR ANDROID_ABI STREQUAL "arm64-v8a" + ) + target_compile_options( + verify_optimized_kernels PRIVATE -march=armv8.2-a+fp16 + ) + endif() +endif() + install( # eigen_blas doesn't export itself, so we have to do our own install to export # it. diff --git a/kernels/optimized/verify.cpp b/kernels/optimized/verify.cpp new file mode 100644 index 00000000000..b9c471c0d8f --- /dev/null +++ b/kernels/optimized/verify.cpp @@ -0,0 +1,533 @@ +// Standalone on-device verifier for the optimized_kernels implementations of +// grid_sampler_2d.out and sum.IntList_out, cross-checked against an fp32 +// reference derived from the portable kernel. +// +// Build target: `verify_optimized_kernels` (opt-in via +// -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON). +// +// Usage: +// adb push /path/to/verify_optimized_kernels /data/local/tmp/ +// adb shell /data/local/tmp/verify_optimized_kernels +// +// Reports max abs / max rel diff per test case. Non-zero exit on divergence +// beyond tolerance. For fp16 tests, reference is portable run on up-cast +// fp32 inputs, then down-cast — independent of portable's own fp16 path. + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using executorch::aten::ArrayRef; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::etensor::TensorImpl; +using executorch::aten::DimOrderType; +using executorch::aten::SizesType; +using executorch::aten::StridesType; +using torch::executor::KernelRuntimeContext; + +// ====================================================================== +// Forward decls for the two implementations we want to compare. +// ====================================================================== + +namespace torch { +namespace executor { +namespace native { +// Portable reference implementations. +Tensor& sum_dim_out( + KernelRuntimeContext&, + const Tensor&, + std::optional>, + bool, + std::optional, + Tensor&); +Tensor& grid_sampler_2d_out( + KernelRuntimeContext&, + const Tensor&, + const Tensor&, + int64_t, + int64_t, + bool, + Tensor&); +// Optimized implementations (this directory). +Tensor& opt_sum_dim_out( + KernelRuntimeContext&, + const Tensor&, + std::optional>, + bool, + std::optional, + Tensor&); +Tensor& opt_grid_sampler_2d_out( + KernelRuntimeContext&, + const Tensor&, + const Tensor&, + int64_t, + int64_t, + bool, + Tensor&); +} // namespace native +} // namespace executor +} // namespace torch + +// ====================================================================== +// Tiny tensor builder: owns storage + metadata, hands out a Tensor view. +// ====================================================================== + +struct OwnedTensor { + std::vector sizes; + std::vector dim_order; + std::vector strides; + std::vector storage; + std::unique_ptr impl; + + OwnedTensor() = default; + OwnedTensor(const OwnedTensor&) = delete; + OwnedTensor& operator=(const OwnedTensor&) = delete; + OwnedTensor(OwnedTensor&&) = default; + OwnedTensor& operator=(OwnedTensor&&) = default; + + static OwnedTensor make(ScalarType dtype, std::vector shape) { + OwnedTensor t; + t.sizes.assign(shape.begin(), shape.end()); + t.dim_order.resize(shape.size()); + for (size_t i = 0; i < shape.size(); ++i) { + t.dim_order[i] = static_cast(i); + } + t.strides.resize(shape.size()); + int32_t running = 1; + for (int i = static_cast(shape.size()) - 1; i >= 0; --i) { + t.strides[i] = running; + running *= shape[i]; + } + size_t numel = 1; + for (auto s : shape) { + numel *= static_cast(s); + } + size_t elem_size = (dtype == ScalarType::Float) ? 4 + : (dtype == ScalarType::Half) ? 2 + : 4; + t.storage.assign(numel * elem_size, 0); + t.impl = std::make_unique( + dtype, + static_cast(shape.size()), + t.sizes.data(), + t.storage.data(), + t.dim_order.data(), + t.strides.data()); + return t; + } + + Tensor view() { + // Hold the Tensor as a member so callers can bind it to Tensor& + // parameters (the kernel signatures take non-const Tensor& for the out + // tensor). Refreshes each call because TensorImpl pointer may shift + // if the OwnedTensor is moved (though we disallow move in practice). + tensor_view_ = Tensor(impl.get()); + return tensor_view_; + } + Tensor& view_ref() { + tensor_view_ = Tensor(impl.get()); + return tensor_view_; + } + + template + T* data() { + return reinterpret_cast(storage.data()); + } + + size_t numel() const { + size_t n = 1; + for (auto s : sizes) { + n *= static_cast(s); + } + return n; + } + + private: + Tensor tensor_view_{nullptr}; +}; + +// ====================================================================== +// Compare helpers. +// ====================================================================== + +template +struct DiffStats { + double max_abs = 0; + double max_rel_nonzero = 0; // rel diff ignoring near-zero cells + size_t violations = 0; // elements failing combined tol check + size_t count = 0; + // How many elements were near-zero enough that rel is meaningless. + size_t near_zero = 0; + + bool passes() const { + return violations == 0; + } +}; + +// numpy.testing.assert_allclose semantics: +// |a - b| <= abs_tol + rel_tol * |b| +// Near-zero cells are bounded by abs_tol alone; away from zero, rel_tol +// dominates. Avoids the "relative error explodes at zero crossings" trap. +template +DiffStats diff( + const T* a, + const T* b, + size_t n, + double abs_tol, + double rel_tol) { + DiffStats s; + s.count = n; + for (size_t i = 0; i < n; ++i) { + double va = static_cast(a[i]); + double vb = static_cast(b[i]); + double abs_d = std::fabs(va - vb); + double bound = abs_tol + rel_tol * std::fabs(vb); + if (abs_d > bound) { + ++s.violations; + } + s.max_abs = std::max(s.max_abs, abs_d); + double mag = std::max(std::fabs(va), std::fabs(vb)); + if (mag < 10 * abs_tol) { + ++s.near_zero; + } else { + s.max_rel_nonzero = std::max(s.max_rel_nonzero, abs_d / mag); + } + } + return s; +} + +// Half <-> float conversion via ARM fp16 type (aarch64-only). We already build +// with -march=armv8.2-a+fp16, so these are cheap. +#ifdef __aarch64__ +#include +static inline float half_to_float(uint16_t h) { + __fp16 f; + std::memcpy(&f, &h, sizeof(f)); + return static_cast(f); +} +static inline uint16_t float_to_half(float f) { + __fp16 h = static_cast<__fp16>(f); + uint16_t u; + std::memcpy(&u, &h, sizeof(u)); + return u; +} +#endif + +// ====================================================================== +// Test cases. +// ====================================================================== + +struct TestResult { + const char* name; + size_t n; + double max_abs; + double max_rel; + bool passed; +}; + +static std::vector results; + +template +void report( + const char* name, + const DiffStats& s, + double abs_tol, + double rel_tol) { + bool ok = s.passes(); + results.push_back({name, s.count, s.max_abs, s.max_rel_nonzero, ok}); + std::printf( + " %-58s n=%-7zu max_abs=%-10.3g max_rel(far)=%-10.3g near_zero=%-5zu viol=%-4zu [%s]\n", + name, + s.count, + s.max_abs, + s.max_rel_nonzero, + s.near_zero, + s.violations, + ok ? "PASS" : "FAIL"); +} + +// ---------- grid_sampler_2d tests ---------- + +template +static void test_grid_sampler( + const char* label, + int N, + int C, + int H_in, + int W_in, + int H_out, + int W_out, + bool align_corners) { + auto input = OwnedTensor::make(DTYPE, {N, C, H_in, W_in}); + auto grid = OwnedTensor::make(DTYPE, {N, H_out, W_out, 2}); + auto out_neon = OwnedTensor::make(DTYPE, {N, C, H_out, W_out}); + + // For fp16 / bf16 the portable kernel's own fp16 path is itself imprecise + // (catastrophic cancellation on weight computation). We compute the + // reference by up-casting inputs to fp32, running portable in fp32, and + // down-casting the output — that's the "best achievable" fp16 output. + // For fp32 this upcast is a no-op and we just run portable directly. + auto input_f = OwnedTensor::make(ScalarType::Float, {N, C, H_in, W_in}); + auto grid_f = OwnedTensor::make(ScalarType::Float, {N, H_out, W_out, 2}); + auto out_ref_f = + OwnedTensor::make(ScalarType::Float, {N, C, H_out, W_out}); + + std::mt19937 rng(12345); + std::uniform_real_distribution ud(-1.0f, 1.0f); + + auto* in_d = input.data(); + auto* in_fd = input_f.data(); + for (size_t i = 0; i < input.numel(); ++i) { + float v = ud(rng); +#ifdef __aarch64__ + if constexpr (std::is_same_v) { + in_d[i] = float_to_half(v); + in_fd[i] = half_to_float(in_d[i]); // round-trip to match fp16 input + } else { + in_d[i] = static_cast(v); + in_fd[i] = static_cast(in_d[i]); + } +#else + in_d[i] = static_cast(v); + in_fd[i] = static_cast(in_d[i]); +#endif + } + + // Grid has spread of coordinates — mostly in [-1, 1] with some edges. + auto* g_d = grid.data(); + auto* g_fd = grid_f.data(); + std::uniform_real_distribution gd(-1.1f, 1.1f); + for (size_t i = 0; i < grid.numel(); ++i) { + float v = gd(rng); +#ifdef __aarch64__ + if constexpr (std::is_same_v) { + g_d[i] = float_to_half(v); + g_fd[i] = half_to_float(g_d[i]); + } else { + g_d[i] = static_cast(v); + g_fd[i] = static_cast(g_d[i]); + } +#else + g_d[i] = static_cast(v); + g_fd[i] = static_cast(g_d[i]); +#endif + } + + // Reference: portable kernel run on fp32 inputs. + KernelRuntimeContext ctx_ref, ctx_neon; + torch::executor::native::grid_sampler_2d_out( + ctx_ref, + input_f.view(), + grid_f.view(), + /*interpolation_mode=*/0, + /*padding_mode=*/0, + align_corners, + out_ref_f.view_ref()); + + // Optimized kernel on the native-dtype inputs. + torch::executor::native::opt_grid_sampler_2d_out( + ctx_neon, + input.view(), + grid.view(), + /*interpolation_mode=*/0, + /*padding_mode=*/0, + align_corners, + out_neon.view_ref()); + + // Compare both to the fp32 reference. For fp16/bf16, down-cast the + // reference to the optimized output's dtype before comparing — optimized can't + // represent more precision than its output dtype allows. + std::vector ref_f(out_ref_f.numel()); + std::vector neon_f(out_neon.numel()); + auto* ref_fd = out_ref_f.data(); + auto* neon_d = out_neon.data(); + for (size_t i = 0; i < ref_f.size(); ++i) { +#ifdef __aarch64__ + if constexpr (std::is_same_v) { + // Round reference through fp16 to match optimized output precision. + uint16_t ref_h = float_to_half(ref_fd[i]); + ref_f[i] = half_to_float(ref_h); + neon_f[i] = half_to_float(neon_d[i]); + } else { + ref_f[i] = ref_fd[i]; + neon_f[i] = static_cast(neon_d[i]); + } +#else + ref_f[i] = ref_fd[i]; + neon_f[i] = static_cast(neon_d[i]); +#endif + } + // Portable and optimized both accumulate in fp32. For fp16 inputs the only + // remaining difference is the final fp16 round-trip on store (half a ULP) + // plus tiny FMA ordering noise. + double abs_tol = (DTYPE == ScalarType::Float) ? 1e-5 : 1e-3; + double rel_tol = (DTYPE == ScalarType::Float) ? 1e-4 : 2e-3; + auto s = diff(ref_f.data(), neon_f.data(), ref_f.size(), abs_tol, rel_tol); + report(label, s, abs_tol, rel_tol); +} + +// ---------- sum.IntList_out tests ---------- + +template +static void test_sum( + const char* label, + std::vector input_shape, + int64_t reduce_dim, + bool keepdim) { + auto input = OwnedTensor::make(DTYPE, input_shape); + // Compute output shape. + std::vector out_shape = input_shape; + if (keepdim) { + out_shape[reduce_dim] = 1; + } else { + out_shape.erase(out_shape.begin() + reduce_dim); + } + auto out_neon = OwnedTensor::make(DTYPE, out_shape); + + // Same strategy as the grid_sampler test: fp32 reference run on up-cast + // inputs, then down-cast the output for comparison. Avoids depending on + // portable's fp16 accumulator precision. + auto input_f = OwnedTensor::make(ScalarType::Float, input_shape); + auto out_ref_f = OwnedTensor::make(ScalarType::Float, out_shape); + + std::mt19937 rng(9999); + std::uniform_real_distribution ud(-1.0f, 1.0f); + + auto* in_d = input.data(); + auto* in_fd = input_f.data(); + for (size_t i = 0; i < input.numel(); ++i) { + float v = ud(rng); +#ifdef __aarch64__ + if constexpr (std::is_same_v) { + in_d[i] = float_to_half(v); + in_fd[i] = half_to_float(in_d[i]); + } else { + in_d[i] = static_cast(v); + in_fd[i] = static_cast(in_d[i]); + } +#else + in_d[i] = static_cast(v); + in_fd[i] = static_cast(in_d[i]); +#endif + } + + std::array dims = {reduce_dim}; + std::optional> dim_list{ArrayRef(dims.data(), 1)}; + std::optional dtype_opt = std::nullopt; + + KernelRuntimeContext ctx_ref, ctx_neon; + torch::executor::native::sum_dim_out( + ctx_ref, + input_f.view(), + dim_list, + keepdim, + dtype_opt, + out_ref_f.view_ref()); + torch::executor::native::opt_sum_dim_out( + ctx_neon, input.view(), dim_list, keepdim, dtype_opt, out_neon.view_ref()); + + std::vector ref_f(out_ref_f.numel()); + std::vector neon_f(out_neon.numel()); + auto* ref_fd = out_ref_f.data(); + auto* neon_d = out_neon.data(); + for (size_t i = 0; i < ref_f.size(); ++i) { +#ifdef __aarch64__ + if constexpr (std::is_same_v) { + uint16_t ref_h = float_to_half(ref_fd[i]); + ref_f[i] = half_to_float(ref_h); + neon_f[i] = half_to_float(neon_d[i]); + } else { + ref_f[i] = ref_fd[i]; + neon_f[i] = static_cast(neon_d[i]); + } +#else + ref_f[i] = ref_fd[i]; + neon_f[i] = static_cast(neon_d[i]); +#endif + } + // Portable and optimized both accumulate in fp32. For fp16 inputs the only + // remaining delta is the final fp16-cast on store and any FMA reordering. + double abs_tol = (DTYPE == ScalarType::Float) ? 1e-4 : 1e-2; + double rel_tol = (DTYPE == ScalarType::Float) ? 1e-4 : 2e-3; + auto s = diff(ref_f.data(), neon_f.data(), ref_f.size(), abs_tol, rel_tol); + report(label, s, abs_tol, rel_tol); +} + +// ====================================================================== +// Entry point. +// ====================================================================== + +int main() { + std::printf( + "=== grid_sampler_2d.out: optimized vs portable-fp32 reference ===\n" + "(for fp16 tests, reference is portable run on up-cast fp32 inputs,\n" + " then down-cast to fp16 — independent of portable's fp16 path)\n"); + test_grid_sampler( + "grid_sampler fp32 N=1 C=16 in=32x48 out=24x32 align=0", 1, 16, 32, 48, 24, 32, false); + test_grid_sampler( + "grid_sampler fp32 N=1 C=32 in=72x96 out=72x96 align=1", 1, 32, 72, 96, 72, 96, true); + test_grid_sampler( + "grid_sampler fp32 N=1 C=7 (odd) in=16x24 out=16x24 align=0", 1, 7, 16, 24, 16, 24, false); + test_grid_sampler( + "grid_sampler fp32 N=2 C=64 in=48x64 out=48x64 align=0", 2, 64, 48, 64, 48, 64, false); +#ifdef __aarch64__ + test_grid_sampler( + "grid_sampler fp16 N=1 C=16 in=32x48 out=24x32 align=0", 1, 16, 32, 48, 24, 32, false); + test_grid_sampler( + "grid_sampler fp16 N=1 C=32 in=72x96 out=72x96 align=1", 1, 32, 72, 96, 72, 96, true); +#endif + + std::printf("\n=== sum.IntList_out: optimized vs portable-fp32 reference ===\n"); + // Innermost reduction. + test_sum( + "sum fp32 [1, 32, 192, 256] reduce=-1 keepdim=0", + {1, 32, 192, 256}, + 3, + false); + test_sum( + "sum fp32 [2, 64, 128] reduce=2 keepdim=1", + {2, 64, 128}, + 2, + true); + // Middle (strided) reduction. + test_sum( + "sum fp32 [1, 32, 192, 256] reduce=1 keepdim=0", + {1, 32, 192, 256}, + 1, + false); + test_sum( + "sum fp32 [4, 16, 64, 64] reduce=2 keepdim=0", + {4, 16, 64, 64}, + 2, + false); +#ifdef __aarch64__ + test_sum( + "sum fp16 [1, 32, 192, 256] reduce=-1 keepdim=0", + {1, 32, 192, 256}, + 3, + false); + test_sum( + "sum fp16 [1, 32, 192, 256] reduce=1 keepdim=0", + {1, 32, 192, 256}, + 1, + false); +#endif + + int failed = 0; + for (auto& r : results) { + if (!r.passed) ++failed; + } + std::printf( + "\n=== %zu tests total, %d failed ===\n", results.size(), failed); + return failed == 0 ? 0 : 1; +} From b2bdc94c4d2c111e9db800cc65c0830e63bf9a91 Mon Sep 17 00:00:00 2001 From: John Gibson Date: Fri, 24 Apr 2026 15:12:24 -0400 Subject: [PATCH 4/4] optimized: fallback to portable grid_sampler_2d for non-default layouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NEON fast path indexes input/grid/out directly assuming contiguous NCHW default-dim-order layout — no use of .strides() or .dim_order(). If the caller passes anything else (NHWC, transposed, strided, channels- last), we'd read wrong memory and silently produce garbage output. Add the same check pattern op_sum.cpp already uses at L150-151: tensor_is_default_dim_order + tensor_is_contiguous on input, grid, and out. If any fails, delegate to the portable kernel (which handles arbitrary strides / dim orders correctly via .strides()). No perf impact on the hot path — the checks are a handful of scalar comparisons run once per call, and the common polycam depth model case is already default-contiguous so the fast path is still taken. --- kernels/optimized/cpu/op_grid_sampler_2d.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp index e8c69a23bdb..e3fe8c49779 100644 --- a/kernels/optimized/cpu/op_grid_sampler_2d.cpp +++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp @@ -294,9 +294,20 @@ Tensor& opt_grid_sampler_2d_out( int64_t padding_mode, bool align_corners, Tensor& out) { + // The NEON path indexes input/grid/out directly assuming a contiguous NCHW + // default-dim-order layout — no use of .strides() or .dim_order(). If the + // caller passes anything else, fall back to portable (which does handle + // arbitrary strides and dim orders correctly). These are cheap checks. + const bool fast_eligible = tensor_is_default_dim_order(input) && + tensor_is_default_dim_order(grid) && + tensor_is_default_dim_order(out) && + tensor_is_contiguous(input) && + tensor_is_contiguous(grid) && + tensor_is_contiguous(out); + // Only the bilinear + zeros-padding combination is accelerated. Everything - // else — and any non-aarch64 target — delegates to the portable kernel. - if (interpolation_mode != 0 || padding_mode != 0) { + // else — non-default layout, any non-aarch64 target — delegates to portable. + if (interpolation_mode != 0 || padding_mode != 0 || !fast_eligible) { return grid_sampler_2d_out( ctx, input, grid, interpolation_mode, padding_mode, align_corners, out); }