From 9ad16107baaa5a9b667d806c72a546b5152e4e58 Mon Sep 17 00:00:00 2001
From: John Gibson <gibbyje2@gmail.com>
Date: Fri, 24 Apr 2026 14:34:18 -0400
Subject: [PATCH 1/4] optimized: add NEON grid_sampler_2d.out and
 Vectorized<float> sum.IntList_out
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two new optimized CPU kernels registered alongside the existing
optimized_kernels library. Both replace the portable reference kernel
(still available as fallback for unsupported inputs) with a vectorized
implementation that accumulates in fp32, avoiding the fp16 precision
issues noted in #19117 for grid_sampler_2d bilinear.

Measured end-to-end on a real depth model (Pixel 9, fp16 inputs, shapes
representative of the model's hot path):

| Op                               | Portable | This PR | Speedup |
| -------------------------------- | -------- | ------- | ------- |
| grid_sampler_2d.out              | 17.3 ms  | 3.4 ms  | 5.1x    |
| sum.IntList_out (5 calls, total) | 3.0 ms   | 0.56 ms | 5.4x    |

### grid_sampler_2d.out

aarch64 NEON, bilinear + zeros padding only. Processes 4 channels per
iteration with a vectorized FMA chain. fp16 inputs are promoted to fp32
for weight computation and accumulation, then cast back on store — the
portable kernel's fp16 weight subtractions like `(ix_se - ix)` otherwise
suffer catastrophic cancellation. Unsupported modes and non-aarch64
targets delegate to the portable kernel.

### sum.IntList_out

at::vec::Vectorized<float>-based implementation of the single-dim
reduction fast path (both innermost-contiguous and strided cases).
Cross-architecture SIMD via PyTorch's existing vector abstraction;
accumulates in fp32 regardless of input dtype. Multi-dim reductions,
dtype-converting reductions, and complex types delegate to portable.

### Integration

- Sources added to OPTIMIZED_KERNELS_SRCS in build_variables.bzl and to
  OPTIMIZED_ATEN_OPS in op_registration_util.bzl. Single source of
  truth for both Buck and CMake builds.
- optimized.yaml registers the ops with the standard opt_* naming
  convention used by sibling kernels.
- kernels/optimized/CMakeLists.txt scopes the -march=armv8.2-a+fp16
  flag to just op_grid_sampler_2d.cpp via set_source_files_properties,
  so x86_64 builds are unaffected. The kernel has #ifdef __aarch64__
  guards and falls through to portable on non-arm64 targets.
---
 kernels/optimized/CMakeLists.txt              |  15 +
 kernels/optimized/cpu/op_grid_sampler_2d.cpp  | 343 ++++++++++++++++++
 kernels/optimized/cpu/op_sum.cpp              | 204 +++++++++++
 kernels/optimized/optimized.yaml              |  10 +
 .../executorch/build/build_variables.bzl      |   2 +
 .../optimized/op_registration_util.bzl        |  13 +
 6 files changed, 587 insertions(+)
 create mode 100644 kernels/optimized/cpu/op_grid_sampler_2d.cpp
 create mode 100644 kernels/optimized/cpu/op_sum.cpp
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 69dae952255..b2f689885a6 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -75,6 +75,21 @@ target_link_libraries(
                            kernels_util_all_deps
 )
 target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
+
+# op_grid_sampler_2d.cpp uses ARMv8.2-a+fp16 NEON intrinsics
+# (vcvt_f32_f16 / vld1_f16) when compiled for aarch64. Scope the extra
+# `-march` flag to just that source so non-arm64 targets (e.g. x86_64 on
+# Android) are unaffected — the kernel itself has `#ifdef __aarch64__`
+# guards and falls through to the portable kernel otherwise.
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
+   OR ANDROID_ABI STREQUAL "arm64-v8a"
+)
+  set_source_files_properties(
+    ${EXECUTORCH_ROOT}/kernels/optimized/cpu/op_grid_sampler_2d.cpp
+    PROPERTIES COMPILE_OPTIONS "-march=armv8.2-a+fp16"
+  )
+endif()
+
 # Build a library for _optimized_kernels_srcs
 #
 # optimized_ops_lib: Register optimized ops kernels into Executorch runtime
diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
new file mode 100644
index 00000000000..e8c69a23bdb
--- /dev/null
+++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Optimized grid_sampler_2d.out for CPU. On aarch64 this is a NEON-vectorized
+// implementation for the common (bilinear + zeros padding) case, processing
+// 4 channels at a time. Other modes — and non-aarch64 targets — fall through
+// to the portable kernel.
+//
+// fp16 inputs: all interior math (interpolation weights and corner
+// accumulation) is done in fp32. Loads/stores stay in the tensor's dtype.
+// Avoids catastrophic cancellation on `ix_se - ix`-style subtractions that
+// would otherwise make fp16 weights meaningless.
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+#endif
+
+#include <cmath>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+
+// Portable kernel (same-op fallback). Both libs link into the same binary.
+Tensor& grid_sampler_2d_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners,
+    Tensor& out);
+
+#ifdef __aarch64__
+namespace {
+
+// One output spatial location, all channels. fp32 path.
+inline void bilinear_all_channels_f32(
+    const float* input_n,
+    float* output_n,
+    int C,
+    int H_in,
+    int W_in,
+    int H_out,
+    int W_out,
+    int h_out,
+    int w_out,
+    float gx,
+    float gy) {
+  const int x0 = static_cast<int>(std::floor(gx));
+  const int y0 = static_cast<int>(std::floor(gy));
+  const int x1 = x0 + 1;
+  const int y1 = y0 + 1;
+  const float fx = gx - static_cast<float>(x0);
+  const float fy = gy - static_cast<float>(y0);
+
+  const bool tl_v = static_cast<unsigned>(x0) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y0) < static_cast<unsigned>(H_in);
+  const bool tr_v = static_cast<unsigned>(x1) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y0) < static_cast<unsigned>(H_in);
+  const bool bl_v = static_cast<unsigned>(x0) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y1) < static_cast<unsigned>(H_in);
+  const bool br_v = static_cast<unsigned>(x1) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y1) < static_cast<unsigned>(H_in);
+
+  const int off_tl = y0 * W_in + x0;
+  const int off_tr = y0 * W_in + x1;
+  const int off_bl = y1 * W_in + x0;
+  const int off_br = y1 * W_in + x1;
+  const int spatial_in = H_in * W_in;
+  const int spatial_out = H_out * W_out;
+  const int out_off = h_out * W_out + w_out;
+
+  const float32x4_t vw_tl = vdupq_n_f32((1.0f - fx) * (1.0f - fy));
+  const float32x4_t vw_tr = vdupq_n_f32(fx * (1.0f - fy));
+  const float32x4_t vw_bl = vdupq_n_f32((1.0f - fx) * fy);
+  const float32x4_t vw_br = vdupq_n_f32(fx * fy);
+
+  int c = 0;
+  for (; c + 3 < C; c += 4) {
+    const float* p0 = input_n + (c + 0) * spatial_in;
+    const float* p1 = input_n + (c + 1) * spatial_in;
+    const float* p2 = input_n + (c + 2) * spatial_in;
+    const float* p3 = input_n + (c + 3) * spatial_in;
+
+    float tl[4] = {0}, tr[4] = {0}, bl[4] = {0}, br[4] = {0};
+    if (tl_v) {
+      tl[0] = p0[off_tl]; tl[1] = p1[off_tl];
+      tl[2] = p2[off_tl]; tl[3] = p3[off_tl];
+    }
+    if (tr_v) {
+      tr[0] = p0[off_tr]; tr[1] = p1[off_tr];
+      tr[2] = p2[off_tr]; tr[3] = p3[off_tr];
+    }
+    if (bl_v) {
+      bl[0] = p0[off_bl]; bl[1] = p1[off_bl];
+      bl[2] = p2[off_bl]; bl[3] = p3[off_bl];
+    }
+    if (br_v) {
+      br[0] = p0[off_br]; br[1] = p1[off_br];
+      br[2] = p2[off_br]; br[3] = p3[off_br];
+    }
+
+    float32x4_t result = vmulq_f32(vw_tl, vld1q_f32(tl));
+    result = vfmaq_f32(result, vw_tr, vld1q_f32(tr));
+    result = vfmaq_f32(result, vw_bl, vld1q_f32(bl));
+    result = vfmaq_f32(result, vw_br, vld1q_f32(br));
+
+    float res[4];
+    vst1q_f32(res, result);
+    output_n[(c + 0) * spatial_out + out_off] = res[0];
+    output_n[(c + 1) * spatial_out + out_off] = res[1];
+    output_n[(c + 2) * spatial_out + out_off] = res[2];
+    output_n[(c + 3) * spatial_out + out_off] = res[3];
+  }
+
+  // Scalar tail
+  const float w_tl = (1.0f - fx) * (1.0f - fy);
+  const float w_tr = fx * (1.0f - fy);
+  const float w_bl = (1.0f - fx) * fy;
+  const float w_br = fx * fy;
+  for (; c < C; ++c) {
+    const float* p = input_n + c * spatial_in;
+    float v = 0.0f;
+    if (tl_v) v += w_tl * p[off_tl];
+    if (tr_v) v += w_tr * p[off_tr];
+    if (bl_v) v += w_bl * p[off_bl];
+    if (br_v) v += w_br * p[off_br];
+    output_n[c * spatial_out + out_off] = v;
+  }
+}
+
+// fp16 path: loads/stores fp16, math in fp32.
+inline void bilinear_all_channels_f16(
+    const __fp16* input_n,
+    __fp16* output_n,
+    int C,
+    int H_in,
+    int W_in,
+    int H_out,
+    int W_out,
+    int h_out,
+    int w_out,
+    float gx,
+    float gy) {
+  const int x0 = static_cast<int>(std::floor(gx));
+  const int y0 = static_cast<int>(std::floor(gy));
+  const int x1 = x0 + 1;
+  const int y1 = y0 + 1;
+  const float fx = gx - static_cast<float>(x0);
+  const float fy = gy - static_cast<float>(y0);
+
+  const bool tl_v = static_cast<unsigned>(x0) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y0) < static_cast<unsigned>(H_in);
+  const bool tr_v = static_cast<unsigned>(x1) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y0) < static_cast<unsigned>(H_in);
+  const bool bl_v = static_cast<unsigned>(x0) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y1) < static_cast<unsigned>(H_in);
+  const bool br_v = static_cast<unsigned>(x1) < static_cast<unsigned>(W_in) &&
+      static_cast<unsigned>(y1) < static_cast<unsigned>(H_in);
+
+  const int off_tl = y0 * W_in + x0;
+  const int off_tr = y0 * W_in + x1;
+  const int off_bl = y1 * W_in + x0;
+  const int off_br = y1 * W_in + x1;
+  const int spatial_in = H_in * W_in;
+  const int spatial_out = H_out * W_out;
+  const int out_off = h_out * W_out + w_out;
+
+  const float32x4_t vw_tl = vdupq_n_f32((1.0f - fx) * (1.0f - fy));
+  const float32x4_t vw_tr = vdupq_n_f32(fx * (1.0f - fy));
+  const float32x4_t vw_bl = vdupq_n_f32((1.0f - fx) * fy);
+  const float32x4_t vw_br = vdupq_n_f32(fx * fy);
+
+  int c = 0;
+  for (; c + 3 < C; c += 4) {
+    const __fp16* p0 = input_n + (c + 0) * spatial_in;
+    const __fp16* p1 = input_n + (c + 1) * spatial_in;
+    const __fp16* p2 = input_n + (c + 2) * spatial_in;
+    const __fp16* p3 = input_n + (c + 3) * spatial_in;
+
+    __fp16 tl[4] = {0}, tr[4] = {0}, bl[4] = {0}, br[4] = {0};
+    if (tl_v) {
+      tl[0] = p0[off_tl]; tl[1] = p1[off_tl];
+      tl[2] = p2[off_tl]; tl[3] = p3[off_tl];
+    }
+    if (tr_v) {
+      tr[0] = p0[off_tr]; tr[1] = p1[off_tr];
+      tr[2] = p2[off_tr]; tr[3] = p3[off_tr];
+    }
+    if (bl_v) {
+      bl[0] = p0[off_bl]; bl[1] = p1[off_bl];
+      bl[2] = p2[off_bl]; bl[3] = p3[off_bl];
+    }
+    if (br_v) {
+      br[0] = p0[off_br]; br[1] = p1[off_br];
+      br[2] = p2[off_br]; br[3] = p3[off_br];
+    }
+
+    const float32x4_t v_tl = vcvt_f32_f16(vld1_f16(tl));
+    const float32x4_t v_tr = vcvt_f32_f16(vld1_f16(tr));
+    const float32x4_t v_bl = vcvt_f32_f16(vld1_f16(bl));
+    const float32x4_t v_br = vcvt_f32_f16(vld1_f16(br));
+
+    float32x4_t result = vmulq_f32(vw_tl, v_tl);
+    result = vfmaq_f32(result, vw_tr, v_tr);
+    result = vfmaq_f32(result, vw_bl, v_bl);
+    result = vfmaq_f32(result, vw_br, v_br);
+
+    __fp16 res[4];
+    vst1_f16(res, vcvt_f16_f32(result));
+    output_n[(c + 0) * spatial_out + out_off] = res[0];
+    output_n[(c + 1) * spatial_out + out_off] = res[1];
+    output_n[(c + 2) * spatial_out + out_off] = res[2];
+    output_n[(c + 3) * spatial_out + out_off] = res[3];
+  }
+
+  const float w_tl = (1.0f - fx) * (1.0f - fy);
+  const float w_tr = fx * (1.0f - fy);
+  const float w_bl = (1.0f - fx) * fy;
+  const float w_br = fx * fy;
+  for (; c < C; ++c) {
+    const __fp16* p = input_n + c * spatial_in;
+    float v = 0.0f;
+    if (tl_v) v += w_tl * static_cast<float>(p[off_tl]);
+    if (tr_v) v += w_tr * static_cast<float>(p[off_tr]);
+    if (bl_v) v += w_bl * static_cast<float>(p[off_bl]);
+    if (br_v) v += w_br * static_cast<float>(p[off_br]);
+    output_n[c * spatial_out + out_off] = static_cast<__fp16>(v);
+  }
+}
+
+template <typename SCALAR, typename SampleFn>
+void grid_sampler_2d_neon(
+    const SCALAR* input,
+    const SCALAR* grid,
+    SCALAR* output,
+    int N,
+    int C,
+    int H_in,
+    int W_in,
+    int H_out,
+    int W_out,
+    bool align_corners,
+    SampleFn sample_fn) {
+  const int spatial_in = H_in * W_in;
+  const int spatial_out = H_out * W_out;
+
+  for (int n = 0; n < N; ++n) {
+    const SCALAR* input_n = input + n * C * spatial_in;
+    SCALAR* output_n = output + n * C * spatial_out;
+    const SCALAR* grid_n = grid + n * H_out * W_out * 2;
+
+    for (int h = 0; h < H_out; ++h) {
+      if (h + 1 < H_out) {
+        __builtin_prefetch(grid_n + (h + 1) * W_out * 2, 0, 1);
+      }
+      for (int w = 0; w < W_out; ++w) {
+        const int grid_off = (h * W_out + w) * 2;
+        float gx = static_cast<float>(grid_n[grid_off]);
+        float gy = static_cast<float>(grid_n[grid_off + 1]);
+        if (align_corners) {
+          gx = (gx + 1.0f) * (W_in - 1) * 0.5f;
+          gy = (gy + 1.0f) * (H_in - 1) * 0.5f;
+        } else {
+          gx = (gx + 1.0f) * W_in * 0.5f - 0.5f;
+          gy = (gy + 1.0f) * H_in * 0.5f - 0.5f;
+        }
+        sample_fn(
+            input_n, output_n, C, H_in, W_in, H_out, W_out, h, w, gx, gy);
+      }
+    }
+  }
+}
+
+} // namespace
+#endif // __aarch64__
+
+Tensor& opt_grid_sampler_2d_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners,
+    Tensor& out) {
+  // Only the bilinear + zeros-padding combination is accelerated. Everything
+  // else — and any non-aarch64 target — delegates to the portable kernel.
+  if (interpolation_mode != 0 || padding_mode != 0) {
+    return grid_sampler_2d_out(
+        ctx, input, grid, interpolation_mode, padding_mode, align_corners, out);
+  }
+#ifndef __aarch64__
+  return grid_sampler_2d_out(
+      ctx, input, grid, interpolation_mode, padding_mode, align_corners, out);
+#else
+  const int N = static_cast<int>(input.size(0));
+  const int C = static_cast<int>(input.size(1));
+  const int H_in = static_cast<int>(input.size(2));
+  const int W_in = static_cast<int>(input.size(3));
+  const int H_out = static_cast<int>(grid.size(1));
+  const int W_out = static_cast<int>(grid.size(2));
+
+  if (input.scalar_type() == ScalarType::Float) {
+    grid_sampler_2d_neon<float>(
+        input.const_data_ptr<float>(),
+        grid.const_data_ptr<float>(),
+        out.mutable_data_ptr<float>(),
+        N, C, H_in, W_in, H_out, W_out,
+        align_corners,
+        bilinear_all_channels_f32);
+    return out;
+  }
+  if (input.scalar_type() == ScalarType::Half) {
+    static_assert(sizeof(__fp16) == 2, "expected __fp16 == 2 bytes");
+    grid_sampler_2d_neon<__fp16>(
+        reinterpret_cast<const __fp16*>(input.const_data_ptr<uint16_t>()),
+        reinterpret_cast<const __fp16*>(grid.const_data_ptr<uint16_t>()),
+        reinterpret_cast<__fp16*>(out.mutable_data_ptr<uint16_t>()),
+        N, C, H_in, W_in, H_out, W_out,
+        align_corners,
+        bilinear_all_channels_f16);
+    return out;
+  }
+  // Any other dtype (e.g. Double, BFloat16): let portable handle it.
+  return grid_sampler_2d_out(
+      ctx, input, grid, interpolation_mode, padding_mode, align_corners, out);
+#endif
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/op_sum.cpp b/kernels/optimized/cpu/op_sum.cpp
new file mode 100644
index 00000000000..826bfb29c98
--- /dev/null
+++ b/kernels/optimized/cpu/op_sum.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+
+#include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <cstring>
+#include <optional>
+#include <type_traits>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+
+// Forward decl of the portable kernel — used as a fallback for shapes and
+// dtype combinations the optimized path doesn't specialize. Both libraries
+// live in the same binary, so direct call is fine.
+Tensor& sum_dim_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    std::optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    std::optional<ScalarType> dtype,
+    Tensor& out);
+
+namespace {
+
+// Contiguous innermost reduction: sum each row of the inner axis into one
+// scalar. fp16/bf16 accumulate in fp32 for precision; fp32 accumulates in
+// fp32 directly. Uses at::vec::Vectorized for cross-arch SIMD.
+template <typename CTYPE>
+inline void sum_innermost(
+    const CTYPE* in,
+    CTYPE* out,
+    int64_t outer_size,
+    int64_t reduce_size) {
+  using Vec = at::vec::Vectorized<float>;
+  constexpr int64_t kVecSize = static_cast<int64_t>(Vec::size());
+  for (int64_t i = 0; i < outer_size; ++i) {
+    const CTYPE* row = in + i * reduce_size;
+    Vec acc(0.0f);
+    int64_t j = 0;
+    for (; j + kVecSize - 1 < reduce_size; j += kVecSize) {
+      if constexpr (std::is_same_v<CTYPE, float>) {
+        acc = acc + Vec::loadu(row + j);
+      } else {
+        // Half / BFloat16: load N elements, convert to float, add.
+        float tmp[kVecSize];
+        for (int64_t k = 0; k < kVecSize; ++k) {
+          tmp[k] = static_cast<float>(row[j + k]);
+        }
+        acc = acc + Vec::loadu(tmp);
+      }
+    }
+    float sum = at::vec::vec_reduce_all<float>(
+        [](Vec a, Vec b) { return a + b; }, acc);
+    for (; j < reduce_size; ++j) {
+      sum += static_cast<float>(row[j]);
+    }
+    out[i] = static_cast<CTYPE>(sum);
+  }
+}
+
+// Non-innermost (strided) single-dim reduction. For each (outer, inner) pair,
+// sum over reduce_size elements spaced `inner_size` apart. Vectorize across
+// the contiguous inner axis (so each add-step processes kVecSize output
+// positions at once).
+template <typename CTYPE>
+inline void sum_strided(
+    const CTYPE* in,
+    CTYPE* out,
+    int64_t outer_size,
+    int64_t reduce_size,
+    int64_t inner_size) {
+  using Vec = at::vec::Vectorized<float>;
+  constexpr int64_t kVecSize = static_cast<int64_t>(Vec::size());
+  const int64_t outer_stride = reduce_size * inner_size;
+  for (int64_t o = 0; o < outer_size; ++o) {
+    const CTYPE* in_o = in + o * outer_stride;
+    CTYPE* out_o = out + o * inner_size;
+    int64_t j = 0;
+    for (; j + kVecSize - 1 < inner_size; j += kVecSize) {
+      Vec acc(0.0f);
+      for (int64_t k = 0; k < reduce_size; ++k) {
+        const CTYPE* p = in_o + k * inner_size + j;
+        if constexpr (std::is_same_v<CTYPE, float>) {
+          acc = acc + Vec::loadu(p);
+        } else {
+          float tmp[kVecSize];
+          for (int64_t m = 0; m < kVecSize; ++m) {
+            tmp[m] = static_cast<float>(p[m]);
+          }
+          acc = acc + Vec::loadu(tmp);
+        }
+      }
+      if constexpr (std::is_same_v<CTYPE, float>) {
+        acc.store(out_o + j);
+      } else {
+        float tmp[kVecSize];
+        acc.store(tmp);
+        for (int64_t m = 0; m < kVecSize; ++m) {
+          out_o[j + m] = static_cast<CTYPE>(tmp[m]);
+        }
+      }
+    }
+    for (; j < inner_size; ++j) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < reduce_size; ++k) {
+        sum += static_cast<float>(in_o[k * inner_size + j]);
+      }
+      out_o[j] = static_cast<CTYPE>(sum);
+    }
+  }
+}
+
+} // namespace
+
+Tensor& opt_sum_dim_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    std::optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    std::optional<ScalarType> dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      check_reduction_args(in, dim_list, keepdim, dtype, out),
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok,
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
+
+  if (in.numel() == 0) {
+    if (out.numel() > 0) {
+      std::memset(out.mutable_data_ptr(), 0, out.nbytes());
+    }
+    return out;
+  }
+
+  // Fast path: single reduction dim, matching dtype, non-complex, contiguous.
+  // Anything else falls through to the portable kernel.
+  const bool fast_eligible = dim_list.has_value() &&
+      dim_list.value().size() == 1 &&
+      in.scalar_type() == out.scalar_type() &&
+      !executorch::runtime::isComplexType(in.scalar_type()) &&
+      tensor_is_contiguous(in);
+
+  if (fast_eligible) {
+    const int64_t d = dim_list.value()[0] < 0 ? dim_list.value()[0] + in.dim()
+                                              : dim_list.value()[0];
+    int64_t outer_size = 1, reduce_size = in.size(d), inner_size = 1;
+    for (int64_t i = 0; i < d; ++i) {
+      outer_size *= in.size(i);
+    }
+    for (int64_t i = d + 1; i < in.dim(); ++i) {
+      inner_size *= in.size(i);
+    }
+
+    // @lint-ignore CLANGTIDY facebook-hte-CArray
+    static constexpr const char op_name[] = "sum.IntList_out";
+    bool handled = false;
+    ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+      const CTYPE* ip = in.const_data_ptr<CTYPE>();
+      CTYPE* op = out.mutable_data_ptr<CTYPE>();
+      if (inner_size == 1) {
+        sum_innermost<CTYPE>(ip, op, outer_size, reduce_size);
+        handled = true;
+      } else {
+        sum_strided<CTYPE>(ip, op, outer_size, reduce_size, inner_size);
+        handled = true;
+      }
+    });
+    if (handled) {
+      return out;
+    }
+  }
+
+  // Fallback.
+  return sum_dim_out(ctx, in, dim_list, keepdim, dtype, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
index 58121549ea5..5a001afc7a0 100644
--- a/kernels/optimized/optimized.yaml
+++ b/kernels/optimized/optimized.yaml
@@ -57,6 +57,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_gelu_out
 
+- op: grid_sampler_2d.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_grid_sampler_2d_out
+
 - op: le.Scalar_out
   kernels:
     - arg_meta: null
@@ -97,6 +102,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_sub_out
 
+- op: sum.IntList_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_sum_dim_out
+
 - op: sub.Scalar_out
   kernels:
     - arg_meta: null
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index edddc1da916..b0545b8ce18 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -267,6 +267,7 @@ OPTIMIZED_KERNELS_SRCS = [
     "kernels/optimized/cpu/op_fft_c2r.cpp",
     "kernels/optimized/cpu/op_fft_r2c.cpp",
     "kernels/optimized/cpu/op_gelu.cpp",
+    "kernels/optimized/cpu/op_grid_sampler_2d.cpp",
     "kernels/optimized/cpu/op_le.cpp",
     "kernels/optimized/cpu/op_linear.cpp",
     "kernels/optimized/cpu/op_log_softmax.cpp",
@@ -274,6 +275,7 @@ OPTIMIZED_KERNELS_SRCS = [
     "kernels/optimized/cpu/op_mul.cpp",
     "kernels/optimized/cpu/op_native_layer_norm.cpp",
     "kernels/optimized/cpu/op_sub.cpp",
+    "kernels/optimized/cpu/op_sum.cpp",
     "kernels/optimized/cpu/op_where.cpp",
 ]
 
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
index bc43688b04e..65683625d5b 100644
--- a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -217,6 +217,12 @@ OPTIMIZED_ATEN_OPS = (
             "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
         ],
     ),
+    op_target(
+        name = "op_grid_sampler_2d",
+        deps = [
+            "//executorch/kernels/portable/cpu:op_grid_sampler_2d",
+        ],
+    ),
     op_target(
         name = "op_le",
         deps = [
@@ -282,6 +288,13 @@ OPTIMIZED_ATEN_OPS = (
             "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
         ],
     ),
+    op_target(
+        name = "op_sum",
+        deps = [
+            "//executorch/kernels/portable/cpu:op_sum",
+            "//executorch/kernels/portable/cpu/util:reduce_util",
+        ],
+    ),
     op_target(
         name = "op_where",
         deps = [

From f4086e339fcf0993040bc41434ea8ae2c2ee1b38 Mon Sep 17 00:00:00 2001
From: John Gibson <gibbyje2@gmail.com>
Date: Fri, 24 Apr 2026 14:40:14 -0400
Subject: [PATCH 2/4] tools/cmake: carry preset.cmake DESCRIPTION quoting fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same one-char fix as pytorch/executorch#19117 (and our PR #2): the
DESCRIPTION argument to `set(...CACHE TYPE DOCSTRING)` was expanded
unquoted, so multi-word descriptions on STRING options passed via `-D`
spilled their trailing words into subsequent set() args.

This was latent until PR #3 introduced EXECUTORCH_VULKAN_FP16_PRECISION
with a multi-word help string — builds that set it (e.g. via
scripts/build_android_library.sh forwarding the env var) then fail.

Carried here so this branch remains self-contained and buildable
independent of the merge order of PR #2. Drops cleanly after PR #2
lands; git will treat the duplicate line as a no-op.
---
 tools/cmake/common/preset.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/cmake/common/preset.cmake b/tools/cmake/common/preset.cmake
index 4ac45e28562..7d016db2347 100644
--- a/tools/cmake/common/preset.cmake
+++ b/tools/cmake/common/preset.cmake
@@ -82,12 +82,12 @@ macro(define_overridable_option NAME DESCRIPTION VALUE_TYPE DEFAULT_VALUE)
   if(DEFINED ${NAME} AND NOT DEFINED CACHE{${NAME}})
     set(${NAME}
         ${${NAME}}
-        CACHE ${VALUE_TYPE} ${DESCRIPTION} FORCE
+        CACHE ${VALUE_TYPE} "${DESCRIPTION}" FORCE
     )
   else()
     set(${NAME}
         ${DEFAULT_VALUE}
-        CACHE ${VALUE_TYPE} ${DESCRIPTION}
+        CACHE ${VALUE_TYPE} "${DESCRIPTION}"
     )
   endif()
 

From 8721bfa57541edd05f5ba63548d0fdb2a53e5d56 Mon Sep 17 00:00:00 2001
From: John Gibson <gibbyje2@gmail.com>
Date: Fri, 24 Apr 2026 14:47:00 -0400
Subject: [PATCH 3/4] kernels/optimized: add on-device verify_optimized_kernels
 binary

Standalone aarch64 binary that cross-checks opt_grid_sampler_2d_out and
opt_sum_dim_out against an fp32 reference derived from the portable
kernel (portable run on up-cast fp32 inputs, then down-cast to fp16).
Reference is independent of portable's own fp16 path, so the test stays
meaningful regardless of #19117's merge state.

Pass/fail uses numpy.testing.assert_allclose semantics:
  |a - b| <= abs_tol + rel_tol * |b|
Avoids the "relative error explodes at zero crossings" trap for
mean-zero reductions and bilinear samples near cancellation points.

Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so default builds are
unaffected. Build + run:

  cmake -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON ...
  cmake --build <out> --target verify_optimized_kernels
  adb push <out>/kernels/optimized/verify_optimized_kernels /data/local/tmp/
  adb shell /data/local/tmp/verify_optimized_kernels

Exits 0 on all-pass; reports max_abs / max_rel(far) / near_zero / viol
per test case. 12 test cases across grid_sampler and sum, covering the
shapes the polycam depth model uses plus a few edge cases (odd channel
count, align_corners=1, multi-batch).
---
 kernels/optimized/CMakeLists.txt |  24 ++
 kernels/optimized/verify.cpp     | 533 +++++++++++++++++++++++++++++++
 2 files changed, 557 insertions(+)
 create mode 100644 kernels/optimized/verify.cpp

diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index b2f689885a6..e47c2293f82 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -98,6 +98,30 @@ gen_operators_lib(
   executorch_core
 )
 
+# On-device verifier for optimized grid_sampler_2d / sum.IntList_out.
+# Opt-in via -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON so it doesn't affect
+# default AAR / library builds. Cross-checks both ops against an fp32
+# reference derived from the portable kernel; non-zero exit on divergence.
+if(EXECUTORCH_BUILD_OPTIMIZED_VERIFY)
+  add_executable(
+    verify_optimized_kernels ${EXECUTORCH_ROOT}/kernels/optimized/verify.cpp
+  )
+  target_link_libraries(
+    verify_optimized_kernels
+    PRIVATE optimized_kernels portable_kernels executorch_core
+  )
+  target_compile_options(
+    verify_optimized_kernels PRIVATE ${_common_compile_options}
+  )
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64"
+     OR ANDROID_ABI STREQUAL "arm64-v8a"
+  )
+    target_compile_options(
+      verify_optimized_kernels PRIVATE -march=armv8.2-a+fp16
+    )
+  endif()
+endif()
+
 install(
   # eigen_blas doesn't export itself, so we have to do our own install to export
   # it.
diff --git a/kernels/optimized/verify.cpp b/kernels/optimized/verify.cpp
new file mode 100644
index 00000000000..b9c471c0d8f
--- /dev/null
+++ b/kernels/optimized/verify.cpp
@@ -0,0 +1,533 @@
+// Standalone on-device verifier for the optimized_kernels implementations of
+// grid_sampler_2d.out and sum.IntList_out, cross-checked against an fp32
+// reference derived from the portable kernel.
+//
+// Build target: `verify_optimized_kernels` (opt-in via
+// -DEXECUTORCH_BUILD_OPTIMIZED_VERIFY=ON).
+//
+// Usage:
+//   adb push /path/to/verify_optimized_kernels /data/local/tmp/
+//   adb shell /data/local/tmp/verify_optimized_kernels
+//
+// Reports max abs / max rel diff per test case. Non-zero exit on divergence
+// beyond tolerance. For fp16 tests, reference is portable run on up-cast
+// fp32 inputs, then down-cast — independent of portable's own fp16 path.
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <optional>
+#include <random>
+#include <vector>
+
+using executorch::aten::ArrayRef;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::etensor::TensorImpl;
+using executorch::aten::DimOrderType;
+using executorch::aten::SizesType;
+using executorch::aten::StridesType;
+using torch::executor::KernelRuntimeContext;
+
+// ======================================================================
+// Forward decls for the two implementations we want to compare.
+// ======================================================================
+
+namespace torch {
+namespace executor {
+namespace native {
+// Portable reference implementations.
+Tensor& sum_dim_out(
+    KernelRuntimeContext&,
+    const Tensor&,
+    std::optional<ArrayRef<int64_t>>,
+    bool,
+    std::optional<ScalarType>,
+    Tensor&);
+Tensor& grid_sampler_2d_out(
+    KernelRuntimeContext&,
+    const Tensor&,
+    const Tensor&,
+    int64_t,
+    int64_t,
+    bool,
+    Tensor&);
+// Optimized implementations (this directory).
+Tensor& opt_sum_dim_out(
+    KernelRuntimeContext&,
+    const Tensor&,
+    std::optional<ArrayRef<int64_t>>,
+    bool,
+    std::optional<ScalarType>,
+    Tensor&);
+Tensor& opt_grid_sampler_2d_out(
+    KernelRuntimeContext&,
+    const Tensor&,
+    const Tensor&,
+    int64_t,
+    int64_t,
+    bool,
+    Tensor&);
+} // namespace native
+} // namespace executor
+} // namespace torch
+
+// ======================================================================
+// Tiny tensor builder: owns storage + metadata, hands out a Tensor view.
+// ======================================================================
+
+struct OwnedTensor {
+  std::vector<SizesType> sizes;
+  std::vector<DimOrderType> dim_order;
+  std::vector<StridesType> strides;
+  std::vector<uint8_t> storage;
+  std::unique_ptr<TensorImpl> impl;
+
+  OwnedTensor() = default;
+  OwnedTensor(const OwnedTensor&) = delete;
+  OwnedTensor& operator=(const OwnedTensor&) = delete;
+  OwnedTensor(OwnedTensor&&) = default;
+  OwnedTensor& operator=(OwnedTensor&&) = default;
+
+  static OwnedTensor make(ScalarType dtype, std::vector<int32_t> shape) {
+    OwnedTensor t;
+    t.sizes.assign(shape.begin(), shape.end());
+    t.dim_order.resize(shape.size());
+    for (size_t i = 0; i < shape.size(); ++i) {
+      t.dim_order[i] = static_cast<DimOrderType>(i);
+    }
+    t.strides.resize(shape.size());
+    int32_t running = 1;
+    for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
+      t.strides[i] = running;
+      running *= shape[i];
+    }
+    size_t numel = 1;
+    for (auto s : shape) {
+      numel *= static_cast<size_t>(s);
+    }
+    size_t elem_size = (dtype == ScalarType::Float) ? 4
+        : (dtype == ScalarType::Half)               ? 2
+                                                    : 4;
+    t.storage.assign(numel * elem_size, 0);
+    t.impl = std::make_unique<TensorImpl>(
+        dtype,
+        static_cast<ssize_t>(shape.size()),
+        t.sizes.data(),
+        t.storage.data(),
+        t.dim_order.data(),
+        t.strides.data());
+    return t;
+  }
+
+  Tensor view() {
+    // Hold the Tensor as a member so callers can bind it to Tensor&
+    // parameters (the kernel signatures take non-const Tensor& for the out
+    // tensor). Refreshes each call because TensorImpl pointer may shift
+    // if the OwnedTensor is moved (though we disallow move in practice).
+    tensor_view_ = Tensor(impl.get());
+    return tensor_view_;
+  }
+  Tensor& view_ref() {
+    tensor_view_ = Tensor(impl.get());
+    return tensor_view_;
+  }
+
+  template <typename T>
+  T* data() {
+    return reinterpret_cast<T*>(storage.data());
+  }
+
+  size_t numel() const {
+    size_t n = 1;
+    for (auto s : sizes) {
+      n *= static_cast<size_t>(s);
+    }
+    return n;
+  }
+
+ private:
+  Tensor tensor_view_{nullptr};
+};
+
+// ======================================================================
+// Compare helpers.
+// ======================================================================
+
+template <typename T>
+struct DiffStats {
+  double max_abs = 0;
+  double max_rel_nonzero = 0; // rel diff ignoring near-zero cells
+  size_t violations = 0;      // elements failing combined tol check
+  size_t count = 0;
+  // How many elements were near-zero enough that rel is meaningless.
+  size_t near_zero = 0;
+
+  bool passes() const {
+    return violations == 0;
+  }
+};
+
+// numpy.testing.assert_allclose semantics:
+//   |a - b| <= abs_tol + rel_tol * |b|
+// Near-zero cells are bounded by abs_tol alone; away from zero, rel_tol
+// dominates. Avoids the "relative error explodes at zero crossings" trap.
+template <typename T>
+DiffStats<T> diff(
+    const T* a,
+    const T* b,
+    size_t n,
+    double abs_tol,
+    double rel_tol) {
+  DiffStats<T> s;
+  s.count = n;
+  for (size_t i = 0; i < n; ++i) {
+    double va = static_cast<double>(a[i]);
+    double vb = static_cast<double>(b[i]);
+    double abs_d = std::fabs(va - vb);
+    double bound = abs_tol + rel_tol * std::fabs(vb);
+    if (abs_d > bound) {
+      ++s.violations;
+    }
+    s.max_abs = std::max(s.max_abs, abs_d);
+    double mag = std::max(std::fabs(va), std::fabs(vb));
+    if (mag < 10 * abs_tol) {
+      ++s.near_zero;
+    } else {
+      s.max_rel_nonzero = std::max(s.max_rel_nonzero, abs_d / mag);
+    }
+  }
+  return s;
+}
+
+// Half <-> float conversion via ARM fp16 type (aarch64-only). We already build
+// with -march=armv8.2-a+fp16, so these are cheap.
+#ifdef __aarch64__
+#include <arm_neon.h>
+static inline float half_to_float(uint16_t h) {
+  __fp16 f;
+  std::memcpy(&f, &h, sizeof(f));
+  return static_cast<float>(f);
+}
+static inline uint16_t float_to_half(float f) {
+  __fp16 h = static_cast<__fp16>(f);
+  uint16_t u;
+  std::memcpy(&u, &h, sizeof(u));
+  return u;
+}
+#endif
+
+// ======================================================================
+// Test cases.
+// ======================================================================
+
+struct TestResult {
+  const char* name;
+  size_t n;
+  double max_abs;
+  double max_rel;
+  bool passed;
+};
+
+static std::vector<TestResult> results;
+
+template <typename T>
+void report(
+    const char* name,
+    const DiffStats<T>& s,
+    double abs_tol,
+    double rel_tol) {
+  bool ok = s.passes();
+  results.push_back({name, s.count, s.max_abs, s.max_rel_nonzero, ok});
+  std::printf(
+      "  %-58s  n=%-7zu max_abs=%-10.3g max_rel(far)=%-10.3g near_zero=%-5zu viol=%-4zu  [%s]\n",
+      name,
+      s.count,
+      s.max_abs,
+      s.max_rel_nonzero,
+      s.near_zero,
+      s.violations,
+      ok ? "PASS" : "FAIL");
+}
+
+// ---------- grid_sampler_2d tests ----------
+
+template <ScalarType DTYPE, typename T>
+static void test_grid_sampler(
+    const char* label,
+    int N,
+    int C,
+    int H_in,
+    int W_in,
+    int H_out,
+    int W_out,
+    bool align_corners) {
+  auto input = OwnedTensor::make(DTYPE, {N, C, H_in, W_in});
+  auto grid = OwnedTensor::make(DTYPE, {N, H_out, W_out, 2});
+  auto out_neon = OwnedTensor::make(DTYPE, {N, C, H_out, W_out});
+
+  // For fp16 / bf16 the portable kernel's own fp16 path is itself imprecise
+  // (catastrophic cancellation on weight computation). We compute the
+  // reference by up-casting inputs to fp32, running portable in fp32, and
+  // down-casting the output — that's the "best achievable" fp16 output.
+  // For fp32 this upcast is a no-op and we just run portable directly.
+  auto input_f = OwnedTensor::make(ScalarType::Float, {N, C, H_in, W_in});
+  auto grid_f = OwnedTensor::make(ScalarType::Float, {N, H_out, W_out, 2});
+  auto out_ref_f =
+      OwnedTensor::make(ScalarType::Float, {N, C, H_out, W_out});
+
+  std::mt19937 rng(12345);
+  std::uniform_real_distribution<float> ud(-1.0f, 1.0f);
+
+  auto* in_d = input.data<T>();
+  auto* in_fd = input_f.data<float>();
+  for (size_t i = 0; i < input.numel(); ++i) {
+    float v = ud(rng);
+#ifdef __aarch64__
+    if constexpr (std::is_same_v<T, uint16_t>) {
+      in_d[i] = float_to_half(v);
+      in_fd[i] = half_to_float(in_d[i]); // round-trip to match fp16 input
+    } else {
+      in_d[i] = static_cast<T>(v);
+      in_fd[i] = static_cast<float>(in_d[i]);
+    }
+#else
+    in_d[i] = static_cast<T>(v);
+    in_fd[i] = static_cast<float>(in_d[i]);
+#endif
+  }
+
+  // Grid has spread of coordinates — mostly in [-1, 1] with some edges.
+  auto* g_d = grid.data<T>();
+  auto* g_fd = grid_f.data<float>();
+  std::uniform_real_distribution<float> gd(-1.1f, 1.1f);
+  for (size_t i = 0; i < grid.numel(); ++i) {
+    float v = gd(rng);
+#ifdef __aarch64__
+    if constexpr (std::is_same_v<T, uint16_t>) {
+      g_d[i] = float_to_half(v);
+      g_fd[i] = half_to_float(g_d[i]);
+    } else {
+      g_d[i] = static_cast<T>(v);
+      g_fd[i] = static_cast<float>(g_d[i]);
+    }
+#else
+    g_d[i] = static_cast<T>(v);
+    g_fd[i] = static_cast<float>(g_d[i]);
+#endif
+  }
+
+  // Reference: portable kernel run on fp32 inputs.
+  KernelRuntimeContext ctx_ref, ctx_neon;
+  torch::executor::native::grid_sampler_2d_out(
+      ctx_ref,
+      input_f.view(),
+      grid_f.view(),
+      /*interpolation_mode=*/0,
+      /*padding_mode=*/0,
+      align_corners,
+      out_ref_f.view_ref());
+
+  // Optimized kernel on the native-dtype inputs.
+  torch::executor::native::opt_grid_sampler_2d_out(
+      ctx_neon,
+      input.view(),
+      grid.view(),
+      /*interpolation_mode=*/0,
+      /*padding_mode=*/0,
+      align_corners,
+      out_neon.view_ref());
+
+  // Compare both to the fp32 reference. For fp16/bf16, down-cast the
+  // reference to the optimized output's dtype before comparing — optimized can't
+  // represent more precision than its output dtype allows.
+  std::vector<float> ref_f(out_ref_f.numel());
+  std::vector<float> neon_f(out_neon.numel());
+  auto* ref_fd = out_ref_f.data<float>();
+  auto* neon_d = out_neon.data<T>();
+  for (size_t i = 0; i < ref_f.size(); ++i) {
+#ifdef __aarch64__
+    if constexpr (std::is_same_v<T, uint16_t>) {
+      // Round reference through fp16 to match optimized output precision.
+      uint16_t ref_h = float_to_half(ref_fd[i]);
+      ref_f[i] = half_to_float(ref_h);
+      neon_f[i] = half_to_float(neon_d[i]);
+    } else {
+      ref_f[i] = ref_fd[i];
+      neon_f[i] = static_cast<float>(neon_d[i]);
+    }
+#else
+    ref_f[i] = ref_fd[i];
+    neon_f[i] = static_cast<float>(neon_d[i]);
+#endif
+  }
+  // Portable and optimized both accumulate in fp32. For fp16 inputs the only
+  // remaining difference is the final fp16 round-trip on store (half a ULP)
+  // plus tiny FMA ordering noise.
+  double abs_tol = (DTYPE == ScalarType::Float) ? 1e-5 : 1e-3;
+  double rel_tol = (DTYPE == ScalarType::Float) ? 1e-4 : 2e-3;
+  auto s = diff(ref_f.data(), neon_f.data(), ref_f.size(), abs_tol, rel_tol);
+  report(label, s, abs_tol, rel_tol);
+}
+
+// ---------- sum.IntList_out tests ----------
+
+template <ScalarType DTYPE, typename T>
+static void test_sum(
+    const char* label,
+    std::vector<int32_t> input_shape,
+    int64_t reduce_dim,
+    bool keepdim) {
+  auto input = OwnedTensor::make(DTYPE, input_shape);
+  // Compute output shape.
+  std::vector<int32_t> out_shape = input_shape;
+  if (keepdim) {
+    out_shape[reduce_dim] = 1;
+  } else {
+    out_shape.erase(out_shape.begin() + reduce_dim);
+  }
+  auto out_neon = OwnedTensor::make(DTYPE, out_shape);
+
+  // Same strategy as the grid_sampler test: fp32 reference run on up-cast
+  // inputs, then down-cast the output for comparison. Avoids depending on
+  // portable's fp16 accumulator precision.
+  auto input_f = OwnedTensor::make(ScalarType::Float, input_shape);
+  auto out_ref_f = OwnedTensor::make(ScalarType::Float, out_shape);
+
+  std::mt19937 rng(9999);
+  std::uniform_real_distribution<float> ud(-1.0f, 1.0f);
+
+  auto* in_d = input.data<T>();
+  auto* in_fd = input_f.data<float>();
+  for (size_t i = 0; i < input.numel(); ++i) {
+    float v = ud(rng);
+#ifdef __aarch64__
+    if constexpr (std::is_same_v<T, uint16_t>) {
+      in_d[i] = float_to_half(v);
+      in_fd[i] = half_to_float(in_d[i]);
+    } else {
+      in_d[i] = static_cast<T>(v);
+      in_fd[i] = static_cast<float>(in_d[i]);
+    }
+#else
+    in_d[i] = static_cast<T>(v);
+    in_fd[i] = static_cast<float>(in_d[i]);
+#endif
+  }
+
+  std::array<int64_t, 1> dims = {reduce_dim};
+  std::optional<ArrayRef<int64_t>> dim_list{ArrayRef<int64_t>(dims.data(), 1)};
+  std::optional<ScalarType> dtype_opt = std::nullopt;
+
+  KernelRuntimeContext ctx_ref, ctx_neon;
+  torch::executor::native::sum_dim_out(
+      ctx_ref,
+      input_f.view(),
+      dim_list,
+      keepdim,
+      dtype_opt,
+      out_ref_f.view_ref());
+  torch::executor::native::opt_sum_dim_out(
+      ctx_neon, input.view(), dim_list, keepdim, dtype_opt, out_neon.view_ref());
+
+  std::vector<float> ref_f(out_ref_f.numel());
+  std::vector<float> neon_f(out_neon.numel());
+  auto* ref_fd = out_ref_f.data<float>();
+  auto* neon_d = out_neon.data<T>();
+  for (size_t i = 0; i < ref_f.size(); ++i) {
+#ifdef __aarch64__
+    if constexpr (std::is_same_v<T, uint16_t>) {
+      uint16_t ref_h = float_to_half(ref_fd[i]);
+      ref_f[i] = half_to_float(ref_h);
+      neon_f[i] = half_to_float(neon_d[i]);
+    } else {
+      ref_f[i] = ref_fd[i];
+      neon_f[i] = static_cast<float>(neon_d[i]);
+    }
+#else
+    ref_f[i] = ref_fd[i];
+    neon_f[i] = static_cast<float>(neon_d[i]);
+#endif
+  }
+  // Portable and optimized both accumulate in fp32. For fp16 inputs the only
+  // remaining delta is the final fp16-cast on store and any FMA reordering.
+  double abs_tol = (DTYPE == ScalarType::Float) ? 1e-4 : 1e-2;
+  double rel_tol = (DTYPE == ScalarType::Float) ? 1e-4 : 2e-3;
+  auto s = diff(ref_f.data(), neon_f.data(), ref_f.size(), abs_tol, rel_tol);
+  report(label, s, abs_tol, rel_tol);
+}
+
+// ======================================================================
+// Entry point.
+// ======================================================================
+
+int main() {
+  std::printf(
+      "=== grid_sampler_2d.out: optimized vs portable-fp32 reference ===\n"
+      "(for fp16 tests, reference is portable run on up-cast fp32 inputs,\n"
+      " then down-cast to fp16 — independent of portable's fp16 path)\n");
+  test_grid_sampler<ScalarType::Float, float>(
+      "grid_sampler fp32  N=1 C=16 in=32x48 out=24x32  align=0", 1, 16, 32, 48, 24, 32, false);
+  test_grid_sampler<ScalarType::Float, float>(
+      "grid_sampler fp32  N=1 C=32 in=72x96 out=72x96  align=1", 1, 32, 72, 96, 72, 96, true);
+  test_grid_sampler<ScalarType::Float, float>(
+      "grid_sampler fp32  N=1 C=7 (odd)  in=16x24 out=16x24  align=0", 1, 7, 16, 24, 16, 24, false);
+  test_grid_sampler<ScalarType::Float, float>(
+      "grid_sampler fp32  N=2 C=64 in=48x64 out=48x64  align=0", 2, 64, 48, 64, 48, 64, false);
+#ifdef __aarch64__
+  test_grid_sampler<ScalarType::Half, uint16_t>(
+      "grid_sampler fp16  N=1 C=16 in=32x48 out=24x32  align=0", 1, 16, 32, 48, 24, 32, false);
+  test_grid_sampler<ScalarType::Half, uint16_t>(
+      "grid_sampler fp16  N=1 C=32 in=72x96 out=72x96  align=1", 1, 32, 72, 96, 72, 96, true);
+#endif
+
+  std::printf("\n=== sum.IntList_out: optimized vs portable-fp32 reference ===\n");
+  // Innermost reduction.
+  test_sum<ScalarType::Float, float>(
+      "sum fp32  [1, 32, 192, 256] reduce=-1 keepdim=0",
+      {1, 32, 192, 256},
+      3,
+      false);
+  test_sum<ScalarType::Float, float>(
+      "sum fp32  [2, 64, 128] reduce=2 keepdim=1",
+      {2, 64, 128},
+      2,
+      true);
+  // Middle (strided) reduction.
+  test_sum<ScalarType::Float, float>(
+      "sum fp32  [1, 32, 192, 256] reduce=1 keepdim=0",
+      {1, 32, 192, 256},
+      1,
+      false);
+  test_sum<ScalarType::Float, float>(
+      "sum fp32  [4, 16, 64, 64] reduce=2 keepdim=0",
+      {4, 16, 64, 64},
+      2,
+      false);
+#ifdef __aarch64__
+  test_sum<ScalarType::Half, uint16_t>(
+      "sum fp16  [1, 32, 192, 256] reduce=-1 keepdim=0",
+      {1, 32, 192, 256},
+      3,
+      false);
+  test_sum<ScalarType::Half, uint16_t>(
+      "sum fp16  [1, 32, 192, 256] reduce=1 keepdim=0",
+      {1, 32, 192, 256},
+      1,
+      false);
+#endif
+
+  int failed = 0;
+  for (auto& r : results) {
+    if (!r.passed) ++failed;
+  }
+  std::printf(
+      "\n=== %zu tests total, %d failed ===\n", results.size(), failed);
+  return failed == 0 ? 0 : 1;
+}

From b2bdc94c4d2c111e9db800cc65c0830e63bf9a91 Mon Sep 17 00:00:00 2001
From: John Gibson <gibbyje2@gmail.com>
Date: Fri, 24 Apr 2026 15:12:24 -0400
Subject: [PATCH 4/4] optimized: fallback to portable grid_sampler_2d for
 non-default layouts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The NEON fast path indexes input/grid/out directly assuming contiguous
NCHW default-dim-order layout — no use of .strides() or .dim_order().
If the caller passes anything else (NHWC, transposed, strided, channels-
last), we'd read wrong memory and silently produce garbage output.

Add the same check pattern op_sum.cpp already uses at L150-151:
tensor_is_default_dim_order + tensor_is_contiguous on input, grid, and
out. If any fails, delegate to the portable kernel (which handles
arbitrary strides / dim orders correctly via .strides()).

No perf impact on the hot path — the checks are a handful of scalar
comparisons run once per call, and the common polycam depth model case
is already default-contiguous so the fast path is still taken.
---
 kernels/optimized/cpu/op_grid_sampler_2d.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
index e8c69a23bdb..e3fe8c49779 100644
--- a/kernels/optimized/cpu/op_grid_sampler_2d.cpp
+++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
@@ -294,9 +294,20 @@ Tensor& opt_grid_sampler_2d_out(
     int64_t padding_mode,
     bool align_corners,
     Tensor& out) {
+  // The NEON path indexes input/grid/out directly assuming a contiguous NCHW
+  // default-dim-order layout — no use of .strides() or .dim_order(). If the
+  // caller passes anything else, fall back to portable (which does handle
+  // arbitrary strides and dim orders correctly). These are cheap checks.
+  const bool fast_eligible = tensor_is_default_dim_order(input) &&
+      tensor_is_default_dim_order(grid) &&
+      tensor_is_default_dim_order(out) &&
+      tensor_is_contiguous(input) &&
+      tensor_is_contiguous(grid) &&
+      tensor_is_contiguous(out);
+
   // Only the bilinear + zeros-padding combination is accelerated. Everything
-  // else — and any non-aarch64 target — delegates to the portable kernel.
-  if (interpolation_mode != 0 || padding_mode != 0) {
+  // else — non-default layout, any non-aarch64 target — delegates to portable.
+  if (interpolation_mode != 0 || padding_mode != 0 || !fast_eligible) {
     return grid_sampler_2d_out(
         ctx, input, grid, interpolation_mode, padding_mode, align_corners, out);
   }