diff --git a/extension/llm/custom_ops/TARGETS b/extension/llm/custom_ops/TARGETS
index 9a437e7dad5..5dda2318f3f 100644
--- a/extension/llm/custom_ops/TARGETS
+++ b/extension/llm/custom_ops/TARGETS
@@ -60,5 +60,6 @@ runtime.python_test(
     ],
     deps = [
         "//caffe2:torch",
+        "//executorch/extension/pybindings:portable_lib",
     ],
 )
diff --git a/extension/llm/custom_ops/test_quantized_sdpa.py b/extension/llm/custom_ops/test_quantized_sdpa.py
index 87026d5c251..e6edf6ffbb1 100644
--- a/extension/llm/custom_ops/test_quantized_sdpa.py
+++ b/extension/llm/custom_ops/test_quantized_sdpa.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from executorch.extension.llm.custom_ops import custom_ops  # noqa
+from executorch.extension.pybindings.portable_lib import _unsafe_reset_threadpool
 
 
 def is_fbcode():
@@ -40,6 +41,11 @@ def setUp(self):
         self.q_shape = None
         self.kv_shape = None
         self.is_seq_at_dim_2 = True
+        # For some reason 4 threads doesnt work
+        # This setting is needed to make this test not flaky due to OMP
+        # error of "OMP: Error #131: Thread identifier invalid"
+        # Not clear why that happens but having smaller threadpool resolves it
+        _unsafe_reset_threadpool(3)
 
     def _scale_tensor(self, tensor, min_value, max_value, scale=True):
         normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
diff --git a/kernels/quantized/cpu/op_choose_qparams.cpp b/kernels/quantized/cpu/op_choose_qparams.cpp
index 5335f4bfbd2..acb8e100af6 100644
--- a/kernels/quantized/cpu/op_choose_qparams.cpp
+++ b/kernels/quantized/cpu/op_choose_qparams.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
@@ -202,17 +203,42 @@ void choose_qparams_per_token(
     num_tokens *= input.size(i);
   }
   auto token_dim_size = input.size(input.dim() - 1);
-  for (auto i = 0; i < num_tokens; i++) {
-    // vec_minf uses std::min_element. Check if it actually
-    // gets vectorized.
-    float min = torch::executor::vec_minf(x_fp32, token_dim_size);
-    float max = torch::executor::vec_maxf(x_fp32, token_dim_size);
-    double scale;
-    int32_t zero_point;
-    calculate_scale_and_zero_point(min, max, qmin, qmax, scale, zero_point);
-    scale_out.mutable_data_ptr<double>()[i] = scale;
-    zero_point_out.mutable_data_ptr<int64_t>()[i] = zero_point;
-    x_fp32 += token_dim_size;
+
+  const int64_t total_elements = num_tokens * token_dim_size;
+  constexpr int64_t MIN_ELEMENTS_FOR_PARALLEL = 512;
+  const bool use_parallel = total_elements >= MIN_ELEMENTS_FOR_PARALLEL;
+
+  if (use_parallel) {
+    auto* scale_data = scale_out.mutable_data_ptr<double>();
+    auto* zero_point_data = zero_point_out.mutable_data_ptr<int64_t>();
+
+    ::executorch::extension::parallel_for(
+        0, num_tokens, 1, [&](const int64_t begin, const int64_t end) {
+          for (int64_t i = begin; i < end; i++) {
+            const float* token_data = x_fp32 + i * token_dim_size;
+            float min = torch::executor::vec_minf(token_data, token_dim_size);
+            float max = torch::executor::vec_maxf(token_data, token_dim_size);
+            double scale;
+            int32_t zero_point;
+            calculate_scale_and_zero_point(
+                min, max, qmin, qmax, scale, zero_point);
+            scale_data[i] = scale;
+            zero_point_data[i] = zero_point;
+          }
+        });
+  } else {
+    for (auto i = 0; i < num_tokens; i++) {
+      // vec_minf uses std::min_element. Check if it actually
+      // gets vectorized.
+      float min = torch::executor::vec_minf(x_fp32, token_dim_size);
+      float max = torch::executor::vec_maxf(x_fp32, token_dim_size);
+      double scale;
+      int32_t zero_point;
+      calculate_scale_and_zero_point(min, max, qmin, qmax, scale, zero_point);
+      scale_out.mutable_data_ptr<double>()[i] = scale;
+      zero_point_out.mutable_data_ptr<int64_t>()[i] = zero_point;
+      x_fp32 += token_dim_size;
+    }
   }
 }
 } // namespace
diff --git a/kernels/quantized/cpu/op_quantize.cpp b/kernels/quantized/cpu/op_quantize.cpp
index 5586f8a77eb..fefb07b1e59 100644
--- a/kernels/quantized/cpu/op_quantize.cpp
+++ b/kernels/quantized/cpu/op_quantize.cpp
@@ -11,6 +11,10 @@
 #include <cinttypes>
 #include <cmath>
 
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
 /**
  * For an input tensor, use the scale and zero_point arguments to quantize it.
  */
@@ -105,6 +109,143 @@ T quantize_val(
   return static_cast<T>(qvalue);
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+
+// Traits for type-specific NEON operations
+template <typename T>
+struct NeonQuantizeTraits;
+
+template <>
+struct NeonQuantizeTraits<uint8_t> {
+  // Narrow int16x8 to uint8x8 with saturation (unsigned)
+  static inline uint8x8_t narrow_and_saturate(int16x8_t v) {
+    return vqmovun_s16(v);
+  }
+
+  // Store uint8x8 to memory
+  static inline void store(uint8_t* ptr, uint8x8_t v) {
+    vst1_u8(ptr, v);
+  }
+
+  // Scalar clamping for uint8
+  static inline uint8_t clamp_scalar(int32_t val) {
+    return static_cast<uint8_t>(std::min(255, std::max(0, val)));
+  }
+};
+
+template <>
+struct NeonQuantizeTraits<int8_t> {
+  // Narrow int16x8 to int8x8 with saturation (signed)
+  static inline int8x8_t narrow_and_saturate(int16x8_t v) {
+    return vqmovn_s16(v);
+  }
+
+  // Store int8x8 to memory
+  static inline void store(int8_t* ptr, int8x8_t v) {
+    vst1_s8(ptr, v);
+  }
+
+  // Scalar clamping for int8
+  static inline int8_t clamp_scalar(int32_t val) {
+    return static_cast<int8_t>(std::min(127, std::max(-128, val)));
+  }
+};
+
+// Unified ARM NEON optimized quantization for contiguous blocks
+// Processes N elements with a single scale/zero_point pair
+// Used for both per-tensor (entire tensor) and per-channel (one block per
+// channel)
+template <typename T>
+void quantize_arm(
+    const float* __restrict__ in,
+    T* __restrict__ out,
+    const int64_t N,
+    const float inv_scale,
+    const int32_t zero_point,
+    const int32_t quant_min,
+    const int32_t quant_max) {
+  using Traits = NeonQuantizeTraits<T>;
+  const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
+
+#if defined(__aarch64__)
+  // ARMv8: Use vcvtnq_s32_f32 for rounding
+  const int16x8_t vzero_point = vdupq_n_s16(static_cast<int16_t>(zero_point));
+  const int16x8_t vquant_min = vdupq_n_s16(static_cast<int16_t>(quant_min));
+  const int16x8_t vquant_max = vdupq_n_s16(static_cast<int16_t>(quant_max));
+
+  int64_t i = 0;
+  // Process 8 elements at a time
+  for (; i + 8 <= N; i += 8) {
+    const float32x4_t vin0123 = vld1q_f32(in + i);
+    const float32x4_t vin4567 = vld1q_f32(in + i + 4);
+
+    // Multiply by inv_scale and round
+    const int32x4_t v0123_rounded =
+        vcvtnq_s32_f32(vmulq_f32(vin0123, vinv_scale));
+    const int32x4_t v4567_rounded =
+        vcvtnq_s32_f32(vmulq_f32(vin4567, vinv_scale));
+
+    // Combine to int16 and add zero_point
+    int16x8_t v01234567_packed = vqaddq_s16(
+        vqmovn_high_s32(vqmovn_s32(v0123_rounded), v4567_rounded), vzero_point);
+
+    // Clamp to quant_min/quant_max
+    v01234567_packed = vmaxq_s16(v01234567_packed, vquant_min);
+    v01234567_packed = vminq_s16(v01234567_packed, vquant_max);
+
+    // Convert to T (int8/uint8) with saturation using type-specific operation
+    const auto vout01234567 = Traits::narrow_and_saturate(v01234567_packed);
+    Traits::store(out + i, vout01234567);
+  }
+
+  // Handle remaining elements with proper quant_min/quant_max clamping
+  for (; i < N; ++i) {
+    float val = in[i] * inv_scale;
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::max(quant_min, std::min(quant_max, qval));
+    out[i] = static_cast<T>(qval);
+  }
+
+#else
+  // ARMv7: Use magic float rounding
+  const int32x4_t voffset = vdupq_n_s32(zero_point - 0x4B400000);
+  const float32x4_t vmagic_float = vdupq_n_f32(12582912.0f);
+
+  int64_t i = 0;
+  // Process 8 elements at a time
+  for (; i + 8 <= N; i += 8) {
+    const float32x4_t vin0123 = vld1q_f32(in + i);
+    const float32x4_t vin4567 = vld1q_f32(in + i + 4);
+
+    const int32x4_t vraw0123 = vaddq_s32(
+        voffset,
+        vreinterpretq_s32_f32(
+            vaddq_f32(vmagic_float, vmulq_f32(vin0123, vinv_scale))));
+    const int32x4_t vraw4567 = vaddq_s32(
+        voffset,
+        vreinterpretq_s32_f32(
+            vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale))));
+
+    const int16x8_t vraw01234567 =
+        vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567));
+
+    // Convert to T (int8/uint8) with saturation using type-specific operation
+    const auto vout01234567 = Traits::narrow_and_saturate(vraw01234567);
+    Traits::store(out + i, vout01234567);
+  }
+
+  // Handle remaining elements with proper quant_min/quant_max clamping
+  for (; i < N; ++i) {
+    float val = in[i] * inv_scale;
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::max(quant_min, std::min(quant_max, qval));
+    out[i] = static_cast<T>(qval);
+  }
+#endif
+}
+
+#endif // defined(__aarch64__) || defined(__ARM_NEON__)
+
 Tensor& quantize_per_tensor_out(
     const Tensor& input,
     double scale,
@@ -120,19 +261,44 @@ Tensor& quantize_per_tensor_out(
 
   check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
 
-  // calculate the quantized input
-#define QUANTIZE_IMPL(IN_CTYPE, OUT_CTYPE, out_dtype)                          \
-  case ScalarType::out_dtype: {                                                \
-    /* Hoist these function calls out of our inner loop because they might not \
-     * get inlined without LTO, particularly in ATen mode. */                  \
-    auto* out_data_ptr = out.mutable_data_ptr<OUT_CTYPE>();                    \
-    const auto* input_data_ptr = input.const_data_ptr<IN_CTYPE>();             \
-    const auto input_numel = input.numel();                                    \
-    for (size_t i = 0; i < input_numel; i++) {                                 \
-      IN_CTYPE value = input_data_ptr[i];                                      \
-      out_data_ptr[i] = quantize_val<OUT_CTYPE, IN_CTYPE>(                     \
-          scale, zero_point, value, quant_min, quant_max);                     \
-    }                                                                          \
+  // Try ARM NEON optimized path for float->int8/uint8 quantization
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+  if (input.scalar_type() == ScalarType::Float) {
+    if (dtype == ScalarType::Byte) {
+      quantize_arm<uint8_t>(
+          input.const_data_ptr<float>(),
+          out.mutable_data_ptr<uint8_t>(),
+          input.numel(),
+          1.0f / static_cast<float>(scale),
+          static_cast<int32_t>(zero_point),
+          static_cast<int32_t>(quant_min),
+          static_cast<int32_t>(quant_max));
+      return out;
+    } else if (dtype == ScalarType::Char) {
+      quantize_arm<int8_t>(
+          input.const_data_ptr<float>(),
+          out.mutable_data_ptr<int8_t>(),
+          input.numel(),
+          1.0f / static_cast<float>(scale),
+          static_cast<int32_t>(zero_point),
+          static_cast<int32_t>(quant_min),
+          static_cast<int32_t>(quant_max));
+      return out;
+    }
+  }
+#endif
+
+  // Fallback scalar implementation for all other cases
+#define QUANTIZE_IMPL(IN_CTYPE, OUT_CTYPE, out_dtype)              \
+  case ScalarType::out_dtype: {                                    \
+    auto* out_data_ptr = out.mutable_data_ptr<OUT_CTYPE>();        \
+    const auto* input_data_ptr = input.const_data_ptr<IN_CTYPE>(); \
+    const auto input_numel = input.numel();                        \
+    for (size_t i = 0; i < input_numel; i++) {                     \
+      IN_CTYPE value = input_data_ptr[i];                          \
+      out_data_ptr[i] = quantize_val<OUT_CTYPE, IN_CTYPE>(         \
+          scale, zero_point, value, quant_min, quant_max);         \
+    }                                                              \
   } break;
 #define CALCULATE_FLOAT_TYPE(IN_CTYPE, in_dtype)         \
   case ScalarType::in_dtype:                             \
@@ -284,29 +450,85 @@ Tensor& quantize_per_channel_out(
   const double* scale_data = scale.const_data_ptr<double>();
   const int64_t* zero_point_data = zero_point.const_data_ptr<int64_t>();
 
-  // High-performance single loop with direct channel calculation
-#define QUANTIZE_IMPL(CTYPE_IN, CTYPE_OUT, out_dtype)                          \
-  case ScalarType::out_dtype: {                                                \
-    auto* out_data_ptr = out.mutable_data_ptr<CTYPE_OUT>();                    \
-    const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();             \
-    const int64_t input_numel = input.numel();                                 \
-    const int64_t axis_size = input.size(axis);                                \
-    /* Calculate the stride pattern for efficient channel index calculation */ \
-    int64_t axis_block_size = 1;                                               \
-    for (int64_t i = axis + 1; i < input.dim(); i++) {                         \
-      axis_block_size *= input.size(i);                                        \
-    }                                                                          \
-    /* Single loop over all elements */                                        \
-    for (int64_t i = 0; i < input_numel; i++) {                                \
-      /* Calculate which channel this element belongs to */                    \
-      int64_t channel_idx = (i / axis_block_size) % axis_size;                 \
-      /* Get quantization parameters for this channel */                       \
-      double _scale = scale_data[channel_idx];                                 \
-      int64_t _zero_point = zero_point_data[channel_idx];                      \
-      /* Apply quantization */                                                 \
-      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(                     \
-          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max);       \
-    }                                                                          \
+  // Calculate the block size for each channel
+  int64_t axis_block_size = 1;
+  for (int64_t i = axis + 1; i < input.dim(); i++) {
+    axis_block_size *= input.size(i);
+  }
+  const int64_t axis_size = input.size(axis);
+
+  // Try ARM NEON optimized path for float->int8/uint8 quantization
+#if defined(__aarch64__) || defined(__ARM_NEON__)
+  if (input.scalar_type() == ScalarType::Float) {
+    const int64_t num_blocks = input.numel() / axis_block_size;
+
+    if (dtype == ScalarType::Byte) {
+      auto* out_data_ptr = out.mutable_data_ptr<uint8_t>();
+      const auto* input_data_ptr = input.const_data_ptr<float>();
+
+      // Process each contiguous block (which shares the same scale/zero_point)
+      for (int64_t block = 0; block < num_blocks; ++block) {
+        int64_t channel_idx = block % axis_size;
+        float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
+        int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+        const float* in_ptr = input_data_ptr + block * axis_block_size;
+        uint8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+        quantize_arm<uint8_t>(
+            in_ptr,
+            out_ptr,
+            axis_block_size,
+            inv_scale,
+            zp,
+            static_cast<int32_t>(quant_min),
+            static_cast<int32_t>(quant_max));
+      }
+      return out;
+    } else if (dtype == ScalarType::Char) {
+      auto* out_data_ptr = out.mutable_data_ptr<int8_t>();
+      const auto* input_data_ptr = input.const_data_ptr<float>();
+
+      // Process each contiguous block (which shares the same scale/zero_point)
+      for (int64_t block = 0; block < num_blocks; ++block) {
+        int64_t channel_idx = block % axis_size;
+        float inv_scale = 1.0f / static_cast<float>(scale_data[channel_idx]);
+        int32_t zp = static_cast<int32_t>(zero_point_data[channel_idx]);
+
+        const float* in_ptr = input_data_ptr + block * axis_block_size;
+        int8_t* out_ptr = out_data_ptr + block * axis_block_size;
+
+        quantize_arm<int8_t>(
+            in_ptr,
+            out_ptr,
+            axis_block_size,
+            inv_scale,
+            zp,
+            static_cast<int32_t>(quant_min),
+            static_cast<int32_t>(quant_max));
+      }
+      return out;
+    }
+  }
+#endif
+
+  // Fallback scalar implementation
+#define QUANTIZE_IMPL(CTYPE_IN, CTYPE_OUT, out_dtype)                    \
+  case ScalarType::out_dtype: {                                          \
+    auto* out_data_ptr = out.mutable_data_ptr<CTYPE_OUT>();              \
+    const auto* input_data_ptr = input.const_data_ptr<CTYPE_IN>();       \
+    const int64_t input_numel = input.numel();                           \
+    /* Single loop over all elements */                                  \
+    for (int64_t i = 0; i < input_numel; i++) {                          \
+      /* Calculate which channel this element belongs to */              \
+      int64_t channel_idx = (i / axis_block_size) % axis_size;           \
+      /* Get quantization parameters for this channel */                 \
+      double _scale = scale_data[channel_idx];                           \
+      int64_t _zero_point = zero_point_data[channel_idx];                \
+      /* Apply quantization */                                           \
+      out_data_ptr[i] = quantize_val<CTYPE_OUT, CTYPE_IN>(               \
+          _scale, _zero_point, input_data_ptr[i], quant_min, quant_max); \
+    }                                                                    \
   } break;
 
 #define CALCULATE_FLOAT_TYPE(CTYPE_IN, in_dtype)         \
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
index f29f1f013b7..1da0d482485 100644
--- a/kernels/quantized/cpu/targets.bzl
+++ b/kernels/quantized/cpu/targets.bzl
@@ -9,6 +9,7 @@ _QUANT_OPS = (
         name = "op_choose_qparams",
         deps = [
             "//executorch/kernels/portable/cpu:vec_ops",
+            "//executorch/extension/threadpool:threadpool",
         ],
     ),
     op_target(
diff --git a/kernels/quantized/test/op_choose_qparams_test.cpp b/kernels/quantized/test/op_choose_qparams_test.cpp
index 13426bfdd86..dc92df80488 100644
--- a/kernels/quantized/test/op_choose_qparams_test.cpp
+++ b/kernels/quantized/test/op_choose_qparams_test.cpp
@@ -15,6 +15,7 @@
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
+#include <cmath>
 #include <limits>
 
 using namespace ::testing;
@@ -163,3 +164,97 @@ TEST(OpChooseQparamsPerTokenAsymmetricTensorOutTest, DynamicShapeFloat) {
   EXPECT_TENSOR_CLOSE_WITH_TOL(scale_out, new_expected_scale, 1e-4, 1e-4);
   EXPECT_TENSOR_EQ(zero_point_out, new_expected_zero_point);
 }
+
+TEST(
+    OpChooseQparamsPerTokenAsymmetricTensorOutTest,
+    LargeInputParallelization) {
+  et_pal_init();
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  // Create input with 8 tokens x 128 elements per token = 1024 total elements
+  // This exceeds the MIN_ELEMENTS_FOR_PARALLEL threshold of 512
+  const int num_tokens = 8;
+  const int token_size = 128;
+  std::vector<float> input_data(num_tokens * token_size);
+
+  // Generate test data with known min/max per token for easier verification
+  std::vector<float> expected_min(num_tokens);
+  std::vector<float> expected_max(num_tokens);
+
+  for (int i = 0; i < num_tokens; i++) {
+    float token_min = -1.0f * (i + 1);
+    float token_max = 2.0f * (i + 1);
+    expected_min[i] = token_min;
+    expected_max[i] = token_max;
+
+    for (int j = 0; j < token_size; j++) {
+      // Linearly interpolate between min and max
+      float t = j / static_cast<float>(token_size - 1);
+      input_data[i * token_size + j] = token_min + t * (token_max - token_min);
+    }
+  }
+
+  Tensor input = tf_float.make({num_tokens, token_size}, input_data);
+  Tensor scale_out = tf_double.zeros({num_tokens, 1});
+  Tensor zero_point_out = tf_long.zeros({num_tokens, 1});
+
+  choose_qparams_per_token_asymmetric_out(
+      input, ScalarType::Float, scale_out, zero_point_out);
+
+  // Manually calculate expected scale and zero_point using the same algorithm
+  // as calculate_scale_and_zero_point function
+  const int32_t qmin = -128;
+  const int32_t qmax = 127;
+  const float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+
+  for (int i = 0; i < num_tokens; i++) {
+    float min = std::min(expected_min[i], 0.0f);
+    float max = std::max(expected_max[i], 0.0f);
+
+    // Calculate scale
+    double scale = (static_cast<double>(max) - min) / (qmax - qmin);
+    if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
+      scale = 0.1;
+    }
+
+    // Cut off small scale
+    if (scale < SMALL_SCALE_THRESHOLD) {
+      scale = SMALL_SCALE_THRESHOLD;
+      if (min == 0.0f) {
+        max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
+      } else if (max == 0.0f) {
+        min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
+      } else {
+        float amplifier = SMALL_SCALE_THRESHOLD / scale;
+        min *= amplifier;
+        max *= amplifier;
+      }
+    }
+
+    // Calculate zero_point
+    double zero_point_from_min = qmin - min / scale;
+    double zero_point_from_max = qmax - max / scale;
+    double zero_point_from_min_error = std::abs(qmin) - std::abs(min / scale);
+    double zero_point_from_max_error = std::abs(qmax) - std::abs(max / scale);
+    double initial_zero_point =
+        zero_point_from_min_error < zero_point_from_max_error
+        ? zero_point_from_min
+        : zero_point_from_max;
+
+    int32_t nudged_zero_point = 0;
+    if (initial_zero_point < qmin) {
+      nudged_zero_point = qmin;
+    } else if (initial_zero_point > qmax) {
+      nudged_zero_point = qmax;
+    } else {
+      nudged_zero_point =
+          std::nearbyint(static_cast<float>(initial_zero_point));
+    }
+
+    // Verify computed values match expected
+    EXPECT_NEAR(scale_out.const_data_ptr<double>()[i], scale, 1e-6);
+    EXPECT_EQ(zero_point_out.const_data_ptr<int64_t>()[i], nudged_zero_point);
+  }
+}
diff --git a/kernels/quantized/test/op_quantize_test.cpp b/kernels/quantized/test/op_quantize_test.cpp
index 4ac835c24ce..b450ec0ee33 100644
--- a/kernels/quantized/test/op_quantize_test.cpp
+++ b/kernels/quantized/test/op_quantize_test.cpp
@@ -14,7 +14,6 @@
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
-#include <limits>
 
 using namespace ::testing;
 using executorch::aten::ArrayRef;
@@ -446,3 +445,539 @@ TEST(OpQuantizeOutTest, QuantizePerChannelClampingBehavior) {
 
   EXPECT_TENSOR_EQ(out, expected);
 }
+
+TEST(OpQuantizeOutTest, LargePerChannelClampingSIMDPath) {
+  // Test quant_min/quant_max clamping with large tensor to exercise SIMD path
+  // Shape: [3, 80] with axis=0 (3 channels, 80 elements each)
+  // 80 elements = 10 SIMD iterations (8 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 3;
+  const int block_size = 80;
+  std::vector<float> input_data(num_channels * block_size);
+
+  // Create input data with values that exceed quant_min/quant_max
+  for (int ch = 0; ch < num_channels; ch++) {
+    for (int i = 0; i < block_size; i++) {
+      // Generate values from -150 to 150 to test clamping
+      input_data[ch * block_size + i] =
+          static_cast<float>((i % 40) - 20) * 5.0f * (ch + 1);
+    }
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  // Use uniform scale and zero_point for all channels
+  Tensor scale = tf_double.make({num_channels}, {1.0, 1.0, 1.0});
+  Tensor zero_point = tf_long.make({num_channels}, {0, 0, 0});
+
+  // Set narrow quant_min/quant_max to force clamping
+  int64_t quant_min = -20;
+  int64_t quant_max = 20;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values with clamping
+  std::vector<int8_t> expected_data(num_channels * block_size);
+  for (int ch = 0; ch < num_channels; ch++) {
+    double ch_scale = scale.const_data_ptr<double>()[ch];
+    int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      // Use double precision to avoid overflow
+      double val = static_cast<double>(input_data[idx]) / ch_scale;
+      // Clamp before converting to int to avoid overflow
+      val = std::max(-1000.0, std::min(1000.0, val));
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+      // Apply quant_min/quant_max clamping
+      qval = std::max(
+          static_cast<int32_t>(quant_min),
+          std::min(static_cast<int32_t>(quant_max), qval));
+      expected_data[idx] = static_cast<int8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+// Large tensor tests to ensure ARM NEON SIMD path is exercised
+
+TEST(OpQuantizeOutTest, LargeTensorUInt8SIMDPath) {
+  // Test with 64 elements to fully exercise SIMD path (8 elements per
+  // iteration)
+  TensorFactory<ScalarType::Float> tf_float;
+
+  // Create input with known values for verification
+  std::vector<float> input_data(64);
+  for (size_t i = 0; i < 64; i++) {
+    input_data[i] = static_cast<float>(i) * 0.5f; // 0.0, 0.5, 1.0, 1.5, ...
+  }
+  Tensor input = tf_float.make({64}, input_data);
+
+  double scale = 0.1;
+  int64_t zero_point = 10;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({64});
+
+  // Compute expected values: round(value / scale) + zero_point
+  std::vector<uint8_t> expected_data(64);
+  for (size_t i = 0; i < 64; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(255, std::max(0, qval));
+    expected_data[i] = static_cast<uint8_t>(qval);
+  }
+  Tensor expected = tfo.make({64}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargeTensorInt8SIMDPath) {
+  // Test with 72 elements (9 SIMD iterations of 8) to test both vectorized and
+  // scalar paths
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(72);
+  for (size_t i = 0; i < 72; i++) {
+    // Mix of positive and negative values
+    input_data[i] = static_cast<float>(static_cast<int>(i) - 36) * 0.25f;
+  }
+  Tensor input = tf_float.make({72}, input_data);
+
+  double scale = 0.2;
+  int64_t zero_point = 0;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({72});
+
+  // Compute expected values
+  std::vector<int8_t> expected_data(72);
+  for (size_t i = 0; i < 72; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(127, std::max(-128, qval));
+    expected_data[i] = static_cast<int8_t>(qval);
+  }
+  Tensor expected = tfo.make({72}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargeTensorWithRemainderUInt8) {
+  // Test with 100 elements (12 SIMD iterations + 4 remainder) to test remainder
+  // handling
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(100);
+  for (size_t i = 0; i < 100; i++) {
+    input_data[i] = static_cast<float>(i % 50) * 0.3f;
+  }
+  Tensor input = tf_float.make({100}, input_data);
+
+  double scale = 0.15;
+  int64_t zero_point = 128;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({100});
+
+  std::vector<uint8_t> expected_data(100);
+  for (size_t i = 0; i < 100; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(255, std::max(0, qval));
+    expected_data[i] = static_cast<uint8_t>(qval);
+  }
+  Tensor expected = tfo.make({100}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargeTensorWithRemainderInt8) {
+  // Test with 99 elements (12 SIMD iterations + 3 remainder)
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(99);
+  for (size_t i = 0; i < 99; i++) {
+    input_data[i] = std::sin(static_cast<float>(i) * 0.1f) * 10.0f;
+  }
+  Tensor input = tf_float.make({99}, input_data);
+
+  double scale = 0.1;
+  int64_t zero_point = 5;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({99});
+
+  std::vector<int8_t> expected_data(99);
+  for (size_t i = 0; i < 99; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(127, std::max(-128, qval));
+    expected_data[i] = static_cast<int8_t>(qval);
+  }
+  Tensor expected = tfo.make({99}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, VeryLargeTensor2DUInt8) {
+  // Test with realistic 2D tensor size that would be used in neural networks
+  // 256x256 = 65536 elements (8192 SIMD iterations)
+  TensorFactory<ScalarType::Float> tf_float;
+
+  std::vector<float> input_data(256 * 256);
+  for (size_t i = 0; i < 256 * 256; i++) {
+    // Generate diverse values in a safe range
+    input_data[i] =
+        static_cast<float>((static_cast<int>(i % 256) - 128)) * 0.05f;
+  }
+  Tensor input = tf_float.make({256, 256}, input_data);
+
+  double scale = 0.05;
+  int64_t zero_point = 128;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({256, 256});
+
+  // Compute expected values with proper overflow handling
+  std::vector<uint8_t> expected_data(256 * 256);
+  for (size_t i = 0; i < 256 * 256; i++) {
+    // Use double precision to avoid overflow
+    double val = static_cast<double>(input_data[i]) / scale;
+    // Clamp before converting to int to avoid overflow
+    val = std::max(-1000.0, std::min(1000.0, val));
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+        static_cast<int32_t>(zero_point);
+    qval = std::min(255, std::max(0, qval));
+    expected_data[i] = static_cast<uint8_t>(qval);
+  }
+  Tensor expected = tfo.make({256, 256}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, VeryLargeTensor3DInt8) {
+  // Test with 3D tensor (batch_size=2, height=64, width=128) = 16384 elements
+  TensorFactory<ScalarType::Float> tf_float;
+
+  const size_t total_elements = 2 * 64 * 128;
+  std::vector<float> input_data(total_elements);
+  for (size_t i = 0; i < total_elements; i++) {
+    input_data[i] = std::cos(static_cast<float>(i) * 0.01f) * 8.0f;
+  }
+  Tensor input = tf_float.make({2, 64, 128}, input_data);
+
+  double scale = 0.0625; // 1/16
+  int64_t zero_point = -10;
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({2, 64, 128});
+
+  std::vector<int8_t> expected_data(total_elements);
+  for (size_t i = 0; i < total_elements; i++) {
+    float val = input_data[i] / static_cast<float>(scale);
+    int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+    qval = std::min(127, std::max(-128, qval));
+    expected_data[i] = static_cast<int8_t>(qval);
+  }
+  Tensor expected = tfo.make({2, 64, 128}, expected_data);
+
+  quantize_per_tensor_out(
+      input, scale, zero_point, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, EdgeCaseSizesSIMD) {
+  // Test specific sizes around SIMD boundaries
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Byte> tfo;
+
+  double scale = 0.1;
+  int64_t zero_point = 100;
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  // Test sizes: 7 (just before SIMD), 8 (exactly 1 SIMD), 9 (1 SIMD + 1), 15,
+  // 16, 17
+  std::vector<size_t> test_sizes = {
+      7, 8, 9, 15, 16, 17, 23, 24, 25, 31, 32, 33};
+
+  for (size_t size : test_sizes) {
+    std::vector<float> input_data(size);
+    std::vector<uint8_t> expected_data(size);
+
+    for (size_t i = 0; i < size; i++) {
+      input_data[i] = static_cast<float>(i) * 0.3f;
+      float val = input_data[i] / static_cast<float>(scale);
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) + zero_point;
+      qval = std::min(255, std::max(0, qval));
+      expected_data[i] = static_cast<uint8_t>(qval);
+    }
+
+    Tensor input = tf_float.make({static_cast<int>(size)}, input_data);
+    Tensor out = tfo.zeros({static_cast<int>(size)});
+    Tensor expected = tfo.make({static_cast<int>(size)}, expected_data);
+
+    quantize_per_tensor_out(
+        input, scale, zero_point, quant_min, quant_max, ScalarType::Byte, out);
+
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+}
+
+// Large tensor tests for per-channel quantization to ensure SIMD path is
+// exercised
+
+TEST(OpQuantizeOutTest, LargePerChannelUInt8SIMDPath) {
+  // Test per-channel quantization with large blocks (64 elements per channel)
+  // Shape: [4, 64] with axis=1 (4 channels, 64 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 4;
+  const int block_size = 64;
+  std::vector<float> input_data(num_channels * block_size);
+
+  // Create varying input data for each channel
+  for (int ch = 0; ch < num_channels; ch++) {
+    for (int i = 0; i < block_size; i++) {
+      input_data[ch * block_size + i] = static_cast<float>((ch + 1) * i) * 0.1f;
+    }
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  // Different scale and zero_point for each channel
+  Tensor scale = tf_double.make({num_channels}, {0.1, 0.2, 0.15, 0.25});
+  Tensor zero_point = tf_long.make({num_channels}, {10, 20, 15, 25});
+
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values
+  std::vector<uint8_t> expected_data(num_channels * block_size);
+  for (int ch = 0; ch < num_channels; ch++) {
+    double ch_scale = scale.const_data_ptr<double>()[ch];
+    int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      float val = input_data[idx] / static_cast<float>(ch_scale);
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+      qval = std::min(255, std::max(0, qval));
+      expected_data[idx] = static_cast<uint8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, LargePerChannelInt8SIMDPath) {
+  // Test per-channel quantization with int8 and large blocks
+  // Shape: [3, 100] with axis=1 (3 channels, 100 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 3;
+  const int block_size = 100; // 12 SIMD iterations + 4 remainder
+  std::vector<float> input_data(num_channels * block_size);
+
+  // Create varying input data with negative values
+  for (int ch = 0; ch < num_channels; ch++) {
+    for (int i = 0; i < block_size; i++) {
+      input_data[ch * block_size + i] =
+          static_cast<float>(i - 50) * 0.2f * (ch + 1);
+    }
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  Tensor scale = tf_double.make({num_channels}, {0.1, 0.15, 0.2});
+  Tensor zero_point = tf_long.make({num_channels}, {0, -5, 5});
+
+  int64_t quant_min = -128;
+  int64_t quant_max = 127;
+
+  TensorFactory<ScalarType::Char> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values
+  std::vector<int8_t> expected_data(num_channels * block_size);
+  for (int ch = 0; ch < num_channels; ch++) {
+    double ch_scale = scale.const_data_ptr<double>()[ch];
+    int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      float val = input_data[idx] / static_cast<float>(ch_scale);
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+      qval = std::min(127, std::max(-128, qval));
+      expected_data[idx] = static_cast<int8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Char, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, VeryLargePerChannel2DUInt8) {
+  // Test realistic neural network weight tensor
+  // Shape: [128, 256] with axis=0 (128 channels, 256 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int num_channels = 128;
+  const int block_size = 256;
+  const int total_elements = num_channels * block_size;
+
+  std::vector<float> input_data(total_elements);
+  for (int i = 0; i < total_elements; i++) {
+    input_data[i] = std::sin(static_cast<float>(i) * 0.01f) * 5.0f;
+  }
+  Tensor input = tf_float.make({num_channels, block_size}, input_data);
+
+  // Create varying scales and zero_points for each channel
+  std::vector<double> scales(num_channels);
+  std::vector<int64_t> zero_points(num_channels);
+  for (int ch = 0; ch < num_channels; ch++) {
+    scales[ch] = 0.02 + (ch % 10) * 0.001; // Varying scales
+    zero_points[ch] = 128 + (ch % 5); // Varying zero_points
+  }
+  Tensor scale = tf_double.make({num_channels}, scales);
+  Tensor zero_point = tf_long.make({num_channels}, zero_points);
+
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({num_channels, block_size});
+
+  // Compute expected values
+  std::vector<uint8_t> expected_data(total_elements);
+  for (int ch = 0; ch < num_channels; ch++) {
+    float inv_scale = 1.0f / static_cast<float>(scales[ch]);
+    int64_t ch_zero_point = zero_points[ch];
+
+    for (int i = 0; i < block_size; i++) {
+      int idx = ch * block_size + i;
+      float val = input_data[idx] * inv_scale;
+      // Clamp before converting to avoid overflow
+      val = std::max(-1000.0f, std::min(1000.0f, val));
+      int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+          static_cast<int32_t>(ch_zero_point);
+
+      qval = std::min(255, std::max(0, qval));
+      expected_data[idx] = static_cast<uint8_t>(qval);
+    }
+  }
+  Tensor expected = tfo.make({num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 0, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST(OpQuantizeOutTest, PerChannelAxis1LargeBlocks) {
+  // Test per-channel quantization with axis=1 and large contiguous blocks
+  // Shape: [2, 3, 64] with axis=1 (2 batches, 3 channels, 64 elements each)
+  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  const int batch_size = 2;
+  const int num_channels = 3;
+  const int block_size = 64;
+  const int total_elements = batch_size * num_channels * block_size;
+
+  std::vector<float> input_data(total_elements);
+  for (int i = 0; i < total_elements; i++) {
+    input_data[i] = static_cast<float>(i % 100) * 0.1f;
+  }
+  Tensor input =
+      tf_float.make({batch_size, num_channels, block_size}, input_data);
+
+  Tensor scale = tf_double.make({num_channels}, {0.05, 0.1, 0.15});
+  Tensor zero_point = tf_long.make({num_channels}, {100, 110, 120});
+
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Byte> tfo;
+  Tensor out = tfo.zeros({batch_size, num_channels, block_size});
+
+  // Compute expected values
+  std::vector<uint8_t> expected_data(total_elements);
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < num_channels; ch++) {
+      double ch_scale = scale.const_data_ptr<double>()[ch];
+      int64_t ch_zero_point = zero_point.const_data_ptr<int64_t>()[ch];
+
+      for (int i = 0; i < block_size; i++) {
+        int idx = (b * num_channels + ch) * block_size + i;
+        float val = input_data[idx] / static_cast<float>(ch_scale);
+        int32_t qval = static_cast<int32_t>(std::nearbyint(val)) +
+            static_cast<int32_t>(ch_zero_point);
+        qval = std::min(255, std::max(0, qval));
+        expected_data[idx] = static_cast<uint8_t>(qval);
+      }
+    }
+  }
+  Tensor expected =
+      tfo.make({batch_size, num_channels, block_size}, expected_data);
+
+  quantize_per_channel_out(
+      input, scale, zero_point, 1, quant_min, quant_max, ScalarType::Byte, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}