From d9c6c0adccfc36b2ede458eaae6e767b7b434272 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Fri, 12 Jun 2026 14:29:53 -0700
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 backends/xnnpack/CMakeLists.txt               |   5 +
 backends/xnnpack/runtime/core/dtype.h         |  24 ++
 .../xnnpack/runtime/core/quant_params.cpp     | 202 ++++++++++
 backends/xnnpack/runtime/core/quant_params.h  | 150 ++++++++
 backends/xnnpack/runtime/core/tensor.cpp      | 152 ++++++++
 backends/xnnpack/runtime/core/tensor.h        |  56 +++
 backends/xnnpack/runtime/core/variant_util.h  |   8 +
 backends/xnnpack/test/CMakeLists.txt          |  21 ++
 .../test/runtime/test_quant_params.cpp        | 354 ++++++++++++++++++
 9 files changed, 972 insertions(+)
 create mode 100644 backends/xnnpack/runtime/core/dtype.h
 create mode 100644 backends/xnnpack/runtime/core/quant_params.cpp
 create mode 100644 backends/xnnpack/runtime/core/quant_params.h
 create mode 100644 backends/xnnpack/runtime/core/tensor.cpp
 create mode 100644 backends/xnnpack/runtime/core/tensor.h
 create mode 100644 backends/xnnpack/runtime/core/variant_util.h
 create mode 100644 backends/xnnpack/test/runtime/test_quant_params.cpp
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index cd0d945a84f..81453e80d2f 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -100,6 +100,11 @@ set(xnnpack_third_party pthreadpool extension_threadpool cpuinfo)
 
 include(cmake/Dependencies.cmake)
 
+# Graph runtime sources.
+list(APPEND _xnnpack_backend__srcs backends/xnnpack/runtime/core/tensor.cpp
+     backends/xnnpack/runtime/core/quant_params.cpp
+)
+
 list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(xnnpack_backend ${_xnnpack_backend__srcs})
 target_link_libraries(
diff --git a/backends/xnnpack/runtime/core/dtype.h b/backends/xnnpack/runtime/core/dtype.h
new file mode 100644
index 00000000000..d3ac48920f7
--- /dev/null
+++ b/backends/xnnpack/runtime/core/dtype.h
@@ -0,0 +1,24 @@
+#pragma once
+
+namespace executorch::backends::xnnpack::core {
+
+enum class DType {
+  // Floating point
+  Float32,
+  Float16,
+  BFloat16,
+
+  // Non-quantized integer
+  Int64,
+  UInt64,
+
+  // Quantized — signed
+  QInt8,
+  QInt4,
+  QInt32,
+
+  // Quantized — unsigned
+  QUInt8,
+};
+
+} // namespace executorch::backends::xnnpack::core
diff --git a/backends/xnnpack/runtime/core/quant_params.cpp b/backends/xnnpack/runtime/core/quant_params.cpp
new file mode 100644
index 00000000000..c9abdeaf24e
--- /dev/null
+++ b/backends/xnnpack/runtime/core/quant_params.cpp
@@ -0,0 +1,202 @@
+#include <executorch/backends/xnnpack/runtime/core/quant_params.h>
+
+#include <executorch/backends/xnnpack/runtime/core/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cstdlib>
+
+namespace executorch::backends::xnnpack::core {
+
+using executorch::runtime::Span;
+
+QuantParams qint8_per_channel_sym(int8_t axis) {
+  return PerAxisQuantParams{.axis = axis, .has_zero_point = false};
+}
+
+QuantParams qint8_per_tensor_sym(float scale) {
+  return PerTensorQuantParams{
+      .scale = scale, .zero_point = 0, .has_zero_point = false};
+}
+
+QuantParams quint8_per_tensor_asym(float scale, int32_t zero_point) {
+  return PerTensorQuantParams{
+      .scale = scale, .zero_point = zero_point, .has_zero_point = true};
+}
+
+QuantParams quint8_per_row_asym(int8_t axis) {
+  return PerRowQuantParams{.axis = axis, .has_zero_point = true};
+}
+
+QuantParams quint8_per_token_asym() {
+  return PerRowQuantParams{.axis = -1, .has_zero_point = true};
+}
+
+QuantParams qint4_blockwise_sym(int8_t axis, int32_t block_size) {
+  return PerBlockQuantParams{
+      .axis = axis, .block_size = block_size, .has_zero_point = false};
+}
+
+bool is_quantized(DType dtype) {
+  switch (dtype) {
+    case DType::Float32:
+    case DType::Float16:
+    case DType::BFloat16:
+    case DType::Int64:
+    case DType::UInt64:
+      return false;
+    case DType::QInt8:
+    case DType::QInt4:
+    case DType::QInt32:
+    case DType::QUInt8:
+      return true;
+  }
+}
+
+bool is_subbyte(DType dtype) {
+  switch (dtype) {
+    case DType::QInt4:
+      return true;
+    case DType::Float32:
+    case DType::Float16:
+    case DType::BFloat16:
+    case DType::Int64:
+    case DType::UInt64:
+    case DType::QInt8:
+    case DType::QInt32:
+    case DType::QUInt8:
+      return false;
+  }
+}
+
+size_t byte_stride(DType dtype) {
+  switch (dtype) {
+    case DType::QInt8:
+    case DType::QUInt8:
+      return 1;
+    case DType::Float16:
+    case DType::BFloat16:
+      return 2;
+    case DType::Float32:
+    case DType::QInt32:
+      return 4;
+    case DType::Int64:
+    case DType::UInt64:
+      return 8;
+    case DType::QInt4:
+      // Sub-byte; no whole-byte stride. Guard callers with is_subbyte().
+      abort();
+  }
+}
+
+bool is_asymmetric(const QuantParams& params) {
+  return std::visit([](const auto& p) { return p.has_zero_point; }, params);
+}
+
+uint8_t aux_buffer_count(DType dtype, const QuantParams& params) {
+  if (!is_quantized(dtype))
+    return 0;
+
+  uint8_t count = 1; // scales
+  if (is_asymmetric(params))
+    count++; // zero_points
+  return count;
+}
+
+static runtime::Result<size_t> scale_element_count(
+    Span<const uint64_t> sizes,
+    const QuantParams& params) {
+  return std::visit(
+      overloaded{
+          [](const PerTensorQuantParams&) -> runtime::Result<size_t> {
+            return 1;
+          },
+          [&](const PerAxisQuantParams& p) -> runtime::Result<size_t> {
+            ET_CHECK_OR_RETURN_ERROR(
+                p.axis >= 0 && static_cast<size_t>(p.axis) < sizes.size(),
+                InvalidArgument,
+                "Per-axis quant axis %d is out of range for a %zu-dim tensor",
+                static_cast<int>(p.axis),
+                sizes.size());
+            return sizes[p.axis];
+          },
+          [&](const PerRowQuantParams& p) -> runtime::Result<size_t> {
+            int rank = static_cast<int>(sizes.size());
+            int axis = p.axis < 0 ? p.axis + rank : p.axis;
+            ET_CHECK_OR_RETURN_ERROR(
+                axis >= 0 && axis < rank,
+                InvalidArgument,
+                "Per-row quant axis %d is out of range for a %d-dim tensor",
+                static_cast<int>(p.axis),
+                rank);
+            size_t count = 1;
+            for (size_t i = 0; i < sizes.size(); i++) {
+              if (i != static_cast<size_t>(axis))
+                count *= sizes[i];
+            }
+            return count;
+          },
+          [&](const PerBlockQuantParams& p) -> runtime::Result<size_t> {
+            ET_CHECK_OR_RETURN_ERROR(
+                p.axis >= 0 && static_cast<size_t>(p.axis) < sizes.size(),
+                InvalidArgument,
+                "Per-block quant axis %d is out of range for a %zu-dim tensor",
+                static_cast<int>(p.axis),
+                sizes.size());
+            ET_CHECK_OR_RETURN_ERROR(
+                p.block_size > 0,
+                InvalidArgument,
+                "Per-block quant block_size must be positive, got %d",
+                p.block_size);
+            auto axis = static_cast<size_t>(p.axis);
+            ET_CHECK_OR_RETURN_ERROR(
+                sizes[axis] % static_cast<uint64_t>(p.block_size) == 0,
+                InvalidArgument,
+                "Per-block quant block_size %d must evenly divide axis %d (size %zu)",
+                p.block_size,
+                static_cast<int>(p.axis),
+                static_cast<size_t>(sizes[axis]));
+            size_t num_blocks = sizes[axis] / p.block_size;
+            size_t other_dims = 1;
+            for (size_t i = 0; i < sizes.size(); i++) {
+              if (i != axis)
+                other_dims *= sizes[i];
+            }
+            return num_blocks * other_dims;
+          },
+      },
+      params);
+}
+
+static DType scale_dtype_of(const QuantParams& params) {
+  return std::visit(
+      overloaded{
+          [](const PerTensorQuantParams& p) { return p.scale_dtype; },
+          [](const PerAxisQuantParams& p) { return p.scale_dtype; },
+          [](const PerRowQuantParams& p) { return p.scale_dtype; },
+          [](const PerBlockQuantParams& p) { return p.scale_dtype; },
+      },
+      params);
+}
+
+runtime::Result<std::vector<size_t>> compute_aux_storage_sizes(
+    Span<const uint64_t> sizes,
+    DType dtype,
+    const QuantParams& params) {
+  std::vector<size_t> result;
+
+  ET_UNWRAP(num_scales, scale_element_count(sizes, params));
+  const uint64_t scale_shape[] = {static_cast<uint64_t>(num_scales)};
+  ET_UNWRAP(
+      scale_bytes, compute_storage_size(scale_shape, scale_dtype_of(params)));
+  result.push_back(scale_bytes);
+
+  if (is_asymmetric(params)) {
+    auto zp_bytes = num_scales * sizeof(int32_t);
+    result.push_back(zp_bytes);
+  }
+
+  return result;
+}
+
+} // namespace executorch::backends::xnnpack::core
diff --git a/backends/xnnpack/runtime/core/quant_params.h b/backends/xnnpack/runtime/core/quant_params.h
new file mode 100644
index 00000000000..c0319250674
--- /dev/null
+++ b/backends/xnnpack/runtime/core/quant_params.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <executorch/backends/xnnpack/runtime/core/dtype.h>
+#include <executorch/backends/xnnpack/runtime/core/variant_util.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+#include <cstdint>
+#include <variant>
+#include <vector>
+
+/*
+ * This file contains types and methods related to quantization parameters.
+ * Quant params, in combination with dtype, should provide enough information
+ * to interpret raw tensor memory and inform kernel dispatch.
+ */
+
+namespace executorch::backends::xnnpack::core {
+
+/*
+ * Represents quantization parameters for per-tensor quantization. This means
+ * that there is a single scale and zero point for the entire tensor.
+ *
+ * For a tensor of shape [A, B, C], this is equivalent to a block size of
+ * [A, B, C].
+ */
+struct PerTensorQuantParams {
+  DType scale_dtype = DType::Float32;
+  float scale = 0.0f;
+  int32_t zero_point = 0;
+  bool has_zero_point = false;
+
+  bool operator==(const PerTensorQuantParams& o) const {
+    return scale_dtype == o.scale_dtype && scale == o.scale &&
+        zero_point == o.zero_point && has_zero_point == o.has_zero_point;
+  }
+};
+
+/*
+ * Represents per-axis quantization parameters. Scale and zero point are
+ * shared by all elements with the same index along the target axis.
+ *
+ * For a tensor of shape [A, B, C] and axis=1, this is equivalent to a block
+ * size of [A, 1, C] with a scale shape [1, B, 1].
+ */
+struct PerAxisQuantParams {
+  int8_t axis;
+  DType scale_dtype = DType::Float32;
+  bool has_zero_point = false;
+
+  bool operator==(const PerAxisQuantParams& o) const {
+    return axis == o.axis && scale_dtype == o.scale_dtype &&
+        has_zero_point == o.has_zero_point;
+  }
+};
+
+/*
+ * Represents per-row quantization parameters. Scale and zero point are
+ * shared by all elements with the same indices along non-target axes; `axis`
+ * is the reduced dim, negative values index from the end, and it defaults to
+ * -1 (the last dim, i.e. per-token).
+ *
+ * For a tensor of shape [A, B, C] and axis=1, this is equivalent to a block
+ * size of [1, B, 1] with a scale shape of [A, 1, C].
+ */
+struct PerRowQuantParams {
+  int8_t axis = -1;
+  DType scale_dtype = DType::Float32;
+  bool has_zero_point = false;
+
+  bool operator==(const PerRowQuantParams& o) const {
+    return axis == o.axis && scale_dtype == o.scale_dtype &&
+        has_zero_point == o.has_zero_point;
+  }
+};
+
+/*
+ * Represents per-block quantization parameters. Elements are grouped along
+ * `axis` into groups of `block_size`. Elements within a group share a scale
+ * and zero point. The block size must evenly divide the input tensor shape
+ * along the target axis.
+ *
+ * For a tensor of shape [A, B, C] and axis=1, blocks are size
+ * [1, block_size, 1] with a scale shape of [A, B / block_size, C].
+ */
+struct PerBlockQuantParams {
+  int8_t axis;
+  int32_t block_size;
+  DType scale_dtype = DType::Float32;
+  bool has_zero_point = false;
+
+  bool operator==(const PerBlockQuantParams& o) const {
+    return axis == o.axis && block_size == o.block_size &&
+        scale_dtype == o.scale_dtype && has_zero_point == o.has_zero_point;
+  }
+};
+
+/*
+ * Quantization parameter descriptor. Describes the type and granularity of
+ * the quantization scheme. Does not contain the actual scale and zero point
+ * data, as these are stored in the auxialliary storage on the tensor.
+ */
+using QuantParams = std::variant<
+    PerTensorQuantParams,
+    PerAxisQuantParams,
+    PerRowQuantParams,
+    PerBlockQuantParams>;
+
+QuantParams qint8_per_channel_sym(int8_t axis);
+QuantParams qint8_per_tensor_sym(float scale);
+QuantParams quint8_per_tensor_asym(float scale, int32_t zero_point);
+QuantParams quint8_per_row_asym(int8_t axis);
+QuantParams quint8_per_token_asym();
+QuantParams qint4_blockwise_sym(int8_t axis, int32_t block_size);
+
+/*
+ * Returns true if the given dtype is quantized. Quantized types
+ * require additional metadata to interpret.
+ */
+bool is_quantized(DType dtype);
+
+/*
+ * Returns true if the dtype's elements are smaller than a byte (e.g. packed
+ * 4-bit), and so are not individually byte-addressable.
+ */
+bool is_subbyte(DType dtype);
+
+/*
+ * Returns the size in bytes of a single element. Precondition: the dtype is
+ * byte-aligned (!is_subbyte); sub-byte types have no whole-byte stride.
+ */
+size_t byte_stride(DType dtype);
+
+/*
+ * Returns true if the given quant params have a zero point.
+ */
+bool is_asymmetric(const QuantParams& params);
+
+/*
+ * Returns the number of auxilliary storage buffers required to
+ * store the parameters (scales + zero points) for the given quant
+ * scheme.
+ */
+uint8_t aux_buffer_count(DType dtype, const QuantParams& params);
+runtime::Result<std::vector<size_t>> compute_aux_storage_sizes(
+    runtime::Span<const uint64_t> sizes,
+    DType dtype,
+    const QuantParams& params);
+
+} // namespace executorch::backends::xnnpack::core
diff --git a/backends/xnnpack/runtime/core/tensor.cpp b/backends/xnnpack/runtime/core/tensor.cpp
new file mode 100644
index 00000000000..26dd1fb4733
--- /dev/null
+++ b/backends/xnnpack/runtime/core/tensor.cpp
@@ -0,0 +1,152 @@
+#include <executorch/backends/xnnpack/runtime/core/tensor.h>
+
+#include <c10/util/safe_numerics.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cstdlib>
+
+namespace executorch::backends::xnnpack::core {
+
+using executorch::runtime::Span;
+
+Storage::~Storage() {
+  if (owner == StorageOwner::Self) {
+    std::free(data);
+  }
+}
+
+Storage::Storage(Storage&& other) noexcept
+    : data(other.data), owner(other.owner), size_in_bytes(other.size_in_bytes) {
+  other.data = nullptr;
+  other.owner = StorageOwner::External;
+  other.size_in_bytes = 0;
+}
+
+Storage& Storage::operator=(Storage&& other) noexcept {
+  if (this != &other) {
+    if (owner == StorageOwner::Self) {
+      std::free(data);
+    }
+    data = other.data;
+    owner = other.owner;
+    size_in_bytes = other.size_in_bytes;
+    other.data = nullptr;
+    other.owner = StorageOwner::External;
+    other.size_in_bytes = 0;
+  }
+  return *this;
+}
+
+runtime::Result<Storage> Storage::create_owned(size_t size_in_bytes) {
+  void* data = std::malloc(size_in_bytes);
+  ET_CHECK_OR_RETURN_ERROR(
+      data != nullptr || size_in_bytes == 0,
+      MemoryAllocationFailed,
+      "Failed to allocate %zu bytes for tensor storage",
+      size_in_bytes);
+
+  Storage s;
+  s.data = data;
+  s.owner = StorageOwner::Self;
+  s.size_in_bytes = size_in_bytes;
+  return s;
+}
+
+namespace {
+runtime::Result<size_t> checked_num_elements(Span<const uint64_t> sizes) {
+  size_t num_elements = 1;
+  for (size_t i = 0; i < sizes.size(); i++) {
+    size_t next;
+    ET_CHECK_OR_RETURN_ERROR(
+        !c10::mul_overflows(num_elements, static_cast<size_t>(sizes[i]), &next),
+        InvalidArgument,
+        "Overflow computing number of elements at dimension %zu",
+        i);
+    num_elements = next;
+  }
+  return num_elements;
+}
+} // namespace
+
+runtime::Result<size_t> Tensor::numel() const {
+  return checked_num_elements({sizes.data(), sizes.size()});
+}
+
+runtime::Error Tensor::resize(std::vector<uint64_t> new_sizes) {
+  ET_UNWRAP(
+      new_size_in_bytes,
+      compute_storage_size({new_sizes.data(), new_sizes.size()}, dtype));
+
+  if (new_size_in_bytes <= storage.size_in_bytes) {
+    sizes = std::move(new_sizes);
+    return runtime::Error::Ok;
+  }
+
+  ET_CHECK_OR_RETURN_ERROR(
+      storage.owner == StorageOwner::Self,
+      NotSupported,
+      "Cannot grow storage of a non-owned tensor");
+
+  void* new_data = std::realloc(storage.data, new_size_in_bytes);
+  ET_CHECK_OR_RETURN_ERROR(
+      new_data != nullptr,
+      MemoryAllocationFailed,
+      "Failed to reallocate %zu bytes during resize",
+      new_size_in_bytes);
+
+  storage.data = new_data;
+  storage.size_in_bytes = new_size_in_bytes;
+  sizes = std::move(new_sizes);
+  return runtime::Error::Ok;
+}
+
+runtime::Result<size_t> compute_storage_size(
+    Span<const uint64_t> sizes,
+    DType dtype) {
+  ET_UNWRAP(num_elements, checked_num_elements(sizes));
+
+  switch (dtype) {
+    case DType::Int64:
+    case DType::UInt64: {
+      size_t bytes;
+      ET_CHECK_OR_RETURN_ERROR(
+          !c10::mul_overflows(num_elements, size_t{8}, &bytes),
+          InvalidArgument,
+          "Overflow computing storage size in bytes");
+      return bytes;
+    }
+    case DType::Float32:
+    case DType::QInt32: {
+      size_t bytes;
+      ET_CHECK_OR_RETURN_ERROR(
+          !c10::mul_overflows(num_elements, size_t{4}, &bytes),
+          InvalidArgument,
+          "Overflow computing storage size in bytes");
+      return bytes;
+    }
+    case DType::Float16:
+    case DType::BFloat16: {
+      size_t bytes;
+      ET_CHECK_OR_RETURN_ERROR(
+          !c10::mul_overflows(num_elements, size_t{2}, &bytes),
+          InvalidArgument,
+          "Overflow computing storage size in bytes");
+      return bytes;
+    }
+    case DType::QInt8:
+    case DType::QUInt8:
+      return num_elements;
+    case DType::QInt4:
+      // Two 4-bit elements per byte, rounded up (written to avoid overflow
+      // in the round-up).
+      return num_elements / 2 + (num_elements % 2);
+  }
+
+  ET_LOG(
+      Error,
+      "Unknown DType %d in compute_storage_size",
+      static_cast<int>(dtype));
+  return runtime::Error::InvalidArgument;
+}
+
+} // namespace executorch::backends::xnnpack::core
diff --git a/backends/xnnpack/runtime/core/tensor.h b/backends/xnnpack/runtime/core/tensor.h
new file mode 100644
index 00000000000..c794a61f97f
--- /dev/null
+++ b/backends/xnnpack/runtime/core/tensor.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <executorch/backends/xnnpack/runtime/core/dtype.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace executorch::backends::xnnpack::core {
+
+enum class StorageOwner { Arena, External, Self };
+
+struct Storage {
+  void* data = nullptr;
+  StorageOwner owner = StorageOwner::External;
+  size_t size_in_bytes = 0;
+
+  Storage() = default;
+  ~Storage();
+
+  Storage(const Storage&) = delete;
+  Storage& operator=(const Storage&) = delete;
+
+  Storage(Storage&& other) noexcept;
+  Storage& operator=(Storage&& other) noexcept;
+
+  static runtime::Result<Storage> create_owned(size_t size_in_bytes);
+};
+
+struct Tensor {
+  DType dtype;
+  std::vector<uint64_t> sizes;
+  Storage storage;
+  std::vector<Storage> aux_storage;
+
+  template <class T>
+  const T* data_const() const {
+    return static_cast<const T*>(storage.data);
+  }
+  template <class T>
+  T* data_mut() {
+    return static_cast<T*>(storage.data);
+  }
+
+  runtime::Result<size_t> numel() const;
+  runtime::Error resize(std::vector<uint64_t> new_sizes);
+};
+
+runtime::Result<size_t> compute_storage_size(
+    runtime::Span<const uint64_t> sizes,
+    DType dtype);
+
+} // namespace executorch::backends::xnnpack::core
diff --git a/backends/xnnpack/runtime/core/variant_util.h b/backends/xnnpack/runtime/core/variant_util.h
new file mode 100644
index 00000000000..ce73d8f6961
--- /dev/null
+++ b/backends/xnnpack/runtime/core/variant_util.h
@@ -0,0 +1,8 @@
+#pragma once
+
+template <class... Ts>
+struct overloaded : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+overloaded(Ts...) -> overloaded<Ts...>;
diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
index 3d9c77d6ad6..667cd2580b6 100644
--- a/backends/xnnpack/test/CMakeLists.txt
+++ b/backends/xnnpack/test/CMakeLists.txt
@@ -40,3 +40,24 @@ target_include_directories(
           ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
           ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
 )
+
+# Graph runtime unit tests.
+set(_graph_runtime_test_srcs runtime/test_quant_params.cpp)
+
+et_cxx_test(
+  backends_xnnpack_graph_runtime_test
+  SOURCES
+  ${_graph_runtime_test_srcs}
+  EXTRA_LIBS
+  xnnpack_backend
+  XNNPACK
+  pthreadpool
+  cpuinfo
+  xnnpack-microkernels-prod
+)
+target_include_directories(
+  backends_xnnpack_graph_runtime_test
+  PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/XNNPACK/include
+          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+)
diff --git a/backends/xnnpack/test/runtime/test_quant_params.cpp b/backends/xnnpack/test/runtime/test_quant_params.cpp
new file mode 100644
index 00000000000..8db0887fcc6
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_quant_params.cpp
@@ -0,0 +1,354 @@
+#include <gtest/gtest.h>
+
+#include <executorch/backends/xnnpack/runtime/core/dtype.h>
+#include <executorch/backends/xnnpack/runtime/core/quant_params.h>
+#include <executorch/backends/xnnpack/runtime/core/tensor.h>
+
+using namespace executorch::backends::xnnpack::core;
+
+// --- is_quantized ---
+
+TEST(TestQuantParams, is_quantized_float) {
+  EXPECT_FALSE(is_quantized(DType::Float32));
+}
+
+TEST(TestQuantParams, is_quantized_qint8sym) {
+  EXPECT_TRUE(is_quantized(DType::QInt8));
+}
+
+TEST(TestQuantParams, is_quantized_qint4sym) {
+  EXPECT_TRUE(is_quantized(DType::QInt4));
+}
+
+TEST(TestQuantParams, is_quantized_quint8asym) {
+  EXPECT_TRUE(is_quantized(DType::QUInt8));
+}
+
+TEST(TestQuantParams, is_quantized_nonquantized_types) {
+  EXPECT_FALSE(is_quantized(DType::Float16));
+  EXPECT_FALSE(is_quantized(DType::BFloat16));
+  EXPECT_FALSE(is_quantized(DType::Int64));
+  EXPECT_FALSE(is_quantized(DType::UInt64));
+}
+
+// --- is_asymmetric (now derived from QuantParams, not DType) ---
+
+TEST(TestQuantParams, is_asymmetric_sym) {
+  EXPECT_FALSE(is_asymmetric(qint8_per_channel_sym(0)));
+  EXPECT_FALSE(is_asymmetric(qint8_per_tensor_sym(0.5f)));
+  EXPECT_FALSE(is_asymmetric(qint4_blockwise_sym(1, 32)));
+}
+
+TEST(TestQuantParams, is_asymmetric_asym) {
+  EXPECT_TRUE(is_asymmetric(quint8_per_tensor_asym(0.25f, 128)));
+  EXPECT_TRUE(is_asymmetric(quint8_per_row_asym(-1)));
+  EXPECT_TRUE(is_asymmetric(quint8_per_token_asym()));
+}
+
+// --- is_subbyte / byte_stride ---
+
+TEST(TestQuantParams, is_subbyte) {
+  EXPECT_TRUE(is_subbyte(DType::QInt4));
+  EXPECT_FALSE(is_subbyte(DType::Float32));
+  EXPECT_FALSE(is_subbyte(DType::Float16));
+  EXPECT_FALSE(is_subbyte(DType::BFloat16));
+  EXPECT_FALSE(is_subbyte(DType::Int64));
+  EXPECT_FALSE(is_subbyte(DType::UInt64));
+  EXPECT_FALSE(is_subbyte(DType::QInt8));
+  EXPECT_FALSE(is_subbyte(DType::QUInt8));
+  EXPECT_FALSE(is_subbyte(DType::QInt32));
+}
+
+TEST(TestQuantParams, byte_stride) {
+  EXPECT_EQ(byte_stride(DType::QInt8), 1);
+  EXPECT_EQ(byte_stride(DType::QUInt8), 1);
+  EXPECT_EQ(byte_stride(DType::Float16), 2);
+  EXPECT_EQ(byte_stride(DType::BFloat16), 2);
+  EXPECT_EQ(byte_stride(DType::Float32), 4);
+  EXPECT_EQ(byte_stride(DType::QInt32), 4);
+  EXPECT_EQ(byte_stride(DType::Int64), 8);
+  EXPECT_EQ(byte_stride(DType::UInt64), 8);
+}
+
+// --- compute_storage_size ---
+
+TEST(TestQuantParams, storage_size_qint8sym) {
+  const uint64_t sizes[] = {4, 8};
+  auto r = compute_storage_size(sizes, DType::QInt8);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 32);
+}
+
+TEST(TestQuantParams, storage_size_quint8asym) {
+  const uint64_t sizes[] = {2, 5};
+  auto r = compute_storage_size(sizes, DType::QUInt8);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 10);
+}
+
+TEST(TestQuantParams, storage_size_float16) {
+  const uint64_t sizes[] = {4, 8};
+  auto r = compute_storage_size(sizes, DType::Float16);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 64); // 32 elements * 2 bytes
+}
+
+TEST(TestQuantParams, storage_size_int64) {
+  const uint64_t sizes[] = {4, 8};
+  auto r = compute_storage_size(sizes, DType::Int64);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 256); // 32 elements * 8 bytes
+}
+
+TEST(TestQuantParams, storage_size_qint4sym_even) {
+  const uint64_t sizes[] = {2, 4};
+  auto r = compute_storage_size(sizes, DType::QInt4);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 4);
+}
+
+TEST(TestQuantParams, storage_size_qint4sym_odd) {
+  const uint64_t sizes[] = {7};
+  auto r = compute_storage_size(sizes, DType::QInt4);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 4);
+}
+
+TEST(TestQuantParams, storage_size_qint4sym_one) {
+  const uint64_t sizes[] = {1};
+  auto r = compute_storage_size(sizes, DType::QInt4);
+  ASSERT_TRUE(r.ok());
+  EXPECT_EQ(r.get(), 1);
+}
+
+TEST(TestQuantParams, storage_size_overflow_returns_error) {
+  const uint64_t sizes[] = {SIZE_MAX, 2};
+  auto r = compute_storage_size(sizes, DType::QInt8);
+  EXPECT_FALSE(r.ok());
+  EXPECT_EQ(r.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+TEST(TestQuantParams, storage_size_byte_overflow_returns_error) {
+  // num_elements fits in size_t but num_elements * 4 overflows.
+  const uint64_t sizes[] = {SIZE_MAX / 2};
+  auto r = compute_storage_size(sizes, DType::Float32);
+  EXPECT_FALSE(r.ok());
+  EXPECT_EQ(r.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+// --- Preset factories ---
+
+TEST(TestQuantParams, preset_qint8_per_channel_sym) {
+  auto p = qint8_per_channel_sym(0);
+  auto* pa = std::get_if<PerAxisQuantParams>(&p);
+  ASSERT_NE(pa, nullptr);
+  EXPECT_EQ(pa->axis, 0);
+  EXPECT_EQ(pa->scale_dtype, DType::Float32);
+  EXPECT_FALSE(pa->has_zero_point);
+}
+
+TEST(TestQuantParams, preset_qint8_per_tensor_sym) {
+  auto p = qint8_per_tensor_sym(0.5f);
+  auto* pt = std::get_if<PerTensorQuantParams>(&p);
+  ASSERT_NE(pt, nullptr);
+  EXPECT_FLOAT_EQ(pt->scale, 0.5f);
+  EXPECT_EQ(pt->zero_point, 0);
+  EXPECT_FALSE(pt->has_zero_point);
+}
+
+TEST(TestQuantParams, preset_quint8_per_tensor_asym) {
+  auto p = quint8_per_tensor_asym(0.25f, 128);
+  auto* pt = std::get_if<PerTensorQuantParams>(&p);
+  ASSERT_NE(pt, nullptr);
+  EXPECT_FLOAT_EQ(pt->scale, 0.25f);
+  EXPECT_EQ(pt->zero_point, 128);
+  EXPECT_TRUE(pt->has_zero_point);
+}
+
+TEST(TestQuantParams, preset_quint8_per_row_asym) {
+  auto p = quint8_per_row_asym(1);
+  auto* pr = std::get_if<PerRowQuantParams>(&p);
+  ASSERT_NE(pr, nullptr);
+  EXPECT_EQ(pr->axis, 1);
+  EXPECT_EQ(pr->scale_dtype, DType::Float32);
+  EXPECT_TRUE(pr->has_zero_point);
+}
+
+TEST(TestQuantParams, preset_quint8_per_token_asym) {
+  auto p = quint8_per_token_asym();
+  auto* pr = std::get_if<PerRowQuantParams>(&p);
+  ASSERT_NE(pr, nullptr);
+  EXPECT_EQ(pr->axis, -1);
+  EXPECT_TRUE(pr->has_zero_point);
+}
+
+TEST(TestQuantParams, preset_qint4_blockwise_sym) {
+  auto p = qint4_blockwise_sym(1, 32);
+  auto* pb = std::get_if<PerBlockQuantParams>(&p);
+  ASSERT_NE(pb, nullptr);
+  EXPECT_EQ(pb->axis, 1);
+  EXPECT_EQ(pb->block_size, 32);
+  EXPECT_EQ(pb->scale_dtype, DType::Float32);
+}
+
+// --- aux_buffer_count ---
+
+TEST(TestQuantParams, aux_buffer_count_float) {
+  QuantParams dummy = PerTensorQuantParams{};
+  EXPECT_EQ(aux_buffer_count(DType::Float32, dummy), 0);
+}
+
+TEST(TestQuantParams, aux_buffer_count_sym) {
+  auto p = qint8_per_channel_sym(0);
+  EXPECT_EQ(aux_buffer_count(DType::QInt8, p), 1);
+}
+
+TEST(TestQuantParams, aux_buffer_count_asym) {
+  auto p = quint8_per_tensor_asym(1.0f, 0);
+  EXPECT_EQ(aux_buffer_count(DType::QUInt8, p), 2);
+}
+
+// --- compute_aux_storage_sizes ---
+
+TEST(TestQuantParams, aux_sizes_per_tensor_sym) {
+  auto p = qint8_per_tensor_sym(1.0f);
+  const uint64_t shape[] = {4, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get();
+  ASSERT_EQ(sizes.size(), 1);
+  EXPECT_EQ(sizes[0], sizeof(float)); // 1 scale, float32
+}
+
+TEST(TestQuantParams, aux_sizes_per_axis_keep_axis0) {
+  // [4, 8], keep axis=0 -> one scale per index along axis 0 -> 4 scales.
+  auto p = qint8_per_channel_sym(0);
+  const uint64_t shape[] = {4, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get();
+  ASSERT_EQ(sizes.size(), 1);
+  EXPECT_EQ(sizes[0], 4 * sizeof(float));
+}
+
+TEST(TestQuantParams, aux_sizes_per_axis_keep_axis1) {
+  // [4, 8], keep axis=1 -> 8 scales.
+  auto p = qint8_per_channel_sym(1);
+  const uint64_t shape[] = {4, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get();
+  ASSERT_EQ(sizes.size(), 1);
+  EXPECT_EQ(sizes[0], 8 * sizeof(float));
+}
+
+TEST(TestQuantParams, aux_sizes_per_channel_conv3d) {
+  // conv3d weight [out=4, in=8, kT=3, kH=3, kW=3], per-output-channel keeps
+  // axis 0 and reduces the rest -> one scale per output channel -> 4.
+  auto p = qint8_per_channel_sym(0);
+  const uint64_t shape[] = {4, 8, 3, 3, 3};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get();
+  ASSERT_EQ(sizes.size(), 1);
+  EXPECT_EQ(sizes[0], 4 * sizeof(float));
+}
+
+TEST(TestQuantParams, aux_sizes_per_row_asym_2d) {
+  // [4, 8], per-token reduces the last dim -> one scale per row -> 4 scales
+  // + 4 zero_points.
+  auto p = quint8_per_token_asym();
+  const uint64_t shape[] = {4, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get();
+  ASSERT_EQ(sizes.size(), 2);
+  EXPECT_EQ(sizes[0], 4 * sizeof(float)); // scales
+  EXPECT_EQ(sizes[1], 4 * sizeof(int32_t)); // zero_points
+}
+
+TEST(TestQuantParams, aux_sizes_per_token_3d) {
+  // [batch=2, seqlen=3, features=8], per-token reduces the last dim ->
+  // one scale per [batch, seqlen] combo -> 2*3 = 6 scales.
+  auto p = quint8_per_token_asym();
+  const uint64_t shape[] = {2, 3, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get();
+  ASSERT_EQ(sizes.size(), 2);
+  EXPECT_EQ(sizes[0], 6 * sizeof(float)); // scales
+  EXPECT_EQ(sizes[1], 6 * sizeof(int32_t)); // zero_points
+}
+
+TEST(TestQuantParams, aux_sizes_per_row_explicit_dim) {
+  // [2, 3, 8], reduce dim 1 -> keep dims 0 and 2 -> 2*8 = 16 scales.
+  auto p = quint8_per_row_asym(1);
+  const uint64_t shape[] = {2, 3, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get();
+  ASSERT_EQ(sizes.size(), 2);
+  EXPECT_EQ(sizes[0], 16 * sizeof(float));
+  EXPECT_EQ(sizes[1], 16 * sizeof(int32_t));
+}
+
+TEST(TestQuantParams, aux_sizes_per_row_negative_dim) {
+  // dim=-1 on [2, 3, 8] reduces the last dim -> 6 scales (same as per-token).
+  auto p = quint8_per_row_asym(-1);
+  const uint64_t shape[] = {2, 3, 8};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get();
+  ASSERT_EQ(sizes.size(), 2);
+  EXPECT_EQ(sizes[0], 6 * sizeof(float));
+}
+
+TEST(TestQuantParams, aux_sizes_per_row_dim_out_of_range_errors) {
+  // dim=3 is invalid for a 3-dim tensor.
+  auto p = quint8_per_row_asym(3);
+  const uint64_t shape[] = {2, 3, 8};
+  auto result = compute_aux_storage_sizes(shape, DType::QUInt8, p);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+TEST(TestQuantParams, aux_sizes_per_row_negative_dim_out_of_range_errors) {
+  // dim=-4 resolves to -1 for a 3-dim tensor, which is invalid.
+  auto p = quint8_per_row_asym(-4);
+  const uint64_t shape[] = {2, 3, 8};
+  auto result = compute_aux_storage_sizes(shape, DType::QUInt8, p);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+TEST(TestQuantParams, aux_sizes_blockwise_sym) {
+  // [4, 128], blockwise along axis=1, block_size=32
+  // num_blocks = 128/32 = 4
+  // other_dims = 4 (axis=1, so dim 0 contributes)
+  // total scales = 4 * 4 = 16
+  auto p = qint4_blockwise_sym(1, 32);
+  const uint64_t shape[] = {4, 128};
+  auto sizes = compute_aux_storage_sizes(shape, DType::QInt4, p).get();
+  ASSERT_EQ(sizes.size(), 1);
+  EXPECT_EQ(sizes[0], 16 * sizeof(float));
+}
+
+TEST(TestQuantParams, aux_sizes_per_block_not_divisible_errors) {
+  // [4, 100], block_size=32 along axis=1 does not evenly divide 100.
+  auto p = qint4_blockwise_sym(1, 32);
+  const uint64_t shape[] = {4, 100};
+  auto result = compute_aux_storage_sizes(shape, DType::QInt4, p);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+// --- compute_aux_storage_sizes validation ---
+
+TEST(TestQuantParams, aux_sizes_axis_out_of_range_errors) {
+  // axis=2 is invalid for a 2-dim tensor.
+  auto p = qint8_per_channel_sym(2);
+  const uint64_t shape[] = {4, 8};
+  auto result = compute_aux_storage_sizes(shape, DType::QInt8, p);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+TEST(TestQuantParams, aux_sizes_negative_axis_errors) {
+  auto p = qint8_per_channel_sym(-1);
+  const uint64_t shape[] = {4, 8};
+  auto result = compute_aux_storage_sizes(shape, DType::QInt8, p);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument);
+}
+
+TEST(TestQuantParams, aux_sizes_zero_block_size_errors) {
+  auto p = qint4_blockwise_sym(1, 0);
+  const uint64_t shape[] = {4, 128};
+  auto result = compute_aux_storage_sizes(shape, DType::QInt4, p);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument);
+}

From 7cc1262c33ab2ceb7fb22ecf1a9a9a9c525bb881 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Fri, 12 Jun 2026 15:33:55 -0700
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 backends/xnnpack/CMakeLists.txt                    | 5 -----
 backends/xnnpack/targets.bzl                       | 5 +++++
 shim_et/xplat/executorch/build/build_variables.bzl | 2 ++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 81453e80d2f..cd0d945a84f 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -100,11 +100,6 @@ set(xnnpack_third_party pthreadpool extension_threadpool cpuinfo)
 
 include(cmake/Dependencies.cmake)
 
-# Graph runtime sources.
-list(APPEND _xnnpack_backend__srcs backends/xnnpack/runtime/core/tensor.cpp
-     backends/xnnpack/runtime/core/quant_params.cpp
-)
-
 list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(xnnpack_backend ${_xnnpack_backend__srcs})
 target_link_libraries(
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index b3af589df10..4bafb6c4c5f 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -41,6 +41,11 @@ def define_common_targets():
             headers = native.glob([
                 "runtime/*.h",
                 "runtime/profiling/*.h",
+                "runtime/core/*.h",
+                "runtime/graph/*.h",
+                "runtime/operators/*.h",
+                "runtime/executor/*.h",
+                "runtime/plan/*.h",
             ]),
             visibility = ["PUBLIC"],
             preprocessor_flags = [
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index 659a128994f..4a8cc479075 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -480,6 +480,8 @@ XNNPACK_BACKEND_BUCK_SRCS = [
     "runtime/XNNWorkspaceManager.cpp",
     "runtime/XnnpackBackendOptions.cpp",
     "runtime/profiling/XNNProfiler.cpp",
+    "runtime/core/tensor.cpp",
+    "runtime/core/quant_params.cpp",
 ]
 
 XNNPACK_BACKEND_SRCS = ["backends/xnnpack/" + x for x in XNNPACK_BACKEND_BUCK_SRCS]

From 1b00ce36d43bbce970ac3df721175250a29e6970 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Fri, 12 Jun 2026 22:19:00 -0700
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 backends/xnnpack/runtime/core/quant_params.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/xnnpack/runtime/core/quant_params.h b/backends/xnnpack/runtime/core/quant_params.h
index c0319250674..7922015512e 100644
--- a/backends/xnnpack/runtime/core/quant_params.h
+++ b/backends/xnnpack/runtime/core/quant_params.h
@@ -67,10 +67,15 @@ struct PerRowQuantParams {
   int8_t axis = -1;
   DType scale_dtype = DType::Float32;
   bool has_zero_point = false;
+  // When true, this is a dynamically-quantized activation (XNNPACK qdint8):
+  // the per-row scale/zero point are computed at runtime rather than stored.
+  // `axis` is the reduced (channel) dim, so the number of trailing "row" dims
+  // (XNNPACK's num_nonbatch_dims) is -axis for the usual negative axis.
+  bool is_dynamic = false;
 
   bool operator==(const PerRowQuantParams& o) const {
     return axis == o.axis && scale_dtype == o.scale_dtype &&
-        has_zero_point == o.has_zero_point;
+        has_zero_point == o.has_zero_point && is_dynamic == o.is_dynamic;
   }
 };