From d9c6c0adccfc36b2ede458eaae6e767b7b434272 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Fri, 12 Jun 2026 14:29:53 -0700 Subject: [PATCH 1/3] Update [ghstack-poisoned] --- backends/xnnpack/CMakeLists.txt | 5 + backends/xnnpack/runtime/core/dtype.h | 24 ++ .../xnnpack/runtime/core/quant_params.cpp | 202 ++++++++++ backends/xnnpack/runtime/core/quant_params.h | 150 ++++++++ backends/xnnpack/runtime/core/tensor.cpp | 152 ++++++++ backends/xnnpack/runtime/core/tensor.h | 56 +++ backends/xnnpack/runtime/core/variant_util.h | 8 + backends/xnnpack/test/CMakeLists.txt | 21 ++ .../test/runtime/test_quant_params.cpp | 354 ++++++++++++++++++ 9 files changed, 972 insertions(+) create mode 100644 backends/xnnpack/runtime/core/dtype.h create mode 100644 backends/xnnpack/runtime/core/quant_params.cpp create mode 100644 backends/xnnpack/runtime/core/quant_params.h create mode 100644 backends/xnnpack/runtime/core/tensor.cpp create mode 100644 backends/xnnpack/runtime/core/tensor.h create mode 100644 backends/xnnpack/runtime/core/variant_util.h create mode 100644 backends/xnnpack/test/runtime/test_quant_params.cpp diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index cd0d945a84f..81453e80d2f 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -100,6 +100,11 @@ set(xnnpack_third_party pthreadpool extension_threadpool cpuinfo) include(cmake/Dependencies.cmake) +# Graph runtime sources. +list(APPEND _xnnpack_backend__srcs backends/xnnpack/runtime/core/tensor.cpp + backends/xnnpack/runtime/core/quant_params.cpp +) + list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(xnnpack_backend ${_xnnpack_backend__srcs}) target_link_libraries( diff --git a/backends/xnnpack/runtime/core/dtype.h b/backends/xnnpack/runtime/core/dtype.h new file mode 100644 index 00000000000..d3ac48920f7 --- /dev/null +++ b/backends/xnnpack/runtime/core/dtype.h @@ -0,0 +1,24 @@ +#pragma once + +namespace executorch::backends::xnnpack::core { + +enum class DType { + // Floating point + Float32, + Float16, + BFloat16, + + // Non-quantized integer + Int64, + UInt64, + + // Quantized — signed + QInt8, + QInt4, + QInt32, + + // Quantized — unsigned + QUInt8, +}; + +} // namespace executorch::backends::xnnpack::core diff --git a/backends/xnnpack/runtime/core/quant_params.cpp b/backends/xnnpack/runtime/core/quant_params.cpp new file mode 100644 index 00000000000..c9abdeaf24e --- /dev/null +++ b/backends/xnnpack/runtime/core/quant_params.cpp @@ -0,0 +1,202 @@ +#include + +#include +#include +#include + +#include + +namespace executorch::backends::xnnpack::core { + +using executorch::runtime::Span; + +QuantParams qint8_per_channel_sym(int8_t axis) { + return PerAxisQuantParams{.axis = axis, .has_zero_point = false}; +} + +QuantParams qint8_per_tensor_sym(float scale) { + return PerTensorQuantParams{ + .scale = scale, .zero_point = 0, .has_zero_point = false}; +} + +QuantParams quint8_per_tensor_asym(float scale, int32_t zero_point) { + return PerTensorQuantParams{ + .scale = scale, .zero_point = zero_point, .has_zero_point = true}; +} + +QuantParams quint8_per_row_asym(int8_t axis) { + return PerRowQuantParams{.axis = axis, .has_zero_point = true}; +} + +QuantParams quint8_per_token_asym() { + return PerRowQuantParams{.axis = -1, .has_zero_point = true}; +} + +QuantParams qint4_blockwise_sym(int8_t axis, int32_t block_size) { + return PerBlockQuantParams{ + .axis = axis, .block_size = block_size, .has_zero_point = false}; +} + +bool is_quantized(DType dtype) { + switch (dtype) { + case DType::Float32: + case DType::Float16: + case DType::BFloat16: + case DType::Int64: + case DType::UInt64: + return false; + case DType::QInt8: + case DType::QInt4: + case DType::QInt32: + case DType::QUInt8: + return true; + } +} + +bool is_subbyte(DType dtype) { + switch (dtype) { + case DType::QInt4: + return true; + case DType::Float32: + case DType::Float16: + case DType::BFloat16: + case DType::Int64: + case DType::UInt64: + case DType::QInt8: + case DType::QInt32: + case DType::QUInt8: + return false; + } +} + +size_t byte_stride(DType dtype) { + switch (dtype) { + case DType::QInt8: + case DType::QUInt8: + return 1; + case DType::Float16: + case DType::BFloat16: + return 2; + case DType::Float32: + case DType::QInt32: + return 4; + case DType::Int64: + case DType::UInt64: + return 8; + case DType::QInt4: + // Sub-byte; no whole-byte stride. Guard callers with is_subbyte(). + abort(); + } +} + +bool is_asymmetric(const QuantParams& params) { + return std::visit([](const auto& p) { return p.has_zero_point; }, params); +} + +uint8_t aux_buffer_count(DType dtype, const QuantParams& params) { + if (!is_quantized(dtype)) + return 0; + + uint8_t count = 1; // scales + if (is_asymmetric(params)) + count++; // zero_points + return count; +} + +static runtime::Result scale_element_count( + Span sizes, + const QuantParams& params) { + return std::visit( + overloaded{ + [](const PerTensorQuantParams&) -> runtime::Result { + return 1; + }, + [&](const PerAxisQuantParams& p) -> runtime::Result { + ET_CHECK_OR_RETURN_ERROR( + p.axis >= 0 && static_cast(p.axis) < sizes.size(), + InvalidArgument, + "Per-axis quant axis %d is out of range for a %zu-dim tensor", + static_cast(p.axis), + sizes.size()); + return sizes[p.axis]; + }, + [&](const PerRowQuantParams& p) -> runtime::Result { + int rank = static_cast(sizes.size()); + int axis = p.axis < 0 ? p.axis + rank : p.axis; + ET_CHECK_OR_RETURN_ERROR( + axis >= 0 && axis < rank, + InvalidArgument, + "Per-row quant axis %d is out of range for a %d-dim tensor", + static_cast(p.axis), + rank); + size_t count = 1; + for (size_t i = 0; i < sizes.size(); i++) { + if (i != static_cast(axis)) + count *= sizes[i]; + } + return count; + }, + [&](const PerBlockQuantParams& p) -> runtime::Result { + ET_CHECK_OR_RETURN_ERROR( + p.axis >= 0 && static_cast(p.axis) < sizes.size(), + InvalidArgument, + "Per-block quant axis %d is out of range for a %zu-dim tensor", + static_cast(p.axis), + sizes.size()); + ET_CHECK_OR_RETURN_ERROR( + p.block_size > 0, + InvalidArgument, + "Per-block quant block_size must be positive, got %d", + p.block_size); + auto axis = static_cast(p.axis); + ET_CHECK_OR_RETURN_ERROR( + sizes[axis] % static_cast(p.block_size) == 0, + InvalidArgument, + "Per-block quant block_size %d must evenly divide axis %d (size %zu)", + p.block_size, + static_cast(p.axis), + static_cast(sizes[axis])); + size_t num_blocks = sizes[axis] / p.block_size; + size_t other_dims = 1; + for (size_t i = 0; i < sizes.size(); i++) { + if (i != axis) + other_dims *= sizes[i]; + } + return num_blocks * other_dims; + }, + }, + params); +} + +static DType scale_dtype_of(const QuantParams& params) { + return std::visit( + overloaded{ + [](const PerTensorQuantParams& p) { return p.scale_dtype; }, + [](const PerAxisQuantParams& p) { return p.scale_dtype; }, + [](const PerRowQuantParams& p) { return p.scale_dtype; }, + [](const PerBlockQuantParams& p) { return p.scale_dtype; }, + }, + params); +} + +runtime::Result> compute_aux_storage_sizes( + Span sizes, + DType dtype, + const QuantParams& params) { + std::vector result; + + ET_UNWRAP(num_scales, scale_element_count(sizes, params)); + const uint64_t scale_shape[] = {static_cast(num_scales)}; + ET_UNWRAP( + scale_bytes, compute_storage_size(scale_shape, scale_dtype_of(params))); + result.push_back(scale_bytes); + + if (is_asymmetric(params)) { + auto zp_bytes = num_scales * sizeof(int32_t); + result.push_back(zp_bytes); + } + + return result; +} + +} // namespace executorch::backends::xnnpack::core diff --git a/backends/xnnpack/runtime/core/quant_params.h b/backends/xnnpack/runtime/core/quant_params.h new file mode 100644 index 00000000000..c0319250674 --- /dev/null +++ b/backends/xnnpack/runtime/core/quant_params.h @@ -0,0 +1,150 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +/* + * This file contains types and methods related to quantization parameters. + * Quant params, in combination with dtype, should provide enough information + * to interpret raw tensor memory and inform kernel dispatch. + */ + +namespace executorch::backends::xnnpack::core { + +/* + * Represents quantization parameters for per-tensor quantization. This means + * that there is a single scale and zero point for the entire tensor. + * + * For a tensor of shape [A, B, C], this is equivalent to a block size of + * [A, B, C]. + */ +struct PerTensorQuantParams { + DType scale_dtype = DType::Float32; + float scale = 0.0f; + int32_t zero_point = 0; + bool has_zero_point = false; + + bool operator==(const PerTensorQuantParams& o) const { + return scale_dtype == o.scale_dtype && scale == o.scale && + zero_point == o.zero_point && has_zero_point == o.has_zero_point; + } +}; + +/* + * Represents per-axis quantization parameters. Scale and zero point are + * shared by all elements with the same index along the target axis. + * + * For a tensor of shape [A, B, C] and axis=1, this is equivalent to a block + * size of [A, 1, C] with a scale shape [1, B, 1]. + */ +struct PerAxisQuantParams { + int8_t axis; + DType scale_dtype = DType::Float32; + bool has_zero_point = false; + + bool operator==(const PerAxisQuantParams& o) const { + return axis == o.axis && scale_dtype == o.scale_dtype && + has_zero_point == o.has_zero_point; + } +}; + +/* + * Represents per-row quantization parameters. Scale and zero point are + * shared by all elements with the same indices along non-target axes; `axis` + * is the reduced dim, negative values index from the end, and it defaults to + * -1 (the last dim, i.e. per-token). + * + * For a tensor of shape [A, B, C] and axis=1, this is equivalent to a block + * size of [1, B, 1] with a scale shape of [A, 1, C]. + */ +struct PerRowQuantParams { + int8_t axis = -1; + DType scale_dtype = DType::Float32; + bool has_zero_point = false; + + bool operator==(const PerRowQuantParams& o) const { + return axis == o.axis && scale_dtype == o.scale_dtype && + has_zero_point == o.has_zero_point; + } +}; + +/* + * Represents per-block quantization parameters. Elements are grouped along + * `axis` into groups of `block_size`. Elements within a group share a scale + * and zero point. The block size must evenly divide the input tensor shape + * along the target axis. + * + * For a tensor of shape [A, B, C] and axis=1, blocks are size + * [1, block_size, 1] with a scale shape of [A, B / block_size, C]. + */ +struct PerBlockQuantParams { + int8_t axis; + int32_t block_size; + DType scale_dtype = DType::Float32; + bool has_zero_point = false; + + bool operator==(const PerBlockQuantParams& o) const { + return axis == o.axis && block_size == o.block_size && + scale_dtype == o.scale_dtype && has_zero_point == o.has_zero_point; + } +}; + +/* + * Quantization parameter descriptor. Describes the type and granularity of + * the quantization scheme. Does not contain the actual scale and zero point + * data, as these are stored in the auxialliary storage on the tensor. + */ +using QuantParams = std::variant< + PerTensorQuantParams, + PerAxisQuantParams, + PerRowQuantParams, + PerBlockQuantParams>; + +QuantParams qint8_per_channel_sym(int8_t axis); +QuantParams qint8_per_tensor_sym(float scale); +QuantParams quint8_per_tensor_asym(float scale, int32_t zero_point); +QuantParams quint8_per_row_asym(int8_t axis); +QuantParams quint8_per_token_asym(); +QuantParams qint4_blockwise_sym(int8_t axis, int32_t block_size); + +/* + * Returns true if the given dtype is quantized. Quantized types + * require additional metadata to interpret. + */ +bool is_quantized(DType dtype); + +/* + * Returns true if the dtype's elements are smaller than a byte (e.g. packed + * 4-bit), and so are not individually byte-addressable. + */ +bool is_subbyte(DType dtype); + +/* + * Returns the size in bytes of a single element. Precondition: the dtype is + * byte-aligned (!is_subbyte); sub-byte types have no whole-byte stride. + */ +size_t byte_stride(DType dtype); + +/* + * Returns true if the given quant params have a zero point. + */ +bool is_asymmetric(const QuantParams& params); + +/* + * Returns the number of auxilliary storage buffers required to + * store the parameters (scales + zero points) for the given quant + * scheme. + */ +uint8_t aux_buffer_count(DType dtype, const QuantParams& params); +runtime::Result> compute_aux_storage_sizes( + runtime::Span sizes, + DType dtype, + const QuantParams& params); + +} // namespace executorch::backends::xnnpack::core diff --git a/backends/xnnpack/runtime/core/tensor.cpp b/backends/xnnpack/runtime/core/tensor.cpp new file mode 100644 index 00000000000..26dd1fb4733 --- /dev/null +++ b/backends/xnnpack/runtime/core/tensor.cpp @@ -0,0 +1,152 @@ +#include + +#include +#include + +#include + +namespace executorch::backends::xnnpack::core { + +using executorch::runtime::Span; + +Storage::~Storage() { + if (owner == StorageOwner::Self) { + std::free(data); + } +} + +Storage::Storage(Storage&& other) noexcept + : data(other.data), owner(other.owner), size_in_bytes(other.size_in_bytes) { + other.data = nullptr; + other.owner = StorageOwner::External; + other.size_in_bytes = 0; +} + +Storage& Storage::operator=(Storage&& other) noexcept { + if (this != &other) { + if (owner == StorageOwner::Self) { + std::free(data); + } + data = other.data; + owner = other.owner; + size_in_bytes = other.size_in_bytes; + other.data = nullptr; + other.owner = StorageOwner::External; + other.size_in_bytes = 0; + } + return *this; +} + +runtime::Result Storage::create_owned(size_t size_in_bytes) { + void* data = std::malloc(size_in_bytes); + ET_CHECK_OR_RETURN_ERROR( + data != nullptr || size_in_bytes == 0, + MemoryAllocationFailed, + "Failed to allocate %zu bytes for tensor storage", + size_in_bytes); + + Storage s; + s.data = data; + s.owner = StorageOwner::Self; + s.size_in_bytes = size_in_bytes; + return s; +} + +namespace { +runtime::Result checked_num_elements(Span sizes) { + size_t num_elements = 1; + for (size_t i = 0; i < sizes.size(); i++) { + size_t next; + ET_CHECK_OR_RETURN_ERROR( + !c10::mul_overflows(num_elements, static_cast(sizes[i]), &next), + InvalidArgument, + "Overflow computing number of elements at dimension %zu", + i); + num_elements = next; + } + return num_elements; +} +} // namespace + +runtime::Result Tensor::numel() const { + return checked_num_elements({sizes.data(), sizes.size()}); +} + +runtime::Error Tensor::resize(std::vector new_sizes) { + ET_UNWRAP( + new_size_in_bytes, + compute_storage_size({new_sizes.data(), new_sizes.size()}, dtype)); + + if (new_size_in_bytes <= storage.size_in_bytes) { + sizes = std::move(new_sizes); + return runtime::Error::Ok; + } + + ET_CHECK_OR_RETURN_ERROR( + storage.owner == StorageOwner::Self, + NotSupported, + "Cannot grow storage of a non-owned tensor"); + + void* new_data = std::realloc(storage.data, new_size_in_bytes); + ET_CHECK_OR_RETURN_ERROR( + new_data != nullptr, + MemoryAllocationFailed, + "Failed to reallocate %zu bytes during resize", + new_size_in_bytes); + + storage.data = new_data; + storage.size_in_bytes = new_size_in_bytes; + sizes = std::move(new_sizes); + return runtime::Error::Ok; +} + +runtime::Result compute_storage_size( + Span sizes, + DType dtype) { + ET_UNWRAP(num_elements, checked_num_elements(sizes)); + + switch (dtype) { + case DType::Int64: + case DType::UInt64: { + size_t bytes; + ET_CHECK_OR_RETURN_ERROR( + !c10::mul_overflows(num_elements, size_t{8}, &bytes), + InvalidArgument, + "Overflow computing storage size in bytes"); + return bytes; + } + case DType::Float32: + case DType::QInt32: { + size_t bytes; + ET_CHECK_OR_RETURN_ERROR( + !c10::mul_overflows(num_elements, size_t{4}, &bytes), + InvalidArgument, + "Overflow computing storage size in bytes"); + return bytes; + } + case DType::Float16: + case DType::BFloat16: { + size_t bytes; + ET_CHECK_OR_RETURN_ERROR( + !c10::mul_overflows(num_elements, size_t{2}, &bytes), + InvalidArgument, + "Overflow computing storage size in bytes"); + return bytes; + } + case DType::QInt8: + case DType::QUInt8: + return num_elements; + case DType::QInt4: + // Two 4-bit elements per byte, rounded up (written to avoid overflow + // in the round-up). + return num_elements / 2 + (num_elements % 2); + } + + ET_LOG( + Error, + "Unknown DType %d in compute_storage_size", + static_cast(dtype)); + return runtime::Error::InvalidArgument; +} + +} // namespace executorch::backends::xnnpack::core diff --git a/backends/xnnpack/runtime/core/tensor.h b/backends/xnnpack/runtime/core/tensor.h new file mode 100644 index 00000000000..c794a61f97f --- /dev/null +++ b/backends/xnnpack/runtime/core/tensor.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace executorch::backends::xnnpack::core { + +enum class StorageOwner { Arena, External, Self }; + +struct Storage { + void* data = nullptr; + StorageOwner owner = StorageOwner::External; + size_t size_in_bytes = 0; + + Storage() = default; + ~Storage(); + + Storage(const Storage&) = delete; + Storage& operator=(const Storage&) = delete; + + Storage(Storage&& other) noexcept; + Storage& operator=(Storage&& other) noexcept; + + static runtime::Result create_owned(size_t size_in_bytes); +}; + +struct Tensor { + DType dtype; + std::vector sizes; + Storage storage; + std::vector aux_storage; + + template + const T* data_const() const { + return static_cast(storage.data); + } + template + T* data_mut() { + return static_cast(storage.data); + } + + runtime::Result numel() const; + runtime::Error resize(std::vector new_sizes); +}; + +runtime::Result compute_storage_size( + runtime::Span sizes, + DType dtype); + +} // namespace executorch::backends::xnnpack::core diff --git a/backends/xnnpack/runtime/core/variant_util.h b/backends/xnnpack/runtime/core/variant_util.h new file mode 100644 index 00000000000..ce73d8f6961 --- /dev/null +++ b/backends/xnnpack/runtime/core/variant_util.h @@ -0,0 +1,8 @@ +#pragma once + +template +struct overloaded : Ts... { + using Ts::operator()...; +}; +template +overloaded(Ts...) -> overloaded; diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index 3d9c77d6ad6..667cd2580b6 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -40,3 +40,24 @@ target_include_directories( ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include ) + +# Graph runtime unit tests. +set(_graph_runtime_test_srcs runtime/test_quant_params.cpp) + +et_cxx_test( + backends_xnnpack_graph_runtime_test + SOURCES + ${_graph_runtime_test_srcs} + EXTRA_LIBS + xnnpack_backend + XNNPACK + pthreadpool + cpuinfo + xnnpack-microkernels-prod +) +target_include_directories( + backends_xnnpack_graph_runtime_test + PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/XNNPACK/include + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include +) diff --git a/backends/xnnpack/test/runtime/test_quant_params.cpp b/backends/xnnpack/test/runtime/test_quant_params.cpp new file mode 100644 index 00000000000..8db0887fcc6 --- /dev/null +++ b/backends/xnnpack/test/runtime/test_quant_params.cpp @@ -0,0 +1,354 @@ +#include + +#include +#include +#include + +using namespace executorch::backends::xnnpack::core; + +// --- is_quantized --- + +TEST(TestQuantParams, is_quantized_float) { + EXPECT_FALSE(is_quantized(DType::Float32)); +} + +TEST(TestQuantParams, is_quantized_qint8sym) { + EXPECT_TRUE(is_quantized(DType::QInt8)); +} + +TEST(TestQuantParams, is_quantized_qint4sym) { + EXPECT_TRUE(is_quantized(DType::QInt4)); +} + +TEST(TestQuantParams, is_quantized_quint8asym) { + EXPECT_TRUE(is_quantized(DType::QUInt8)); +} + +TEST(TestQuantParams, is_quantized_nonquantized_types) { + EXPECT_FALSE(is_quantized(DType::Float16)); + EXPECT_FALSE(is_quantized(DType::BFloat16)); + EXPECT_FALSE(is_quantized(DType::Int64)); + EXPECT_FALSE(is_quantized(DType::UInt64)); +} + +// --- is_asymmetric (now derived from QuantParams, not DType) --- + +TEST(TestQuantParams, is_asymmetric_sym) { + EXPECT_FALSE(is_asymmetric(qint8_per_channel_sym(0))); + EXPECT_FALSE(is_asymmetric(qint8_per_tensor_sym(0.5f))); + EXPECT_FALSE(is_asymmetric(qint4_blockwise_sym(1, 32))); +} + +TEST(TestQuantParams, is_asymmetric_asym) { + EXPECT_TRUE(is_asymmetric(quint8_per_tensor_asym(0.25f, 128))); + EXPECT_TRUE(is_asymmetric(quint8_per_row_asym(-1))); + EXPECT_TRUE(is_asymmetric(quint8_per_token_asym())); +} + +// --- is_subbyte / byte_stride --- + +TEST(TestQuantParams, is_subbyte) { + EXPECT_TRUE(is_subbyte(DType::QInt4)); + EXPECT_FALSE(is_subbyte(DType::Float32)); + EXPECT_FALSE(is_subbyte(DType::Float16)); + EXPECT_FALSE(is_subbyte(DType::BFloat16)); + EXPECT_FALSE(is_subbyte(DType::Int64)); + EXPECT_FALSE(is_subbyte(DType::UInt64)); + EXPECT_FALSE(is_subbyte(DType::QInt8)); + EXPECT_FALSE(is_subbyte(DType::QUInt8)); + EXPECT_FALSE(is_subbyte(DType::QInt32)); +} + +TEST(TestQuantParams, byte_stride) { + EXPECT_EQ(byte_stride(DType::QInt8), 1); + EXPECT_EQ(byte_stride(DType::QUInt8), 1); + EXPECT_EQ(byte_stride(DType::Float16), 2); + EXPECT_EQ(byte_stride(DType::BFloat16), 2); + EXPECT_EQ(byte_stride(DType::Float32), 4); + EXPECT_EQ(byte_stride(DType::QInt32), 4); + EXPECT_EQ(byte_stride(DType::Int64), 8); + EXPECT_EQ(byte_stride(DType::UInt64), 8); +} + +// --- compute_storage_size --- + +TEST(TestQuantParams, storage_size_qint8sym) { + const uint64_t sizes[] = {4, 8}; + auto r = compute_storage_size(sizes, DType::QInt8); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 32); +} + +TEST(TestQuantParams, storage_size_quint8asym) { + const uint64_t sizes[] = {2, 5}; + auto r = compute_storage_size(sizes, DType::QUInt8); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 10); +} + +TEST(TestQuantParams, storage_size_float16) { + const uint64_t sizes[] = {4, 8}; + auto r = compute_storage_size(sizes, DType::Float16); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 64); // 32 elements * 2 bytes +} + +TEST(TestQuantParams, storage_size_int64) { + const uint64_t sizes[] = {4, 8}; + auto r = compute_storage_size(sizes, DType::Int64); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 256); // 32 elements * 8 bytes +} + +TEST(TestQuantParams, storage_size_qint4sym_even) { + const uint64_t sizes[] = {2, 4}; + auto r = compute_storage_size(sizes, DType::QInt4); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 4); +} + +TEST(TestQuantParams, storage_size_qint4sym_odd) { + const uint64_t sizes[] = {7}; + auto r = compute_storage_size(sizes, DType::QInt4); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 4); +} + +TEST(TestQuantParams, storage_size_qint4sym_one) { + const uint64_t sizes[] = {1}; + auto r = compute_storage_size(sizes, DType::QInt4); + ASSERT_TRUE(r.ok()); + EXPECT_EQ(r.get(), 1); +} + +TEST(TestQuantParams, storage_size_overflow_returns_error) { + const uint64_t sizes[] = {SIZE_MAX, 2}; + auto r = compute_storage_size(sizes, DType::QInt8); + EXPECT_FALSE(r.ok()); + EXPECT_EQ(r.error(), executorch::runtime::Error::InvalidArgument); +} + +TEST(TestQuantParams, storage_size_byte_overflow_returns_error) { + // num_elements fits in size_t but num_elements * 4 overflows. + const uint64_t sizes[] = {SIZE_MAX / 2}; + auto r = compute_storage_size(sizes, DType::Float32); + EXPECT_FALSE(r.ok()); + EXPECT_EQ(r.error(), executorch::runtime::Error::InvalidArgument); +} + +// --- Preset factories --- + +TEST(TestQuantParams, preset_qint8_per_channel_sym) { + auto p = qint8_per_channel_sym(0); + auto* pa = std::get_if(&p); + ASSERT_NE(pa, nullptr); + EXPECT_EQ(pa->axis, 0); + EXPECT_EQ(pa->scale_dtype, DType::Float32); + EXPECT_FALSE(pa->has_zero_point); +} + +TEST(TestQuantParams, preset_qint8_per_tensor_sym) { + auto p = qint8_per_tensor_sym(0.5f); + auto* pt = std::get_if(&p); + ASSERT_NE(pt, nullptr); + EXPECT_FLOAT_EQ(pt->scale, 0.5f); + EXPECT_EQ(pt->zero_point, 0); + EXPECT_FALSE(pt->has_zero_point); +} + +TEST(TestQuantParams, preset_quint8_per_tensor_asym) { + auto p = quint8_per_tensor_asym(0.25f, 128); + auto* pt = std::get_if(&p); + ASSERT_NE(pt, nullptr); + EXPECT_FLOAT_EQ(pt->scale, 0.25f); + EXPECT_EQ(pt->zero_point, 128); + EXPECT_TRUE(pt->has_zero_point); +} + +TEST(TestQuantParams, preset_quint8_per_row_asym) { + auto p = quint8_per_row_asym(1); + auto* pr = std::get_if(&p); + ASSERT_NE(pr, nullptr); + EXPECT_EQ(pr->axis, 1); + EXPECT_EQ(pr->scale_dtype, DType::Float32); + EXPECT_TRUE(pr->has_zero_point); +} + +TEST(TestQuantParams, preset_quint8_per_token_asym) { + auto p = quint8_per_token_asym(); + auto* pr = std::get_if(&p); + ASSERT_NE(pr, nullptr); + EXPECT_EQ(pr->axis, -1); + EXPECT_TRUE(pr->has_zero_point); +} + +TEST(TestQuantParams, preset_qint4_blockwise_sym) { + auto p = qint4_blockwise_sym(1, 32); + auto* pb = std::get_if(&p); + ASSERT_NE(pb, nullptr); + EXPECT_EQ(pb->axis, 1); + EXPECT_EQ(pb->block_size, 32); + EXPECT_EQ(pb->scale_dtype, DType::Float32); +} + +// --- aux_buffer_count --- + +TEST(TestQuantParams, aux_buffer_count_float) { + QuantParams dummy = PerTensorQuantParams{}; + EXPECT_EQ(aux_buffer_count(DType::Float32, dummy), 0); +} + +TEST(TestQuantParams, aux_buffer_count_sym) { + auto p = qint8_per_channel_sym(0); + EXPECT_EQ(aux_buffer_count(DType::QInt8, p), 1); +} + +TEST(TestQuantParams, aux_buffer_count_asym) { + auto p = quint8_per_tensor_asym(1.0f, 0); + EXPECT_EQ(aux_buffer_count(DType::QUInt8, p), 2); +} + +// --- compute_aux_storage_sizes --- + +TEST(TestQuantParams, aux_sizes_per_tensor_sym) { + auto p = qint8_per_tensor_sym(1.0f); + const uint64_t shape[] = {4, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get(); + ASSERT_EQ(sizes.size(), 1); + EXPECT_EQ(sizes[0], sizeof(float)); // 1 scale, float32 +} + +TEST(TestQuantParams, aux_sizes_per_axis_keep_axis0) { + // [4, 8], keep axis=0 -> one scale per index along axis 0 -> 4 scales. + auto p = qint8_per_channel_sym(0); + const uint64_t shape[] = {4, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get(); + ASSERT_EQ(sizes.size(), 1); + EXPECT_EQ(sizes[0], 4 * sizeof(float)); +} + +TEST(TestQuantParams, aux_sizes_per_axis_keep_axis1) { + // [4, 8], keep axis=1 -> 8 scales. + auto p = qint8_per_channel_sym(1); + const uint64_t shape[] = {4, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get(); + ASSERT_EQ(sizes.size(), 1); + EXPECT_EQ(sizes[0], 8 * sizeof(float)); +} + +TEST(TestQuantParams, aux_sizes_per_channel_conv3d) { + // conv3d weight [out=4, in=8, kT=3, kH=3, kW=3], per-output-channel keeps + // axis 0 and reduces the rest -> one scale per output channel -> 4. + auto p = qint8_per_channel_sym(0); + const uint64_t shape[] = {4, 8, 3, 3, 3}; + auto sizes = compute_aux_storage_sizes(shape, DType::QInt8, p).get(); + ASSERT_EQ(sizes.size(), 1); + EXPECT_EQ(sizes[0], 4 * sizeof(float)); +} + +TEST(TestQuantParams, aux_sizes_per_row_asym_2d) { + // [4, 8], per-token reduces the last dim -> one scale per row -> 4 scales + // + 4 zero_points. + auto p = quint8_per_token_asym(); + const uint64_t shape[] = {4, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get(); + ASSERT_EQ(sizes.size(), 2); + EXPECT_EQ(sizes[0], 4 * sizeof(float)); // scales + EXPECT_EQ(sizes[1], 4 * sizeof(int32_t)); // zero_points +} + +TEST(TestQuantParams, aux_sizes_per_token_3d) { + // [batch=2, seqlen=3, features=8], per-token reduces the last dim -> + // one scale per [batch, seqlen] combo -> 2*3 = 6 scales. + auto p = quint8_per_token_asym(); + const uint64_t shape[] = {2, 3, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get(); + ASSERT_EQ(sizes.size(), 2); + EXPECT_EQ(sizes[0], 6 * sizeof(float)); // scales + EXPECT_EQ(sizes[1], 6 * sizeof(int32_t)); // zero_points +} + +TEST(TestQuantParams, aux_sizes_per_row_explicit_dim) { + // [2, 3, 8], reduce dim 1 -> keep dims 0 and 2 -> 2*8 = 16 scales. + auto p = quint8_per_row_asym(1); + const uint64_t shape[] = {2, 3, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get(); + ASSERT_EQ(sizes.size(), 2); + EXPECT_EQ(sizes[0], 16 * sizeof(float)); + EXPECT_EQ(sizes[1], 16 * sizeof(int32_t)); +} + +TEST(TestQuantParams, aux_sizes_per_row_negative_dim) { + // dim=-1 on [2, 3, 8] reduces the last dim -> 6 scales (same as per-token). + auto p = quint8_per_row_asym(-1); + const uint64_t shape[] = {2, 3, 8}; + auto sizes = compute_aux_storage_sizes(shape, DType::QUInt8, p).get(); + ASSERT_EQ(sizes.size(), 2); + EXPECT_EQ(sizes[0], 6 * sizeof(float)); +} + +TEST(TestQuantParams, aux_sizes_per_row_dim_out_of_range_errors) { + // dim=3 is invalid for a 3-dim tensor. + auto p = quint8_per_row_asym(3); + const uint64_t shape[] = {2, 3, 8}; + auto result = compute_aux_storage_sizes(shape, DType::QUInt8, p); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument); +} + +TEST(TestQuantParams, aux_sizes_per_row_negative_dim_out_of_range_errors) { + // dim=-4 resolves to -1 for a 3-dim tensor, which is invalid. + auto p = quint8_per_row_asym(-4); + const uint64_t shape[] = {2, 3, 8}; + auto result = compute_aux_storage_sizes(shape, DType::QUInt8, p); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument); +} + +TEST(TestQuantParams, aux_sizes_blockwise_sym) { + // [4, 128], blockwise along axis=1, block_size=32 + // num_blocks = 128/32 = 4 + // other_dims = 4 (axis=1, so dim 0 contributes) + // total scales = 4 * 4 = 16 + auto p = qint4_blockwise_sym(1, 32); + const uint64_t shape[] = {4, 128}; + auto sizes = compute_aux_storage_sizes(shape, DType::QInt4, p).get(); + ASSERT_EQ(sizes.size(), 1); + EXPECT_EQ(sizes[0], 16 * sizeof(float)); +} + +TEST(TestQuantParams, aux_sizes_per_block_not_divisible_errors) { + // [4, 100], block_size=32 along axis=1 does not evenly divide 100. + auto p = qint4_blockwise_sym(1, 32); + const uint64_t shape[] = {4, 100}; + auto result = compute_aux_storage_sizes(shape, DType::QInt4, p); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument); +} + +// --- compute_aux_storage_sizes validation --- + +TEST(TestQuantParams, aux_sizes_axis_out_of_range_errors) { + // axis=2 is invalid for a 2-dim tensor. + auto p = qint8_per_channel_sym(2); + const uint64_t shape[] = {4, 8}; + auto result = compute_aux_storage_sizes(shape, DType::QInt8, p); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument); +} + +TEST(TestQuantParams, aux_sizes_negative_axis_errors) { + auto p = qint8_per_channel_sym(-1); + const uint64_t shape[] = {4, 8}; + auto result = compute_aux_storage_sizes(shape, DType::QInt8, p); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument); +} + +TEST(TestQuantParams, aux_sizes_zero_block_size_errors) { + auto p = qint4_blockwise_sym(1, 0); + const uint64_t shape[] = {4, 128}; + auto result = compute_aux_storage_sizes(shape, DType::QInt4, p); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), executorch::runtime::Error::InvalidArgument); +} From 7cc1262c33ab2ceb7fb22ecf1a9a9a9c525bb881 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Fri, 12 Jun 2026 15:33:55 -0700 Subject: [PATCH 2/3] Update [ghstack-poisoned] --- backends/xnnpack/CMakeLists.txt | 5 ----- backends/xnnpack/targets.bzl | 5 +++++ shim_et/xplat/executorch/build/build_variables.bzl | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 81453e80d2f..cd0d945a84f 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -100,11 +100,6 @@ set(xnnpack_third_party pthreadpool extension_threadpool cpuinfo) include(cmake/Dependencies.cmake) -# Graph runtime sources. -list(APPEND _xnnpack_backend__srcs backends/xnnpack/runtime/core/tensor.cpp - backends/xnnpack/runtime/core/quant_params.cpp -) - list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(xnnpack_backend ${_xnnpack_backend__srcs}) target_link_libraries( diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index b3af589df10..4bafb6c4c5f 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -41,6 +41,11 @@ def define_common_targets(): headers = native.glob([ "runtime/*.h", "runtime/profiling/*.h", + "runtime/core/*.h", + "runtime/graph/*.h", + "runtime/operators/*.h", + "runtime/executor/*.h", + "runtime/plan/*.h", ]), visibility = ["PUBLIC"], preprocessor_flags = [ diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl index 659a128994f..4a8cc479075 100644 --- a/shim_et/xplat/executorch/build/build_variables.bzl +++ b/shim_et/xplat/executorch/build/build_variables.bzl @@ -480,6 +480,8 @@ XNNPACK_BACKEND_BUCK_SRCS = [ "runtime/XNNWorkspaceManager.cpp", "runtime/XnnpackBackendOptions.cpp", "runtime/profiling/XNNProfiler.cpp", + "runtime/core/tensor.cpp", + "runtime/core/quant_params.cpp", ] XNNPACK_BACKEND_SRCS = ["backends/xnnpack/" + x for x in XNNPACK_BACKEND_BUCK_SRCS] From 1b00ce36d43bbce970ac3df721175250a29e6970 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Fri, 12 Jun 2026 22:19:00 -0700 Subject: [PATCH 3/3] Update [ghstack-poisoned] --- backends/xnnpack/runtime/core/quant_params.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backends/xnnpack/runtime/core/quant_params.h b/backends/xnnpack/runtime/core/quant_params.h index c0319250674..7922015512e 100644 --- a/backends/xnnpack/runtime/core/quant_params.h +++ b/backends/xnnpack/runtime/core/quant_params.h @@ -67,10 +67,15 @@ struct PerRowQuantParams { int8_t axis = -1; DType scale_dtype = DType::Float32; bool has_zero_point = false; + // When true, this is a dynamically-quantized activation (XNNPACK qdint8): + // the per-row scale/zero point are computed at runtime rather than stored. + // `axis` is the reduced (channel) dim, so the number of trailing "row" dims + // (XNNPACK's num_nonbatch_dims) is -axis for the usual negative axis. + bool is_dynamic = false; bool operator==(const PerRowQuantParams& o) const { return axis == o.axis && scale_dtype == o.scale_dtype && - has_zero_point == o.has_zero_point; + has_zero_point == o.has_zero_point && is_dynamic == o.is_dynamic; } };