diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index 3a5d6ab7958..ec5ce9ef7dc 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -6,18 +6,25 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include #include #include +#include #include #include #include +#include #include +#include +#include +#include #include #include +#include #pragma clang diagnostic ignored "-Wglobal-constructors" @@ -41,6 +48,16 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::Result; using executorch::runtime::Span; +struct XnnpackDelegateHandle { + bool is_graph_runtime = false; + // Legacy path: XNNExecutor placed via runtime allocator. + xnnpack::delegate::XNNExecutor* legacy_executor = nullptr; + // Graph path: heap-allocated Executor. + xnnpack::executor::Executor* graph_executor = nullptr; + std::vector input_external_ids; + std::vector output_external_ids; +}; + class XnnpackBackend final : public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface { public: @@ -66,26 +83,67 @@ class XnnpackBackend final BackendInitContext& context, FreeableBuffer* processed, ArrayRef compile_specs) const override { + auto* handle = context.get_runtime_allocator() + ->allocateInstance(); + if (handle == nullptr) { + return Error::MemoryAllocationFailed; + } + new (handle) XnnpackDelegateHandle(); + + bool use_graph_runtime = options_.resolve_graph_runtime(context); + handle->is_graph_runtime = use_graph_runtime; + + if (use_graph_runtime) { + auto t0 = std::chrono::steady_clock::now(); + const NamedDataMap* named_data_map = context.get_named_data_map(); + ET_UNWRAP( + result, + xnnpack::FlatbufferGraphBuilder::build( + processed->data(), processed->size(), named_data_map)); + processed->Free(); + auto t1 = std::chrono::steady_clock::now(); + + ET_UNWRAP( + built_executor, xnnpack::executor::Executor::build(result.graph)); + auto* executor = + new xnnpack::executor::Executor(std::move(built_executor)); + auto t2 = std::chrono::steady_clock::now(); + handle->graph_executor = executor; + handle->input_external_ids = std::move(result.input_external_ids); + handle->output_external_ids = std::move(result.output_external_ids); + ET_LOG( + Info, + "Graph runtime init: deserialize=%lldms executor_build=%lldms", + (long long)std::chrono::duration_cast( + t1 - t0) + .count(), + (long long)std::chrono::duration_cast( + t2 - t1) + .count()); + return handle; + } + auto executor = context.get_runtime_allocator() ->allocateInstance(); if (executor == nullptr) { + handle->~XnnpackDelegateHandle(); return Error::MemoryAllocationFailed; } const NamedDataMap* named_data_map = context.get_named_data_map(); - // thread safe. This can happen when multiple threads call init() on - // the same backend instance. auto program_id = reinterpret_cast(context.get_runtime_allocator()); auto sharing_mode_result = options_.resolve_sharing_mode(context); if (!sharing_mode_result.ok()) { + handle->~XnnpackDelegateHandle(); return sharing_mode_result.error(); } auto workspace_result = options_.workspace_manager().get_or_create_workspace( program_id, sharing_mode_result.get()); if (!workspace_result.ok()) { + handle->~XnnpackDelegateHandle(); return workspace_result.error(); } auto workspace = workspace_result.get(); @@ -128,23 +186,27 @@ class XnnpackBackend final processed->Free(); if (err != Error::Ok) { - // destroy() won't be called on this handle, so we need to clean it up - // now. executor->~XNNExecutor(); - + handle->~XnnpackDelegateHandle(); ET_LOG( Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err); return err; } - - return executor; + handle->legacy_executor = executor; + return handle; } Error execute( BackendExecutionContext& context, DelegateHandle* handle, Span args) const override { - auto executor = static_cast(handle); + auto* delegate = static_cast(handle); + + if (delegate->is_graph_runtime) { + return execute_graph(delegate, args); + } + + auto executor = delegate->legacy_executor; auto workspace = executor->get_workspace(); @@ -176,7 +238,15 @@ class XnnpackBackend final void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { - auto executor = static_cast(handle); + auto* delegate = static_cast(handle); + + if (delegate->is_graph_runtime) { + delete delegate->graph_executor; + delegate->~XnnpackDelegateHandle(); + return; + } + + auto executor = delegate->legacy_executor; auto workspace = executor->get_workspace(); const std::lock_guard lock_weights_cache( @@ -200,6 +270,7 @@ class XnnpackBackend final // XNNExecutor is not trivially destructible. Since this was constructed // manually in init(), we must destroy it manually here. executor->~XNNExecutor(); + delegate->~XnnpackDelegateHandle(); } } @@ -228,6 +299,107 @@ class XnnpackBackend final } private: + Error execute_graph(XnnpackDelegateHandle* delegate, Span args) + const { + auto* executor = delegate->graph_executor; + + // Build input tensors from EValue args. + std::vector inputs; + inputs.reserve(delegate->input_external_ids.size()); + for (uint32_t ext_id : delegate->input_external_ids) { + ET_CHECK_OR_RETURN_ERROR( + ext_id < args.size(), + InvalidProgram, + "Input external id %u out of range (%zu args)", + ext_id, + args.size()); + auto& et_tensor = args[ext_id]->toTensor(); + xnnpack::core::Tensor t; + // The external-value dtype is taken from the serialized graph spec; this + // field is informational for the input wrapper. Defaulting to Float32 + // matches the supported (float) input set. + t.dtype = xnnpack::core::DType::Float32; + if (et_tensor.dim() == 0) { + t.sizes = {1}; + } else { + // Pass dims in physical (dim-order-permuted) layout so a channels-last + // input matches the NHWC layout XNNPACK expects, mirroring the legacy + // XNNExecutor path. + size_t num_dims = et_tensor.dim(); + executorch::aten::DimOrderType + dim_order[::executorch::runtime::kTensorDimensionLimit]; + ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order( + et_tensor, dim_order, num_dims)); + t.sizes.resize(num_dims); + for (size_t d = 0; d < num_dims; d++) { + t.sizes[d] = static_cast( + et_tensor.size(static_cast(dim_order[d]))); + } + } + t.storage.data = et_tensor.mutable_data_ptr(); + t.storage.size_in_bytes = et_tensor.nbytes(); + t.storage.owner = xnnpack::core::StorageOwner::External; + inputs.push_back(std::move(t)); + } + + ET_UNWRAP(outputs, executor->run({inputs.data(), inputs.size()})); + + ET_CHECK_OR_RETURN_ERROR( + outputs.size() == delegate->output_external_ids.size(), + Internal, + "Executor produced %zu outputs, expected %zu", + outputs.size(), + delegate->output_external_ids.size()); + + // Copy output data back to EValue tensors. + for (size_t i = 0; i < delegate->output_external_ids.size(); i++) { + uint32_t ext_id = delegate->output_external_ids[i]; + ET_CHECK_OR_RETURN_ERROR( + ext_id < args.size(), + InvalidProgram, + "Output external id %u out of range (%zu args)", + ext_id, + args.size()); + auto& et_tensor = args[ext_id]->toTensor(); + auto& out_tensor = outputs[i]; + + // Resize the output EValue tensor to match the computed shape. The + // executor reports dims in XNNPACK physical (channels-last) order; + // scatter them back to the tensor's logical order via its dim_order, + // mirroring the legacy XNNExecutor::resize_outputs path. + size_t num_dims = out_tensor.sizes.size(); + std::vector new_sizes_vec(num_dims); + executorch::aten::DimOrderType + out_dim_order[::executorch::runtime::kTensorDimensionLimit]; + ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order( + et_tensor, out_dim_order, num_dims)); + for (size_t d = 0; d < num_dims; d++) { + new_sizes_vec[out_dim_order[d]] = + static_cast(out_tensor.sizes[d]); + } + executorch::aten::ArrayRef new_sizes( + new_sizes_vec.data(), new_sizes_vec.size()); + ET_CHECK_OK_OR_RETURN_ERROR( + executorch::runtime::resize_tensor(et_tensor, new_sizes)); + + if (out_tensor.storage.data != et_tensor.mutable_data_ptr()) { + ET_CHECK_OR_RETURN_ERROR( + out_tensor.storage.size_in_bytes <= et_tensor.nbytes(), + Internal, + "Output %zu is %zu bytes, exceeds tensor capacity %zu", + i, + out_tensor.storage.size_in_bytes, + et_tensor.nbytes()); + std::memcpy( + et_tensor.mutable_data_ptr(), + out_tensor.storage.data, + out_tensor.storage.size_in_bytes); + } + } + + return Error::Ok; + } + mutable xnnpack::XnnpackBackendOptions options_; // Weights cache is global to all delegate instances. diff --git a/backends/xnnpack/runtime/XNNPACKBackend.h b/backends/xnnpack/runtime/XNNPACKBackend.h index e3492c3f5f3..473eba2a1e1 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.h +++ b/backends/xnnpack/runtime/XNNPACKBackend.h @@ -20,6 +20,10 @@ const char weight_cache_option_key[] = "weight_cache_enabled"; // @lint-ignore CLANGTIDY facebook-hte-CArray const char packed_cache_path_option_key[] = "packed_cache_path"; +/// The key for the graph runtime option. When enabled, the new graph-based +/// runtime is used instead of the legacy XNNCompiler/XNNExecutor path. +const char use_graph_runtime_option_key[] = "use_graph_runtime"; + /// Workspace sharing mode. This is a backend option that can be set via the /// set_option API to control memory sharing between CALL_DELEGATE instances. /// This is useful for reducing memory consumption. diff --git a/backends/xnnpack/runtime/XnnpackBackendOptions.cpp b/backends/xnnpack/runtime/XnnpackBackendOptions.cpp index ffaba9508d8..0eea1528874 100644 --- a/backends/xnnpack/runtime/XnnpackBackendOptions.cpp +++ b/backends/xnnpack/runtime/XnnpackBackendOptions.cpp @@ -43,6 +43,8 @@ Error XnnpackBackendOptions::get_option(BackendOption& option) const { std::min(packed_cache_path_.size(), runtime::kMaxOptionValueLength - 1); memcpy(arr.data(), packed_cache_path_.data(), len); option.value = arr; + } else if (strcmp(option.key, use_graph_runtime_option_key) == 0) { + option.value = use_graph_runtime_.load(); } return Error::Ok; } @@ -84,6 +86,14 @@ Error XnnpackBackendOptions::set_option(const BackendOption& option) { Debug, "Setting XNNPACK packed cache path to %s.", packed_cache_path_.c_str()); + } else if (strcmp(option.key, use_graph_runtime_option_key) == 0) { + auto* val = std::get_if(&option.value); + if (!val) { + ET_LOG(Error, "XNNPACK use_graph_runtime must be a bool."); + return Error::InvalidArgument; + } + ET_LOG(Debug, "Setting XNNPACK use_graph_runtime to %d.", *val); + use_graph_runtime_.store(*val); } return Error::Ok; } @@ -114,6 +124,12 @@ XnnpackBackendOptions::resolve_sharing_mode( return static_cast(raw_mode); } +bool XnnpackBackendOptions::resolve_graph_runtime( + const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const { + return resolve_option( + context, use_graph_runtime_option_key, use_graph_runtime_.load()); +} + WorkspaceSharingMode XnnpackBackendOptions::get_sharing_mode() const { return sharing_mode_.load(); } diff --git a/backends/xnnpack/runtime/XnnpackBackendOptions.h b/backends/xnnpack/runtime/XnnpackBackendOptions.h index aed037ac835..69a3626aaf6 100644 --- a/backends/xnnpack/runtime/XnnpackBackendOptions.h +++ b/backends/xnnpack/runtime/XnnpackBackendOptions.h @@ -37,6 +37,9 @@ class XnnpackBackendOptions { runtime::Result resolve_sharing_mode( const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const; + bool resolve_graph_runtime( + const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const; + WorkspaceSharingMode get_sharing_mode() const; XNNWorkspaceManager& workspace_manager(); const XNNWorkspaceManager& workspace_manager() const; @@ -61,6 +64,7 @@ class XnnpackBackendOptions { #endif std::string packed_cache_path_; + std::atomic use_graph_runtime_{false}; }; } // namespace executorch::backends::xnnpack diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index 024e26d9e56..ea16c346547 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -69,3 +69,22 @@ target_include_directories( ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include ) + +# Graph runtime E2E tests (requires XNNPACK runtime). +et_cxx_test( + backends_xnnpack_graph_e2e_test + SOURCES + runtime/test_e2e.cpp + EXTRA_LIBS + xnnpack_backend + XNNPACK + pthreadpool + cpuinfo + xnnpack-microkernels-prod +) +target_include_directories( + backends_xnnpack_graph_e2e_test + PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/XNNPACK/include + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include + ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include +) diff --git a/backends/xnnpack/test/runtime/test_e2e.cpp b/backends/xnnpack/test/runtime/test_e2e.cpp new file mode 100644 index 00000000000..8e63108a1e7 --- /dev/null +++ b/backends/xnnpack/test/runtime/test_e2e.cpp @@ -0,0 +1,975 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include + +using namespace executorch::backends::xnnpack::core; +using namespace executorch::backends::xnnpack::executor; +using namespace executorch::backends::xnnpack::graph; + +static Storage make_owned(size_t size_in_bytes) { + return std::move(Storage::create_owned(size_in_bytes).get()); +} + +TEST(TestE2E, add) { + // Build graph: output = input_a + input_b + // Shape: [1, 4] float32, static sizes. + auto builder = GraphBuilder(); + + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}}; + + auto a = builder.createInput(spec); + auto b = builder.createInput(spec); + auto add = builder.createOperator(Operator::Add, spec, a, b); + builder.createOutput(add); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + // Create input tensors. + Tensor ta; + ta.dtype = DType::Float32; + ta.sizes = {1, 4}; + ta.storage = make_owned(4 * sizeof(float)); + float* da = ta.data_mut(); + da[0] = 1.0f; + da[1] = 2.0f; + da[2] = 3.0f; + da[3] = 4.0f; + + Tensor tb; + tb.dtype = DType::Float32; + tb.sizes = {1, 4}; + tb.storage = make_owned(4 * sizeof(float)); + float* db = tb.data_mut(); + db[0] = 10.0f; + db[1] = 20.0f; + db[2] = 30.0f; + db[3] = 40.0f; + + std::vector inputs; + inputs.push_back(std::move(ta)); + inputs.push_back(std::move(tb)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 4})); + + auto* out = outputs[0].data_const(); + EXPECT_FLOAT_EQ(out[0], 11.0f); + EXPECT_FLOAT_EQ(out[1], 22.0f); + EXPECT_FLOAT_EQ(out[2], 33.0f); + EXPECT_FLOAT_EQ(out[3], 44.0f); +} + +/* + * Helper: build and run a single binary op over two [1,n] float32 inputs. + * Returns the executor (owns the arena) and the output tensors (alias it). + */ +struct BinaryOpResult { + Executor executor; + std::vector outputs; +}; + +BinaryOpResult +run_binary_op(Operator op, const float* a, const float* b, size_t n) { + auto builder = GraphBuilder(); + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = { + DimSizeSpec::constant(1), + DimSizeSpec::constant(static_cast(n))}}; + + auto ia = builder.createInput(spec); + auto ib = builder.createInput(spec); + auto out = builder.createOperator(op, spec, ia, ib); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + EXPECT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ta; + ta.dtype = DType::Float32; + ta.sizes = {1, static_cast(n)}; + ta.storage = make_owned(n * sizeof(float)); + std::memcpy(ta.data_mut(), a, n * sizeof(float)); + + Tensor tb; + tb.dtype = DType::Float32; + tb.sizes = {1, static_cast(n)}; + tb.storage = make_owned(n * sizeof(float)); + std::memcpy(tb.data_mut(), b, n * sizeof(float)); + + std::vector inputs; + inputs.push_back(std::move(ta)); + inputs.push_back(std::move(tb)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + EXPECT_TRUE(outputs_result.ok()); + return {std::move(executor), std::move(*outputs_result)}; +} + +TEST(TestE2E, subtract) { + float a[] = {10, 20, 30, 40}; + float b[] = {1, 2, 3, 4}; + auto [executor, outputs] = run_binary_op(Operator::Subtract, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 9.0f); + EXPECT_FLOAT_EQ(d[1], 18.0f); + EXPECT_FLOAT_EQ(d[2], 27.0f); + EXPECT_FLOAT_EQ(d[3], 36.0f); +} + +TEST(TestE2E, multiply) { + float a[] = {2, 3, 4, 5}; + float b[] = {10, 20, 30, 40}; + auto [executor, outputs] = run_binary_op(Operator::Multiply, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 20.0f); + EXPECT_FLOAT_EQ(d[1], 60.0f); + EXPECT_FLOAT_EQ(d[2], 120.0f); + EXPECT_FLOAT_EQ(d[3], 200.0f); +} + +TEST(TestE2E, divide) { + float a[] = {10, 20, 30, 40}; + float b[] = {2, 4, 5, 8}; + auto [executor, outputs] = run_binary_op(Operator::Divide, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 5.0f); + EXPECT_FLOAT_EQ(d[1], 5.0f); + EXPECT_FLOAT_EQ(d[2], 6.0f); + EXPECT_FLOAT_EQ(d[3], 5.0f); +} + +TEST(TestE2E, maximum) { + float a[] = {1, 20, 3, 40}; + float b[] = {10, 2, 30, 4}; + auto [executor, outputs] = run_binary_op(Operator::Maximum, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 10.0f); + EXPECT_FLOAT_EQ(d[1], 20.0f); + EXPECT_FLOAT_EQ(d[2], 30.0f); + EXPECT_FLOAT_EQ(d[3], 40.0f); +} + +TEST(TestE2E, minimum) { + float a[] = {1, 20, 3, 40}; + float b[] = {10, 2, 30, 4}; + auto [executor, outputs] = run_binary_op(Operator::Minimum, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 1.0f); + EXPECT_FLOAT_EQ(d[1], 2.0f); + EXPECT_FLOAT_EQ(d[2], 3.0f); + EXPECT_FLOAT_EQ(d[3], 4.0f); +} + +TEST(TestE2E, copysign) { + float a[] = {5, -5, 5, -5}; + float b[] = {1, -1, -1, 1}; + auto [executor, outputs] = run_binary_op(Operator::CopySign, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 5.0f); + EXPECT_FLOAT_EQ(d[1], -5.0f); + EXPECT_FLOAT_EQ(d[2], -5.0f); + EXPECT_FLOAT_EQ(d[3], 5.0f); +} + +TEST(TestE2E, squared_difference) { + float a[] = {5, 10, 3, 8}; + float b[] = {2, 7, 1, 4}; + auto [executor, outputs] = + run_binary_op(Operator::SquaredDifference, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 9.0f); + EXPECT_FLOAT_EQ(d[1], 9.0f); + EXPECT_FLOAT_EQ(d[2], 4.0f); + EXPECT_FLOAT_EQ(d[3], 16.0f); +} + +TEST(TestE2E, prelu) { + float a[] = {1, -2, 3, -4}; + float b[] = {0.5f, 0.5f, 0.5f, 0.5f}; + auto [executor, outputs] = run_binary_op(Operator::PReLU, a, b, 4); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 1.0f); + EXPECT_FLOAT_EQ(d[1], -1.0f); + EXPECT_FLOAT_EQ(d[2], 3.0f); + EXPECT_FLOAT_EQ(d[3], -2.0f); +} + +TEST(TestE2E, add_dynamic_shape) { + // Build graph: output = A + B with shape [1, s0] (dynamic second dim). + auto builder = GraphBuilder(); + auto s0 = builder.createSymInt(); + + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::sym(s0)}}; + + auto a = builder.createInput(spec); + auto b = builder.createInput(spec); + auto add = builder.createOperator(Operator::Add, spec, a, b); + builder.createOutput(add); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + // First run: n=4 + { + float da[] = {1, 2, 3, 4}; + float db[] = {10, 20, 30, 40}; + + Tensor ta; + ta.dtype = DType::Float32; + ta.sizes = {1, 4}; + ta.storage = make_owned(4 * sizeof(float)); + std::memcpy(ta.data_mut(), da, sizeof(da)); + + Tensor tb; + tb.dtype = DType::Float32; + tb.sizes = {1, 4}; + tb.storage = make_owned(4 * sizeof(float)); + std::memcpy(tb.data_mut(), db, sizeof(db)); + + std::vector inputs; + inputs.push_back(std::move(ta)); + inputs.push_back(std::move(tb)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 4})); + auto* out = outputs[0].data_const(); + EXPECT_FLOAT_EQ(out[0], 11.0f); + EXPECT_FLOAT_EQ(out[1], 22.0f); + EXPECT_FLOAT_EQ(out[2], 33.0f); + EXPECT_FLOAT_EQ(out[3], 44.0f); + } + + // Second run: n=2 (different dynamic size, same executor) + { + float da[] = {100, 200}; + float db[] = {5, 6}; + + Tensor ta; + ta.dtype = DType::Float32; + ta.sizes = {1, 2}; + ta.storage = make_owned(2 * sizeof(float)); + std::memcpy(ta.data_mut(), da, sizeof(da)); + + Tensor tb; + tb.dtype = DType::Float32; + tb.sizes = {1, 2}; + tb.storage = make_owned(2 * sizeof(float)); + std::memcpy(tb.data_mut(), db, sizeof(db)); + + std::vector inputs; + inputs.push_back(std::move(ta)); + inputs.push_back(std::move(tb)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 2})); + auto* out = outputs[0].data_const(); + EXPECT_FLOAT_EQ(out[0], 105.0f); + EXPECT_FLOAT_EQ(out[1], 206.0f); + } +} + +TEST(TestE2E, three_adds_two_outputs) { + auto builder = GraphBuilder(); + + auto spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}}; + + auto a = builder.createInput(spec); + auto b = builder.createInput(spec); + auto c = builder.createInput(spec); + auto add0 = builder.createOperator(Operator::Add, spec, a, b); + auto add1 = builder.createOperator(Operator::Add, spec, add0, c); + auto add2 = builder.createOperator(Operator::Add, spec, add0, add1); + builder.createOutput(add0); + builder.createOutput(add2); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ta; + ta.dtype = DType::Float32; + ta.sizes = {1, 4}; + ta.storage = make_owned(4 * sizeof(float)); + float* da = ta.data_mut(); + da[0] = 1.0f; + da[1] = 2.0f; + da[2] = 3.0f; + da[3] = 4.0f; + + Tensor tb; + tb.dtype = DType::Float32; + tb.sizes = {1, 4}; + tb.storage = make_owned(4 * sizeof(float)); + float* db = tb.data_mut(); + db[0] = 10.0f; + db[1] = 20.0f; + db[2] = 30.0f; + db[3] = 40.0f; + + Tensor tc; + tc.dtype = DType::Float32; + tc.sizes = {1, 4}; + tc.storage = make_owned(4 * sizeof(float)); + float* dc = tc.data_mut(); + dc[0] = 100.0f; + dc[1] = 200.0f; + dc[2] = 300.0f; + dc[3] = 400.0f; + + std::vector inputs; + inputs.push_back(std::move(ta)); + inputs.push_back(std::move(tb)); + inputs.push_back(std::move(tc)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + // Add0 = A + B = {11, 22, 33, 44} + ASSERT_EQ(outputs.size(), 2); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 4})); + auto* out0 = outputs[0].data_const(); + EXPECT_FLOAT_EQ(out0[0], 11.0f); + EXPECT_FLOAT_EQ(out0[1], 22.0f); + EXPECT_FLOAT_EQ(out0[2], 33.0f); + EXPECT_FLOAT_EQ(out0[3], 44.0f); + + // Add1 = Add0 + C = {111, 222, 333, 444} + // Add2 = Add0 + Add1 = {122, 244, 366, 488} + ASSERT_EQ(outputs[1].sizes, (std::vector{1, 4})); + auto* out1 = outputs[1].data_const(); + EXPECT_FLOAT_EQ(out1[0], 122.0f); + EXPECT_FLOAT_EQ(out1[1], 244.0f); + EXPECT_FLOAT_EQ(out1[2], 366.0f); + EXPECT_FLOAT_EQ(out1[3], 488.0f); +} + +TEST(TestE2E, linear) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(3)}}; + auto output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}}; + + auto input = builder.createInput(input_spec); + + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::Float32; + filter_tensor->sizes = {2, 3}; + filter_tensor->storage = make_owned(6 * sizeof(float)); + float* fw = filter_tensor->data_mut(); + fw[0] = 1; + fw[1] = 0; + fw[2] = 0; + fw[3] = 0; + fw[4] = 1; + fw[5] = 0; + auto filter = builder.createConstant(filter_tensor); + + auto bias_tensor = std::make_shared(); + bias_tensor->dtype = DType::Float32; + bias_tensor->sizes = {2}; + bias_tensor->storage = make_owned(2 * sizeof(float)); + float* bw = bias_tensor->data_mut(); + bw[0] = 10; + bw[1] = 20; + auto bias = builder.createConstant(bias_tensor); + + auto out = builder.createOperator( + Operator::Linear, output_spec, {input, filter, bias}); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {1, 3}; + ti.storage = make_owned(3 * sizeof(float)); + float* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 2})); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 11.0f); // 1*1+2*0+3*0+10 + EXPECT_FLOAT_EQ(d[1], 22.0f); // 1*0+2*1+3*0+20 +} + +TEST(TestE2E, linear_no_bias) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(3)}}; + auto output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}}; + + auto input = builder.createInput(input_spec); + + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::Float32; + filter_tensor->sizes = {2, 3}; + filter_tensor->storage = make_owned(6 * sizeof(float)); + float* fw = filter_tensor->data_mut(); + fw[0] = 1; + fw[1] = 0; + fw[2] = 0; + fw[3] = 0; + fw[4] = 1; + fw[5] = 0; + auto filter = builder.createConstant(filter_tensor); + + auto out = builder.createOperator( + Operator::Linear, output_spec, {input, filter, ValueHandle::null()}); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {1, 3}; + ti.storage = make_owned(3 * sizeof(float)); + float* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 2})); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 1.0f); // 1*1+2*0+3*0 + EXPECT_FLOAT_EQ(d[1], 2.0f); // 1*0+2*1+3*0 +} + +TEST(TestE2E, linear_no_bias_larger) { + // Larger dimensions to exercise tiled packing in the in-tree SME kernel path. + // On non-SME hardware this goes through XNNPACK and validates the same math. + constexpr size_t M = 4; + constexpr size_t K = 8; + constexpr size_t N = 6; + + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(M), DimSizeSpec::constant(K)}}; + auto output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(M), DimSizeSpec::constant(N)}}; + + auto input = builder.createInput(input_spec); + + // Weight tensor: N x K (transposed convention for Linear). + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::Float32; + filter_tensor->sizes = {N, K}; + filter_tensor->storage = make_owned(N * K * sizeof(float)); + float* fw = filter_tensor->data_mut(); + for (size_t i = 0; i < N * K; i++) { + fw[i] = static_cast(i + 1) * 0.1f; + } + auto filter = builder.createConstant(filter_tensor); + + auto out = builder.createOperator( + Operator::Linear, output_spec, {input, filter, ValueHandle::null()}); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + // Input: M x K + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {M, K}; + ti.storage = make_owned(M * K * sizeof(float)); + float* di = ti.data_mut(); + for (size_t i = 0; i < M * K; i++) { + di[i] = static_cast(i + 1); + } + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{M, N})); + + // Reference: out[m][n] = sum_k(input[m][k] * weight[n][k]) + auto* d = outputs[0].data_const(); + for (size_t m = 0; m < M; m++) { + for (size_t n = 0; n < N; n++) { + float expected = 0; + for (size_t k = 0; k < K; k++) { + expected += static_cast(m * K + k + 1) * fw[n * K + k]; + } + EXPECT_NEAR(d[m * N + n], expected, std::abs(expected) * 1e-5f) + << "at (" << m << ", " << n << ")"; + } + } +} + +TEST(TestE2E, linear_qint8_static_dequantized) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(3)}, + .quant_params = qint8_per_tensor_sym(1.0f), + }; + auto quant_output_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}, + .quant_params = qint8_per_tensor_sym(1.0f), + }; + auto float_output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}, + }; + + auto input = builder.createInput(input_spec); + + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::QInt8; + filter_tensor->sizes = {2, 3}; + filter_tensor->storage = make_owned(6); + auto* fw = filter_tensor->data_mut(); + fw[0] = 1; + fw[1] = 0; + fw[2] = 0; + fw[3] = 0; + fw[4] = 1; + fw[5] = 0; + filter_tensor->aux_storage.push_back(make_owned(2 * sizeof(float))); + auto* scales = static_cast(filter_tensor->aux_storage[0].data); + scales[0] = 1.0f; + scales[1] = 1.0f; + auto filter = builder.createConstant(filter_tensor, qint8_per_channel_sym(0)); + + auto linear_out = builder.createOperator( + Operator::Linear, + quant_output_spec, + {input, filter, ValueHandle::null()}); + auto dequant_out = builder.createOperator( + Operator::Dequantize, float_output_spec, {linear_out}); + builder.createOutput(dequant_out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::QInt8; + ti.sizes = {1, 3}; + ti.storage = make_owned(3); + auto* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 2})); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], 1.0f); + EXPECT_FLOAT_EQ(d[1], 2.0f); +} + +TEST(TestE2E, linear_qcint4_static_dequantized) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}, + .quant_params = qint8_per_tensor_sym(1.0f), + }; + auto quant_output_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}, + .quant_params = qint8_per_tensor_sym(1.0f), + }; + auto float_output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}, + }; + + auto input = builder.createInput(input_spec); + + // Weight [2, 4], qcint4 per-channel (axis 0), scales = [1, 1]. + // Logical values: row0 = [1,0,0,0], row1 = [0,1,0,0]. + // qcint4 zero_point is 8, so stored nibble = value + 8. + // row0 nibbles = [9,8,8,8], row1 nibbles = [8,9,8,8]. + // Packed 2 nibbles/byte, element i in byte i/2 (low nibble for even i). + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::QInt4; + filter_tensor->sizes = {2, 4}; + filter_tensor->storage = make_owned(4); // 8 nibbles -> 4 bytes + auto* fw = filter_tensor->data_mut(); + fw[0] = (8 << 4) | 9; // row0 nibbles 0,1 = 9,8 + fw[1] = (8 << 4) | 8; // row0 nibbles 2,3 = 8,8 + fw[2] = (9 << 4) | 8; // row1 nibbles 0,1 = 8,9 + fw[3] = (8 << 4) | 8; // row1 nibbles 2,3 = 8,8 + filter_tensor->aux_storage.push_back(make_owned(2 * sizeof(float))); + auto* scales = static_cast(filter_tensor->aux_storage[0].data); + scales[0] = 1.0f; + scales[1] = 1.0f; + auto filter = builder.createConstant(filter_tensor, qint8_per_channel_sym(0)); + + auto linear_out = builder.createOperator( + Operator::Linear, + quant_output_spec, + {input, filter, ValueHandle::null()}); + auto dequant_out = builder.createOperator( + Operator::Dequantize, float_output_spec, {linear_out}); + builder.createOutput(dequant_out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::QInt8; + ti.sizes = {1, 4}; + ti.storage = make_owned(4); + auto* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + di[3] = 4; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 2})); + auto* d = outputs[0].data_const(); + // out[0] = row0 . input = 1*1 = 1 ; out[1] = row1 . input = 1*2 = 2 + EXPECT_FLOAT_EQ(d[0], 1.0f); + EXPECT_FLOAT_EQ(d[1], 2.0f); +} + +TEST(TestE2E, linear_qd8_qcint4_dynamic) { + auto builder = GraphBuilder(); + + // Float input [2, 4]; dynamically quantized to qdint8 before the matmul. + auto float_input_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(2), DimSizeSpec::constant(4)}, + }; + auto dyn_quant_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(2), DimSizeSpec::constant(4)}, + .quant_params = PerRowQuantParams{.axis = -1, .is_dynamic = true}, + }; + auto float_output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(2), DimSizeSpec::constant(2)}, + }; + + auto input = builder.createInput(float_input_spec); + auto qinput = + builder.createOperator(Operator::Quantize, dyn_quant_spec, {input}); + + // Weight [2, 4], qcint4 per-channel (axis 0), non-trivial signed nibbles + // and per-channel scales to exercise sign, high/low nibble order, and scale. + // row0 logical = [ 3, -2, 1, 0], scale0 = 2.0 + // row1 logical = [-4, 7, -1, 2], scale1 = 0.5 + // nibble = logical + 8: + // row0 = [11, 6, 9, 8], row1 = [4, 15, 7, 10] + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::QInt4; + filter_tensor->sizes = {2, 4}; + filter_tensor->storage = make_owned(4); + auto* fw = filter_tensor->data_mut(); + fw[0] = (6 << 4) | 11; // row0 nibbles 0,1 + fw[1] = (8 << 4) | 9; // row0 nibbles 2,3 + fw[2] = (15 << 4) | 4; // row1 nibbles 0,1 + fw[3] = (10 << 4) | 7; // row1 nibbles 2,3 + filter_tensor->aux_storage.push_back(make_owned(2 * sizeof(float))); + auto* scales = static_cast(filter_tensor->aux_storage[0].data); + scales[0] = 2.0f; + scales[1] = 0.5f; + auto filter = builder.createConstant(filter_tensor, qint8_per_channel_sym(0)); + + auto linear_out = builder.createOperator( + Operator::Linear, + float_output_spec, + {qinput, filter, ValueHandle::null()}); + builder.createOutput(linear_out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {2, 4}; + ti.storage = make_owned(8 * sizeof(float)); + auto* di = ti.data_mut(); + // row0 = [1,2,3,4], row1 = [2,1,0,-1] + di[0] = 1; + di[1] = 2; + di[2] = 3; + di[3] = 4; + di[4] = 2; + di[5] = 1; + di[6] = 0; + di[7] = -1; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{2, 2})); + auto* d = outputs[0].data_const(); + // input row0 = [1,2,3,4]: + // out[0][0] = 2.0*(3*1 - 2*2 + 1*3 + 0*4) = 2.0*2 = 4.0 + // out[0][1] = 0.5*(-4*1 + 7*2 - 1*3 + 2*4) = 0.5*15 = 7.5 + // input row1 = [2,1,0,-1]: + // out[1][0] = 2.0*(3*2 - 2*1 + 1*0 + 0*-1) = 2.0*4 = 8.0 + // out[1][1] = 0.5*(-4*2 + 7*1 - 1*0 + 2*-1) = 0.5*-3 = -1.5 + EXPECT_NEAR(d[0], 4.0f, 1e-1f); + EXPECT_NEAR(d[1], 7.5f, 1e-1f); + EXPECT_NEAR(d[2], 8.0f, 1e-1f); + EXPECT_NEAR(d[3], -1.5f, 1e-1f); +} + +TEST(TestE2E, linear_qint8_static_requantized) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(3)}, + .quant_params = qint8_per_tensor_sym(1.0f), + }; + auto output_spec = TensorSpec{ + .dtype = DType::QInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(2)}, + .quant_params = qint8_per_tensor_sym(1.0f), + }; + + auto input = builder.createInput(input_spec); + + auto filter_tensor = std::make_shared(); + filter_tensor->dtype = DType::QInt8; + filter_tensor->sizes = {2, 3}; + filter_tensor->storage = make_owned(6); + auto* fw = filter_tensor->data_mut(); + fw[0] = 1; + fw[1] = 0; + fw[2] = 0; + fw[3] = 0; + fw[4] = 1; + fw[5] = 0; + filter_tensor->aux_storage.push_back(make_owned(2 * sizeof(float))); + auto* scales = static_cast(filter_tensor->aux_storage[0].data); + scales[0] = 1.0f; + scales[1] = 1.0f; + auto filter = builder.createConstant(filter_tensor, qint8_per_channel_sym(0)); + + auto out = builder.createOperator( + Operator::Linear, output_spec, {input, filter, ValueHandle::null()}); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::QInt8; + ti.sizes = {1, 3}; + ti.storage = make_owned(3); + auto* di = ti.data_mut(); + di[0] = 1; + di[1] = 2; + di[2] = 3; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 2})); + auto* d = outputs[0].data_const(); + EXPECT_EQ(d[0], 1); + EXPECT_EQ(d[1], 2); +} + +TEST(TestE2E, dequantize_quint8) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::QUInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}, + .quant_params = quint8_per_tensor_asym(0.5f, 1), + }; + auto output_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}, + }; + + auto input = builder.createInput(input_spec); + auto out = builder.createOperator(Operator::Dequantize, output_spec, {input}); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::QUInt8; + ti.sizes = {1, 4}; + ti.storage = make_owned(4); + auto* di = ti.data_mut(); + di[0] = 0; + di[1] = 1; + di[2] = 2; + di[3] = 3; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 4})); + auto* d = outputs[0].data_const(); + EXPECT_FLOAT_EQ(d[0], -0.5f); + EXPECT_FLOAT_EQ(d[1], 0.0f); + EXPECT_FLOAT_EQ(d[2], 0.5f); + EXPECT_FLOAT_EQ(d[3], 1.0f); +} + +TEST(TestE2E, quantize_quint8) { + auto builder = GraphBuilder(); + + auto input_spec = TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}, + }; + auto output_spec = TensorSpec{ + .dtype = DType::QUInt8, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(4)}, + .quant_params = quint8_per_tensor_asym(0.5f, 1), + }; + + auto input = builder.createInput(input_spec); + auto out = builder.createOperator(Operator::Quantize, output_spec, {input}); + builder.createOutput(out); + + auto graph = builder.build(); + auto executor_result = Executor::build(graph); + ASSERT_TRUE(executor_result.ok()); + auto& executor = *executor_result; + + Tensor ti; + ti.dtype = DType::Float32; + ti.sizes = {1, 4}; + ti.storage = make_owned(4 * sizeof(float)); + auto* di = ti.data_mut(); + di[0] = -0.5f; + di[1] = 0.0f; + di[2] = 0.5f; + di[3] = 1.0f; + + std::vector inputs; + inputs.push_back(std::move(ti)); + + auto outputs_result = executor.run({inputs.data(), inputs.size()}); + ASSERT_TRUE(outputs_result.ok()); + auto& outputs = *outputs_result; + + ASSERT_EQ(outputs.size(), 1); + ASSERT_EQ(outputs[0].sizes, (std::vector{1, 4})); + auto* d = outputs[0].data_const(); + EXPECT_EQ(d[0], 0); + EXPECT_EQ(d[1], 1); + EXPECT_EQ(d[2], 2); + EXPECT_EQ(d[3], 3); +}