From a8cd74b1fd685ec35fba4999c0f26ae9255b2309 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Fri, 12 Jun 2026 14:30:22 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- backends/xnnpack/CMakeLists.txt | 3 + .../xnnpack/runtime/plan/execution_plan.cpp | 118 +++ .../xnnpack/runtime/plan/execution_plan.h | 48 + backends/xnnpack/runtime/plan/schedule.cpp | 58 ++ backends/xnnpack/runtime/plan/schedule.h | 17 + .../xnnpack/runtime/plan/xnn_subgraph.cpp | 879 ++++++++++++++++++ backends/xnnpack/runtime/plan/xnn_subgraph.h | 44 + backends/xnnpack/test/CMakeLists.txt | 2 +- .../xnnpack/test/runtime/test_schedule.cpp | 162 ++++ 9 files changed, 1330 insertions(+), 1 deletion(-) create mode 100644 backends/xnnpack/runtime/plan/execution_plan.cpp create mode 100644 backends/xnnpack/runtime/plan/execution_plan.h create mode 100644 backends/xnnpack/runtime/plan/schedule.cpp create mode 100644 backends/xnnpack/runtime/plan/schedule.h create mode 100644 backends/xnnpack/runtime/plan/xnn_subgraph.cpp create mode 100644 backends/xnnpack/runtime/plan/xnn_subgraph.h create mode 100644 backends/xnnpack/test/runtime/test_schedule.cpp diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 02fd8373275..c0cf4db0921 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -113,6 +113,9 @@ list( backends/xnnpack/runtime/executor/shape_env.cpp backends/xnnpack/runtime/plan/xnn_support.cpp backends/xnnpack/runtime/plan/partition.cpp + backends/xnnpack/runtime/plan/xnn_subgraph.cpp + backends/xnnpack/runtime/plan/schedule.cpp + backends/xnnpack/runtime/plan/execution_plan.cpp ) list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/") diff --git a/backends/xnnpack/runtime/plan/execution_plan.cpp b/backends/xnnpack/runtime/plan/execution_plan.cpp new file mode 100644 index 00000000000..326d38cb59e --- /dev/null +++ b/backends/xnnpack/runtime/plan/execution_plan.cpp @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace executorch::backends::xnnpack::plan { + +using executorch::runtime::Span; +using namespace graph; + +namespace { + +uint32_t assign_value_slots( + graph::Graph& graph, + Span linear_schedule) { + uint32_t next_slot = 0; + for (auto nh : linear_schedule) { + graph.nodes[nh].tag = next_slot; + next_slot += graph.nodes[nh].output_count(); + } + return next_slot; +} + +runtime::Result> create_plan_steps( + const graph::Graph& graph, + Span linear_schedule) { + std::vector steps; + steps.reserve(linear_schedule.size()); + + for (auto node_handle : linear_schedule) { + auto& node = graph.nodes[node_handle]; + + runtime::Error err = runtime::Error::Ok; + std::visit( + overloaded{ + [&](const CallSubgraphNode& n) { + std::vector external_value_slots; + external_value_slots.reserve(n.args.size() + node.output_count()); + + for (const auto& arg : n.args) { + auto slot = graph.nodes[arg.node].tag + arg.output; + external_value_slots.push_back(slot); + } + + for (uint32_t i = 0; i < node.output_count(); i++) { + external_value_slots.push_back(node.tag + i); + } + + auto runtime_result = compile_xnn_subgraph(*n.subgraph, nullptr); + if (!runtime_result.ok()) { + err = runtime_result.error(); + return; + } + + RunXnnSubgraphStep step; + step.runtime = std::move(*runtime_result); + step.external_value_slots = std::move(external_value_slots); + step.num_external_inputs = n.args.size(); + steps.push_back(std::move(step)); + }, + [](const InputNode&) {}, + [](const ConstantNode&) {}, + [&steps, &node, &graph, &err](const CallOperatorNode& n) { + std::vector input_slots; + input_slots.reserve(n.args.size()); + for (const auto& arg : n.args) { + if (arg.is_null()) + continue; + input_slots.push_back(graph.nodes[arg.node].tag + arg.output); + } + + std::vector output_slots; + for (uint32_t i = 0; i < node.output_count(); i++) { + output_slots.push_back(node.tag + i); + } + + auto op = operators::create_operator(n.op); + if (op == nullptr) { + err = runtime::Error::NotSupported; + return; + } + op->setup({n.constant_args.data(), n.constant_args.size()}); + + RunOperatorStep step; + step.op = std::move(op); + step.input_slots = std::move(input_slots); + step.output_slots = std::move(output_slots); + steps.push_back(std::move(step)); + }}, + node.value); + ET_CHECK_OK_OR_RETURN_ERROR(err); + } + + return steps; +} +} // namespace + +runtime::Result create_execution_plan(graph::Graph& graph) { + ET_CHECK_OK_OR_RETURN_ERROR(partition_xnn_subgraphs(graph)); + + auto linear_schedule = schedule(graph); + Span schedule_span( + linear_schedule.data(), linear_schedule.size()); + auto num_value_slots = assign_value_slots(graph, schedule_span); + (void)num_value_slots; + + ET_UNWRAP(plan_steps, create_plan_steps(graph, schedule_span)); + ExecutionPlan plan; + plan.steps = std::move(plan_steps); + + return plan; +} + +} // namespace executorch::backends::xnnpack::plan diff --git a/backends/xnnpack/runtime/plan/execution_plan.h b/backends/xnnpack/runtime/plan/execution_plan.h new file mode 100644 index 00000000000..2d3b75cf0f1 --- /dev/null +++ b/backends/xnnpack/runtime/plan/execution_plan.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace executorch::backends::xnnpack::graph { +struct Graph; +} + +namespace executorch::backends::xnnpack::plan { + +using ValueSlot = uint32_t; + +/* Run an operator with the given inputs and outputs. */ +struct RunOperatorStep { + std::unique_ptr op; + std::vector input_slots; + std::vector output_slots; +}; + +/* Run an subgraph delegated to XNNPACK. */ +struct RunXnnSubgraphStep { + XnnRuntime runtime; + std::vector external_value_slots; + uint32_t num_external_inputs = 0; +}; + +using PlanStep = std::variant; + +/* + * Describes the planned execution steps for a compiled graph. + */ +struct ExecutionPlan { + std::vector steps; +}; + +/* + * Build an execution plan from a model graph. This pre-processes the + * graph into a format suitable for efficient execution. + */ +runtime::Result create_execution_plan(graph::Graph& graph); + +} // namespace executorch::backends::xnnpack::plan diff --git a/backends/xnnpack/runtime/plan/schedule.cpp b/backends/xnnpack/runtime/plan/schedule.cpp new file mode 100644 index 00000000000..512eb621d71 --- /dev/null +++ b/backends/xnnpack/runtime/plan/schedule.cpp @@ -0,0 +1,58 @@ +#include + +#include +#include +#include +#include + +namespace executorch::backends::xnnpack::plan { + +using namespace graph; + +std::vector schedule(const graph::Graph& graph) { + const auto& nodes = graph.nodes; + + std::vector in_edges(nodes.size(), 0); + std::deque queue; + + for (NodeHandle n = 0; n < nodes.size(); n++) { + auto& args = nodes[n].get_args(); + in_edges[n] = + std::count_if(args.begin(), args.end(), [&](const ValueHandle& a) { + if (a.is_null()) + return false; + return !std::holds_alternative(nodes[a.node].value) && + !std::holds_alternative(nodes[a.node].value); + }); + if (in_edges[n] == 0) { + queue.push_back(n); + } + } + + std::vector order; + order.reserve(nodes.size()); + + while (!queue.empty()) { + auto nh = queue.front(); + queue.pop_front(); + order.push_back(nh); + + if (std::holds_alternative(nodes[nh].value) || + std::holds_alternative(nodes[nh].value)) { + continue; + } + + for (auto user : nodes[nh].users) { + assert(in_edges[user] > 0); + in_edges[user]--; + if (in_edges[user] == 0) { + queue.push_back(user); + } + } + } + + assert(order.size() == nodes.size()); + return order; +} + +} // namespace executorch::backends::xnnpack::plan diff --git a/backends/xnnpack/runtime/plan/schedule.h b/backends/xnnpack/runtime/plan/schedule.h new file mode 100644 index 00000000000..08c860ff98d --- /dev/null +++ b/backends/xnnpack/runtime/plan/schedule.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +#include + +namespace executorch::backends::xnnpack::plan { + +/* + * Flatten a computational graph down to a linear schedule. This is + * an ordering of nodes that respects dependency orders - i.e. a + * topological sort. + */ +std::vector schedule(const graph::Graph& graph); + +} // namespace executorch::backends::xnnpack::plan diff --git a/backends/xnnpack/runtime/plan/xnn_subgraph.cpp b/backends/xnnpack/runtime/plan/xnn_subgraph.cpp new file mode 100644 index 00000000000..31b387bb5f2 --- /dev/null +++ b/backends/xnnpack/runtime/plan/xnn_subgraph.cpp @@ -0,0 +1,879 @@ +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace executorch::backends::xnnpack::plan { + +using core::DType; +using executorch::runtime::Span; +using graph::CallOperatorNode; +using graph::ConstantNode; +using graph::InputNode; +using graph::NodeHandle; +using graph::Operator; + +namespace { + +runtime::Result map_xnn_datatype(const graph::TensorSpec& spec) { + if (!spec.quant_params) { + switch (spec.dtype) { + case DType::Float32: + return xnn_datatype_fp32; + case DType::Float16: + return xnn_datatype_fp16; + default: + ET_LOG(Error, "Unsupported dtype for XNNPACK delegation"); + return runtime::Error::NotSupported; + } + } + switch (spec.dtype) { + case DType::QUInt8: + return xnn_datatype_quint8; + case DType::QInt8: + if (std::holds_alternative( + *spec.quant_params)) { + return xnn_datatype_qcint8; + } + return xnn_datatype_qint8; + case DType::QInt32: + return xnn_datatype_qint32; + default: + ET_LOG(Error, "Unsupported quantized dtype for XNNPACK delegation"); + return runtime::Error::NotSupported; + } +} + +std::optional map_binary_op(Operator op) { + switch (op) { + case Operator::Add: + return xnn_binary_add; + case Operator::Subtract: + return xnn_binary_subtract; + case Operator::Multiply: + return xnn_binary_multiply; + case Operator::Divide: + return xnn_binary_divide; + case Operator::Maximum: + return xnn_binary_maximum; + case Operator::Minimum: + return xnn_binary_minimum; + case Operator::CopySign: + return xnn_binary_copysign; + case Operator::SquaredDifference: + return xnn_binary_squared_difference; + case Operator::Modulus: + return xnn_binary_modulus; + case Operator::Atan2: + return xnn_binary_atan2; + case Operator::Pow: + return xnn_binary_pow; + default: + return std::nullopt; + } +} + +std::optional map_unary_op(Operator op) { + switch (op) { + case Operator::Abs: + return xnn_unary_abs; + case Operator::Negate: + return xnn_unary_negate; + case Operator::Clamp: + return xnn_unary_clamp; + case Operator::Ceiling: + return xnn_unary_ceiling; + case Operator::Floor: + return xnn_unary_floor; + case Operator::Round: + return xnn_unary_bankers_rounding; + case Operator::Square: + return xnn_unary_square; + case Operator::SquareRoot: + return xnn_unary_square_root; + case Operator::ReciprocalSquareRoot: + return xnn_unary_reciprocal_square_root; + case Operator::Exp: + return xnn_unary_exp; + case Operator::Log: + return xnn_unary_log; + case Operator::Sigmoid: + return xnn_unary_sigmoid; + case Operator::Tanh: + return xnn_unary_tanh; + case Operator::ELU: + return xnn_unary_elu; + case Operator::GELU: + return xnn_unary_gelu; + case Operator::HardSwish: + return xnn_unary_hardswish; + case Operator::LeakyReLU: + return xnn_unary_leaky_relu; + case Operator::Sine: + return xnn_unary_sine; + case Operator::Cosine: + return xnn_unary_cosine; + case Operator::Sign: + return xnn_unary_sign; + case Operator::ReLU: + return xnn_unary_clamp; + default: + return std::nullopt; + } +} + +template +T get_const(const graph::ConstantArg& arg) { + return std::get(arg); +} + +template +std::vector to_size_vec(const std::vector& v) { + return {v.begin(), v.end()}; +} + +runtime::Error define_node( + const CallOperatorNode& op, + uint32_t output_id, + Span xnn_ids, + xnn_subgraph_t subgraph) { + if (auto bin_op = map_binary_op(op.op)) { + xnn_binary_params params = { + .output_min = op.output_min, + .output_max = op.output_max, + }; + auto status = xnn_define_binary( + subgraph, + *bin_op, + ¶ms, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::PReLU) { + auto status = xnn_define_prelu( + subgraph, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (auto unary_op = map_unary_op(op.op)) { + xnn_unary_params params = {}; + + if (op.op == Operator::Clamp) { + params.clamp.min = + static_cast(get_const(op.constant_args[0])); + params.clamp.max = + static_cast(get_const(op.constant_args[1])); + } else if (op.op == Operator::ReLU) { + params.clamp.min = 0.0f; + params.clamp.max = INFINITY; + } else if (op.op == Operator::ELU) { + params.elu.alpha = op.constant_args.empty() + ? 1.0f + : static_cast(get_const(op.constant_args[0])); + } else if (op.op == Operator::LeakyReLU) { + params.leaky_relu.negative_slope = + static_cast(get_const(op.constant_args[0])); + } + + auto status = xnn_define_unary( + subgraph, + *unary_op, + ¶ms, + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Linear) { + auto bias_id = + op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node]; + auto status = xnn_define_fully_connected( + subgraph, + op.output_min, + op.output_max, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + bias_id, + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::BatchMatrixMultiply) { + auto status = xnn_define_batch_matrix_multiply( + subgraph, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Conv2d) { + auto stride = get_const>(op.constant_args[0]); + auto padding = get_const>(op.constant_args[1]); + auto dilation = get_const>(op.constant_args[2]); + auto groups = get_const(op.constant_args[3]); + auto kernel = get_const>(op.constant_args[4]); + auto group_input_channels = get_const(op.constant_args[5]); + auto group_output_channels = get_const(op.constant_args[6]); + auto bias_id = + op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node]; + + auto status = xnn_define_convolution_2d( + subgraph, + padding[0], + padding[1], + padding[0], + padding[1], + kernel[0], + kernel[1], + stride[0], + stride[1], + dilation[0], + dilation[1], + groups, + group_input_channels, + group_output_channels, + op.output_min, + op.output_max, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + bias_id, + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::DepthwiseConv2d) { + auto stride = get_const>(op.constant_args[0]); + auto padding = get_const>(op.constant_args[1]); + auto dilation = get_const>(op.constant_args[2]); + auto groups = get_const(op.constant_args[3]); + auto kernel = get_const>(op.constant_args[4]); + auto group_input_channels = get_const(op.constant_args[5]); + auto group_output_channels = get_const(op.constant_args[6]); + auto bias_id = + op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node]; + + auto status = xnn_define_depthwise_convolution_2d( + subgraph, + padding[0], + padding[1], + padding[0], + padding[1], + kernel[0], + kernel[1], + stride[0], + stride[1], + dilation[0], + dilation[1], + group_output_channels / group_input_channels, // depth_multiplier + groups, // input_channels + op.output_min, + op.output_max, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + bias_id, + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::ConvTranspose2d) { + auto stride = get_const>(op.constant_args[0]); + auto padding = get_const>(op.constant_args[1]); + auto output_padding = get_const>(op.constant_args[2]); + auto groups = get_const(op.constant_args[3]); + auto dilation = get_const>(op.constant_args[4]); + auto kernel = get_const>(op.constant_args[5]); + auto group_input_channels = get_const(op.constant_args[6]); + auto group_output_channels = get_const(op.constant_args[7]); + auto bias_id = + op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node]; + + auto status = xnn_define_deconvolution_2d( + subgraph, + padding[0], + padding[1], + padding[0], + padding[1], + output_padding[0], + output_padding[1], + kernel[0], + kernel[1], + stride[0], + stride[1], + dilation[0], + dilation[1], + groups, + group_input_channels, + group_output_channels, + op.output_min, + op.output_max, + xnn_ids[op.args[0].node], + xnn_ids[op.args[1].node], + bias_id, + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::AvgPool2d) { + auto kernel = get_const>(op.constant_args[0]); + auto stride = get_const>(op.constant_args[1]); + auto padding = get_const>(op.constant_args[2]); + uint32_t flags = 0; + auto status = xnn_define_average_pooling_2d( + subgraph, + padding[0], + padding[0], + padding[1], + padding[1], + kernel[0], + kernel[1], + stride[0], + stride[1], + -INFINITY, + INFINITY, + xnn_ids[op.args[0].node], + output_id, + flags); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::AdaptiveAvgPool2d) { + auto status = xnn_define_global_average_pooling_2d( + subgraph, + -INFINITY, + INFINITY, + xnn_ids[op.args[0].node], + output_id, + XNN_FLAG_KEEP_DIMS); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::MaxPool2d) { + auto kernel = get_const>(op.constant_args[0]); + auto stride = get_const>(op.constant_args[1]); + auto padding = get_const>(op.constant_args[2]); + auto dilation = get_const>(op.constant_args[3]); + // padding is [top, left, bottom, right]; xnn wants (top, right, bottom, + // left). ceil_mode pooling relies on the asymmetric bottom/right padding. + auto status = xnn_define_max_pooling_2d( + subgraph, + padding[0], + padding[3], + padding[2], + padding[1], + kernel[0], + kernel[1], + stride[0], + stride[1], + dilation[0], + dilation[1], + -INFINITY, + INFINITY, + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Softmax) { + auto status = xnn_define_softmax( + subgraph, xnn_ids[op.args[0].node], output_id, /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Mean || op.op == Operator::Sum) { + auto dims = get_const>(op.constant_args[0]); + auto keepdim = get_const(op.constant_args[1]); + std::vector reduction_axes(dims.begin(), dims.end()); + uint32_t flags = keepdim ? XNN_FLAG_KEEP_DIMS : 0; + + auto reduce_op = + (op.op == Operator::Mean) ? xnn_reduce_mean : xnn_reduce_sum; + + auto status = xnn_define_static_reduce( + subgraph, + reduce_op, + reduction_axes.size(), + reduction_axes.data(), + xnn_ids[op.args[0].node], + output_id, + flags); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Reshape || op.op == Operator::View) { + auto shape = get_const>(op.constant_args[0]); + auto new_shape = to_size_vec(shape); + auto status = xnn_define_static_reshape( + subgraph, + new_shape.size(), + new_shape.data(), + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Transpose || op.op == Operator::Permute) { + auto perm = get_const>(op.constant_args[0]); + auto perm_sz = to_size_vec(perm); + auto status = xnn_define_static_transpose( + subgraph, + perm_sz.size(), + perm_sz.data(), + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Slice) { + auto dim = get_const(op.constant_args[0]); + auto start = get_const(op.constant_args[1]); + auto end = get_const(op.constant_args[2]); + (void)end; + + auto& out_spec = std::get(op.output_specs); + auto ndims = out_spec.sizes.size(); + std::vector offsets(ndims, 0); + std::vector sizes(ndims, 0); + for (size_t i = 0; i < ndims; i++) { + sizes[i] = static_cast(out_spec.sizes[i].offset); + } + offsets[dim] = static_cast(start); + + auto status = xnn_define_static_slice( + subgraph, + ndims, + offsets.data(), + sizes.data(), + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Cat) { + auto axis = get_const(op.constant_args[0]); + std::vector input_ids; + for (auto& arg : op.args) { + input_ids.push_back(xnn_ids[arg.node]); + } + auto status = xnn_define_concatenate( + subgraph, + static_cast(axis), + input_ids.size(), + input_ids.data(), + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Unsqueeze) { + auto dim = get_const(op.constant_args[0]); + size_t axis = static_cast(dim); + auto status = xnn_define_static_expand_dims( + subgraph, 1, &axis, xnn_ids[op.args[0].node], output_id, /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Expand) { + auto shape = get_const>(op.constant_args[0]); + auto new_shape = to_size_vec(shape); + auto status = xnn_define_static_broadcast( + subgraph, + new_shape.size(), + new_shape.data(), + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Clone) { + auto status = xnn_define_copy( + subgraph, xnn_ids[op.args[0].node], output_id, /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Pad) { + auto pad = get_const>(op.constant_args[0]); + auto& out_spec = std::get(op.output_specs); + auto ndims = out_spec.sizes.size(); + std::vector pre_paddings(ndims, 0); + std::vector post_paddings(ndims, 0); + // The serialized pre/post paddings are already per-dimension in ascending + // (XNNPACK channels-last) order, matching xnn_define_static_constant_pad. + for (size_t i = 0; i < pad.size() / 2 && i < ndims; i++) { + pre_paddings[i] = static_cast(pad[2 * i]); + post_paddings[i] = static_cast(pad[2 * i + 1]); + } + + float padding_value = op.constant_args.size() > 1 + ? static_cast(get_const(op.constant_args[1])) + : 0.0f; + auto status = xnn_define_static_constant_pad( + subgraph, + pre_paddings.data(), + post_paddings.data(), + padding_value, + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::StaticResizeBilinear2D) { + auto new_height = get_const(op.constant_args[0]); + auto new_width = get_const(op.constant_args[1]); + auto flags = static_cast(get_const(op.constant_args[2])); + auto status = xnn_define_static_resize_bilinear_2d( + subgraph, + static_cast(new_height), + static_cast(new_width), + xnn_ids[op.args[0].node], + output_id, + flags); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + if (op.op == Operator::Quantize || op.op == Operator::Dequantize) { + auto status = xnn_define_unary( + subgraph, + xnn_unary_convert, + nullptr, + xnn_ids[op.args[0].node], + output_id, + /*flags=*/0); + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK node"); + return runtime::Error::Ok; + } + + ET_LOG(Error, "Unsupported operator for XNNPACK delegation"); + return runtime::Error::NotSupported; +} + +runtime::Result define_tensor( + const graph::TensorSpec& spec, + xnn_subgraph_t subgraph, + bool is_input = false, + bool is_output = false, + uint32_t external_id = XNN_INVALID_VALUE_ID, + const core::Tensor* constant_tensor = nullptr) { + std::vector dims(spec.sizes.size()); + for (auto i = 0u; i < spec.sizes.size(); i++) { + auto& s = spec.sizes[i]; + dims[i] = s.is_constant() ? static_cast(s.offset) : 1; + } + + uint32_t flags = 0; + if (is_input) { + flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT; + } + if (is_output) { + flags |= XNN_VALUE_FLAG_EXTERNAL_OUTPUT; + } + + const void* data = constant_tensor ? constant_tensor->storage.data : nullptr; + ET_UNWRAP(xnn_dtype, map_xnn_datatype(spec)); + + uint32_t id = 0; + xnn_status status; + + if (!spec.quant_params) { + status = xnn_define_tensor_value( + subgraph, + xnn_dtype, + spec.sizes.size(), + dims.data(), + data, + external_id, + flags, + &id); + } else if ( + auto* pt = std::get_if(&*spec.quant_params)) { + status = xnn_define_quantized_tensor_value( + subgraph, + xnn_dtype, + pt->zero_point, + pt->scale, + spec.sizes.size(), + dims.data(), + data, + external_id, + flags, + &id); + } else if ( + auto* pa = std::get_if(&*spec.quant_params)) { + ET_CHECK_OR_RETURN_ERROR( + constant_tensor != nullptr && !constant_tensor->aux_storage.empty(), + NotSupported, + "Per-axis quantized tensor is missing scale data"); + auto* scales = + static_cast(constant_tensor->aux_storage[0].data); + int32_t zero_point = (xnn_dtype == xnn_datatype_qcint4) ? 8 : 0; + status = xnn_define_channelwise_quantized_tensor_value_v2( + subgraph, + xnn_dtype, + zero_point, + scales, + spec.sizes.size(), + static_cast(pa->axis), + dims.data(), + data, + external_id, + flags, + &id); + } else { + ET_LOG(Error, "Unsupported quantization scheme for XNNPACK delegation"); + return runtime::Error::NotSupported; + } + + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to define XNNPACK tensor value: 0x%x", + (unsigned int)status); + return id; +} + +} // namespace + +runtime::Result build_xnn_subgraph(const graph::Graph& graph) { + auto num_external_values = + static_cast(graph.input_specs.size() + graph.outputs.size()); + + xnn_subgraph_t raw_subgraph = nullptr; + auto status = xnn_create_subgraph( + num_external_values, + /*flags=*/0, + &raw_subgraph); + + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to create XNNPACK subgraph"); + + auto subgraph = XnnSubgraph(raw_subgraph); + + std::vector xnn_input_ids(graph.input_specs.size()); + for (auto i = 0u; i < graph.input_specs.size(); i++) { + ET_UNWRAP( + input_id, + define_tensor( + graph.input_specs[i], + subgraph.get(), + /*is_input=*/true, + /*is_output=*/false, + /*external_id=*/i)); + xnn_input_ids[i] = input_id; + } + + std::vector xnn_output_ids(graph.outputs.size()); + for (auto i = 0u; i < graph.outputs.size(); i++) { + auto external_id = static_cast(i + graph.input_specs.size()); + ET_UNWRAP( + output_id, + define_tensor( + graph.get_tensor_spec(graph.outputs[i]), + subgraph.get(), + /*is_input=*/false, + /*is_output=*/true, + external_id)); + xnn_output_ids[i] = output_id; + } + + std::vector xnn_ids(graph.nodes.size(), XNN_INVALID_VALUE_ID); + + for (NodeHandle n = 0; n < graph.nodes.size(); n++) { + if (auto* inp = std::get_if(&graph.nodes[n].value)) { + xnn_ids[n] = xnn_input_ids[inp->input]; + } + } + + for (NodeHandle n = 0; n < graph.nodes.size(); n++) { + if (auto* cn = std::get_if(&graph.nodes[n].value)) { + auto spec = + std::get(graph.get_output_spec_for_node(n)); + ET_UNWRAP( + const_id, + define_tensor( + spec, + subgraph.get(), + /*is_input=*/false, + /*is_output=*/false, + XNN_INVALID_VALUE_ID, + cn->tensor.get())); + xnn_ids[n] = const_id; + } + } + + for (auto i = 0u; i < graph.outputs.size(); i++) { + xnn_ids[graph.outputs[i].node] = xnn_output_ids[i]; + } + + for (NodeHandle n = 0; n < graph.nodes.size(); n++) { + auto* op = std::get_if(&graph.nodes[n].value); + if (!op) + continue; + + if (xnn_ids[n] == XNN_INVALID_VALUE_ID) { + auto spec = std::get(op->output_specs); + ET_UNWRAP(op_output_id, define_tensor(spec, subgraph.get())); + xnn_ids[n] = op_output_id; + } + } + + for (NodeHandle n = 0; n < graph.nodes.size(); n++) { + auto* op = std::get_if(&graph.nodes[n].value); + if (!op) + continue; + + ET_CHECK_OK_OR_RETURN_ERROR(define_node( + *op, xnn_ids[n], {xnn_ids.data(), xnn_ids.size()}, subgraph.get())); + } + + return subgraph; +} + +runtime::Result compile_xnn_subgraph( + const graph::Graph& graph, + xnn_workspace_t workspace) { + ET_UNWRAP(subgraph, build_xnn_subgraph(graph)); + + xnn_runtime_t runtime = nullptr; + auto status = xnn_create_runtime_v4( + subgraph.get(), + /*weights_cache=*/nullptr, + workspace, + ::executorch::extension::threadpool::get_pthreadpool(), + /*flags=*/0, + &runtime); + + ET_CHECK_OR_RETURN_ERROR( + status == xnn_status_success, + Internal, + "Failed to create XNNPACK runtime"); + + return XnnRuntime(runtime); +} + +} // namespace executorch::backends::xnnpack::plan diff --git a/backends/xnnpack/runtime/plan/xnn_subgraph.h b/backends/xnnpack/runtime/plan/xnn_subgraph.h new file mode 100644 index 00000000000..53e48108fb9 --- /dev/null +++ b/backends/xnnpack/runtime/plan/xnn_subgraph.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +#include + +#include + +namespace executorch::backends::xnnpack::graph { +struct Graph; +} + +namespace executorch::backends::xnnpack::plan { + +struct XnnSubgraphDeleter { + void operator()(xnn_subgraph_t subgraph) const { + xnn_delete_subgraph(subgraph); + } +}; + +struct XnnRuntimeDeleter { + void operator()(xnn_runtime_t runtime) const { + xnn_delete_runtime(runtime); + } +}; + +struct XnnWorkspaceDeleter { + void operator()(xnn_workspace_t workspace) const { + xnn_release_workspace(workspace); + } +}; + +using XnnSubgraph = std::unique_ptr; +using XnnRuntime = std::unique_ptr; +using XnnWorkspace = std::unique_ptr; + +/* + * Prepare a (sub)graph for XNNPACK execution. + */ +runtime::Result compile_xnn_subgraph( + const graph::Graph& graph, + xnn_workspace_t workspace); + +} // namespace executorch::backends::xnnpack::plan diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index 0e2be1aacc4..bcf84ef6593 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -45,7 +45,7 @@ target_include_directories( set(_graph_runtime_test_srcs runtime/test_quant_params.cpp runtime/test_graph_builder.cpp runtime/test_shape_env.cpp runtime/test_arena.cpp - runtime/test_partition.cpp + runtime/test_partition.cpp runtime/test_schedule.cpp ) et_cxx_test( diff --git a/backends/xnnpack/test/runtime/test_schedule.cpp b/backends/xnnpack/test/runtime/test_schedule.cpp new file mode 100644 index 00000000000..93ffb4b33c5 --- /dev/null +++ b/backends/xnnpack/test/runtime/test_schedule.cpp @@ -0,0 +1,162 @@ +#include + +#include +#include +#include + +#include +#include + +using namespace executorch::backends::xnnpack::core; +using namespace executorch::backends::xnnpack::graph; +using namespace executorch::backends::xnnpack::plan; + +// Helper: check that the order is a valid topological sort of the graph. +static void assert_topological( + const Graph& graph, + const std::vector& order) { + ASSERT_EQ(order.size(), graph.nodes.size()); + + // Build position map: node -> index in order. + std::vector pos(graph.nodes.size()); + for (uint32_t i = 0; i < order.size(); i++) { + pos[order[i]] = i; + } + + // Every arg source must appear before the node that uses it. + for (NodeHandle n = 0; n < graph.nodes.size(); n++) { + for (auto& arg : graph.nodes[n].get_args()) { + EXPECT_LT(pos[arg.node], pos[n]) + << "Node " << arg.node << " (arg) should appear before node " << n; + } + } +} + +// Helper: check that all nodes appear exactly once. +static void assert_all_nodes_present( + const Graph& graph, + const std::vector& order) { + ASSERT_EQ(order.size(), graph.nodes.size()); + + std::unordered_set seen(order.begin(), order.end()); + EXPECT_EQ(seen.size(), graph.nodes.size()); +} + +static TensorSpec make_spec() { + return TensorSpec{ + .dtype = DType::Float32, + .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(10)}}; +} + +TEST(TestSchedule, linear_chain) { + // input -> op1 -> op2 -> output + auto builder = GraphBuilder(); + auto spec = make_spec(); + + auto input = builder.createInput(spec); + auto op1 = builder.createOperator(Operator::Add, spec, input, input); + auto op2 = builder.createOperator(Operator::Add, spec, op1, input); + builder.createOutput(op2); + + auto graph = builder.build(); + graph.update_users(); + + auto order = schedule(graph); + + assert_all_nodes_present(graph, order); + assert_topological(graph, order); + + // Input must be first, then op1, then op2. + EXPECT_EQ(order[0], input.node); + EXPECT_EQ(order[1], op1.node); + EXPECT_EQ(order[2], op2.node); +} + +TEST(TestSchedule, diamond) { + // input_a, input_b -> add1; input_b -> add2; add1, add2 -> add3 + auto builder = GraphBuilder(); + auto spec = make_spec(); + + auto input_a = builder.createInput(spec); + auto input_b = builder.createInput(spec); + auto add1 = builder.createOperator(Operator::Add, spec, input_a, input_b); + auto add2 = builder.createOperator(Operator::Add, spec, add1, input_b); + builder.createOutput(add2); + + auto graph = builder.build(); + graph.update_users(); + + auto order = schedule(graph); + + assert_all_nodes_present(graph, order); + assert_topological(graph, order); + + // Both inputs before add1, add1 before add2. + std::vector pos(graph.nodes.size()); + for (uint32_t i = 0; i < order.size(); i++) { + pos[order[i]] = i; + } + EXPECT_LT(pos[input_a.node], pos[add1.node]); + EXPECT_LT(pos[input_b.node], pos[add1.node]); + EXPECT_LT(pos[add1.node], pos[add2.node]); +} + +TEST(TestSchedule, post_fusion) { + auto builder = GraphBuilder(); + auto spec = make_spec(); + + auto input_a = builder.createInput(spec); + auto input_b = builder.createInput(spec); + auto add1 = builder.createOperator(Operator::Add, spec, input_a, input_b); + auto add2 = builder.createOperator(Operator::Add, spec, add1, input_b); + auto add3 = builder.createOperator(Operator::Add, spec, add2, input_b); + builder.createOutput(add3); + + auto graph = builder.build(); + + // Mark all ops for XNNPACK so they get fused. + graph.nodes[add1.node].flags |= NodeFlags::UseXnnpack; + graph.nodes[add2.node].flags |= NodeFlags::UseXnnpack; + graph.nodes[add3.node].flags |= NodeFlags::UseXnnpack; + + ASSERT_EQ(partition_xnn_subgraphs(graph), executorch::runtime::Error::Ok); + + auto order = schedule(graph); + + assert_all_nodes_present(graph, order); + assert_topological(graph, order); + + // Find the CallSubgraphNode and verify it comes after inputs. + std::vector pos(graph.nodes.size()); + for (uint32_t i = 0; i < order.size(); i++) { + pos[order[i]] = i; + } + + for (NodeHandle n = 0; n < graph.nodes.size(); n++) { + if (std::holds_alternative(graph.nodes[n].value)) { + auto& fused = std::get(graph.nodes[n].value); + for (auto& arg : fused.args) { + EXPECT_LT(pos[arg.node], pos[n]); + } + } + } +} + +TEST(TestSchedule, multiple_inputs_no_ops) { + // Graph with only input nodes (degenerate case). + auto builder = GraphBuilder(); + auto spec = make_spec(); + + auto input_a = builder.createInput(spec); + auto input_b = builder.createInput(spec); + builder.createOutput(input_a); + builder.createOutput(input_b); + + auto graph = builder.build(); + graph.update_users(); + + auto order = schedule(graph); + + assert_all_nodes_present(graph, order); + EXPECT_EQ(order.size(), 2u); +}