From a8cd74b1fd685ec35fba4999c0f26ae9255b2309 Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Fri, 12 Jun 2026 14:30:22 -0700
Subject: [PATCH] Update

[ghstack-poisoned]
---
 backends/xnnpack/CMakeLists.txt               |   3 +
 .../xnnpack/runtime/plan/execution_plan.cpp   | 118 +++
 .../xnnpack/runtime/plan/execution_plan.h     |  48 +
 backends/xnnpack/runtime/plan/schedule.cpp    |  58 ++
 backends/xnnpack/runtime/plan/schedule.h      |  17 +
 .../xnnpack/runtime/plan/xnn_subgraph.cpp     | 879 ++++++++++++++++++
 backends/xnnpack/runtime/plan/xnn_subgraph.h  |  44 +
 backends/xnnpack/test/CMakeLists.txt          |   2 +-
 .../xnnpack/test/runtime/test_schedule.cpp    | 162 ++++
 9 files changed, 1330 insertions(+), 1 deletion(-)
 create mode 100644 backends/xnnpack/runtime/plan/execution_plan.cpp
 create mode 100644 backends/xnnpack/runtime/plan/execution_plan.h
 create mode 100644 backends/xnnpack/runtime/plan/schedule.cpp
 create mode 100644 backends/xnnpack/runtime/plan/schedule.h
 create mode 100644 backends/xnnpack/runtime/plan/xnn_subgraph.cpp
 create mode 100644 backends/xnnpack/runtime/plan/xnn_subgraph.h
 create mode 100644 backends/xnnpack/test/runtime/test_schedule.cpp
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 02fd8373275..c0cf4db0921 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -113,6 +113,9 @@ list(
   backends/xnnpack/runtime/executor/shape_env.cpp
   backends/xnnpack/runtime/plan/xnn_support.cpp
   backends/xnnpack/runtime/plan/partition.cpp
+  backends/xnnpack/runtime/plan/xnn_subgraph.cpp
+  backends/xnnpack/runtime/plan/schedule.cpp
+  backends/xnnpack/runtime/plan/execution_plan.cpp
 )
 
 list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
diff --git a/backends/xnnpack/runtime/plan/execution_plan.cpp b/backends/xnnpack/runtime/plan/execution_plan.cpp
new file mode 100644
index 00000000000..326d38cb59e
--- /dev/null
+++ b/backends/xnnpack/runtime/plan/execution_plan.cpp
@@ -0,0 +1,118 @@
+#include <executorch/backends/xnnpack/runtime/operators/operator.h>
+#include <executorch/backends/xnnpack/runtime/plan/execution_plan.h>
+#include <executorch/backends/xnnpack/runtime/plan/partition.h>
+#include <executorch/backends/xnnpack/runtime/plan/schedule.h>
+#include <executorch/backends/xnnpack/runtime/plan/xnn_subgraph.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::xnnpack::plan {
+
+using executorch::runtime::Span;
+using namespace graph;
+
+namespace {
+
+uint32_t assign_value_slots(
+    graph::Graph& graph,
+    Span<const NodeHandle> linear_schedule) {
+  uint32_t next_slot = 0;
+  for (auto nh : linear_schedule) {
+    graph.nodes[nh].tag = next_slot;
+    next_slot += graph.nodes[nh].output_count();
+  }
+  return next_slot;
+}
+
+runtime::Result<std::vector<PlanStep>> create_plan_steps(
+    const graph::Graph& graph,
+    Span<const NodeHandle> linear_schedule) {
+  std::vector<PlanStep> steps;
+  steps.reserve(linear_schedule.size());
+
+  for (auto node_handle : linear_schedule) {
+    auto& node = graph.nodes[node_handle];
+
+    runtime::Error err = runtime::Error::Ok;
+    std::visit(
+        overloaded{
+            [&](const CallSubgraphNode& n) {
+              std::vector<ValueSlot> external_value_slots;
+              external_value_slots.reserve(n.args.size() + node.output_count());
+
+              for (const auto& arg : n.args) {
+                auto slot = graph.nodes[arg.node].tag + arg.output;
+                external_value_slots.push_back(slot);
+              }
+
+              for (uint32_t i = 0; i < node.output_count(); i++) {
+                external_value_slots.push_back(node.tag + i);
+              }
+
+              auto runtime_result = compile_xnn_subgraph(*n.subgraph, nullptr);
+              if (!runtime_result.ok()) {
+                err = runtime_result.error();
+                return;
+              }
+
+              RunXnnSubgraphStep step;
+              step.runtime = std::move(*runtime_result);
+              step.external_value_slots = std::move(external_value_slots);
+              step.num_external_inputs = n.args.size();
+              steps.push_back(std::move(step));
+            },
+            [](const InputNode&) {},
+            [](const ConstantNode&) {},
+            [&steps, &node, &graph, &err](const CallOperatorNode& n) {
+              std::vector<ValueSlot> input_slots;
+              input_slots.reserve(n.args.size());
+              for (const auto& arg : n.args) {
+                if (arg.is_null())
+                  continue;
+                input_slots.push_back(graph.nodes[arg.node].tag + arg.output);
+              }
+
+              std::vector<ValueSlot> output_slots;
+              for (uint32_t i = 0; i < node.output_count(); i++) {
+                output_slots.push_back(node.tag + i);
+              }
+
+              auto op = operators::create_operator(n.op);
+              if (op == nullptr) {
+                err = runtime::Error::NotSupported;
+                return;
+              }
+              op->setup({n.constant_args.data(), n.constant_args.size()});
+
+              RunOperatorStep step;
+              step.op = std::move(op);
+              step.input_slots = std::move(input_slots);
+              step.output_slots = std::move(output_slots);
+              steps.push_back(std::move(step));
+            }},
+        node.value);
+    ET_CHECK_OK_OR_RETURN_ERROR(err);
+  }
+
+  return steps;
+}
+} // namespace
+
+runtime::Result<ExecutionPlan> create_execution_plan(graph::Graph& graph) {
+  ET_CHECK_OK_OR_RETURN_ERROR(partition_xnn_subgraphs(graph));
+
+  auto linear_schedule = schedule(graph);
+  Span<const NodeHandle> schedule_span(
+      linear_schedule.data(), linear_schedule.size());
+  auto num_value_slots = assign_value_slots(graph, schedule_span);
+  (void)num_value_slots;
+
+  ET_UNWRAP(plan_steps, create_plan_steps(graph, schedule_span));
+  ExecutionPlan plan;
+  plan.steps = std::move(plan_steps);
+
+  return plan;
+}
+
+} // namespace executorch::backends::xnnpack::plan
diff --git a/backends/xnnpack/runtime/plan/execution_plan.h b/backends/xnnpack/runtime/plan/execution_plan.h
new file mode 100644
index 00000000000..2d3b75cf0f1
--- /dev/null
+++ b/backends/xnnpack/runtime/plan/execution_plan.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <executorch/backends/xnnpack/runtime/operators/operator.h>
+#include <executorch/backends/xnnpack/runtime/plan/xnn_subgraph.h>
+#include <executorch/runtime/core/result.h>
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+namespace executorch::backends::xnnpack::graph {
+struct Graph;
+}
+
+namespace executorch::backends::xnnpack::plan {
+
+using ValueSlot = uint32_t;
+
+/* Run an operator with the given inputs and outputs. */
+struct RunOperatorStep {
+  std::unique_ptr<operators::Operator> op;
+  std::vector<ValueSlot> input_slots;
+  std::vector<ValueSlot> output_slots;
+};
+
+/* Run an subgraph delegated to XNNPACK. */
+struct RunXnnSubgraphStep {
+  XnnRuntime runtime;
+  std::vector<ValueSlot> external_value_slots;
+  uint32_t num_external_inputs = 0;
+};
+
+using PlanStep = std::variant<RunOperatorStep, RunXnnSubgraphStep>;
+
+/*
+ * Describes the planned execution steps for a compiled graph.
+ */
+struct ExecutionPlan {
+  std::vector<PlanStep> steps;
+};
+
+/*
+ * Build an execution plan from a model graph. This pre-processes the
+ * graph into a format suitable for efficient execution.
+ */
+runtime::Result<ExecutionPlan> create_execution_plan(graph::Graph& graph);
+
+} // namespace executorch::backends::xnnpack::plan
diff --git a/backends/xnnpack/runtime/plan/schedule.cpp b/backends/xnnpack/runtime/plan/schedule.cpp
new file mode 100644
index 00000000000..512eb621d71
--- /dev/null
+++ b/backends/xnnpack/runtime/plan/schedule.cpp
@@ -0,0 +1,58 @@
+#include <executorch/backends/xnnpack/runtime/plan/schedule.h>
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <variant>
+
+namespace executorch::backends::xnnpack::plan {
+
+using namespace graph;
+
+std::vector<NodeHandle> schedule(const graph::Graph& graph) {
+  const auto& nodes = graph.nodes;
+
+  std::vector<uint32_t> in_edges(nodes.size(), 0);
+  std::deque<NodeHandle> queue;
+
+  for (NodeHandle n = 0; n < nodes.size(); n++) {
+    auto& args = nodes[n].get_args();
+    in_edges[n] =
+        std::count_if(args.begin(), args.end(), [&](const ValueHandle& a) {
+          if (a.is_null())
+            return false;
+          return !std::holds_alternative<InputNode>(nodes[a.node].value) &&
+              !std::holds_alternative<ConstantNode>(nodes[a.node].value);
+        });
+    if (in_edges[n] == 0) {
+      queue.push_back(n);
+    }
+  }
+
+  std::vector<NodeHandle> order;
+  order.reserve(nodes.size());
+
+  while (!queue.empty()) {
+    auto nh = queue.front();
+    queue.pop_front();
+    order.push_back(nh);
+
+    if (std::holds_alternative<InputNode>(nodes[nh].value) ||
+        std::holds_alternative<ConstantNode>(nodes[nh].value)) {
+      continue;
+    }
+
+    for (auto user : nodes[nh].users) {
+      assert(in_edges[user] > 0);
+      in_edges[user]--;
+      if (in_edges[user] == 0) {
+        queue.push_back(user);
+      }
+    }
+  }
+
+  assert(order.size() == nodes.size());
+  return order;
+}
+
+} // namespace executorch::backends::xnnpack::plan
diff --git a/backends/xnnpack/runtime/plan/schedule.h b/backends/xnnpack/runtime/plan/schedule.h
new file mode 100644
index 00000000000..08c860ff98d
--- /dev/null
+++ b/backends/xnnpack/runtime/plan/schedule.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <executorch/backends/xnnpack/runtime/graph/graph.h>
+#include <executorch/backends/xnnpack/runtime/graph/handles.h>
+
+#include <vector>
+
+namespace executorch::backends::xnnpack::plan {
+
+/*
+ * Flatten a computational graph down to a linear schedule. This is
+ * an ordering of nodes that respects dependency orders - i.e. a
+ * topological sort.
+ */
+std::vector<graph::NodeHandle> schedule(const graph::Graph& graph);
+
+} // namespace executorch::backends::xnnpack::plan
diff --git a/backends/xnnpack/runtime/plan/xnn_subgraph.cpp b/backends/xnnpack/runtime/plan/xnn_subgraph.cpp
new file mode 100644
index 00000000000..31b387bb5f2
--- /dev/null
+++ b/backends/xnnpack/runtime/plan/xnn_subgraph.cpp
@@ -0,0 +1,879 @@
+#include <executorch/backends/xnnpack/runtime/plan/xnn_subgraph.h>
+
+#include <executorch/backends/xnnpack/runtime/core/quant_params.h>
+#include <executorch/backends/xnnpack/runtime/core/tensor.h>
+
+#include <executorch/backends/xnnpack/runtime/graph/graph.h>
+#include <executorch/backends/xnnpack/runtime/graph/tensor_spec.h>
+
+#include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cassert>
+#include <cmath>
+#include <optional>
+
+namespace executorch::backends::xnnpack::plan {
+
+using core::DType;
+using executorch::runtime::Span;
+using graph::CallOperatorNode;
+using graph::ConstantNode;
+using graph::InputNode;
+using graph::NodeHandle;
+using graph::Operator;
+
+namespace {
+
+runtime::Result<xnn_datatype> map_xnn_datatype(const graph::TensorSpec& spec) {
+  if (!spec.quant_params) {
+    switch (spec.dtype) {
+      case DType::Float32:
+        return xnn_datatype_fp32;
+      case DType::Float16:
+        return xnn_datatype_fp16;
+      default:
+        ET_LOG(Error, "Unsupported dtype for XNNPACK delegation");
+        return runtime::Error::NotSupported;
+    }
+  }
+  switch (spec.dtype) {
+    case DType::QUInt8:
+      return xnn_datatype_quint8;
+    case DType::QInt8:
+      if (std::holds_alternative<core::PerAxisQuantParams>(
+              *spec.quant_params)) {
+        return xnn_datatype_qcint8;
+      }
+      return xnn_datatype_qint8;
+    case DType::QInt32:
+      return xnn_datatype_qint32;
+    default:
+      ET_LOG(Error, "Unsupported quantized dtype for XNNPACK delegation");
+      return runtime::Error::NotSupported;
+  }
+}
+
+std::optional<xnn_binary_operator> map_binary_op(Operator op) {
+  switch (op) {
+    case Operator::Add:
+      return xnn_binary_add;
+    case Operator::Subtract:
+      return xnn_binary_subtract;
+    case Operator::Multiply:
+      return xnn_binary_multiply;
+    case Operator::Divide:
+      return xnn_binary_divide;
+    case Operator::Maximum:
+      return xnn_binary_maximum;
+    case Operator::Minimum:
+      return xnn_binary_minimum;
+    case Operator::CopySign:
+      return xnn_binary_copysign;
+    case Operator::SquaredDifference:
+      return xnn_binary_squared_difference;
+    case Operator::Modulus:
+      return xnn_binary_modulus;
+    case Operator::Atan2:
+      return xnn_binary_atan2;
+    case Operator::Pow:
+      return xnn_binary_pow;
+    default:
+      return std::nullopt;
+  }
+}
+
+std::optional<xnn_unary_operator> map_unary_op(Operator op) {
+  switch (op) {
+    case Operator::Abs:
+      return xnn_unary_abs;
+    case Operator::Negate:
+      return xnn_unary_negate;
+    case Operator::Clamp:
+      return xnn_unary_clamp;
+    case Operator::Ceiling:
+      return xnn_unary_ceiling;
+    case Operator::Floor:
+      return xnn_unary_floor;
+    case Operator::Round:
+      return xnn_unary_bankers_rounding;
+    case Operator::Square:
+      return xnn_unary_square;
+    case Operator::SquareRoot:
+      return xnn_unary_square_root;
+    case Operator::ReciprocalSquareRoot:
+      return xnn_unary_reciprocal_square_root;
+    case Operator::Exp:
+      return xnn_unary_exp;
+    case Operator::Log:
+      return xnn_unary_log;
+    case Operator::Sigmoid:
+      return xnn_unary_sigmoid;
+    case Operator::Tanh:
+      return xnn_unary_tanh;
+    case Operator::ELU:
+      return xnn_unary_elu;
+    case Operator::GELU:
+      return xnn_unary_gelu;
+    case Operator::HardSwish:
+      return xnn_unary_hardswish;
+    case Operator::LeakyReLU:
+      return xnn_unary_leaky_relu;
+    case Operator::Sine:
+      return xnn_unary_sine;
+    case Operator::Cosine:
+      return xnn_unary_cosine;
+    case Operator::Sign:
+      return xnn_unary_sign;
+    case Operator::ReLU:
+      return xnn_unary_clamp;
+    default:
+      return std::nullopt;
+  }
+}
+
+template <typename T>
+T get_const(const graph::ConstantArg& arg) {
+  return std::get<T>(arg);
+}
+
+template <typename T>
+std::vector<size_t> to_size_vec(const std::vector<T>& v) {
+  return {v.begin(), v.end()};
+}
+
+runtime::Error define_node(
+    const CallOperatorNode& op,
+    uint32_t output_id,
+    Span<const uint32_t> xnn_ids,
+    xnn_subgraph_t subgraph) {
+  if (auto bin_op = map_binary_op(op.op)) {
+    xnn_binary_params params = {
+        .output_min = op.output_min,
+        .output_max = op.output_max,
+    };
+    auto status = xnn_define_binary(
+        subgraph,
+        *bin_op,
+        &params,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::PReLU) {
+    auto status = xnn_define_prelu(
+        subgraph,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (auto unary_op = map_unary_op(op.op)) {
+    xnn_unary_params params = {};
+
+    if (op.op == Operator::Clamp) {
+      params.clamp.min =
+          static_cast<float>(get_const<double>(op.constant_args[0]));
+      params.clamp.max =
+          static_cast<float>(get_const<double>(op.constant_args[1]));
+    } else if (op.op == Operator::ReLU) {
+      params.clamp.min = 0.0f;
+      params.clamp.max = INFINITY;
+    } else if (op.op == Operator::ELU) {
+      params.elu.alpha = op.constant_args.empty()
+          ? 1.0f
+          : static_cast<float>(get_const<double>(op.constant_args[0]));
+    } else if (op.op == Operator::LeakyReLU) {
+      params.leaky_relu.negative_slope =
+          static_cast<float>(get_const<double>(op.constant_args[0]));
+    }
+
+    auto status = xnn_define_unary(
+        subgraph,
+        *unary_op,
+        &params,
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Linear) {
+    auto bias_id =
+        op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node];
+    auto status = xnn_define_fully_connected(
+        subgraph,
+        op.output_min,
+        op.output_max,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        bias_id,
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::BatchMatrixMultiply) {
+    auto status = xnn_define_batch_matrix_multiply(
+        subgraph,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Conv2d) {
+    auto stride = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto padding = get_const<std::vector<int64_t>>(op.constant_args[1]);
+    auto dilation = get_const<std::vector<int64_t>>(op.constant_args[2]);
+    auto groups = get_const<int64_t>(op.constant_args[3]);
+    auto kernel = get_const<std::vector<int64_t>>(op.constant_args[4]);
+    auto group_input_channels = get_const<int64_t>(op.constant_args[5]);
+    auto group_output_channels = get_const<int64_t>(op.constant_args[6]);
+    auto bias_id =
+        op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node];
+
+    auto status = xnn_define_convolution_2d(
+        subgraph,
+        padding[0],
+        padding[1],
+        padding[0],
+        padding[1],
+        kernel[0],
+        kernel[1],
+        stride[0],
+        stride[1],
+        dilation[0],
+        dilation[1],
+        groups,
+        group_input_channels,
+        group_output_channels,
+        op.output_min,
+        op.output_max,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        bias_id,
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::DepthwiseConv2d) {
+    auto stride = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto padding = get_const<std::vector<int64_t>>(op.constant_args[1]);
+    auto dilation = get_const<std::vector<int64_t>>(op.constant_args[2]);
+    auto groups = get_const<int64_t>(op.constant_args[3]);
+    auto kernel = get_const<std::vector<int64_t>>(op.constant_args[4]);
+    auto group_input_channels = get_const<int64_t>(op.constant_args[5]);
+    auto group_output_channels = get_const<int64_t>(op.constant_args[6]);
+    auto bias_id =
+        op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node];
+
+    auto status = xnn_define_depthwise_convolution_2d(
+        subgraph,
+        padding[0],
+        padding[1],
+        padding[0],
+        padding[1],
+        kernel[0],
+        kernel[1],
+        stride[0],
+        stride[1],
+        dilation[0],
+        dilation[1],
+        group_output_channels / group_input_channels, // depth_multiplier
+        groups, // input_channels
+        op.output_min,
+        op.output_max,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        bias_id,
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::ConvTranspose2d) {
+    auto stride = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto padding = get_const<std::vector<int64_t>>(op.constant_args[1]);
+    auto output_padding = get_const<std::vector<int64_t>>(op.constant_args[2]);
+    auto groups = get_const<int64_t>(op.constant_args[3]);
+    auto dilation = get_const<std::vector<int64_t>>(op.constant_args[4]);
+    auto kernel = get_const<std::vector<int64_t>>(op.constant_args[5]);
+    auto group_input_channels = get_const<int64_t>(op.constant_args[6]);
+    auto group_output_channels = get_const<int64_t>(op.constant_args[7]);
+    auto bias_id =
+        op.args[2].is_null() ? XNN_INVALID_VALUE_ID : xnn_ids[op.args[2].node];
+
+    auto status = xnn_define_deconvolution_2d(
+        subgraph,
+        padding[0],
+        padding[1],
+        padding[0],
+        padding[1],
+        output_padding[0],
+        output_padding[1],
+        kernel[0],
+        kernel[1],
+        stride[0],
+        stride[1],
+        dilation[0],
+        dilation[1],
+        groups,
+        group_input_channels,
+        group_output_channels,
+        op.output_min,
+        op.output_max,
+        xnn_ids[op.args[0].node],
+        xnn_ids[op.args[1].node],
+        bias_id,
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::AvgPool2d) {
+    auto kernel = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto stride = get_const<std::vector<int64_t>>(op.constant_args[1]);
+    auto padding = get_const<std::vector<int64_t>>(op.constant_args[2]);
+    uint32_t flags = 0;
+    auto status = xnn_define_average_pooling_2d(
+        subgraph,
+        padding[0],
+        padding[0],
+        padding[1],
+        padding[1],
+        kernel[0],
+        kernel[1],
+        stride[0],
+        stride[1],
+        -INFINITY,
+        INFINITY,
+        xnn_ids[op.args[0].node],
+        output_id,
+        flags);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::AdaptiveAvgPool2d) {
+    auto status = xnn_define_global_average_pooling_2d(
+        subgraph,
+        -INFINITY,
+        INFINITY,
+        xnn_ids[op.args[0].node],
+        output_id,
+        XNN_FLAG_KEEP_DIMS);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::MaxPool2d) {
+    auto kernel = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto stride = get_const<std::vector<int64_t>>(op.constant_args[1]);
+    auto padding = get_const<std::vector<int64_t>>(op.constant_args[2]);
+    auto dilation = get_const<std::vector<int64_t>>(op.constant_args[3]);
+    // padding is [top, left, bottom, right]; xnn wants (top, right, bottom,
+    // left). ceil_mode pooling relies on the asymmetric bottom/right padding.
+    auto status = xnn_define_max_pooling_2d(
+        subgraph,
+        padding[0],
+        padding[3],
+        padding[2],
+        padding[1],
+        kernel[0],
+        kernel[1],
+        stride[0],
+        stride[1],
+        dilation[0],
+        dilation[1],
+        -INFINITY,
+        INFINITY,
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Softmax) {
+    auto status = xnn_define_softmax(
+        subgraph, xnn_ids[op.args[0].node], output_id, /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Mean || op.op == Operator::Sum) {
+    auto dims = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto keepdim = get_const<int64_t>(op.constant_args[1]);
+    std::vector<size_t> reduction_axes(dims.begin(), dims.end());
+    uint32_t flags = keepdim ? XNN_FLAG_KEEP_DIMS : 0;
+
+    auto reduce_op =
+        (op.op == Operator::Mean) ? xnn_reduce_mean : xnn_reduce_sum;
+
+    auto status = xnn_define_static_reduce(
+        subgraph,
+        reduce_op,
+        reduction_axes.size(),
+        reduction_axes.data(),
+        xnn_ids[op.args[0].node],
+        output_id,
+        flags);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Reshape || op.op == Operator::View) {
+    auto shape = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto new_shape = to_size_vec(shape);
+    auto status = xnn_define_static_reshape(
+        subgraph,
+        new_shape.size(),
+        new_shape.data(),
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Transpose || op.op == Operator::Permute) {
+    auto perm = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto perm_sz = to_size_vec(perm);
+    auto status = xnn_define_static_transpose(
+        subgraph,
+        perm_sz.size(),
+        perm_sz.data(),
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Slice) {
+    auto dim = get_const<int64_t>(op.constant_args[0]);
+    auto start = get_const<int64_t>(op.constant_args[1]);
+    auto end = get_const<int64_t>(op.constant_args[2]);
+    (void)end;
+
+    auto& out_spec = std::get<graph::TensorSpec>(op.output_specs);
+    auto ndims = out_spec.sizes.size();
+    std::vector<size_t> offsets(ndims, 0);
+    std::vector<size_t> sizes(ndims, 0);
+    for (size_t i = 0; i < ndims; i++) {
+      sizes[i] = static_cast<size_t>(out_spec.sizes[i].offset);
+    }
+    offsets[dim] = static_cast<size_t>(start);
+
+    auto status = xnn_define_static_slice(
+        subgraph,
+        ndims,
+        offsets.data(),
+        sizes.data(),
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Cat) {
+    auto axis = get_const<int64_t>(op.constant_args[0]);
+    std::vector<uint32_t> input_ids;
+    for (auto& arg : op.args) {
+      input_ids.push_back(xnn_ids[arg.node]);
+    }
+    auto status = xnn_define_concatenate(
+        subgraph,
+        static_cast<size_t>(axis),
+        input_ids.size(),
+        input_ids.data(),
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Unsqueeze) {
+    auto dim = get_const<int64_t>(op.constant_args[0]);
+    size_t axis = static_cast<size_t>(dim);
+    auto status = xnn_define_static_expand_dims(
+        subgraph, 1, &axis, xnn_ids[op.args[0].node], output_id, /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Expand) {
+    auto shape = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto new_shape = to_size_vec(shape);
+    auto status = xnn_define_static_broadcast(
+        subgraph,
+        new_shape.size(),
+        new_shape.data(),
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Clone) {
+    auto status = xnn_define_copy(
+        subgraph, xnn_ids[op.args[0].node], output_id, /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Pad) {
+    auto pad = get_const<std::vector<int64_t>>(op.constant_args[0]);
+    auto& out_spec = std::get<graph::TensorSpec>(op.output_specs);
+    auto ndims = out_spec.sizes.size();
+    std::vector<size_t> pre_paddings(ndims, 0);
+    std::vector<size_t> post_paddings(ndims, 0);
+    // The serialized pre/post paddings are already per-dimension in ascending
+    // (XNNPACK channels-last) order, matching xnn_define_static_constant_pad.
+    for (size_t i = 0; i < pad.size() / 2 && i < ndims; i++) {
+      pre_paddings[i] = static_cast<size_t>(pad[2 * i]);
+      post_paddings[i] = static_cast<size_t>(pad[2 * i + 1]);
+    }
+
+    float padding_value = op.constant_args.size() > 1
+        ? static_cast<float>(get_const<double>(op.constant_args[1]))
+        : 0.0f;
+    auto status = xnn_define_static_constant_pad(
+        subgraph,
+        pre_paddings.data(),
+        post_paddings.data(),
+        padding_value,
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::StaticResizeBilinear2D) {
+    auto new_height = get_const<int64_t>(op.constant_args[0]);
+    auto new_width = get_const<int64_t>(op.constant_args[1]);
+    auto flags = static_cast<uint32_t>(get_const<int64_t>(op.constant_args[2]));
+    auto status = xnn_define_static_resize_bilinear_2d(
+        subgraph,
+        static_cast<size_t>(new_height),
+        static_cast<size_t>(new_width),
+        xnn_ids[op.args[0].node],
+        output_id,
+        flags);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  if (op.op == Operator::Quantize || op.op == Operator::Dequantize) {
+    auto status = xnn_define_unary(
+        subgraph,
+        xnn_unary_convert,
+        nullptr,
+        xnn_ids[op.args[0].node],
+        output_id,
+        /*flags=*/0);
+    ET_CHECK_OR_RETURN_ERROR(
+        status == xnn_status_success,
+        Internal,
+        "Failed to define XNNPACK node");
+    return runtime::Error::Ok;
+  }
+
+  ET_LOG(Error, "Unsupported operator for XNNPACK delegation");
+  return runtime::Error::NotSupported;
+}
+
+runtime::Result<uint32_t> define_tensor(
+    const graph::TensorSpec& spec,
+    xnn_subgraph_t subgraph,
+    bool is_input = false,
+    bool is_output = false,
+    uint32_t external_id = XNN_INVALID_VALUE_ID,
+    const core::Tensor* constant_tensor = nullptr) {
+  std::vector<size_t> dims(spec.sizes.size());
+  for (auto i = 0u; i < spec.sizes.size(); i++) {
+    auto& s = spec.sizes[i];
+    dims[i] = s.is_constant() ? static_cast<size_t>(s.offset) : 1;
+  }
+
+  uint32_t flags = 0;
+  if (is_input) {
+    flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT;
+  }
+  if (is_output) {
+    flags |= XNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+  }
+
+  const void* data = constant_tensor ? constant_tensor->storage.data : nullptr;
+  ET_UNWRAP(xnn_dtype, map_xnn_datatype(spec));
+
+  uint32_t id = 0;
+  xnn_status status;
+
+  if (!spec.quant_params) {
+    status = xnn_define_tensor_value(
+        subgraph,
+        xnn_dtype,
+        spec.sizes.size(),
+        dims.data(),
+        data,
+        external_id,
+        flags,
+        &id);
+  } else if (
+      auto* pt = std::get_if<core::PerTensorQuantParams>(&*spec.quant_params)) {
+    status = xnn_define_quantized_tensor_value(
+        subgraph,
+        xnn_dtype,
+        pt->zero_point,
+        pt->scale,
+        spec.sizes.size(),
+        dims.data(),
+        data,
+        external_id,
+        flags,
+        &id);
+  } else if (
+      auto* pa = std::get_if<core::PerAxisQuantParams>(&*spec.quant_params)) {
+    ET_CHECK_OR_RETURN_ERROR(
+        constant_tensor != nullptr && !constant_tensor->aux_storage.empty(),
+        NotSupported,
+        "Per-axis quantized tensor is missing scale data");
+    auto* scales =
+        static_cast<const float*>(constant_tensor->aux_storage[0].data);
+    int32_t zero_point = (xnn_dtype == xnn_datatype_qcint4) ? 8 : 0;
+    status = xnn_define_channelwise_quantized_tensor_value_v2(
+        subgraph,
+        xnn_dtype,
+        zero_point,
+        scales,
+        spec.sizes.size(),
+        static_cast<size_t>(pa->axis),
+        dims.data(),
+        data,
+        external_id,
+        flags,
+        &id);
+  } else {
+    ET_LOG(Error, "Unsupported quantization scheme for XNNPACK delegation");
+    return runtime::Error::NotSupported;
+  }
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to define XNNPACK tensor value: 0x%x",
+      (unsigned int)status);
+  return id;
+}
+
+} // namespace
+
+runtime::Result<XnnSubgraph> build_xnn_subgraph(const graph::Graph& graph) {
+  auto num_external_values =
+      static_cast<uint32_t>(graph.input_specs.size() + graph.outputs.size());
+
+  xnn_subgraph_t raw_subgraph = nullptr;
+  auto status = xnn_create_subgraph(
+      num_external_values,
+      /*flags=*/0,
+      &raw_subgraph);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create XNNPACK subgraph");
+
+  auto subgraph = XnnSubgraph(raw_subgraph);
+
+  std::vector<uint32_t> xnn_input_ids(graph.input_specs.size());
+  for (auto i = 0u; i < graph.input_specs.size(); i++) {
+    ET_UNWRAP(
+        input_id,
+        define_tensor(
+            graph.input_specs[i],
+            subgraph.get(),
+            /*is_input=*/true,
+            /*is_output=*/false,
+            /*external_id=*/i));
+    xnn_input_ids[i] = input_id;
+  }
+
+  std::vector<uint32_t> xnn_output_ids(graph.outputs.size());
+  for (auto i = 0u; i < graph.outputs.size(); i++) {
+    auto external_id = static_cast<uint32_t>(i + graph.input_specs.size());
+    ET_UNWRAP(
+        output_id,
+        define_tensor(
+            graph.get_tensor_spec(graph.outputs[i]),
+            subgraph.get(),
+            /*is_input=*/false,
+            /*is_output=*/true,
+            external_id));
+    xnn_output_ids[i] = output_id;
+  }
+
+  std::vector<uint32_t> xnn_ids(graph.nodes.size(), XNN_INVALID_VALUE_ID);
+
+  for (NodeHandle n = 0; n < graph.nodes.size(); n++) {
+    if (auto* inp = std::get_if<InputNode>(&graph.nodes[n].value)) {
+      xnn_ids[n] = xnn_input_ids[inp->input];
+    }
+  }
+
+  for (NodeHandle n = 0; n < graph.nodes.size(); n++) {
+    if (auto* cn = std::get_if<ConstantNode>(&graph.nodes[n].value)) {
+      auto spec =
+          std::get<graph::TensorSpec>(graph.get_output_spec_for_node(n));
+      ET_UNWRAP(
+          const_id,
+          define_tensor(
+              spec,
+              subgraph.get(),
+              /*is_input=*/false,
+              /*is_output=*/false,
+              XNN_INVALID_VALUE_ID,
+              cn->tensor.get()));
+      xnn_ids[n] = const_id;
+    }
+  }
+
+  for (auto i = 0u; i < graph.outputs.size(); i++) {
+    xnn_ids[graph.outputs[i].node] = xnn_output_ids[i];
+  }
+
+  for (NodeHandle n = 0; n < graph.nodes.size(); n++) {
+    auto* op = std::get_if<CallOperatorNode>(&graph.nodes[n].value);
+    if (!op)
+      continue;
+
+    if (xnn_ids[n] == XNN_INVALID_VALUE_ID) {
+      auto spec = std::get<graph::TensorSpec>(op->output_specs);
+      ET_UNWRAP(op_output_id, define_tensor(spec, subgraph.get()));
+      xnn_ids[n] = op_output_id;
+    }
+  }
+
+  for (NodeHandle n = 0; n < graph.nodes.size(); n++) {
+    auto* op = std::get_if<CallOperatorNode>(&graph.nodes[n].value);
+    if (!op)
+      continue;
+
+    ET_CHECK_OK_OR_RETURN_ERROR(define_node(
+        *op, xnn_ids[n], {xnn_ids.data(), xnn_ids.size()}, subgraph.get()));
+  }
+
+  return subgraph;
+}
+
+runtime::Result<XnnRuntime> compile_xnn_subgraph(
+    const graph::Graph& graph,
+    xnn_workspace_t workspace) {
+  ET_UNWRAP(subgraph, build_xnn_subgraph(graph));
+
+  xnn_runtime_t runtime = nullptr;
+  auto status = xnn_create_runtime_v4(
+      subgraph.get(),
+      /*weights_cache=*/nullptr,
+      workspace,
+      ::executorch::extension::threadpool::get_pthreadpool(),
+      /*flags=*/0,
+      &runtime);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create XNNPACK runtime");
+
+  return XnnRuntime(runtime);
+}
+
+} // namespace executorch::backends::xnnpack::plan
diff --git a/backends/xnnpack/runtime/plan/xnn_subgraph.h b/backends/xnnpack/runtime/plan/xnn_subgraph.h
new file mode 100644
index 00000000000..53e48108fb9
--- /dev/null
+++ b/backends/xnnpack/runtime/plan/xnn_subgraph.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <executorch/runtime/core/result.h>
+
+#include <memory>
+
+#include <xnnpack.h>
+
+namespace executorch::backends::xnnpack::graph {
+struct Graph;
+}
+
+namespace executorch::backends::xnnpack::plan {
+
+struct XnnSubgraphDeleter {
+  void operator()(xnn_subgraph_t subgraph) const {
+    xnn_delete_subgraph(subgraph);
+  }
+};
+
+struct XnnRuntimeDeleter {
+  void operator()(xnn_runtime_t runtime) const {
+    xnn_delete_runtime(runtime);
+  }
+};
+
+struct XnnWorkspaceDeleter {
+  void operator()(xnn_workspace_t workspace) const {
+    xnn_release_workspace(workspace);
+  }
+};
+
+using XnnSubgraph = std::unique_ptr<xnn_subgraph, XnnSubgraphDeleter>;
+using XnnRuntime = std::unique_ptr<xnn_runtime, XnnRuntimeDeleter>;
+using XnnWorkspace = std::unique_ptr<xnn_workspace, XnnWorkspaceDeleter>;
+
+/*
+ * Prepare a (sub)graph for XNNPACK execution.
+ */
+runtime::Result<XnnRuntime> compile_xnn_subgraph(
+    const graph::Graph& graph,
+    xnn_workspace_t workspace);
+
+} // namespace executorch::backends::xnnpack::plan
diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
index 0e2be1aacc4..bcf84ef6593 100644
--- a/backends/xnnpack/test/CMakeLists.txt
+++ b/backends/xnnpack/test/CMakeLists.txt
@@ -45,7 +45,7 @@ target_include_directories(
 set(_graph_runtime_test_srcs
     runtime/test_quant_params.cpp runtime/test_graph_builder.cpp
     runtime/test_shape_env.cpp runtime/test_arena.cpp
-    runtime/test_partition.cpp
+    runtime/test_partition.cpp runtime/test_schedule.cpp
 )
 
 et_cxx_test(
diff --git a/backends/xnnpack/test/runtime/test_schedule.cpp b/backends/xnnpack/test/runtime/test_schedule.cpp
new file mode 100644
index 00000000000..93ffb4b33c5
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_schedule.cpp
@@ -0,0 +1,162 @@
+#include <gtest/gtest.h>
+
+#include <executorch/backends/xnnpack/runtime/graph/graph_builder.h>
+#include <executorch/backends/xnnpack/runtime/plan/partition.h>
+#include <executorch/backends/xnnpack/runtime/plan/schedule.h>
+
+#include <algorithm>
+#include <unordered_set>
+
+using namespace executorch::backends::xnnpack::core;
+using namespace executorch::backends::xnnpack::graph;
+using namespace executorch::backends::xnnpack::plan;
+
+// Helper: check that the order is a valid topological sort of the graph.
+static void assert_topological(
+    const Graph& graph,
+    const std::vector<NodeHandle>& order) {
+  ASSERT_EQ(order.size(), graph.nodes.size());
+
+  // Build position map: node -> index in order.
+  std::vector<uint32_t> pos(graph.nodes.size());
+  for (uint32_t i = 0; i < order.size(); i++) {
+    pos[order[i]] = i;
+  }
+
+  // Every arg source must appear before the node that uses it.
+  for (NodeHandle n = 0; n < graph.nodes.size(); n++) {
+    for (auto& arg : graph.nodes[n].get_args()) {
+      EXPECT_LT(pos[arg.node], pos[n])
+          << "Node " << arg.node << " (arg) should appear before node " << n;
+    }
+  }
+}
+
+// Helper: check that all nodes appear exactly once.
+static void assert_all_nodes_present(
+    const Graph& graph,
+    const std::vector<NodeHandle>& order) {
+  ASSERT_EQ(order.size(), graph.nodes.size());
+
+  std::unordered_set<NodeHandle> seen(order.begin(), order.end());
+  EXPECT_EQ(seen.size(), graph.nodes.size());
+}
+
+static TensorSpec make_spec() {
+  return TensorSpec{
+      .dtype = DType::Float32,
+      .sizes = {DimSizeSpec::constant(1), DimSizeSpec::constant(10)}};
+}
+
+TEST(TestSchedule, linear_chain) {
+  // input -> op1 -> op2 -> output
+  auto builder = GraphBuilder();
+  auto spec = make_spec();
+
+  auto input = builder.createInput(spec);
+  auto op1 = builder.createOperator(Operator::Add, spec, input, input);
+  auto op2 = builder.createOperator(Operator::Add, spec, op1, input);
+  builder.createOutput(op2);
+
+  auto graph = builder.build();
+  graph.update_users();
+
+  auto order = schedule(graph);
+
+  assert_all_nodes_present(graph, order);
+  assert_topological(graph, order);
+
+  // Input must be first, then op1, then op2.
+  EXPECT_EQ(order[0], input.node);
+  EXPECT_EQ(order[1], op1.node);
+  EXPECT_EQ(order[2], op2.node);
+}
+
+TEST(TestSchedule, diamond) {
+  // input_a, input_b -> add1; input_b -> add2; add1, add2 -> add3
+  auto builder = GraphBuilder();
+  auto spec = make_spec();
+
+  auto input_a = builder.createInput(spec);
+  auto input_b = builder.createInput(spec);
+  auto add1 = builder.createOperator(Operator::Add, spec, input_a, input_b);
+  auto add2 = builder.createOperator(Operator::Add, spec, add1, input_b);
+  builder.createOutput(add2);
+
+  auto graph = builder.build();
+  graph.update_users();
+
+  auto order = schedule(graph);
+
+  assert_all_nodes_present(graph, order);
+  assert_topological(graph, order);
+
+  // Both inputs before add1, add1 before add2.
+  std::vector<uint32_t> pos(graph.nodes.size());
+  for (uint32_t i = 0; i < order.size(); i++) {
+    pos[order[i]] = i;
+  }
+  EXPECT_LT(pos[input_a.node], pos[add1.node]);
+  EXPECT_LT(pos[input_b.node], pos[add1.node]);
+  EXPECT_LT(pos[add1.node], pos[add2.node]);
+}
+
+TEST(TestSchedule, post_fusion) {
+  auto builder = GraphBuilder();
+  auto spec = make_spec();
+
+  auto input_a = builder.createInput(spec);
+  auto input_b = builder.createInput(spec);
+  auto add1 = builder.createOperator(Operator::Add, spec, input_a, input_b);
+  auto add2 = builder.createOperator(Operator::Add, spec, add1, input_b);
+  auto add3 = builder.createOperator(Operator::Add, spec, add2, input_b);
+  builder.createOutput(add3);
+
+  auto graph = builder.build();
+
+  // Mark all ops for XNNPACK so they get fused.
+  graph.nodes[add1.node].flags |= NodeFlags::UseXnnpack;
+  graph.nodes[add2.node].flags |= NodeFlags::UseXnnpack;
+  graph.nodes[add3.node].flags |= NodeFlags::UseXnnpack;
+
+  ASSERT_EQ(partition_xnn_subgraphs(graph), executorch::runtime::Error::Ok);
+
+  auto order = schedule(graph);
+
+  assert_all_nodes_present(graph, order);
+  assert_topological(graph, order);
+
+  // Find the CallSubgraphNode and verify it comes after inputs.
+  std::vector<uint32_t> pos(graph.nodes.size());
+  for (uint32_t i = 0; i < order.size(); i++) {
+    pos[order[i]] = i;
+  }
+
+  for (NodeHandle n = 0; n < graph.nodes.size(); n++) {
+    if (std::holds_alternative<CallSubgraphNode>(graph.nodes[n].value)) {
+      auto& fused = std::get<CallSubgraphNode>(graph.nodes[n].value);
+      for (auto& arg : fused.args) {
+        EXPECT_LT(pos[arg.node], pos[n]);
+      }
+    }
+  }
+}
+
+TEST(TestSchedule, multiple_inputs_no_ops) {
+  // Graph with only input nodes (degenerate case).
+  auto builder = GraphBuilder();
+  auto spec = make_spec();
+
+  auto input_a = builder.createInput(spec);
+  auto input_b = builder.createInput(spec);
+  builder.createOutput(input_a);
+  builder.createOutput(input_b);
+
+  auto graph = builder.build();
+  graph.update_users();
+
+  auto order = schedule(graph);
+
+  assert_all_nodes_present(graph, order);
+  EXPECT_EQ(order.size(), 2u);
+}