Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 181 additions & 9 deletions backends/xnnpack/runtime/XNNPACKBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,25 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/xnnpack/runtime/FlatbufferGraphBuilder.h>
#include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
#include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
#include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
#include <executorch/backends/xnnpack/runtime/XnnpackBackendOptions.h>
#include <executorch/backends/xnnpack/runtime/executor/executor.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
#include <executorch/runtime/executor/pte_data_map.h>
#include <executorch/runtime/platform/log.h>
#include <chrono>

#include <cstring>
#include <memory>
#include <mutex>
#include <vector>

#pragma clang diagnostic ignored "-Wglobal-constructors"

Expand All @@ -41,6 +48,16 @@ using executorch::runtime::FreeableBuffer;
using executorch::runtime::Result;
using executorch::runtime::Span;

struct XnnpackDelegateHandle {
bool is_graph_runtime = false;
// Legacy path: XNNExecutor placed via runtime allocator.
xnnpack::delegate::XNNExecutor* legacy_executor = nullptr;
// Graph path: heap-allocated Executor.
xnnpack::executor::Executor* graph_executor = nullptr;
std::vector<uint32_t> input_external_ids;
std::vector<uint32_t> output_external_ids;
};

class XnnpackBackend final
: public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface {
public:
Expand All @@ -66,26 +83,67 @@ class XnnpackBackend final
BackendInitContext& context,
FreeableBuffer* processed,
ArrayRef<CompileSpec> compile_specs) const override {
auto* handle = context.get_runtime_allocator()
->allocateInstance<XnnpackDelegateHandle>();
if (handle == nullptr) {
return Error::MemoryAllocationFailed;
}
new (handle) XnnpackDelegateHandle();

bool use_graph_runtime = options_.resolve_graph_runtime(context);
handle->is_graph_runtime = use_graph_runtime;

if (use_graph_runtime) {
auto t0 = std::chrono::steady_clock::now();
const NamedDataMap* named_data_map = context.get_named_data_map();
ET_UNWRAP(
result,
xnnpack::FlatbufferGraphBuilder::build(
processed->data(), processed->size(), named_data_map));
processed->Free();
auto t1 = std::chrono::steady_clock::now();

ET_UNWRAP(
built_executor, xnnpack::executor::Executor::build(result.graph));
auto* executor =
new xnnpack::executor::Executor(std::move(built_executor));
auto t2 = std::chrono::steady_clock::now();
handle->graph_executor = executor;
handle->input_external_ids = std::move(result.input_external_ids);
handle->output_external_ids = std::move(result.output_external_ids);
ET_LOG(
Info,
"Graph runtime init: deserialize=%lldms executor_build=%lldms",
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(
t1 - t0)
.count(),
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(
t2 - t1)
.count());
return handle;
}

auto executor = context.get_runtime_allocator()
->allocateInstance<xnnpack::delegate::XNNExecutor>();
if (executor == nullptr) {
handle->~XnnpackDelegateHandle();
return Error::MemoryAllocationFailed;
}

const NamedDataMap* named_data_map = context.get_named_data_map();
// thread safe. This can happen when multiple threads call init() on
// the same backend instance.

auto program_id =
reinterpret_cast<uintptr_t>(context.get_runtime_allocator());
auto sharing_mode_result = options_.resolve_sharing_mode(context);
if (!sharing_mode_result.ok()) {
handle->~XnnpackDelegateHandle();
return sharing_mode_result.error();
}
auto workspace_result =
options_.workspace_manager().get_or_create_workspace(
program_id, sharing_mode_result.get());
if (!workspace_result.ok()) {
handle->~XnnpackDelegateHandle();
return workspace_result.error();
}
auto workspace = workspace_result.get();
Expand Down Expand Up @@ -128,23 +186,27 @@ class XnnpackBackend final
processed->Free();

if (err != Error::Ok) {
// destroy() won't be called on this handle, so we need to clean it up
// now.
executor->~XNNExecutor();

handle->~XnnpackDelegateHandle();
ET_LOG(
Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err);
return err;
}

return executor;
handle->legacy_executor = executor;
return handle;
}

Error execute(
BackendExecutionContext& context,
DelegateHandle* handle,
Span<EValue*> args) const override {
auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
auto* delegate = static_cast<XnnpackDelegateHandle*>(handle);

if (delegate->is_graph_runtime) {
return execute_graph(delegate, args);
}

auto executor = delegate->legacy_executor;

auto workspace = executor->get_workspace();

Expand Down Expand Up @@ -176,7 +238,15 @@ class XnnpackBackend final

void destroy(DelegateHandle* handle) const override {
if (handle != nullptr) {
auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
auto* delegate = static_cast<XnnpackDelegateHandle*>(handle);

if (delegate->is_graph_runtime) {
delete delegate->graph_executor;
delegate->~XnnpackDelegateHandle();
return;
}

auto executor = delegate->legacy_executor;
auto workspace = executor->get_workspace();

const std::lock_guard<std::mutex> lock_weights_cache(
Expand All @@ -200,6 +270,7 @@ class XnnpackBackend final
// XNNExecutor is not trivially destructible. Since this was constructed
// manually in init(), we must destroy it manually here.
executor->~XNNExecutor();
delegate->~XnnpackDelegateHandle();
}
}

Expand Down Expand Up @@ -228,6 +299,107 @@ class XnnpackBackend final
}

private:
Error execute_graph(XnnpackDelegateHandle* delegate, Span<EValue*> args)
const {
auto* executor = delegate->graph_executor;

// Build input tensors from EValue args.
std::vector<xnnpack::core::Tensor> inputs;
inputs.reserve(delegate->input_external_ids.size());
for (uint32_t ext_id : delegate->input_external_ids) {
ET_CHECK_OR_RETURN_ERROR(
ext_id < args.size(),
InvalidProgram,
"Input external id %u out of range (%zu args)",
ext_id,
args.size());
auto& et_tensor = args[ext_id]->toTensor();
xnnpack::core::Tensor t;
// The external-value dtype is taken from the serialized graph spec; this
// field is informational for the input wrapper. Defaulting to Float32
// matches the supported (float) input set.
t.dtype = xnnpack::core::DType::Float32;
if (et_tensor.dim() == 0) {
t.sizes = {1};
} else {
// Pass dims in physical (dim-order-permuted) layout so a channels-last
// input matches the NHWC layout XNNPACK expects, mirroring the legacy
// XNNExecutor path.
size_t num_dims = et_tensor.dim();
executorch::aten::DimOrderType
dim_order[::executorch::runtime::kTensorDimensionLimit];
ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order(
et_tensor, dim_order, num_dims));
t.sizes.resize(num_dims);
for (size_t d = 0; d < num_dims; d++) {
t.sizes[d] = static_cast<uint64_t>(
et_tensor.size(static_cast<int>(dim_order[d])));
}
}
t.storage.data = et_tensor.mutable_data_ptr();
t.storage.size_in_bytes = et_tensor.nbytes();
t.storage.owner = xnnpack::core::StorageOwner::External;
inputs.push_back(std::move(t));
}

ET_UNWRAP(outputs, executor->run({inputs.data(), inputs.size()}));

ET_CHECK_OR_RETURN_ERROR(
outputs.size() == delegate->output_external_ids.size(),
Internal,
"Executor produced %zu outputs, expected %zu",
outputs.size(),
delegate->output_external_ids.size());

// Copy output data back to EValue tensors.
for (size_t i = 0; i < delegate->output_external_ids.size(); i++) {
uint32_t ext_id = delegate->output_external_ids[i];
ET_CHECK_OR_RETURN_ERROR(
ext_id < args.size(),
InvalidProgram,
"Output external id %u out of range (%zu args)",
ext_id,
args.size());
auto& et_tensor = args[ext_id]->toTensor();
auto& out_tensor = outputs[i];

// Resize the output EValue tensor to match the computed shape. The
// executor reports dims in XNNPACK physical (channels-last) order;
// scatter them back to the tensor's logical order via its dim_order,
// mirroring the legacy XNNExecutor::resize_outputs path.
size_t num_dims = out_tensor.sizes.size();
std::vector<executorch::aten::SizesType> new_sizes_vec(num_dims);
executorch::aten::DimOrderType
out_dim_order[::executorch::runtime::kTensorDimensionLimit];
ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order(
et_tensor, out_dim_order, num_dims));
for (size_t d = 0; d < num_dims; d++) {
new_sizes_vec[out_dim_order[d]] =
static_cast<executorch::aten::SizesType>(out_tensor.sizes[d]);
}
executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes(
new_sizes_vec.data(), new_sizes_vec.size());
ET_CHECK_OK_OR_RETURN_ERROR(
executorch::runtime::resize_tensor(et_tensor, new_sizes));

if (out_tensor.storage.data != et_tensor.mutable_data_ptr()) {
ET_CHECK_OR_RETURN_ERROR(
out_tensor.storage.size_in_bytes <= et_tensor.nbytes(),
Internal,
"Output %zu is %zu bytes, exceeds tensor capacity %zu",
i,
out_tensor.storage.size_in_bytes,
et_tensor.nbytes());
std::memcpy(
et_tensor.mutable_data_ptr(),
out_tensor.storage.data,
out_tensor.storage.size_in_bytes);
}
}

return Error::Ok;
}

mutable xnnpack::XnnpackBackendOptions options_;

// Weights cache is global to all delegate instances.
Expand Down
4 changes: 4 additions & 0 deletions backends/xnnpack/runtime/XNNPACKBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ const char weight_cache_option_key[] = "weight_cache_enabled";
// @lint-ignore CLANGTIDY facebook-hte-CArray
const char packed_cache_path_option_key[] = "packed_cache_path";

/// The key for the graph runtime option. When enabled, the new graph-based
/// runtime is used instead of the legacy XNNCompiler/XNNExecutor path.
const char use_graph_runtime_option_key[] = "use_graph_runtime";

/// Workspace sharing mode. This is a backend option that can be set via the
/// set_option API to control memory sharing between CALL_DELEGATE instances.
/// This is useful for reducing memory consumption.
Expand Down
16 changes: 16 additions & 0 deletions backends/xnnpack/runtime/XnnpackBackendOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Error XnnpackBackendOptions::get_option(BackendOption& option) const {
std::min(packed_cache_path_.size(), runtime::kMaxOptionValueLength - 1);
memcpy(arr.data(), packed_cache_path_.data(), len);
option.value = arr;
} else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {
option.value = use_graph_runtime_.load();
}
return Error::Ok;
}
Expand Down Expand Up @@ -84,6 +86,14 @@ Error XnnpackBackendOptions::set_option(const BackendOption& option) {
Debug,
"Setting XNNPACK packed cache path to %s.",
packed_cache_path_.c_str());
} else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {
auto* val = std::get_if<bool>(&option.value);
if (!val) {
ET_LOG(Error, "XNNPACK use_graph_runtime must be a bool.");
return Error::InvalidArgument;
}
ET_LOG(Debug, "Setting XNNPACK use_graph_runtime to %d.", *val);
use_graph_runtime_.store(*val);
}
return Error::Ok;
}
Expand Down Expand Up @@ -114,6 +124,12 @@ XnnpackBackendOptions::resolve_sharing_mode(
return static_cast<WorkspaceSharingMode>(raw_mode);
}

bool XnnpackBackendOptions::resolve_graph_runtime(
const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const {
return resolve_option<bool>(
context, use_graph_runtime_option_key, use_graph_runtime_.load());
}

WorkspaceSharingMode XnnpackBackendOptions::get_sharing_mode() const {
return sharing_mode_.load();
}
Expand Down
4 changes: 4 additions & 0 deletions backends/xnnpack/runtime/XnnpackBackendOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class XnnpackBackendOptions {
runtime::Result<WorkspaceSharingMode> resolve_sharing_mode(
const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const;

bool resolve_graph_runtime(
const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const;

WorkspaceSharingMode get_sharing_mode() const;
XNNWorkspaceManager& workspace_manager();
const XNNWorkspaceManager& workspace_manager() const;
Expand All @@ -61,6 +64,7 @@ class XnnpackBackendOptions {
#endif

std::string packed_cache_path_;
std::atomic<bool> use_graph_runtime_{false};
};

} // namespace executorch::backends::xnnpack
19 changes: 19 additions & 0 deletions backends/xnnpack/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,22 @@ target_include_directories(
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
)

# Graph runtime E2E tests (requires XNNPACK runtime).
et_cxx_test(
backends_xnnpack_graph_e2e_test
SOURCES
runtime/test_e2e.cpp
EXTRA_LIBS
xnnpack_backend
XNNPACK
pthreadpool
cpuinfo
xnnpack-microkernels-prod
)
target_include_directories(
backends_xnnpack_graph_e2e_test
PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/XNNPACK/include
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
)
Loading
Loading