Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,10 @@ endfunction()
if(EXECUTORCH_BUILD_WEBGPU_TEST)
add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
add_webgpu_native_test(
webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
)
add_webgpu_native_test(
webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
)
endif()
15 changes: 12 additions & 3 deletions backends/webgpu/scripts/test_webgpu_native_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ fi
cd "${EXECUTORCH_ROOT}"

# ── Exports for the model-driven executables (best-effort) ───────────────────
# native_test + rms_norm read .pte/golden inputs via WEBGPU_TEST_* env and
# self-skip if absent; dispatch_order + scratch are standalone (no exports).
# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
# self-skip if absent; scratch is standalone (generates its own inputs).
PTE_MODEL="/tmp/webgpu_add_test.pte"
PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
RMS_NORM_DIR="/tmp/rmsn"
RMS_NORM_OK=1
DISPATCH_ORDER_DIR="/tmp/dispatch_order"
DISPATCH_ORDER_OK=1

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
Expand All @@ -55,6 +57,11 @@ from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rm
export_rms_norm_cases('${RMS_NORM_DIR}')
" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
" || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; }

# ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ─────────
echo "=== Configure WebGPU native tests on Dawn ==="
rm -rf "${BUILD_DIR}"
Expand Down Expand Up @@ -115,7 +122,9 @@ fi
if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
"${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_dispatch_order_test" ]] && "${BIN_DIR}/webgpu_dispatch_order_test"
if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
"${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
fi
[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"

echo "=== WebGPU native tests on Dawn: all run targets passed ==="
167 changes: 167 additions & 0 deletions backends/webgpu/test/native/test_dispatch_order.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <string>
#include <vector>

using namespace executorch::backends::webgpu;
using namespace executorch::extension;
using namespace executorch::runtime;

namespace {

struct Case {
const char* name;
std::vector<int32_t> sizes;
};

// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
const std::vector<Case> kCases = {
{"single", {16, 16}},
{"chain3", {64, 64}},
{"chain5_tiny", {1, 1}},
{"chain5_wide", {7, 896}},
{"chain8", {256, 256}},
{"deep32", {128, 128}},
{"large_chain", {1024, 1024}},
{"het_small", {1, 1, 7, 896}},
{"het_deep", {1, 1, 5, 256}},
};

std::vector<float> read_f32_bin(const std::string& path) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f) {
return {};
}
const auto file_size = static_cast<size_t>(f.tellg());
if (file_size % sizeof(float) != 0) {
return {}; // truncated/corrupt golden; caller treats empty as failure
}
f.seekg(0);
std::vector<float> data(file_size / sizeof(float));
f.read(
reinterpret_cast<char*>(data.data()),
static_cast<std::streamsize>(file_size));
return data;
}

bool run_case(const std::string& dir, const Case& tc) {
printf("\n--- dispatch_order[%s] ---\n", tc.name);
const std::string base = dir + "/" + tc.name;
std::vector<float> input = read_f32_bin(base + ".input.bin");
std::vector<float> golden = read_f32_bin(base + ".golden.bin");
if (input.empty() || golden.empty()) {
printf("FAIL: could not read input/golden for %s\n", tc.name);
return false;
}

Module module(base + ".pte");
if (module.load_forward() != Error::Ok) {
printf("FAIL: could not load %s.pte\n", tc.name);
return false;
}

size_t expected = 1;
for (int32_t d : tc.sizes) {
expected *= static_cast<size_t>(d);
}
if (input.size() != expected) {
printf(
"FAIL: input numel %zu != expected %zu for %s\n",
input.size(),
expected,
tc.name);
return false;
}
auto x = make_tensor_ptr(tc.sizes, std::vector<float>(input));
auto result = module.forward({EValue(x)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}
const auto& outputs = result.get();
if (outputs.empty() || !outputs[0].isTensor()) {
printf("FAIL: no tensor output\n");
return false;
}
const auto& out_tensor = outputs[0].toTensor();
if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
printf(
"FAIL: output numel %zu != golden %zu\n",
(size_t)out_tensor.numel(),
golden.size());
return false;
}
const float* out_data = out_tensor.const_data_ptr<float>();

float max_abs_err = 0.0f;
float max_rel_err = 0.0f;
for (size_t i = 0; i < golden.size(); i++) {
const float abs_err = std::abs(out_data[i] - golden[i]);
max_abs_err = std::max(max_abs_err, abs_err);
const float denom = std::max(std::abs(golden[i]), 1e-6f);
max_rel_err = std::max(max_rel_err, abs_err / denom);
}
printf(
"Max abs error: %e Max rel error: %e (%zu elements)\n",
max_abs_err,
max_rel_err,
golden.size());
// Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens).
if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) {
printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name);
return false;
}
printf("PASS: dispatch_order[%s]\n", tc.name);
return true;
}

} // namespace

int main(int argc, char** argv) {
std::string dir = "/tmp/dispatch_order";
if (argc > 1) {
dir = argv[1];
}
if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) {
dir = env;
}

WebGPUContext ctx;
try {
ctx = create_webgpu_context();
} catch (const std::exception& e) {
printf("SKIP: %s\n", e.what());
return 0;
}
set_default_webgpu_context(&ctx);
printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());

bool ok = true;
for (const auto& tc : kCases) {
ok = run_case(dir, tc) && ok;
}

set_default_webgpu_context(nullptr);
destroy_webgpu_context(ctx);

if (!ok) {
return 1;
}
printf("\nAll dispatch_order tests passed\n");
return 0;
}
Loading
Loading