From 8fba37c5e1cfe6d1698ff2855b986232fe6d7f1f Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 11 Jun 2026 16:41:44 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- .../webgpu/scripts/test_webgpu_native_ci.sh | 6 + .../test/ops/quantized_linear/__init__.py | 5 + .../quantized_linear/test_quantized_linear.py | 158 +++++++++++++++ backends/webgpu/test/test_webgpu_native.cpp | 184 ++++++++++++++++++ 4 files changed, 353 insertions(+) create mode 100644 backends/webgpu/test/ops/quantized_linear/__init__.py create mode 100644 backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index 29263cc934d..89ecd9c8d29 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -54,6 +54,11 @@ export_add_model('${PTE_MODEL}') export_chained_add_model('${PTE_CHAINED_MODEL}') " || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent" +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models +export_all_quantized_linear_models('/tmp') +" || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test" + $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases export_rms_norm_cases('${RMS_NORM_DIR}') @@ -141,6 +146,7 @@ if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then env WEBGPU_TEST_MODEL="${PTE_MODEL}" \ WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ WEBGPU_TEST_SDPA_DIR=/tmp/ \ + WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \ "${BIN_DIR}/webgpu_native_test" else echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)" diff --git a/backends/webgpu/test/ops/quantized_linear/__init__.py b/backends/webgpu/test/ops/quantized_linear/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/backends/webgpu/test/ops/quantized_linear/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py b/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py new file mode 100644 index 00000000000..1ee69ca9ea5 --- /dev/null +++ b/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py @@ -0,0 +1,158 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""4-bit weight-only quantized linear (`et_vk.linear_q4gsw`) export + fp64 golden. + +Mirrors test_sdpa.py: a named CONFIGS sweep over real Llama-3.2-1B linear shapes +(q/o/k/v/gate/up/down proj + lm_head) plus large-M (4k/8k) prefill stress, each +exported through VulkanPartitioner (which fuses dq+linear into +`et_vk.linear_q4gsw.default`). The golden is the fp64 dequant-matmul truth +(x @ dequant(W).T), so the GPU's fp32 error is measured against truth, not another +fp32 approximation. The native test (test_webgpu_native.cpp) mirrors the same +CONFIGS table and reconstructs the identical deterministic ramp input bit-for-bit. +""" + +import os +import unittest +from dataclasses import dataclass + +import numpy as np +import torch + +from executorch.backends.vulkan import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower +from torchao.quantization.granularity import PerGroup +from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_ + + +@dataclass(frozen=True) +class Q4gswConfig: + name: str + m: int # rows (tokens) + k: int # in_features (reduction dim) + n: int # out_features + group_size: int = 32 # K % group_size == 0, K % 8 == 0, N % 8 == 0 + # heavy = huge fixture / slow on a CPU rasterizer; export_all skips unless asked. + heavy: bool = False + + +# Single source of truth, mirrored by the C++ kQ4gswConfigs table. Llama-3.2-1B: +# hidden=2048, n_heads=32 head_dim=64 (q/o=2048->2048), n_kv=8 (k/v=2048->512), +# FFN=8192 (gate/up=2048->8192), down=8192->2048, vocab=128256 (lm_head). +CONFIGS = [ + # name M K N + Q4gswConfig("q_proj", 1, 2048, 2048), # also covers o_proj (same shape) + Q4gswConfig("kv_proj", 1, 2048, 512), # k_proj / v_proj + Q4gswConfig("gate_proj", 1, 2048, 8192), # gate_proj / up_proj + Q4gswConfig("down_proj", 1, 8192, 2048), # big reduction K + Q4gswConfig("lm_head", 1, 2048, 128256, heavy=True), # 131MB packed .pte + Q4gswConfig("q_proj_4k", 4096, 2048, 2048), # 4k-token prefill + Q4gswConfig("kv_proj_4k", 4096, 2048, 512), + Q4gswConfig("q_proj_8k", 8192, 2048, 2048, heavy=True), # 67MB golden + Q4gswConfig("kv_proj_8k", 8192, 2048, 512, heavy=True), +] + + +def _make_quantized_model(k: int, n: int, group_size: int) -> torch.nn.Module: + torch.manual_seed(0) # load-bearing: fixes the weights the golden derives from + m = torch.nn.Linear(k, n, bias=False).eval() + quantize_( + m, + IntxWeightOnlyConfig(weight_dtype=torch.int4, granularity=PerGroup(group_size)), + ) + return m + + +def _ramp_input(m_rows: int, k: int) -> torch.Tensor: + """Deterministic fp32 input [M,K]; C++ q4gsw_ramp reconstructs it bit-for-bit. + + x[flat] = ((flat % 17) - 8) / 16 over the flat row-major index -- exact in fp32 + (small modulus, power-of-two denominator). + """ + flat = np.arange(m_rows * k, dtype=np.int64) + x = ((flat % 17) - 8).astype(np.float32) / np.float32(16.0) + return torch.from_numpy(x).reshape(m_rows, k) + + +def _fp64_golden(m: torch.nn.Module, x: torch.Tensor) -> np.ndarray: + """fp64 truth: x @ dequant(W).T. The kernel computes the same dequant-matmul, so + fp64 makes this the true answer -- GPU fp32 error is measured vs truth, not vs a + second fp32 approximation. torchao handles the signed-nibble recovery in dequantize(). + """ + wq = m.weight.dequantize() # AffineQuantizedTensor -> dequantized weight [N,K] + golden = x.double() @ wq.double().t() # [M,N] in fp64 + return golden.to(torch.float32).numpy().astype(" None: + # Every config must fuse to a VulkanBackend delegate (q4gsw). Fusion is + # shape-independent, so M=1 keeps even the heavy configs cheap to check. + for cfg in CONFIGS: + with self.subTest(config=cfg.name): + m = _make_quantized_model(cfg.k, cfg.n, cfg.group_size) + et = _export(m, _ramp_input(1, cfg.k)) + found = any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + self.assertTrue(found, f"no VulkanBackend delegate in {cfg.name}") + + def test_golden_matches_eager(self) -> None: + # Dual oracle (mirrors SDPA test_golden_matches_eager_op): the fp64 dequant- + # matmul truth and torchao's own fp32 quantized forward are independent refs + # that must agree -- guards a bug in the fp64 oracle / dequantize() accessor. + # M=1 shapes only (cheap; the math is shape-independent). + for cfg in CONFIGS: + if cfg.m != 1: + continue + with self.subTest(config=cfg.name): + m = _make_quantized_model(cfg.k, cfg.n, cfg.group_size) + x = _ramp_input(1, cfg.k) + golden = torch.from_numpy(_fp64_golden(m, x)) + torch.testing.assert_close(m(x), golden, atol=1e-2, rtol=1e-2) + + +def export_quantized_linear_model( + cfg: Q4gswConfig, pte_path: str, golden_path: str +) -> None: + """Export one config's q4gsw .pte + its fp64 golden (raw LE fp32).""" + m = _make_quantized_model(cfg.k, cfg.n, cfg.group_size) + x = _ramp_input(cfg.m, cfg.k) + et = _export(m, x) + with open(pte_path, "wb") as f: + f.write(et.buffer) + _fp64_golden(m, x).tofile(golden_path) + print(f"Exported {pte_path}; golden {golden_path} ({cfg.m * cfg.n} floats)") + + +def export_all_quantized_linear_models( + out_dir: str, include_heavy: bool = False +) -> None: + """Write q4gsw_.pte + q4gsw_.golden.bin for each config. + + Heavy configs (lm_head 131MB .pte; M=8k 67MB goldens) are skipped unless + include_heavy -- plain CI never writes them; a real-GPU run opts in. + """ + for cfg in CONFIGS: + if cfg.heavy and not include_heavy: + print(f"(skipping heavy config {cfg.name}; set include_heavy=True)") + continue + pte = os.path.join(out_dir, f"q4gsw_{cfg.name}.pte") + golden = os.path.join(out_dir, f"q4gsw_{cfg.name}.golden.bin") + export_quantized_linear_model(cfg, pte, golden) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index 794481e988a..6e4a777593e 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -373,6 +373,165 @@ static bool sdpa_within_tol( return ok; } +// linear_q4gsw sweep config; mirrors CONFIGS in test_quantized_linear.py. +struct Q4gswConfig { + const char* name; + int m; // rows (tokens) + int k; // in_features (reduction dim) + int n; // out_features + float tol_abs; // per-element abs gate + float tol_rel; // per-element rel gate + bool required; // dir set + .pte absent => FAIL (not skip) + bool heavy; // huge/slow: export-gated; runs only if WEBGPU_TEST_HEAVY +}; + +// Llama-3.2-1B linear shapes (q/o/k/v/gate/up/down + lm_head) + 4k/8k prefill. +static const Q4gswConfig kQ4gswConfigs[] = { + // name M K N tol_abs tol_rel req heavy + {"q_proj", 1, 2048, 2048, 1e-4f, 1e-3f, true, false}, + {"kv_proj", 1, 2048, 512, 1e-4f, 1e-3f, true, false}, + {"gate_proj", 1, 2048, 8192, 1e-4f, 1e-3f, true, false}, + {"down_proj", 1, 8192, 2048, 1e-3f, 1e-2f, true, false}, // big-K accum + {"lm_head", 1, 2048, 128256, 1e-4f, 1e-3f, false, true}, + {"q_proj_4k", 4096, 2048, 2048, 1e-3f, 1e-2f, true, false}, + {"kv_proj_4k", 4096, 2048, 512, 1e-3f, 1e-2f, true, false}, + {"q_proj_8k", 8192, 2048, 2048, 1e-3f, 1e-2f, false, true}, + {"kv_proj_8k", 8192, 2048, 512, 1e-3f, 1e-2f, false, true}, +}; + +// /16 ramp over the flat index; mirrors test_quantized_linear.py _ramp_input. +static float q4gsw_ramp(int i) { + return static_cast((i % 17) - 8) / 16.0f; +} + +// Per-element dual tolerance (abs OR rel), parameterized like sdpa_within_tol. +static bool quant_within_tol( + const float* out, + const float* golden, + int n, + float atol, + float rtol, + float* ma, + float* mr) { + float max_abs = 0.0f, max_rel = 0.0f; + bool ok = true; + for (int i = 0; i < n; i++) { + const float ae = std::abs(out[i] - golden[i]); + const float re = ae / std::max(std::abs(golden[i]), 1e-6f); + max_abs = std::max(max_abs, ae); + max_rel = std::max(max_rel, re); + if (ae > atol && re > rtol) { + ok = false; + } + } + *ma = max_abs; + *mr = max_rel; + return ok; +} + +// Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden. +static bool test_q4gsw_config( + const Q4gswConfig& cfg, + const std::string& pte, + const std::string& golden_path) { + printf( + "\n--- Test: linear_q4gsw (%s: M=%d,K=%d,N=%d) ---\n", + cfg.name, + cfg.m, + cfg.k, + cfg.n); + + Module module(pte); + if (module.load_forward() != Error::Ok) { + printf("FAIL: could not load %s\n", pte.c_str()); + return false; + } + + const int in_numel = cfg.m * cfg.k; + const int out_numel = cfg.m * cfg.n; + std::vector input(in_numel); + for (int i = 0; i < in_numel; i++) { + input[i] = q4gsw_ramp(i); + } + + auto x = make_tensor_ptr({cfg.m, cfg.k}, std::vector(input)); + auto result = module.forward({EValue(x)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + const auto& out_tensor = outputs[0].toTensor(); + if (out_tensor.numel() != out_numel) { + printf( + "FAIL: output numel %zu != expected %d\n", + (size_t)out_tensor.numel(), + out_numel); + return false; + } + const float* out_data = out_tensor.const_data_ptr(); + + std::vector golden = load_golden(golden_path, out_numel); + if (golden.empty()) { + printf("FAIL: could not load golden %s\n", golden_path.c_str()); + return false; + } + + float ma = 0.0f, mr = 0.0f; + const bool pass = quant_within_tol( + out_data, golden.data(), out_numel, cfg.tol_abs, cfg.tol_rel, &ma, &mr); + printf( + "Max abs error: %e Max rel error: %e (checked %d elements)\n", + ma, + mr, + out_numel); + if (!pass) { + printf( + "FAIL: linear_q4gsw %s exceeds tolerance (abs %g OR rel %g)\n", + cfg.name, + cfg.tol_abs, + cfg.tol_rel); + return false; + } + printf("PASS: linear_q4gsw %s\n", cfg.name); + return true; +} + +// q4gsw sweep: self-discover q4gsw_.pte; required=FAIL, heavy=gate, *ran. +static bool test_q4gsw_sweep(const std::string& dir, bool* ran) { + bool ok = true; + const bool heavy_run = std::getenv("WEBGPU_TEST_HEAVY") != nullptr; + for (const auto& cfg : kQ4gswConfigs) { + const std::string pte = dir + "q4gsw_" + cfg.name + ".pte"; + FILE* f = std::fopen(pte.c_str(), "rb"); + if (!f) { + if (cfg.required && !dir.empty()) { + printf( + "FAIL: required q4gsw config %s has no .pte in %s\n", + cfg.name, + dir.c_str()); + ok = false; + } + continue; + } + std::fclose(f); + if (cfg.heavy && !heavy_run) { + printf( + "SKIP: heavy q4gsw config %s (set WEBGPU_TEST_HEAVY=1 on a real GPU)\n", + cfg.name); + continue; + } + const std::string golden = dir + "q4gsw_" + cfg.name + ".golden.bin"; + *ran = true; + ok = test_q4gsw_config(cfg, pte, golden) && ok; + } + return ok; +} + // Fused sdpa_with_kv_cache sweep config. Mirrors the Python CONFIGS table in // test_sdpa.py exactly (name, Hq, Hkv, D, S, Cmax, input_pos). struct SdpaConfig { @@ -1274,6 +1433,15 @@ int main(int argc, char** argv) { update_cache_model_path = env; } + // Quantized-linear sweep dir (mirrors WEBGPU_TEST_SDPA_DIR). + std::string qlinear_dir; + if (const char* env = std::getenv("WEBGPU_TEST_QUANTIZED_LINEAR_DIR")) { + qlinear_dir = env; + if (!qlinear_dir.empty() && qlinear_dir.back() != '/') { + qlinear_dir += '/'; + } + } + // SDPA sweep: configs self-discover their sdpa_.pte/.golden.bin under // this directory (default "" = the embedded-file root / cwd). Set // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/). @@ -1308,6 +1476,22 @@ int main(int argc, char** argv) { ok = test_update_cache(update_cache_model_path) && ok; } + bool q4gsw_ran = false; + bool q4gsw_ok = test_q4gsw_sweep(qlinear_dir, &q4gsw_ran); + if (q4gsw_ran) { + ok = q4gsw_ok && ok; + } + // Guard python<->C++ ramp bit-identity: q4gsw_ramp(0) = -0.5 exactly. + if (std::abs(q4gsw_ramp(0) - (-0.5f)) > 1e-12f) { + printf("FAIL: q4gsw_ramp bit-identity check\n"); + ok = false; + } + if (!qlinear_dir.empty() && !q4gsw_ran) { + printf( + "FAIL: WEBGPU_TEST_QUANTIZED_LINEAR_DIR set but no q4gsw config ran\n"); + ok = false; + } + bool sdpa_ran = false; bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran); if (sdpa_ran) {