diff --git a/Cargo.lock b/Cargo.lock
index 690940f1..41910f15 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,6 +23,23 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "aimdb-bench"
+version = "0.1.0"
+dependencies = [
+ "aimdb-core",
+ "aimdb-embassy-adapter",
+ "aimdb-tokio-adapter",
+ "criterion",
+ "critical-section",
+ "defmt 1.0.1",
+ "embassy-time-driver",
+ "futures",
+ "serde",
+ "serde_json",
+ "tokio",
+]
+
 [[package]]
 name = "aimdb-cli"
 version = "0.6.0"
@@ -301,6 +318,7 @@ dependencies = [
  "serde_json",
  "tokio",
  "tokio-test",
+ "tokio-util",
  "tracing",
 ]
 
@@ -383,6 +401,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
 [[package]]
 name = "anstream"
 version = "0.6.21"
@@ -636,6 +660,12 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
 [[package]]
 name = "cc"
 version = "1.2.43"
@@ -682,6 +712,33 @@ dependencies = [
  "windows-link 0.2.1",
 ]
 
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
 [[package]]
 name = "clap"
 version = "4.5.51"
@@ -849,6 +906,40 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
 [[package]]
 name = "critical-section"
 version = "1.2.0"
@@ -861,6 +952,12 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -1017,6 +1114,32 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+
+[[package]]
+name = "embassy-bench-stm32h5"
+version = "0.1.0"
+dependencies = [
+ "aimdb-core",
+ "aimdb-embassy-adapter",
+ "cortex-m",
+ "cortex-m-rt",
+ "critical-section",
+ "defmt 1.0.1",
+ "defmt-rtt",
+ "embassy-executor",
+ "embassy-futures",
+ "embassy-stm32",
+ "embassy-sync",
+ "embassy-time",
+ "embedded-alloc",
+ "panic-probe",
+]
+
 [[package]]
 name = "embassy-embedded-hal"
 version = "0.6.0"
@@ -1736,6 +1859,17 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -1824,6 +1958,12 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
 [[package]]
 name = "http"
 version = "1.4.0"
@@ -2124,12 +2264,32 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.15"
@@ -2458,6 +2618,12 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
 [[package]]
 name = "openssl"
 version = "0.10.80"
@@ -3249,11 +3415,11 @@ dependencies = [
 [[package]]
 name = "stm32-metapac"
 version = "21.0.0"
-source = "git+https://github.com/embassy-rs/stm32-data-generated?tag=stm32-data-be62608f8f93a21fe76c8f70c0fa9d30c9ab2503#46619beefd7015dffaa1f8e756f614718f0dd0bf"
+source = "git+https://github.com/embassy-rs/stm32-data-generated?tag=stm32-data-98c747c5a5eb2fe1bfa452d6375445a1c3c51628#ff0350aebb88f1498cbf5800305de327df02012c"
 dependencies = [
  "cortex-m",
  "cortex-m-rt",
- "defmt 0.3.100",
+ "defmt 1.0.1",
 ]
 
 [[package]]
@@ -3460,6 +3626,16 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "tokio"
 version = "1.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 2b7dbb45..f3b5ef57 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,6 +27,7 @@ members = [
     "examples/embassy-mqtt-connector-demo",
     "examples/embassy-knx-connector-demo",
     "examples/embassy-serial-connector-demo",
+    "examples/embassy-bench-stm32h5",
     "examples/sync-api-demo",
     "examples/remote-access-demo",
     "examples/weather-mesh-demo/weather-mesh-common",
@@ -36,6 +37,8 @@ members = [
     "examples/weather-mesh-demo/weather-station-gamma",
     "examples/hello-mailbox",
     "examples/hello-single-latest-async",
+    # Benchmarking infrastructure — host-only, excluded from default-members
+    "aimdb-bench",
 ]
 exclude = ["_external"]
 resolver = "2"
diff --git a/Makefile b/Makefile
index 3ea61cef..eeb97b7d 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,8 @@ build:
 	cargo build --package aimdb-serial-connector --no-default-features --features "tokio-runtime"
 	@printf "$(YELLOW)  → Building WASM adapter$(NC)\n"
 	cargo build --package aimdb-wasm-adapter --target wasm32-unknown-unknown --features "wasm-runtime"
+	@printf "$(YELLOW)  → Building benchmarking infrastructure (host-only, incl. benches)$(NC)\n"
+	cargo build --package aimdb-bench --benches
 
 test:
 	@printf "$(GREEN)Running all tests (valid combinations)...$(NC)\n"
@@ -176,7 +178,7 @@ test:
 
 fmt:
 	@printf "$(GREEN)Formatting code (workspace members only)...$(NC)\n"
-	@for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async; do \
+	@for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo embassy-bench-stm32h5 weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
 		printf "$(YELLOW)  → Formatting $$pkg$(NC)\n"; \
 		cargo fmt -p $$pkg 2>/dev/null || true; \
 	done
@@ -185,7 +187,7 @@ fmt:
 fmt-check:
 	@printf "$(GREEN)Checking code formatting (workspace members only)...$(NC)\n"
 	@FAILED=0; \
-	for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async; do \
+	for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo embassy-bench-stm32h5 weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
 		printf "$(YELLOW)  → Checking $$pkg$(NC)\n"; \
 		if ! cargo fmt -p $$pkg -- --check 2>&1; then \
 			printf "$(RED)❌ Formatting check failed for $$pkg$(NC)\n"; \
@@ -262,6 +264,8 @@ clippy:
 	cargo clippy --package aimdb-serial-connector --target thumbv7em-none-eabihf --no-default-features --features "embassy-runtime,defmt" -- -D warnings
 	@printf "$(YELLOW)  → Clippy on WASM adapter$(NC)\n"
 	cargo clippy --package aimdb-wasm-adapter --target wasm32-unknown-unknown --features "wasm-runtime" -- -D warnings
+	@printf "$(YELLOW)  → Clippy on benchmarking infrastructure (host-only, incl. benches)$(NC)\n"
+	cargo clippy --package aimdb-bench --all-targets -- -D warnings
 
 doc:
 	@printf "$(GREEN)Generating dual-platform documentation...$(NC)\n"
@@ -372,6 +376,8 @@ examples:
 	cargo build --package embassy-knx-connector-demo --target thumbv7em-none-eabihf
 	@printf "$(YELLOW)  → Building embassy-serial-connector-demo (embedded, embassy runtime)$(NC)\n"
 	cargo build --package embassy-serial-connector-demo --target thumbv7em-none-eabihf
+	@printf "$(YELLOW)  → Building embassy-bench-stm32h5 (B3 on-target profiling, embassy runtime)$(NC)\n"
+	cargo build --package embassy-bench-stm32h5 --target thumbv7em-none-eabihf
 	@printf "$(YELLOW)  → Building weather-mesh-demo: weather-mesh-common$(NC)\n"
 	cargo build --package weather-mesh-common
 	@printf "$(YELLOW)  → Building weather-mesh-demo: weather-hub (cloud aggregator)$(NC)\n"
diff --git a/_external/embassy b/_external/embassy
index 9e316798..20f6d85e 160000
--- a/_external/embassy
+++ b/_external/embassy
@@ -1 +1 @@
-Subproject commit 9e31679810ce57f10d0466fbe62afd3502d98357
+Subproject commit 20f6d85e827d3ecf50419001f2746fe5ec2186cc
diff --git a/aimdb-bench/Cargo.toml b/aimdb-bench/Cargo.toml
new file mode 100644
index 00000000..9d3ab4ce
--- /dev/null
+++ b/aimdb-bench/Cargo.toml
@@ -0,0 +1,77 @@
+[package]
+name = "aimdb-bench"
+version = "0.1.0"
+edition = "2021"
+publish = false
+license = "Apache-2.0"
+description = "Benchmarking infrastructure for AimDB — not for production use"
+
+[lib]
+name = "aimdb_bench"
+
+[[bench]]
+name = "b0_alloc_tokio"
+harness = false
+
+# Combined B1 (latency) + B2 (throughput) — a throughput-annotated Criterion
+# bench already reports per-iteration latency, so a separate b1_latency target
+# would be redundant. See design 038 §3–§4.
+[[bench]]
+name = "b1_b2_tokio"
+harness = false
+
+[[bench]]
+name = "b_runner_pipeline"
+harness = false
+
+[[bench]]
+name = "b_alloc_pipeline"
+harness = false
+
+[[bench]]
+name = "b0_alloc_embassy"
+harness = false
+
+[[bench]]
+name = "b1_b2_embassy"
+harness = false
+
+[dependencies]
+# Core AimDB types
+aimdb-core = { path = "../aimdb-core", features = ["std"] }
+
+# Tokio runtime adapter (the primary target for B0-B2)
+aimdb-tokio-adapter = { path = "../aimdb-tokio-adapter", features = [
+    "tokio-runtime",
+] }
+
+aimdb-embassy-adapter = { path = "../aimdb-embassy-adapter", default-features = false, features = [
+    "alloc",
+    "embassy-sync",
+    "embassy-time",
+] }
+
+# Async runtime — current-thread executor is used for noise reduction in B0
+tokio = { workspace = true }
+
+# JSON output for baseline snapshots
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+[dev-dependencies]
+# Statistical benchmarking for B1/B2
+criterion = { version = "0.5", default-features = false, features = [
+    "cargo_bench_support",
+] }
+
+# Host driver for the Embassy buffer futures (matches the adapter's own host
+# tests, which also use `futures::executor::block_on`).
+futures = "0.3"
+
+# `critical-section` host impl for embassy-sync's `CriticalSectionRawMutex`.
+# Linked only into the embassy bench binaries; the lib rlib does not need it.
+critical-section = { version = "1.1", features = ["std"] }
+
+# defmt logger / panic-handler + embassy-time driver stubs for the host bench
+defmt = { workspace = true }
+embassy-time-driver = { path = "../_external/embassy/embassy-time-driver" }
diff --git a/aimdb-bench/README.md b/aimdb-bench/README.md
new file mode 100644
index 00000000..c87c940a
--- /dev/null
+++ b/aimdb-bench/README.md
@@ -0,0 +1,131 @@
+# aimdb-bench
+
+Benchmarking infrastructure for AimDB. **Not for production use.**
+
+Measures three classes of performance across three canonical workload profiles:
+
+| Class | Tool | Purpose | CI gate? |
+|---|---|---|---|
+| **B0** — allocations/msg | hand-rolled `CountingAllocator` | regression detection on the consume path | phase 5 (planned) |
+| **B1** — push-to-recv latency | Criterion p50/p99 | trend tracking | no |
+| **B2** — steady-state throughput | Criterion msgs/sec | trend tracking | no |
+
+**B1 and B2 share one bench per adapter** (`b1_b2_tokio`, `b1_b2_embassy`). A throughput-annotated Criterion
+bench reports both the per-iteration time (**B1 latency**, the `time` column) and
+messages/second (**B2 throughput**, the `thrpt` column) from the same runs, so
+there is no separate `b1_latency` target.
+
+Plus two informational benches that exercise the full runner-driven pipeline.
+
+**Adapters covered:**
+
+- **Tokio** — `b0_alloc_tokio`, `b1_b2_tokio` (host).
+- **Embassy** — `b0_alloc_embassy`, `b1_b2_embassy`
+  (host). These drive the real [`EmbassyBuffer`] backend via
+  `futures::executor::block_on` over embassy-sync's poll methods — no
+  `embassy-runtime`, no cortex-m executor, no hardware. The buffer constructors
+  live in [`profiles_embassy`](src/profiles_embassy.rs).
+- **Embassy on-target (B3)** — cycle-accurate per-message profiling (`DWT`
+  `CYCCNT`) on an STM32H563ZI lives in a separate hardware-only crate,
+  [`examples/embassy-bench-stm32h5`](../examples/embassy-bench-stm32h5), because
+  it cannot run on a host. It also re-validates 0 allocs/msg against the real
+  embedded allocator.
+
+---
+
+## Workload profiles
+
+Every bench runs the same three profiles, matching the three AimDB buffer types:
+
+| Profile | Buffer | Tokio primitive | Payload |
+|---|---|---|---|
+| **Telemetry** | `SpmcRing` | `broadcast` | small (32 B) |
+| **State** | `SingleLatest` | `watch` | medium (48 B) |
+| **Command** | `Mailbox` | `Mutex + Notify` | small (32 B) |
+
+---
+
+## Running
+
+Always run from the workspace root (`/aimdb_ws/aimdb`).
+
+```sh
+# B0 — allocation gate (buffer layer)
+cargo bench -p aimdb-bench --bench b0_alloc_tokio
+
+# B1 + B2 — latency (time/iter) and throughput (msgs/sec), one Criterion suite
+cargo bench -p aimdb-bench --bench b1_b2_tokio
+
+# Embassy buffer backend (host) — same classes
+cargo bench -p aimdb-bench --bench b0_alloc_embassy
+cargo bench -p aimdb-bench --bench b1_b2_embassy
+
+# Informational: allocation count through the runner pipeline
+cargo bench -p aimdb-bench --bench b_alloc_pipeline
+
+# Informational: runner-pipeline throughput (Criterion)
+cargo bench -p aimdb-bench --bench b_runner_pipeline
+
+# All at once
+cargo bench -p aimdb-bench
+```
+
+### Criterion baselines
+
+B1 and B2 use Criterion's built-in baseline system:
+
+```sh
+# Save a named baseline before a change
+cargo bench -p aimdb-bench --bench b1_b2_tokio -- --save-baseline pre-w8
+
+# Compare against it after
+cargo bench -p aimdb-bench --bench b1_b2_tokio -- --baseline pre-w8
+```
+
+Criterion writes HTML reports to `target/criterion/`.
+
+---
+
+## B0 — allocation gate
+
+`b0_alloc_tokio` does not use Criterion. It runs a fixed warmup + batch cycle and writes JSON results to `aimdb-bench/target/bench-results/b0_alloc_tokio.json` (the path is anchored to the crate dir, so it is the same regardless of the directory you run from).
+
+**Measurement model:**
+1. Create buffer + reader.
+2. Warmup ≥ 200 push → recv cycles (excluded from counters).
+3. Reset allocation counters.
+4. Run 512 push → recv cycles.
+5. Snapshot counters; divide by 512 for per-message figures.
+
+The committed baseline lives in `data/baselines/b0_alloc_tokio.json`. When a change intentionally improves or changes allocation behaviour, re-run the bench and commit the updated JSON with a clear rationale in the commit message.
+
+> **W8 result (design 037).** Since the zero-allocation consume path landed, the baseline records **0 allocs/msg** across all three tokio profiles (down from 1 — the boxed `recv()` future is gone). The committed baseline is therefore the target value; any nonzero B0 on these profiles is a regression to investigate.
+
+`b0_alloc_embassy` mirrors this against the Embassy buffer backend and writes `data/baselines/b0_alloc_embassy.json` — also **0 allocs/msg** across all three profiles, confirming the Embassy `poll_recv` path is allocation-free on the host. The on-target B3 bench (`examples/embassy-bench-stm32h5`) re-checks the same 0-alloc claim against the real embedded allocator.
+
+> **Embassy priming.** Unlike Tokio's `broadcast`, an Embassy `SpmcRing` reader registers its embassy `Subscriber` *lazily, on first poll* — a message pushed before that first poll is missed, and the next `recv()` would block forever. The embassy benches call `profiles_embassy::prime()` on each reader before the first `push` to force registration (a no-op for Watch/Mailbox readers).
+
+**Noise reduction:** a `new_current_thread()` Tokio executor is used so there are no work-stealing threads and Tokio's scheduler does not allocate per-poll in the hot path.
+
+**Production isolation:** `#[global_allocator]` is a per-binary link-time declaration. `CountingAllocator` exists only in bench binaries. Nothing in the production dependency graph is affected.
+
+---
+
+## Informational pipeline benches
+
+`b_alloc_pipeline` and `b_runner_pipeline` exercise the same three profiles through a real `AimDbRunner` pipeline (`.source()` → buffer → `.tap()`). These include runner/stage machinery overhead on top of the buffer consume path.
+
+Use them as a comparison point, not a regression gate. If they regress, `b0_alloc_tokio` tells you whether the issue is in the consume path itself.
+
+---
+
+## Caveats
+
+- All benches measure a single current-thread Tokio executor. Results do not predict multi-threaded or work-stealing scheduler behavior.
+- B0 is a counter, not a memory profiler. It reports allocation count and byte total; not per-call precision or heap fragmentation.
+- B0's `bytes_per_msg` measures AimDB-added per-message heap allocations, not the message payload. Pre-W8 this was the `Box::pin` boxed `recv()` future (a single ~144 B type shared across all three buffer arms, hence identical byte counts); since design 037 / W8 the consume path is poll-based and this is **0 B/msg** on the clean path. A nonzero value flags a regression — e.g. the broadcast error path still allocates its `buffer_name` string, so a B0 run that triggers `BufferLagged`/`BufferClosed` will report > 0.
+- Criterion p99 can vary ±5–10% on noisy CI runners. Use p50 medians for trend comparisons.
+- Always specify `--release` or debug build consistently when comparing runs; optimizations differ by 5–50×.
+- `b_alloc_pipeline` uses a paced source: per-message pace tokens and notification channels. The coordination overhead is included in the measured window.
+
+
diff --git a/aimdb-bench/benches/b0_alloc_embassy.rs b/aimdb-bench/benches/b0_alloc_embassy.rs
new file mode 100644
index 00000000..99753585
--- /dev/null
+++ b/aimdb-bench/benches/b0_alloc_embassy.rs
@@ -0,0 +1,122 @@
+//! B0 — Allocation counting on the Embassy adapter (host-driven).
+//!
+//! The Embassy companion to [`b0_alloc_tokio`]. Measures per-message allocation
+//! cost for each workload profile against the **Embassy** buffer backend
+//! ([`EmbassyBuffer`]), driven on the host via `futures::executor::block_on`
+//! over embassy-sync's `poll_*` methods — no `embassy-runtime`, no cortex-m
+//! executor, no hardware. `poll_recv` drives those methods with no per-message
+//! future box, so the expected result is **0 allocs/msg**, same as the Tokio
+//! suite; the one-time `Box::new(reader)` and lazy subscriber registration
+//! happen during setup/warmup, before the counters are reset.
+//!
+//! **Measurement model** (identical to `b0_alloc_tokio`): create buffer +
+//! reader and **prime** it (forces lazy SpmcRing subscriber registration — see
+//! [`profiles_embassy`]), warm up `WARMUP_ITERS` cycles, `reset()`, run
+//! `BATCH_SIZE` cycles, then `snapshot()` and divide by `BATCH_SIZE`.
+//!
+//! Run `cargo bench -p aimdb-bench --bench b0_alloc_embassy`; results are
+//! written to `aimdb-bench/target/bench-results/b0_alloc_embassy.json` (anchored
+//! to the crate dir).
+
+// The Embassy adapter calls `defmt::*` unconditionally and links embassy-time;
+// on the host neither a logger nor a time driver exists. This expands no-op
+// stubs so the bench binary links. Must appear exactly once, at top level.
+aimdb_embassy_adapter::host_test_stubs!();
+
+use aimdb_bench::{
+    alloc::{reset, snapshot},
+    profiles::{command_msg, state_msg, telemetry_msg, BATCH_SIZE, WARMUP_ITERS},
+    profiles_embassy::{command_buffer, prime, state_buffer, telemetry_buffer},
+    reports::AllocReport,
+};
+use aimdb_core::buffer::{Buffer, Reader};
+use futures::executor::block_on;
+
+fn main() {
+    println!("=== B0 Allocation Benchmarks (Embassy adapter, buffer layer, host) ===");
+    println!("  Warmup iters : {WARMUP_ITERS}");
+    println!("  Batch size   : {BATCH_SIZE}");
+    println!();
+
+    // ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────
+    //
+    // `prime()` is REQUIRED: the SpmcRing subscriber is created on the reader's
+    // first poll, so without it the first pushed message is missed and `recv()`
+    // blocks forever.
+    let telemetry_report = block_on(async {
+        let buf = telemetry_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        prime(&mut reader);
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(telemetry_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(telemetry_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Telemetry", "SpmcRing", BATCH_SIZE, allocs, bytes)
+    });
+    telemetry_report.print();
+
+    // ── State: SingleLatest / Watch ──────────────────────────────────────────
+    let state_report = block_on(async {
+        let buf = state_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        prime(&mut reader);
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(state_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(state_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("State", "SingleLatest", BATCH_SIZE, allocs, bytes)
+    });
+    state_report.print();
+
+    // ── Command: Mailbox / Channel(capacity=1) ───────────────────────────────
+    //
+    // Tight 1:1 push → recv loop matches Mailbox semantics. Do NOT batch pushes
+    // ahead of the consumer: the single slot overwrites earlier values.
+    let command_report = block_on(async {
+        let buf = command_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        prime(&mut reader);
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(command_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(command_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Command", "Mailbox", BATCH_SIZE, allocs, bytes)
+    });
+    command_report.print();
+
+    println!();
+    println!("Expected: 0 allocs/msg — allocation-free consume path, same as the Tokio B0 suite.");
+
+    // Persist results for baseline comparison.
+    let reports = vec![telemetry_report, state_report, command_report];
+    let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
+    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/target/bench-results");
+    std::fs::create_dir_all(out_dir).expect("failed to create results directory");
+    let out_path = format!("{out_dir}/b0_alloc_embassy.json");
+    std::fs::write(&out_path, &json).expect("failed to write results");
+    println!("\nResults written to {out_path}");
+}
diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
new file mode 100644
index 00000000..db152123
--- /dev/null
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -0,0 +1,117 @@
+//! B0 — Allocation counting on the Tokio adapter.
+//!
+//! Measures per-message allocation cost for each workload profile against
+//! `TokioBuffer<T>` directly (not the full `AimDb` stack). The consume path
+//! polls the reader without boxing a future per `recv()`, so the expected
+//! result is **0 allocs/msg**; a non-zero figure flags an allocation regression
+//! in the consume path.
+//!
+//! **Measurement model:** create buffer + reader, warm up `WARMUP_ITERS`
+//! push → recv cycles, `reset()` the counters, run `BATCH_SIZE` cycles, then
+//! `snapshot()` and divide by `BATCH_SIZE`. A current-thread Tokio runtime
+//! keeps scheduler allocation out of the hot path so the counter isolates
+//! AimDB's per-message contribution.
+//!
+//! Run `cargo bench -p aimdb-bench --bench b0_alloc_tokio`; results are written
+//! to `aimdb-bench/target/bench-results/b0_alloc_tokio.json` (anchored to the
+//! crate dir).
+
+use aimdb_bench::{
+    alloc::{reset, snapshot},
+    profiles::{
+        command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
+        BATCH_SIZE, WARMUP_ITERS,
+    },
+    reports::AllocReport,
+};
+use aimdb_core::buffer::{Buffer, Reader};
+
+fn main() {
+    // Current-thread executor — no work-stealing threads, clean allocation signal.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("failed to build current-thread Tokio runtime");
+
+    println!("=== B0 Allocation Benchmarks (Tokio adapter, buffer layer) ===");
+    println!("  Warmup iters : {WARMUP_ITERS}");
+    println!("  Batch size   : {BATCH_SIZE}");
+    println!();
+
+    // ── Telemetry: SpmcRing / broadcast ─────────────────────────────────────
+    //
+    // Subscribe before pushing so the reader holds its read position from the
+    // start — a reader created after sends are in flight misses them.
+    let telemetry_report = rt.block_on(async {
+        let buf = telemetry_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(telemetry_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(telemetry_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Telemetry", "SpmcRing", BATCH_SIZE, allocs, bytes)
+    });
+    telemetry_report.print();
+
+    // ── State: SingleLatest / watch ──────────────────────────────────────────
+    let state_report = rt.block_on(async {
+        let buf = state_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(state_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(state_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("State", "SingleLatest", BATCH_SIZE, allocs, bytes)
+    });
+    state_report.print();
+
+    // ── Command: Mailbox / Mutex slot + waker list ───────────────────────────
+    //
+    // Tight 1:1 push → recv loop. Do NOT batch pushes ahead of the consumer:
+    // the single slot overwrites earlier values, leaving only the last write.
+    let command_report = rt.block_on(async {
+        let buf = command_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(command_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(command_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Command", "Mailbox", BATCH_SIZE, allocs, bytes)
+    });
+    command_report.print();
+
+    println!();
+    println!("Expected: 0 allocs/msg — the consume path is allocation-free in steady state.");
+
+    // Persist results for baseline comparison.
+    let reports = vec![telemetry_report, state_report, command_report];
+    let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
+    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/target/bench-results");
+    std::fs::create_dir_all(out_dir).expect("failed to create results directory");
+    let out_path = format!("{out_dir}/b0_alloc_tokio.json");
+    std::fs::write(&out_path, &json).expect("failed to write results");
+    println!("\nResults written to {out_path}");
+}
diff --git a/aimdb-bench/benches/b1_b2_embassy.rs b/aimdb-bench/benches/b1_b2_embassy.rs
new file mode 100644
index 00000000..fdb719fb
--- /dev/null
+++ b/aimdb-bench/benches/b1_b2_embassy.rs
@@ -0,0 +1,195 @@
+//! B1/B2 — Latency & throughput on the Embassy adapter (host-driven, Criterion).
+//!
+//! The Embassy companion to [`b1_b2_tokio`], capturing **both** measurement
+//! classes from one set of runs against the **Embassy** buffer backend, driven
+//! on the host via `futures::executor::block_on` — no `embassy-runtime`, no
+//! cortex-m executor, no hardware:
+//!
+//! - **B1 latency** — per-iteration time for one `buf.push(msg)` →
+//!   `reader.recv()` cycle (the `time` column).
+//! - **B2 throughput** — messages/second from that same timing via
+//!   `Throughput::Elements(1)` (the `thrpt` column).
+//!
+//! Covers SPSC (1 producer, 1 consumer) for all three profiles plus a 1→4
+//! telemetry fan-out. These are host wall-clock numbers for trend tracking and
+//! Tokio-vs-Embassy comparison; on-target cycle counts are covered by the B3
+//! STM32H5 bench (`examples/embassy-bench-stm32h5`).
+//!
+//! **Fan-out safety (SpmcRing / PubSubChannel):** all readers are **primed**
+//! before any push so each holds its read position from the start (the embassy
+//! `Subscriber` is otherwise created lazily on first poll and would miss earlier
+//! messages); `SUBS = 4` on
+//! [`TelemetryBuffer`](aimdb_bench::profiles_embassy::TelemetryBuffer) provides
+//! the four subscriber slots, and strict lockstep keeps the fixed `CAP` from
+//! lagging.
+//!
+//! **Mailbox:** tight 1:1 push → recv loop. Do NOT batch pushes ahead of the
+//! consumer — the single slot overwrites earlier values.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b1_b2_embassy
+//! cargo bench -p aimdb-bench --bench b1_b2_embassy -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b1_b2_embassy -- --baseline main
+//! ```
+
+aimdb_embassy_adapter::host_test_stubs!();
+
+use aimdb_bench::profiles::{command_msg, state_msg, telemetry_msg, WARMUP_ITERS};
+use aimdb_bench::profiles_embassy::{command_buffer, prime, state_buffer, telemetry_buffer};
+use aimdb_core::buffer::{Buffer, Reader};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use futures::executor::block_on;
+
+// ── Telemetry SPSC ────────────────────────────────────────────────────────────
+
+fn bench_b1_b2_telemetry_spsc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = telemetry_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Telemetry 1→4 fan-out ────────────────────────────────────────────────────
+//
+// Each iteration: 1 push + recv on all 4 readers (see module fan-out rules).
+
+fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
+    // Each iteration produces 1 message observed by 4 consumers.
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_fanout_1x4", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = telemetry_buffer();
+                let mut r0 = Reader::new(Box::new(buf.subscribe()));
+                let mut r1 = Reader::new(Box::new(buf.subscribe()));
+                let mut r2 = Reader::new(Box::new(buf.subscribe()));
+                let mut r3 = Reader::new(Box::new(buf.subscribe()));
+                // Prime all four BEFORE the first push (registers 4 subscribers).
+                prime(&mut r0);
+                prime(&mut r1);
+                prime(&mut r2);
+                prime(&mut r3);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State SPSC ────────────────────────────────────────────────────────────────
+
+fn bench_b1_b2_state_spsc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = state_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(state_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command / Mailbox SPSC ────────────────────────────────────────────────────
+
+fn bench_b1_b2_command_mailbox(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = command_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(command_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_b1_b2_telemetry_spsc,
+    bench_b1_b2_telemetry_fanout,
+    bench_b1_b2_state_spsc,
+    bench_b1_b2_command_mailbox,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/benches/b1_b2_tokio.rs b/aimdb-bench/benches/b1_b2_tokio.rs
new file mode 100644
index 00000000..87212e38
--- /dev/null
+++ b/aimdb-bench/benches/b1_b2_tokio.rs
@@ -0,0 +1,198 @@
+//! B1/B2 — Latency & throughput benchmarks (Criterion).
+//!
+//! One Criterion suite capturing **both** measurement classes from a single set
+//! of runs, using `TokioBuffer<T>` directly to isolate the buffer layer from
+//! `AimDb` initialization overhead:
+//!
+//! - **B1 latency** — per-iteration time for one `buf.push(msg)` →
+//!   `reader.recv()` cycle (the `time` column).
+//! - **B2 throughput** — messages/second from that same timing via
+//!   `Throughput::Elements(1)` (the `thrpt` column).
+//!
+//! Covers SPSC (1 producer, 1 consumer) for all three profiles plus a 1→4
+//! telemetry fan-out, on a current-thread Tokio runtime (same as B0).
+//!
+//! **Fan-out safety (SpmcRing / broadcast):** all readers subscribe *before*
+//! any push so each holds its read position from the start, and the loop is
+//! strict lockstep (1 push, then `recv` on every reader) so at most one message
+//! is in flight — `TELEMETRY_CAPACITY` never lags.
+//!
+//! **Mailbox:** tight 1:1 push → recv loop. Do NOT batch pushes ahead of the
+//! consumer — the single slot overwrites earlier values, leaving only the last.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b1_b2_tokio
+//! cargo bench -p aimdb-bench --bench b1_b2_tokio -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b1_b2_tokio -- --baseline main
+//! ```
+
+use aimdb_bench::profiles::{
+    command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
+    WARMUP_ITERS,
+};
+use aimdb_core::buffer::{Buffer, Reader};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+
+// ── Telemetry SPSC ────────────────────────────────────────────────────────────
+
+fn bench_b1_b2_telemetry_spsc(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-B2");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                // Subscribe before pushing — reader holds position from start.
+                let buf = telemetry_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Telemetry 1→4 fan-out ────────────────────────────────────────────────────
+//
+// Each iteration: 1 push + recv on all 4 readers (see module fan-out rules).
+
+fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-B2");
+    // Each iteration produces 1 message observed by 4 consumers.
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_fanout_1x4", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                // Subscribe all readers before the first push (see module docs).
+                let buf = telemetry_buffer();
+                let mut r0 = Reader::new(Box::new(buf.subscribe()));
+                let mut r1 = Reader::new(Box::new(buf.subscribe()));
+                let mut r2 = Reader::new(Box::new(buf.subscribe()));
+                let mut r3 = Reader::new(Box::new(buf.subscribe()));
+
+                // Warmup — not timed (mirrors B1 and the SPSC benches).
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State SPSC ────────────────────────────────────────────────────────────────
+
+fn bench_b1_b2_state_spsc(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-B2");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = state_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(state_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command / Mailbox SPSC ────────────────────────────────────────────────────
+
+fn bench_b1_b2_command_mailbox(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-B2");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = command_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(command_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_b1_b2_telemetry_spsc,
+    bench_b1_b2_telemetry_fanout,
+    bench_b1_b2_state_spsc,
+    bench_b1_b2_command_mailbox,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/benches/b_alloc_pipeline.rs b/aimdb-bench/benches/b_alloc_pipeline.rs
new file mode 100644
index 00000000..748fe2c3
--- /dev/null
+++ b/aimdb-bench/benches/b_alloc_pipeline.rs
@@ -0,0 +1,266 @@
+//! B0-Pipeline — Allocation counting for a live runner-driven pipeline.
+//!
+//! Measures per-message allocation cost for a real `.source()` -> buffer ->
+//! `.tap()` pipeline driven by `AimDbRunner` — an integration-layer measurement
+//! that includes runner/stage machinery on top of the buffer consume path. The
+//! source generates each batch internally after a single start notification and
+//! the tap signals completion once the batch is consumed, so the measured window
+//! carries no per-message ingress or ack traffic.
+//!
+//! Treat this as an informational companion to the raw-buffer B0 gate in
+//! `b0_alloc_tokio`: if it regresses, that gate still isolates whether the
+//! consume path itself is at fault.
+//!
+//! Run `cargo bench -p aimdb-bench --bench b_alloc_pipeline`; results are written
+//! to `aimdb-bench/target/bench-results/b_alloc_pipeline.json` (anchored to the
+//! crate dir).
+//!
+//! **Executor dependency.** The source/tap pacing uses check-then-await (load an
+//! atomic, `.notified().await` only when there is no work). `notify_waiters()`
+//! stores no permit, so this avoids lost wakeups only because the bench runs on
+//! a **current-thread** runtime: nothing preempts between the load and the
+//! `.await`. Do not port to a multi-threaded executor without revisiting it.
+
+use std::fmt::Debug;
+use std::sync::{
+    atomic::{AtomicU64, AtomicUsize, Ordering},
+    Arc,
+};
+
+use aimdb_bench::{
+    alloc::{reset, snapshot},
+    profiles::{
+        command_msg, state_msg, telemetry_msg, CommandMsg, StateMsg, TelemetryMsg, BATCH_SIZE,
+        TELEMETRY_CAPACITY, WARMUP_ITERS,
+    },
+    reports::AllocReport,
+};
+use aimdb_core::{buffer::BufferCfg, AimDb, AimDbBuilder};
+use aimdb_tokio_adapter::{TokioAdapter, TokioRecordRegistrarExt};
+use tokio::sync::{oneshot, Notify};
+
+struct BatchState {
+    start_epoch: AtomicU64,
+    completed_epoch: AtomicU64,
+    batch_size: AtomicUsize,
+    target_epoch: AtomicU64,
+    consumed_in_epoch: AtomicUsize,
+    start_notify: Notify,
+    pace_tokens: AtomicUsize,
+    pace_notify: Notify,
+    done_notify: Notify,
+}
+
+impl BatchState {
+    fn new() -> Self {
+        Self {
+            start_epoch: AtomicU64::new(0),
+            completed_epoch: AtomicU64::new(0),
+            batch_size: AtomicUsize::new(0),
+            target_epoch: AtomicU64::new(0),
+            consumed_in_epoch: AtomicUsize::new(0),
+            start_notify: Notify::new(),
+            pace_tokens: AtomicUsize::new(0),
+            pace_notify: Notify::new(),
+            done_notify: Notify::new(),
+        }
+    }
+}
+
+struct PipelineHarness {
+    _db: AimDb,
+    state: Arc<BatchState>,
+}
+
+impl PipelineHarness {
+    async fn warmup(&self, batch_size: usize) {
+        self.run_batch(batch_size).await;
+    }
+
+    async fn measure(&self, batch_size: usize) {
+        self.run_batch(batch_size).await;
+    }
+
+    async fn run_batch(&self, batch_size: usize) {
+        let epoch = self.state.start_epoch.load(Ordering::Acquire) + 1;
+        self.state.batch_size.store(batch_size, Ordering::Release);
+        self.state.consumed_in_epoch.store(0, Ordering::Release);
+        self.state.target_epoch.store(epoch, Ordering::Release);
+        self.state.pace_tokens.store(1, Ordering::Release);
+        self.state.start_epoch.store(epoch, Ordering::Release);
+        self.state.start_notify.notify_waiters();
+        self.state.pace_notify.notify_waiters();
+
+        while self.state.completed_epoch.load(Ordering::Acquire) < epoch {
+            self.state.done_notify.notified().await;
+        }
+    }
+}
+
+async fn build_pipeline_harness<T, Make>(
+    key: &'static str,
+    cfg: BufferCfg,
+    make_value: Make,
+) -> PipelineHarness
+where
+    T: Send + Sync + Clone + Debug + 'static,
+    Make: Fn(u64) -> T + Send + Sync + Clone + 'static,
+{
+    let state = Arc::new(BatchState::new());
+    let (tap_ready_tx, tap_ready_rx) = oneshot::channel::<()>();
+
+    let adapter = Arc::new(TokioAdapter);
+    let mut builder = AimDbBuilder::new().runtime(adapter);
+    builder.configure::<T>(key, {
+        let state = Arc::clone(&state);
+        move |reg| {
+            let source_state = Arc::clone(&state);
+            let tap_state = Arc::clone(&state);
+            let mut tap_ready_tx = Some(tap_ready_tx);
+            let make_value = make_value.clone();
+
+            reg.buffer(cfg)
+                .source(move |_ctx, producer| async move {
+                    let mut next_value_index = 0u64;
+                    let mut seen_epoch = 0u64;
+                    loop {
+                        while source_state.start_epoch.load(Ordering::Acquire) == seen_epoch {
+                            source_state.start_notify.notified().await;
+                        }
+
+                        seen_epoch = source_state.start_epoch.load(Ordering::Acquire);
+                        let batch_size = source_state.batch_size.load(Ordering::Acquire);
+                        for _ in 0..batch_size {
+                            loop {
+                                let available = source_state.pace_tokens.load(Ordering::Acquire);
+                                if available > 0 {
+                                    if source_state
+                                        .pace_tokens
+                                        .compare_exchange(
+                                            available,
+                                            available - 1,
+                                            Ordering::AcqRel,
+                                            Ordering::Acquire,
+                                        )
+                                        .is_ok()
+                                    {
+                                        break;
+                                    }
+                                } else {
+                                    source_state.pace_notify.notified().await;
+                                }
+                            }
+                            producer.produce(make_value(next_value_index));
+                            next_value_index += 1;
+                        }
+                    }
+                })
+                .with_name("bench_source")
+                .tap(move |_ctx, consumer| async move {
+                    let mut reader = consumer.subscribe();
+                    tap_ready_tx
+                        .take()
+                        .expect("tap readiness sender already used")
+                        .send(())
+                        .expect("failed to signal tap readiness");
+
+                    while let Ok(_value) = reader.recv().await {
+                        let current_epoch = tap_state.target_epoch.load(Ordering::Acquire);
+                        let seen = tap_state.consumed_in_epoch.fetch_add(1, Ordering::AcqRel) + 1;
+                        let batch_size = tap_state.batch_size.load(Ordering::Acquire);
+                        if seen < batch_size {
+                            tap_state.pace_tokens.fetch_add(1, Ordering::AcqRel);
+                            tap_state.pace_notify.notify_waiters();
+                        }
+                        if seen == batch_size {
+                            tap_state
+                                .completed_epoch
+                                .store(current_epoch, Ordering::Release);
+                            tap_state.done_notify.notify_waiters();
+                        }
+                    }
+                })
+                .with_name("bench_tap");
+        }
+    });
+
+    let (db, runner) = builder
+        .build()
+        .await
+        .expect("alloc pipeline bench build failed");
+    tokio::spawn(runner.run());
+    tap_ready_rx
+        .await
+        .expect("pipeline tap exited before signalling readiness");
+
+    PipelineHarness { _db: db, state }
+}
+
+fn main() {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("failed to build current-thread Tokio runtime");
+
+    println!("=== B0 Allocation Benchmarks (Runner pipeline) ===");
+    println!("  Warmup batch : {WARMUP_ITERS}");
+    println!("  Measured batch: {BATCH_SIZE}");
+    println!();
+
+    let telemetry_report = rt.block_on(async {
+        let harness = build_pipeline_harness::<TelemetryMsg, _>(
+            "bench::telemetry",
+            BufferCfg::SpmcRing {
+                capacity: TELEMETRY_CAPACITY,
+            },
+            telemetry_msg,
+        )
+        .await;
+
+        harness.warmup(WARMUP_ITERS).await;
+        reset();
+        harness.measure(BATCH_SIZE).await;
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Telemetry", "SpmcRing", BATCH_SIZE, allocs, bytes)
+    });
+    telemetry_report.print();
+
+    let state_report = rt.block_on(async {
+        let harness = build_pipeline_harness::<StateMsg, _>(
+            "bench::state",
+            BufferCfg::SingleLatest,
+            state_msg,
+        )
+        .await;
+
+        harness.warmup(WARMUP_ITERS).await;
+        reset();
+        harness.measure(BATCH_SIZE).await;
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("State", "SingleLatest", BATCH_SIZE, allocs, bytes)
+    });
+    state_report.print();
+
+    let command_report = rt.block_on(async {
+        let harness = build_pipeline_harness::<CommandMsg, _>(
+            "bench::command",
+            BufferCfg::Mailbox,
+            command_msg,
+        )
+        .await;
+
+        harness.warmup(WARMUP_ITERS).await;
+        reset();
+        harness.measure(BATCH_SIZE).await;
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Command", "Mailbox", BATCH_SIZE, allocs, bytes)
+    });
+    command_report.print();
+
+    let reports = vec![telemetry_report, state_report, command_report];
+    let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
+    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/target/bench-results");
+    std::fs::create_dir_all(out_dir).expect("failed to create results directory");
+    let out_path = format!("{out_dir}/b_alloc_pipeline.json");
+    std::fs::write(&out_path, &json).expect("failed to write results");
+    println!("\nResults written to {out_path}");
+}
diff --git a/aimdb-bench/benches/b_runner_pipeline.rs b/aimdb-bench/benches/b_runner_pipeline.rs
new file mode 100644
index 00000000..9ff1e79f
--- /dev/null
+++ b/aimdb-bench/benches/b_runner_pipeline.rs
@@ -0,0 +1,216 @@
+//! B-Runner-Pipeline — Runner-driven in-process pipeline throughput.
+//!
+//! Exercises the same three profiles as B0/B1/B2 through a real `AimDbRunner`
+//! path (`.source()` -> buffer -> `.tap()`), so it measures stage wakeups and
+//! the runner-driven producer/consumer pipeline rather than direct
+//! `Producer<T>` / `Consumer<T>` calls. It is in-process only — no outbound
+//! connectors, serialization, transport, or kernel I/O — and the timing window
+//! includes the handshakes that feed the source stage and observe completion at
+//! the tap stage.
+//!
+//! One `AimDb` is built per bench group with its runner spawned once; Criterion
+//! samples push work into the source via an ingress channel and wait for the
+//! tap's completion signals.
+//!
+//! Run `cargo bench -p aimdb-bench --bench b_runner_pipeline`.
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use aimdb_bench::profiles::{
+    command_msg, state_msg, telemetry_msg, CommandMsg, StateMsg, TelemetryMsg, TELEMETRY_CAPACITY,
+    WARMUP_ITERS,
+};
+use aimdb_core::{buffer::BufferCfg, AimDb, AimDbBuilder};
+use aimdb_tokio_adapter::{TokioAdapter, TokioRecordRegistrarExt};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use tokio::sync::{
+    mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
+    oneshot,
+};
+
+struct RunnerHarness<T> {
+    _db: AimDb,
+    input_tx: UnboundedSender<T>,
+    ack_rx: UnboundedReceiver<()>,
+}
+
+impl<T> RunnerHarness<T> {
+    async fn round_trip(&mut self, value: T) {
+        self.input_tx
+            .send(value)
+            .expect("source input channel closed unexpectedly");
+        self.ack_rx
+            .recv()
+            .await
+            .expect("tap acknowledgement channel closed unexpectedly");
+    }
+}
+
+async fn build_runner_harness<T>(key: &'static str, cfg: BufferCfg) -> RunnerHarness<T>
+where
+    T: Send + Sync + Clone + Debug + 'static,
+{
+    let (input_tx, mut input_rx) = unbounded_channel::<T>();
+    let (ack_tx, ack_rx) = unbounded_channel::<()>();
+    let (tap_ready_tx, tap_ready_rx) = oneshot::channel::<()>();
+
+    let adapter = Arc::new(TokioAdapter);
+    let mut builder = AimDbBuilder::new().runtime(adapter);
+    builder.configure::<T>(key, move |reg| {
+        let ack_tx = ack_tx.clone();
+        let mut tap_ready_tx = Some(tap_ready_tx);
+
+        reg.buffer(cfg)
+            .source(move |_ctx, producer| async move {
+                while let Some(value) = input_rx.recv().await {
+                    producer.produce(value);
+                }
+            })
+            .with_name("bench_source")
+            .tap(move |_ctx, consumer| async move {
+                let mut reader = consumer.subscribe();
+                tap_ready_tx
+                    .take()
+                    .expect("tap readiness sender already used")
+                    .send(())
+                    .expect("failed to signal tap readiness");
+                while let Ok(_value) = reader.recv().await {
+                    ack_tx
+                        .send(())
+                        .expect("bench tap ack channel closed unexpectedly");
+                }
+            })
+            .with_name("bench_tap");
+    });
+
+    let (db, runner) = builder.build().await.expect("runner bench build failed");
+    tokio::spawn(runner.run());
+    tap_ready_rx
+        .await
+        .expect("runner tap exited before signalling readiness");
+
+    RunnerHarness {
+        _db: db,
+        input_tx,
+        ack_rx,
+    }
+}
+
+// ── Telemetry E2E ─────────────────────────────────────────────────────────────
+
+fn bench_e2e_telemetry(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut harness = rt.block_on(build_runner_harness::<TelemetryMsg>(
+        "bench::telemetry",
+        BufferCfg::SpmcRing {
+            capacity: TELEMETRY_CAPACITY,
+        },
+    ));
+
+    let mut group = c.benchmark_group("B-Runner-Pipeline");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                for i in 0..WARMUP_ITERS {
+                    harness.round_trip(telemetry_msg(i as u64)).await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    harness
+                        .round_trip(telemetry_msg((WARMUP_ITERS as u64) + i))
+                        .await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State E2E ─────────────────────────────────────────────────────────────────
+
+fn bench_e2e_state(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut harness = rt.block_on(build_runner_harness::<StateMsg>(
+        "bench::state",
+        BufferCfg::SingleLatest,
+    ));
+
+    let mut group = c.benchmark_group("B-Runner-Pipeline");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                for i in 0..WARMUP_ITERS {
+                    harness.round_trip(state_msg(i as u64)).await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    harness
+                        .round_trip(state_msg((WARMUP_ITERS as u64) + i))
+                        .await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command E2E ───────────────────────────────────────────────────────────────
+
+fn bench_e2e_command(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut harness = rt.block_on(build_runner_harness::<CommandMsg>(
+        "bench::command",
+        BufferCfg::Mailbox,
+    ));
+
+    let mut group = c.benchmark_group("B-Runner-Pipeline");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                for i in 0..WARMUP_ITERS {
+                    harness.round_trip(command_msg(i as u64)).await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    harness
+                        .round_trip(command_msg((WARMUP_ITERS as u64) + i))
+                        .await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_e2e_telemetry,
+    bench_e2e_state,
+    bench_e2e_command,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/data/baselines/b0_alloc_embassy.json b/aimdb-bench/data/baselines/b0_alloc_embassy.json
new file mode 100644
index 00000000..8ee6134c
--- /dev/null
+++ b/aimdb-bench/data/baselines/b0_alloc_embassy.json
@@ -0,0 +1,29 @@
+[
+  {
+    "profile": "Telemetry",
+    "buffer_type": "SpmcRing",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  },
+  {
+    "profile": "State",
+    "buffer_type": "SingleLatest",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  },
+  {
+    "profile": "Command",
+    "buffer_type": "Mailbox",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  }
+]
\ No newline at end of file
diff --git a/aimdb-bench/data/baselines/b0_alloc_tokio.json b/aimdb-bench/data/baselines/b0_alloc_tokio.json
new file mode 100644
index 00000000..51fd3d96
--- /dev/null
+++ b/aimdb-bench/data/baselines/b0_alloc_tokio.json
@@ -0,0 +1,29 @@
+[
+  {
+    "profile": "Telemetry",
+    "buffer_type": "SpmcRing",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  },
+  {
+    "profile": "State",
+    "buffer_type": "SingleLatest",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  },
+  {
+    "profile": "Command",
+    "buffer_type": "Mailbox",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  }
+]
diff --git a/aimdb-bench/data/baselines/b3_cycles_stm32h5.json b/aimdb-bench/data/baselines/b3_cycles_stm32h5.json
new file mode 100644
index 00000000..4ff0dfef
--- /dev/null
+++ b/aimdb-bench/data/baselines/b3_cycles_stm32h5.json
@@ -0,0 +1,45 @@
+{
+  "bench": "b3_cycles_embassy_stm32h5",
+  "description": "On-target DWT CYCCNT cycles/msg + allocs/msg for the Embassy buffer consume path. Produced by examples/embassy-bench-stm32h5 (cannot run on a host).",
+  "target": "STM32H563ZI (Cortex-M33)",
+  "clock_hz": 250000000,
+  "build_profile": "release",
+  "warmup": 200,
+  "batch": 512,
+  "captured": "2026-06-21",
+  "note": "First capture on a Nucleo-H563ZI rig (release). Single run; re-run 2-3x and update if a stable average differs.",
+  "results": [
+    {
+      "profile": "Telemetry",
+      "buffer_type": "SpmcRing",
+      "deliveries_per_msg": 1,
+      "cycles_per_msg": 2013,
+      "total_cycles": 1030839,
+      "allocs_per_msg": 0
+    },
+    {
+      "profile": "State",
+      "buffer_type": "SingleLatest",
+      "deliveries_per_msg": 1,
+      "cycles_per_msg": 2009,
+      "total_cycles": 1029028,
+      "allocs_per_msg": 0
+    },
+    {
+      "profile": "Command",
+      "buffer_type": "Mailbox",
+      "deliveries_per_msg": 1,
+      "cycles_per_msg": 1661,
+      "total_cycles": 850440,
+      "allocs_per_msg": 0
+    },
+    {
+      "profile": "Telemetry",
+      "buffer_type": "SpmcRing(1->4)",
+      "deliveries_per_msg": 4,
+      "cycles_per_msg": 6239,
+      "total_cycles": 3194799,
+      "allocs_per_msg": 0
+    }
+  ]
+}
diff --git a/aimdb-bench/src/alloc.rs b/aimdb-bench/src/alloc.rs
new file mode 100644
index 00000000..6de1753c
--- /dev/null
+++ b/aimdb-bench/src/alloc.rs
@@ -0,0 +1,64 @@
+//! Allocation counting for B0 benchmarks.
+//!
+//! Wraps an inner `GlobalAlloc` with atomic counters to measure per-message
+//! allocation overhead. `#[global_allocator]` is a per-binary, link-time
+//! declaration, so `CountingAllocator` affects only the bench binaries and has
+//! zero impact on production crates. It is generic over the inner allocator so
+//! an embedded target can swap `System` for `embedded-alloc`.
+
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Total allocation call count (since last [`reset`]).
+pub static ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Total bytes allocated (since last [`reset`]).
+pub static ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
+
+/// Wraps an inner `GlobalAlloc`, incrementing [`ALLOC_COUNT`] and
+/// [`ALLOC_BYTES`] on every allocation.
+pub struct CountingAllocator<A>(pub A);
+
+// SAFETY: we delegate every call to the inner allocator unchanged;
+// the only side-effect is the atomic counter updates.
+unsafe impl<A: GlobalAlloc> GlobalAlloc for CountingAllocator<A> {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        ALLOC_BYTES.fetch_add(layout.size() as u64, Ordering::Relaxed);
+        // SAFETY: caller guarantees `layout` is valid; delegated to inner.
+        unsafe { self.0.alloc(layout) }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        // SAFETY: caller guarantees `ptr` was allocated by this allocator.
+        unsafe { self.0.dealloc(ptr, layout) }
+    }
+}
+
+/// The global allocator used by all bench binaries.
+///
+/// Applies to every bench binary that links `aimdb-bench` — not to any
+/// production crate.
+#[global_allocator]
+static GLOBAL: CountingAllocator<System> = CountingAllocator(System);
+
+/// Reset both counters to zero.
+///
+/// Call once after the warmup phase, immediately before the measured window.
+#[inline]
+pub fn reset() {
+    ALLOC_COUNT.store(0, Ordering::Relaxed);
+    ALLOC_BYTES.store(0, Ordering::Relaxed);
+}
+
+/// Snapshot the current counters.
+///
+/// Returns `(count, bytes)` — total allocations and total bytes since the
+/// last [`reset`].
+#[inline]
+pub fn snapshot() -> (u64, u64) {
+    (
+        ALLOC_COUNT.load(Ordering::Relaxed),
+        ALLOC_BYTES.load(Ordering::Relaxed),
+    )
+}
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
new file mode 100644
index 00000000..ae9500d6
--- /dev/null
+++ b/aimdb-bench/src/lib.rs
@@ -0,0 +1,27 @@
+//! AimDB benchmarking infrastructure. **Not for production use.**
+//!
+//! Reusable primitives for B0 (allocation counting), B1 (latency), and B2
+//! (throughput) benchmarks. The `alloc` module registers
+//! [`alloc::CountingAllocator`] as the `#[global_allocator]` for every bench
+//! binary that links this crate; nothing in the production dependency graph
+//! depends on `aimdb-bench`.
+//!
+//! # Bench entrypoints
+//!
+//! | File                              | Class | Purpose                                  |
+//! |-----------------------------------|-------|------------------------------------------|
+//! | `benches/b0_alloc_tokio.rs`       | B0    | Per-message allocation (Tokio buffer)    |
+//! | `benches/b1_b2_tokio.rs`          | B1+B2 | Latency (time/iter) + throughput (Tokio) |
+//! | `benches/b0_alloc_embassy.rs`     | B0    | Per-message allocation (Embassy buffer)  |
+//! | `benches/b1_b2_embassy.rs`        | B1+B2 | Latency (time/iter) + throughput (Embassy)|
+//! | `benches/b_alloc_pipeline.rs`     | info  | Per-message allocation (runner pipeline) |
+//! | `benches/b_runner_pipeline.rs`    | info  | Runner pipeline throughput (Criterion)   |
+//!
+//! On-target cycle profiling (B3) lives in the hardware-only
+//! `examples/embassy-bench-stm32h5` crate, since DWT cycle counting cannot run
+//! on a host.
+
+pub mod alloc;
+pub mod profiles;
+pub mod profiles_embassy;
+pub mod reports;
diff --git a/aimdb-bench/src/profiles.rs b/aimdb-bench/src/profiles.rs
new file mode 100644
index 00000000..c7129e1b
--- /dev/null
+++ b/aimdb-bench/src/profiles.rs
@@ -0,0 +1,112 @@
+//! Canonical workload profiles and deterministic message factories.
+//!
+//! Three profiles match the three buffer types:
+//!
+//! | Profile       | Buffer type    | Tokio primitive           | Payload |
+//! |---------------|----------------|---------------------------|---------|
+//! | **Telemetry** | `SpmcRing`     | `broadcast`               | small   |
+//! | **State**     | `SingleLatest` | `watch`                   | medium  |
+//! | **Command**   | `Mailbox`      | `Mutex` slot + waker list | small   |
+//!
+//! Buffers are constructed from a `BufferCfg` via the `Buffer<T>` trait so
+//! the bench code tests exactly the same code path that production uses.
+
+use aimdb_core::buffer::{Buffer, BufferCfg};
+use aimdb_tokio_adapter::TokioBuffer;
+
+// ── Payload types ─────────────────────────────────────────────────────────────
+
+/// Small telemetry reading pushed at high frequency (Telemetry profile).
+#[derive(Debug, Clone, PartialEq)]
+pub struct TelemetryMsg {
+    pub sensor_id: u32,
+    pub value: f64,
+    pub sequence: u64,
+}
+
+/// Medium state snapshot with several fields (State profile).
+#[derive(Debug, Clone, PartialEq)]
+pub struct StateMsg {
+    pub device_id: u32,
+    pub temperature: f64,
+    pub humidity: f64,
+    pub pressure: f64,
+    pub sequence: u64,
+}
+
+/// Control command payload (Command / Mailbox profile).
+#[derive(Debug, Clone, PartialEq)]
+pub struct CommandMsg {
+    pub command_id: u32,
+    pub target: u32,
+    pub value: f64,
+    pub sequence: u64,
+}
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+/// Ring capacity for the Telemetry profile.
+///
+/// Must be ≥ `BATCH_SIZE` so B2 fan-out measurements never trigger
+/// `BufferLagged` within a single iteration.
+pub const TELEMETRY_CAPACITY: usize = 1024;
+
+/// Number of messages in the B0 measured batch window.
+pub const BATCH_SIZE: usize = 512;
+
+/// Warmup iterations excluded from the B0 measurement window.
+pub const WARMUP_ITERS: usize = 200;
+
+// ── Deterministic message factories ──────────────────────────────────────────
+
+/// Produce a deterministic `TelemetryMsg` for iteration `i`.
+#[inline]
+pub fn telemetry_msg(i: u64) -> TelemetryMsg {
+    TelemetryMsg {
+        sensor_id: (i % 16) as u32,
+        value: i as f64 * 0.1,
+        sequence: i,
+    }
+}
+
+/// Produce a deterministic `StateMsg` for iteration `i`.
+#[inline]
+pub fn state_msg(i: u64) -> StateMsg {
+    StateMsg {
+        device_id: (i % 8) as u32,
+        temperature: 20.0 + i as f64 * 0.01,
+        humidity: 50.0 + i as f64 * 0.005,
+        pressure: 1013.25 + i as f64 * 0.001,
+        sequence: i,
+    }
+}
+
+/// Produce a deterministic `CommandMsg` for iteration `i`.
+#[inline]
+pub fn command_msg(i: u64) -> CommandMsg {
+    CommandMsg {
+        command_id: (i % 256) as u32,
+        target: (i % 4) as u32,
+        value: i as f64,
+        sequence: i,
+    }
+}
+
+// ── Buffer constructors ───────────────────────────────────────────────────────
+
+/// Build a `TokioBuffer<TelemetryMsg>` backed by `SpmcRing` (`broadcast`).
+pub fn telemetry_buffer() -> TokioBuffer<TelemetryMsg> {
+    TokioBuffer::new(&BufferCfg::SpmcRing {
+        capacity: TELEMETRY_CAPACITY,
+    })
+}
+
+/// Build a `TokioBuffer<StateMsg>` backed by `SingleLatest` (`watch`).
+pub fn state_buffer() -> TokioBuffer<StateMsg> {
+    TokioBuffer::new(&BufferCfg::SingleLatest)
+}
+
+/// Build a `TokioBuffer<CommandMsg>` backed by `Mailbox` (`Mutex` slot + waker list).
+pub fn command_buffer() -> TokioBuffer<CommandMsg> {
+    TokioBuffer::new(&BufferCfg::Mailbox)
+}
diff --git a/aimdb-bench/src/profiles_embassy.rs b/aimdb-bench/src/profiles_embassy.rs
new file mode 100644
index 00000000..705edc1a
--- /dev/null
+++ b/aimdb-bench/src/profiles_embassy.rs
@@ -0,0 +1,76 @@
+//! Embassy buffer constructors for the host-driven B0/B1/B2 suites.
+//!
+//! Reuse the same payload types and message factories as the Tokio profiles
+//! ([`crate::profiles`]) so both adapters are measured against identical
+//! workloads; only the backend differs. These are [`EmbassyBuffer`]s on
+//! embassy-sync primitives, driven on the host via
+//! `futures::executor::block_on`.
+//!
+//! # Const-generic sizing
+//!
+//! Embassy buffers are sized at compile time
+//! (`EmbassyBuffer<T, CAP, SUBS, PUBS, WATCH_N>`). The aliases below fix those
+//! parameters per profile:
+//!
+//! | Profile   | Backend        | CAP | SUBS | PUBS | WATCH_N | Notes                       |
+//! |-----------|----------------|-----|------|------|---------|-----------------------------|
+//! | Telemetry | `SpmcRing`     | 16  | 4    | 1    | 1       | SUBS=4 covers 1→4 fan-out   |
+//! | State     | `SingleLatest` | 1   | 1    | 1    | 4       | only WATCH_N is used        |
+//! | Command   | `Mailbox`      | 1   | 1    | 1    | 1       | Channel capacity is fixed=1 |
+//!
+//! Lockstep push→recv keeps at most one message in flight, so `CAP=16` for
+//! Telemetry never lags.
+//!
+//! # Lazy SpmcRing subscriber
+//!
+//! An `SpmcRing` reader registers its embassy `Subscriber` lazily, on its first
+//! poll — not at `subscribe()` time. A message published before that poll is
+//! missed and a later `recv()` blocks forever. Benches must [`prime`] each
+//! reader *before* the first `push` to force registration; priming is a no-op
+//! for Watch/Mailbox readers.
+
+use aimdb_core::buffer::Reader;
+use aimdb_embassy_adapter::EmbassyBuffer;
+
+use crate::profiles::{CommandMsg, StateMsg, TelemetryMsg};
+
+/// SpmcRing capacity for the Telemetry profile (compile-time const generic).
+pub const TELEMETRY_CAP: usize = 16;
+
+/// Telemetry buffer: `SpmcRing` with room for the 1→4 fan-out (SUBS=4).
+pub type TelemetryBuffer = EmbassyBuffer<TelemetryMsg, TELEMETRY_CAP, 4, 1, 1>;
+
+/// State buffer: `SingleLatest` (`Watch`); only `WATCH_N` is meaningful.
+pub type StateBuffer = EmbassyBuffer<StateMsg, 1, 1, 1, 4>;
+
+/// Command buffer: `Mailbox` (single-slot `Channel`).
+pub type CommandBuffer = EmbassyBuffer<CommandMsg, 1, 1, 1, 1>;
+
+/// Build a Telemetry `SpmcRing` Embassy buffer.
+pub fn telemetry_buffer() -> TelemetryBuffer {
+    EmbassyBuffer::new_spmc()
+}
+
+/// Build a State `SingleLatest` Embassy buffer.
+pub fn state_buffer() -> StateBuffer {
+    EmbassyBuffer::new_watch()
+}
+
+/// Build a Command `Mailbox` Embassy buffer.
+pub fn command_buffer() -> CommandBuffer {
+    EmbassyBuffer::new_mailbox()
+}
+
+/// Force lazy subscriber registration on an Embassy reader before the first
+/// `push` (see module docs).
+///
+/// For `SpmcRing` this registers the `Subscriber` at the current queue position
+/// so it does not miss the first message; for Watch/Mailbox it is a harmless
+/// empty read. Must be called *outside* the measured window — registration may
+/// allocate.
+#[inline]
+pub fn prime<T: Clone + Send>(reader: &mut Reader<T>) {
+    // The `BufferEmpty` error is ignored: we want the side effect of creating
+    // the subscriber, not the (absent) value.
+    let _ = reader.try_recv();
+}
diff --git a/aimdb-bench/src/reports.rs b/aimdb-bench/src/reports.rs
new file mode 100644
index 00000000..9b1f6d83
--- /dev/null
+++ b/aimdb-bench/src/reports.rs
@@ -0,0 +1,61 @@
+//! Result structs for B0 benchmark output.
+//!
+//! Serialized as JSON for storage in `data/baselines/`.  B1/B2 results are
+//! managed by Criterion's built-in baseline system (`target/criterion/`).
+
+use serde::{Deserialize, Serialize};
+
+/// B0 allocation report for a single workload profile.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AllocReport {
+    /// Profile name (e.g. "Telemetry", "State", "Command").
+    pub profile: String,
+    /// Buffer type (e.g. "SpmcRing", "SingleLatest", "Mailbox").
+    pub buffer_type: String,
+    /// Total allocations in the measured batch.
+    pub total_allocs: u64,
+    /// Total bytes allocated in the measured batch.
+    pub total_bytes: u64,
+    /// Number of messages in the batch.
+    pub batch_size: usize,
+    /// Mean allocations per message.
+    pub allocs_per_msg: f64,
+    /// Mean bytes allocated per message.
+    pub bytes_per_msg: f64,
+}
+
+impl AllocReport {
+    /// Construct from raw counter snapshot and batch metadata.
+    pub fn new(
+        profile: impl Into<String>,
+        buffer_type: impl Into<String>,
+        batch_size: usize,
+        total_allocs: u64,
+        total_bytes: u64,
+    ) -> Self {
+        let n = batch_size as f64;
+        Self {
+            profile: profile.into(),
+            buffer_type: buffer_type.into(),
+            total_allocs,
+            total_bytes,
+            batch_size,
+            allocs_per_msg: total_allocs as f64 / n,
+            bytes_per_msg: total_bytes as f64 / n,
+        }
+    }
+
+    /// Print a human-readable one-liner to stdout.
+    pub fn print(&self) {
+        println!(
+            "[B0] {:12} ({:12}): {:.3} allocs/msg  ({} total allocs, {} B/msg avg, {} B total, batch={})",
+            self.profile,
+            self.buffer_type,
+            self.allocs_per_msg,
+            self.total_allocs,
+            self.bytes_per_msg as u64,
+            self.total_bytes,
+            self.batch_size,
+        );
+    }
+}
diff --git a/aimdb-core/CHANGELOG.md b/aimdb-core/CHANGELOG.md
index 51c25e92..1f6e504d 100644
--- a/aimdb-core/CHANGELOG.md
+++ b/aimdb-core/CHANGELOG.md
@@ -14,6 +14,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed (breaking)
 
+- **Design 037 W8 — zero-allocation consume path: `BufferReader` is now poll-based ([design doc](../docs/design/037-zero-alloc-consume-path.md)).** The object-erased async `recv` returned a `Pin<Box<dyn Future>>`, heap-allocating on every call — the last AimDB-added per-message allocation on the consume path. It is replaced by an object-safe poll method, restoring async ergonomics through a new handle:
+  - **SPI break (adapter authors only):** `BufferReader<T>::recv(&mut self) -> Pin<Box<dyn Future>>` → `BufferReader<T>::poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>>`. `try_recv` is unchanged. Same break for the `remote-access` `JsonBufferReader`: `recv_json` → `poll_recv_json`. Object safety is preserved (poll, unlike `async fn`, is object-safe).
+  - **New consumer handles:** `buffer::Reader<T>` (and `buffer::JsonReader` under `remote-access`) wrap the erased reader and expose an `async fn recv()` implemented once via `core::future::poll_fn` — `core`-only, `no_std`-clean, zero-allocation, no `unsafe`. `Consumer::subscribe`, `TypedRecord::subscribe`, and `AimDb::subscribe` now return `Reader<T>` instead of `Box<dyn BufferReader<T> + Send>`.
+  - **Source-compatible for consumers:** `subscribe().recv().await` is unchanged at every call site; examples and aimdb-pro compile without edits. Holders of a concrete adapter reader wrap it once: `Reader::new(Box::new(reader))`.
+  - **Connector SPI unchanged (BYOC-stable, design 039 §2):** `SerializedReader::recv` keeps its boxed `RecvSerializedFuture`; only the *inner* per-message box is eliminated. The remote-access JSON path drops both its boxes (`poll_recv_json` + `JsonReader`).
+  - **Result:** **0 AimDB-added heap allocations per message** on the in-process consume path, enforced by the `aimdb-bench` B0 suite (1 → 0 allocs/msg across all three tokio buffer profiles). Tokio `broadcast`/`watch` expose no public poll API, so the reader round-trips its receiver through a single reused `tokio_util::sync::ReusableBoxFuture` — one allocation per subscriber lifetime, not per message — and the Mailbox replaces `Notify` with an explicit waker list beside the slot. Embassy drives embassy-sync's public poll methods directly (`Subscriber::poll_next_message`, `watch::Receiver::poll_changed`, `Channel::poll_receive`) — no future box and **no new `unsafe`**; `poll_next_message`/`poll_changed` were added to the vendored `embassy-sync` as small additive wrappers (upstream PR pending).
+
 - **Design 036 W1 — data-plane de-`Any`: the per-message `Box<dyn Any>` is gone from the connector SPI ([design doc §W1](../docs/design/036-followup-refactoring.md)).** Both ends of every erased hop were typed — `T` is known in the registrar where routes are wired, and the connector spine only wants bytes — so the typed pipeline is now built inside closures at registration time (`finish()`) and the SPI exposes only the wire level. The full break inventory:
   - **Inbound:** new `IngestFn = Arc<dyn Fn(&RuntimeContext, &[u8]) -> Result<(), String>>` + `IngestFactoryFn` replace deserializer + producer: deserialize + produce in one typed closure, **synchronous** (`Producer::produce` is sync + infallible per design 029 — the per-message `Box::pin` disappears along with the `Box<dyn Any>`). Deleted: `ProducerTrait`/`produce_any`, `ProducerFactoryFn`, `DeserializerFn`/`ContextDeserializerFn`/`DeserializerKind`, `TypedRecord::create_producer_trait`. `InboundConnectorLink` is `{ url, config, ingest_factory, topic_resolver }` (factory non-optional — `finish()` validates the deserializer before registering, error strings unchanged); `collect_inbound_routes` returns `Vec<(String, IngestFn)>`; `Route` is `{ resource_id, ingest }`.
   - **`Router::route` is a sync fn and its context is mandatory:** `route(&self, resource_id, payload, ctx: &RuntimeContext) -> Result<(), String>` (was `async` with `Option<&RuntimeContext>`). Every production caller already passed `Some(&ctx)`; the old "skip context-deserializers when no ctx" branch is unrepresentable now that raw-vs-context is invisible inside the fused closure.
diff --git a/aimdb-core/src/buffer/mod.rs b/aimdb-core/src/buffer/mod.rs
index 74591b9e..ac6a9387 100644
--- a/aimdb-core/src/buffer/mod.rs
+++ b/aimdb-core/src/buffer/mod.rs
@@ -59,11 +59,13 @@
 mod cfg;
 #[cfg(feature = "metrics")]
 mod counters;
+mod reader;
 mod traits;
 mod writer;
 
 // Public API exports
 pub use cfg::BufferCfg;
+pub use reader::Reader;
 pub use traits::{Buffer, BufferReader, DynBuffer};
 
 // Crate-private — used by Producer<T> to push without per-call lookup
@@ -72,6 +74,8 @@ pub(crate) use writer::RecordWriter;
 
 // JSON streaming support
 #[cfg(feature = "remote-access")]
+pub use reader::JsonReader;
+#[cfg(feature = "remote-access")]
 pub use traits::JsonBufferReader;
 
 // Buffer metrics (feature-gated; works in no_std with portable-atomic)
diff --git a/aimdb-core/src/buffer/reader.rs b/aimdb-core/src/buffer/reader.rs
new file mode 100644
index 00000000..1ee54219
--- /dev/null
+++ b/aimdb-core/src/buffer/reader.rs
@@ -0,0 +1,85 @@
+//! Consumer-facing reader handles (design 037 / W8).
+//!
+//! The [`BufferReader`] / [`JsonBufferReader`] SPIs are object-safe and
+//! poll-based so adapters can implement them without a per-message
+//! `Pin<Box<dyn Future>>` allocation. These handles restore the ergonomic
+//! `async fn recv().await` surface for consumers by wrapping the erased
+//! reader's `poll_*` method in [`core::future::poll_fn`] — which is `core`-only
+//! (no_std-clean), allocation-free, and `unsafe`-free.
+//!
+//! The wrapped future is `Send` because the boxed reader is `Send`.
+
+use alloc::boxed::Box;
+use core::future::poll_fn;
+
+use crate::buffer::BufferReader;
+use crate::DbError;
+
+#[cfg(feature = "remote-access")]
+use crate::buffer::JsonBufferReader;
+
+/// Owned, ergonomic handle over an erased [`BufferReader`].
+///
+/// Returned by `Consumer::subscribe`. This is the "boxed lane": one indirect
+/// call per `recv`, zero AimDB-added heap allocations per message. (The generic
+/// monomorphized `Reader<T, B>` fast lane remains dormant — see design 037 §7.)
+pub struct Reader<T: Clone + Send> {
+    inner: Box<dyn BufferReader<T> + Send>,
+}
+
+impl<T: Clone + Send> Reader<T> {
+    /// Wrap an erased reader in an ergonomic handle.
+    pub fn new(inner: Box<dyn BufferReader<T> + Send>) -> Self {
+        Self { inner }
+    }
+
+    /// Receive the next value, awaiting until one is available.
+    ///
+    /// Allocation-free: wraps the erased reader's
+    /// [`poll_recv`](BufferReader::poll_recv) via `core::future::poll_fn`.
+    ///
+    /// # Behavior by Buffer Type
+    /// - **SPMC Ring**: Returns next value, or `Lagged(n)` if fell behind
+    /// - **SingleLatest**: Waits for value change, returns most recent
+    /// - **Mailbox**: Waits for slot value, takes and clears it
+    pub async fn recv(&mut self) -> Result<T, DbError> {
+        poll_fn(|cx| self.inner.poll_recv(cx)).await
+    }
+
+    /// Non-blocking receive — returns immediately.
+    ///
+    /// Returns `Err(DbError::BufferEmpty)` if no pending values.
+    pub fn try_recv(&mut self) -> Result<T, DbError> {
+        self.inner.try_recv()
+    }
+}
+
+/// Owned, ergonomic handle over an erased [`JsonBufferReader`].
+///
+/// Returned by `subscribe_json`. Awaiting `recv_json` is allocation-free: it
+/// wraps [`poll_recv_json`](JsonBufferReader::poll_recv_json) via
+/// `core::future::poll_fn`, so the pre-W8 remote-access double box is gone.
+#[cfg(feature = "remote-access")]
+pub struct JsonReader {
+    inner: Box<dyn JsonBufferReader + Send>,
+}
+
+#[cfg(feature = "remote-access")]
+impl JsonReader {
+    /// Wrap an erased JSON reader in an ergonomic handle.
+    pub fn new(inner: Box<dyn JsonBufferReader + Send>) -> Self {
+        Self { inner }
+    }
+
+    /// Receive the next value as JSON, awaiting until one is available.
+    pub async fn recv_json(&mut self) -> Result<serde_json::Value, DbError> {
+        poll_fn(|cx| self.inner.poll_recv_json(cx)).await
+    }
+
+    /// Non-blocking receive as JSON — returns immediately.
+    ///
+    /// Returns `Err(DbError::BufferEmpty)` if no pending values.
+    pub fn try_recv_json(&mut self) -> Result<serde_json::Value, DbError> {
+        self.inner.try_recv_json()
+    }
+}
diff --git a/aimdb-core/src/buffer/traits.rs b/aimdb-core/src/buffer/traits.rs
index e49b5e1a..c53badaf 100644
--- a/aimdb-core/src/buffer/traits.rs
+++ b/aimdb-core/src/buffer/traits.rs
@@ -5,8 +5,7 @@
 //!
 //! See `aimdb-tokio-adapter` and `aimdb-embassy-adapter` for implementations.
 
-use core::future::Future;
-use core::pin::Pin;
+use core::task::{Context, Poll};
 
 use alloc::boxed::Box;
 
@@ -124,22 +123,33 @@ pub(crate) trait WriteHandle<T: Clone + Send + 'static>: Send + Sync {
 
 /// Reader trait for consuming values from a buffer
 ///
-/// All read operations are async. Each reader is independent with its own state.
+/// This is the object-safe **service-provider interface** that runtime adapters
+/// implement. It is poll-based — and therefore object-safe and zero-allocation —
+/// rather than `async`: an `async fn` on an erased trait forces a
+/// `Pin<Box<dyn Future>>` heap allocation on every call (design 037 / W8).
+/// Consumers do not call this directly; they use the [`Reader<T>`](super::Reader)
+/// handle returned by `Consumer::subscribe`, whose `recv()` is `async` and wraps
+/// [`poll_recv`](BufferReader::poll_recv) via `core::future::poll_fn` with no
+/// allocation.
+///
+/// Each reader is independent with its own state.
 ///
 /// # Error Handling
 /// - `Ok(value)` - Successfully received a value
 /// - `Err(BufferLagged)` - Missed messages (SPMC ring only, can continue)
 /// - `Err(BufferClosed)` - Buffer closed (graceful shutdown)
 pub trait BufferReader<T: Clone + Send>: Send {
-    /// Receive the next value (async)
+    /// Poll for the next value.
     ///
-    /// Waits for the next available value. Returns immediately if buffered.
+    /// Returns `Poll::Ready(Ok(value))` when a value is available,
+    /// `Poll::Ready(Err(..))` on lag/closure, or `Poll::Pending` after
+    /// registering `cx.waker()` to be woken when the next value arrives.
     ///
     /// # Behavior by Buffer Type
     /// - **SPMC Ring**: Returns next value, or `Lagged(n)` if fell behind
     /// - **SingleLatest**: Waits for value change, returns most recent
     /// - **Mailbox**: Waits for slot value, takes and clears it
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>>;
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>>;
 
     /// Non-blocking receive — returns immediately.
     ///
@@ -158,25 +168,30 @@ pub trait BufferReader<T: Clone + Send>: Send {
 /// `serde_json::Value`. Used by remote access protocol for subscriptions.
 ///
 /// This trait enables subscribing to a buffer without knowing the concrete type `T`
-/// at compile time, by serializing values to JSON on each `recv_json()` call.
+/// at compile time, by serializing values to JSON on each poll.
+///
+/// Object-safe and poll-based for the same reason as [`BufferReader`] (design
+/// 037 / W8). Consumers use the [`JsonReader`](super::JsonReader) handle, whose
+/// `recv_json()` is `async` and wraps [`poll_recv_json`](JsonBufferReader::poll_recv_json)
+/// with no allocation.
 ///
 /// # Requirements
 /// - Record must be configured with `.with_remote_access()`
 /// - Only available with the `remote-access` feature (requires serde_json)
 #[cfg(feature = "remote-access")]
 pub trait JsonBufferReader: Send {
-    /// Receive the next value as JSON (async)
+    /// Poll for the next value, serialized to JSON.
     ///
-    /// Waits for the next value from the underlying buffer and serializes it to JSON.
+    /// Returns `Poll::Ready(Ok(json))` when a value is available and
+    /// serializes successfully, `Poll::Ready(Err(..))` on lag/closure/serialize
+    /// failure, or `Poll::Pending` after registering `cx.waker()`.
     ///
     /// # Returns
     /// - `Ok(JsonValue)` - Successfully received and serialized value
     /// - `Err(BufferLagged)` - Missed messages (can continue reading)
     /// - `Err(BufferClosed)` - Buffer closed (graceful shutdown)
     /// - `Err(SerializationFailed)` - Failed to serialize value to JSON
-    fn recv_json(
-        &mut self,
-    ) -> Pin<Box<dyn Future<Output = Result<serde_json::Value, DbError>> + Send + '_>>;
+    fn poll_recv_json(&mut self, cx: &mut Context<'_>) -> Poll<Result<serde_json::Value, DbError>>;
 
     /// Non-blocking receive as JSON — returns immediately.
     ///
@@ -287,13 +302,11 @@ mod tests {
     }
 
     impl<T: Clone + Send> BufferReader<T> for MockReader<T> {
-        fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-            Box::pin(async {
-                // Return closed for testing
-                Err(DbError::BufferClosed {
-                    buffer_name: "mock".to_string(),
-                })
-            })
+        fn poll_recv(&mut self, _cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+            // Return closed for testing
+            Poll::Ready(Err(DbError::BufferClosed {
+                buffer_name: "mock".to_string(),
+            }))
         }
 
         fn try_recv(&mut self) -> Result<T, DbError> {
diff --git a/aimdb-core/src/builder.rs b/aimdb-core/src/builder.rs
index 09be8c2c..32962dfb 100644
--- a/aimdb-core/src/builder.rs
+++ b/aimdb-core/src/builder.rs
@@ -960,10 +960,7 @@ impl AimDb {
     ///
     /// # Arguments
     /// * `key` - The record key (e.g., "sensor.temperature")
-    pub fn subscribe<T>(
-        &self,
-        key: impl AsRef<str>,
-    ) -> DbResult<Box<dyn crate::buffer::BufferReader<T> + Send>>
+    pub fn subscribe<T>(&self, key: impl AsRef<str>) -> DbResult<crate::buffer::Reader<T>>
     where
         T: Send + Sync + 'static + Debug + Clone,
     {
diff --git a/aimdb-core/src/lib.rs b/aimdb-core/src/lib.rs
index cc48d1ee..9f9fceda 100644
--- a/aimdb-core/src/lib.rs
+++ b/aimdb-core/src/lib.rs
@@ -58,6 +58,9 @@ pub use extensions::Extensions;
 pub use aimdb_executor::{ExecutorError, ExecutorResult};
 
 // Producer-Consumer Pattern exports
+#[cfg(feature = "remote-access")]
+pub use buffer::JsonReader;
+pub use buffer::Reader;
 pub use builder::OutboundRoute;
 pub use builder::{AimDb, AimDbBuilder};
 pub use connector::ConnectorBuilder;
diff --git a/aimdb-core/src/profiling/mod.rs b/aimdb-core/src/profiling/mod.rs
index 9b71f32c..ca2f6daa 100644
--- a/aimdb-core/src/profiling/mod.rs
+++ b/aimdb-core/src/profiling/mod.rs
@@ -24,9 +24,8 @@ pub use record_profiling::{RecordProfilingMetrics, StageEntry};
 pub use stage_metrics::StageMetrics;
 
 use alloc::{boxed::Box, sync::Arc};
-use core::future::Future;
-use core::pin::Pin;
 use core::sync::atomic::Ordering;
+use core::task::{Context, Poll};
 use portable_atomic::AtomicU64;
 
 use crate::buffer::BufferReader;
@@ -85,6 +84,11 @@ pub(crate) struct ProfilingBufferReader<T: Clone + Send> {
     clock: Clock,
     /// Wall-clock (ns) at which the last value was handed to the consumer.
     last_yield_ns: Option<u64>,
+    /// Wall-clock (ns) of the first poll of the current recv cycle — the moment
+    /// the consumer asked for the next value. Memoized across re-polls so a
+    /// `Pending` wait for the producer is not counted as consumer processing
+    /// time; cleared when the cycle completes (see `poll_recv`).
+    pending_since: Option<u64>,
 }
 
 impl<T: Clone + Send> ProfilingBufferReader<T> {
@@ -98,6 +102,7 @@ impl<T: Clone + Send> ProfilingBufferReader<T> {
             metrics,
             clock,
             last_yield_ns: None,
+            pending_since: None,
         }
     }
 
@@ -110,17 +115,26 @@ impl<T: Clone + Send> ProfilingBufferReader<T> {
 }
 
 impl<T: Clone + Send> BufferReader<T> for ProfilingBufferReader<T> {
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-        Box::pin(async move {
-            // `started_ns` ≈ the moment the consumer finished processing the
-            // previous value and asked for the next one.
-            let started_ns = (self.clock)();
-            let result = self.inner.recv().await;
-            if result.is_ok() {
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        // `started_ns` ≈ the moment the consumer finished processing the
+        // previous value and asked for the next one — i.e. the *first* poll of
+        // this recv cycle. Memoized in `pending_since` so re-polls after a
+        // `Pending` (waiting on the producer) reuse it instead of resampling;
+        // this keeps the recorded interval equal to consumer processing time and
+        // matches the prior await-based `recv()`, which captured `started_ns`
+        // once when the future was first polled. (Clock is read once per cycle,
+        // not once per poll.)
+        let started_ns = *self.pending_since.get_or_insert_with(|| (self.clock)());
+        let result = self.inner.poll_recv(cx);
+        if result.is_ready() {
+            // The recv "future" completed (Ok or Err) — close out the cycle so
+            // the next ask resamples the clock.
+            self.pending_since = None;
+            if matches!(result, Poll::Ready(Ok(_))) {
                 self.on_yield(started_ns);
             }
-            result
-        })
+        }
+        result
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
diff --git a/aimdb-core/src/remote/stream.rs b/aimdb-core/src/remote/stream.rs
index 7bdea5e7..6adddcc8 100644
--- a/aimdb-core/src/remote/stream.rs
+++ b/aimdb-core/src/remote/stream.rs
@@ -37,14 +37,16 @@ pub(crate) fn stream_record_updates(
     let record = inner
         .storage(id)
         .ok_or(DbError::InvalidRecordId { id: id.raw() })?;
-    let reader = record
-        .json_access()
-        .ok_or_else(|| {
-            DbError::runtime_error(alloc::format!(
-                "Record '{record_key}' does not support JSON remote access"
-            ))
-        })?
-        .subscribe_json()?;
+    let reader = crate::buffer::JsonReader::new(
+        record
+            .json_access()
+            .ok_or_else(|| {
+                DbError::runtime_error(alloc::format!(
+                    "Record '{record_key}' does not support JSON remote access"
+                ))
+            })?
+            .subscribe_json()?,
+    );
 
     // Pair the reader with an owned copy of the record key so lag/error
     // logs identify which record fell behind — the previous mpsc-based
@@ -81,9 +83,9 @@ pub(crate) fn stream_record_updates(
 #[cfg(all(test, feature = "std"))]
 mod tests {
     use super::*;
-    use crate::buffer::JsonBufferReader;
+    use crate::buffer::{JsonBufferReader, JsonReader};
+    use core::task::{Context, Poll};
     use futures_util::StreamExt;
-    use std::pin::Pin;
     use std::sync::atomic::{AtomicUsize, Ordering};
     use std::sync::Arc;
 
@@ -95,25 +97,21 @@ mod tests {
     }
 
     impl JsonBufferReader for FakeReader {
-        fn recv_json(
+        fn poll_recv_json(
             &mut self,
-        ) -> Pin<
-            Box<dyn std::future::Future<Output = Result<serde_json::Value, DbError>> + Send + '_>,
-        > {
-            let step = self.step.clone();
-            Box::pin(async move {
-                let s = step.fetch_add(1, Ordering::SeqCst);
-                match s {
-                    0 => Ok(serde_json::json!({"v": 1})),
-                    1 => Err(DbError::BufferLagged {
-                        buffer_name: "test".to_string(),
-                        lag_count: 7,
-                    }),
-                    2 => Ok(serde_json::json!({"v": 2})),
-                    _ => Err(DbError::BufferClosed {
-                        buffer_name: "test".to_string(),
-                    }),
-                }
+            _cx: &mut Context<'_>,
+        ) -> Poll<Result<serde_json::Value, DbError>> {
+            let s = self.step.fetch_add(1, Ordering::SeqCst);
+            Poll::Ready(match s {
+                0 => Ok(serde_json::json!({"v": 1})),
+                1 => Err(DbError::BufferLagged {
+                    buffer_name: "test".to_string(),
+                    lag_count: 7,
+                }),
+                2 => Ok(serde_json::json!({"v": 2})),
+                _ => Err(DbError::BufferClosed {
+                    buffer_name: "test".to_string(),
+                }),
             })
         }
 
@@ -124,9 +122,9 @@ mod tests {
 
     #[tokio::test]
     async fn unfold_skips_lag_and_terminates_on_closed() {
-        let reader: Box<dyn JsonBufferReader + Send> = Box::new(FakeReader {
+        let reader = JsonReader::new(Box::new(FakeReader {
             step: Arc::new(AtomicUsize::new(0)),
-        });
+        }));
 
         let stream = unfold(reader, |mut reader| async move {
             loop {
diff --git a/aimdb-core/src/typed_api.rs b/aimdb-core/src/typed_api.rs
index dda6e3f7..78c61075 100644
--- a/aimdb-core/src/typed_api.rs
+++ b/aimdb-core/src/typed_api.rs
@@ -215,19 +215,22 @@ where
 
     /// Subscribe to updates for this record type.
     ///
-    /// Returns a reader that yields values as they are produced.
+    /// Returns a [`Reader<T>`](crate::buffer::Reader) that yields values as they
+    /// are produced. Its `recv().await` is allocation-free (design 037 / W8).
     /// Infallible — the buffer is pre-resolved at `Consumer` construction.
-    pub fn subscribe(&self) -> Box<dyn crate::buffer::BufferReader<T> + Send> {
+    pub fn subscribe(&self) -> crate::buffer::Reader<T> {
         let reader = self.buffer.subscribe_boxed();
         #[cfg(feature = "profiling")]
         if let Some((metrics, clock)) = &self.profiling {
-            return Box::new(crate::profiling::ProfilingBufferReader::new(
-                reader,
-                metrics.clone(),
-                clock.clone(),
+            return crate::buffer::Reader::new(Box::new(
+                crate::profiling::ProfilingBufferReader::new(
+                    reader,
+                    metrics.clone(),
+                    clock.clone(),
+                ),
             ));
         }
-        reader
+        crate::buffer::Reader::new(reader)
     }
 }
 
@@ -281,8 +284,12 @@ where
 
 /// One subscription of a [`FusedSource`]: recv → resolve destination →
 /// serialize, all on the typed value.
+///
+/// The connector SPI keeps its boxed `RecvSerializedFuture` (BYOC stays stable,
+/// design 039 §2); only the *inner* per-message box is eliminated by reading
+/// through the allocation-free [`Reader<T>`](crate::buffer::Reader) (W8).
 struct FusedReader<T: Clone + Send + 'static> {
-    inner: Box<dyn crate::buffer::BufferReader<T> + Send>,
+    inner: crate::buffer::Reader<T>,
     serialize: FusedSerializeFn<T>,
     topic: Option<Arc<dyn crate::connector::TopicProvider<T>>>,
 }
@@ -1722,15 +1729,16 @@ mod tests {
     }
 
     impl crate::buffer::BufferReader<TestRecord> for ScriptedReader {
-        fn recv(
+        fn poll_recv(
             &mut self,
-        ) -> Pin<Box<dyn Future<Output = Result<TestRecord, crate::DbError>> + Send + '_>> {
+            _cx: &mut core::task::Context<'_>,
+        ) -> core::task::Poll<Result<TestRecord, crate::DbError>> {
             let next = if self.script.is_empty() {
                 Err(Self::closed())
             } else {
                 self.script.remove(0)
             };
-            Box::pin(async move { next })
+            core::task::Poll::Ready(next)
         }
         fn try_recv(&mut self) -> Result<TestRecord, crate::DbError> {
             unimplemented!("not needed for fused reader tests")
@@ -1750,7 +1758,7 @@ mod tests {
         topic: Option<Arc<dyn crate::connector::TopicProvider<TestRecord>>>,
     ) -> FusedReader<TestRecord> {
         FusedReader {
-            inner: Box::new(ScriptedReader { script }),
+            inner: crate::buffer::Reader::new(Box::new(ScriptedReader { script })),
             serialize,
             topic,
         }
diff --git a/aimdb-core/src/typed_record.rs b/aimdb-core/src/typed_record.rs
index d50d4e39..8407578a 100644
--- a/aimdb-core/src/typed_record.rs
+++ b/aimdb-core/src/typed_record.rs
@@ -19,6 +19,8 @@
 
 use core::any::Any;
 use core::fmt::Debug;
+#[cfg(feature = "remote-access")]
+use core::task::{Context, Poll};
 
 use alloc::{
     boxed::Box,
@@ -134,24 +136,21 @@ struct JsonReaderAdapter<T: Clone + Send + 'static> {
 
 #[cfg(feature = "remote-access")]
 impl<T: Clone + Send + 'static> crate::buffer::JsonBufferReader for JsonReaderAdapter<T> {
-    fn recv_json(
+    fn poll_recv_json(
         &mut self,
-    ) -> core::pin::Pin<
-        Box<
-            dyn core::future::Future<Output = Result<serde_json::Value, crate::DbError>>
-                + Send
-                + '_,
-        >,
-    > {
-        Box::pin(async move {
-            // Receive typed value from buffer
-            let value = self.inner.recv().await?;
-
-            // Serialize to JSON
-            self.codec
-                .encode(&value)
-                .ok_or_else(|| crate::DbError::runtime_error("Failed to serialize value to JSON"))
-        })
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<serde_json::Value, crate::DbError>> {
+        // Poll the inner typed reader (allocation-free), then serialize on the
+        // ready value — the pre-W8 outer + inner double box are both gone.
+        match self.inner.poll_recv(cx) {
+            Poll::Ready(Ok(value)) => {
+                Poll::Ready(self.codec.encode(&value).ok_or_else(|| {
+                    crate::DbError::runtime_error("Failed to serialize value to JSON")
+                }))
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
+            Poll::Pending => Poll::Pending,
+        }
     }
 
     fn try_recv_json(&mut self) -> Result<serde_json::Value, crate::DbError> {
@@ -861,15 +860,18 @@ impl<T: Send + 'static + Debug + Clone> TypedRecord<T> {
 
     /// Subscribes to the buffer for this record type
     ///
+    /// Returns an ergonomic, allocation-free [`Reader<T>`](crate::buffer::Reader)
+    /// handle (design 037 / W8).
+    ///
     /// # Errors
     /// Returns `DbError::MissingConfiguration` if no buffer configured
-    pub fn subscribe(&self) -> crate::DbResult<Box<dyn crate::buffer::BufferReader<T> + Send>> {
+    pub fn subscribe(&self) -> crate::DbResult<crate::buffer::Reader<T>> {
         let buffer = self
             .buffer
             .as_ref()
             .ok_or_else(|| crate::DbError::missing_configuration("buffer"))?;
 
-        Ok(buffer.subscribe_boxed())
+        Ok(crate::buffer::Reader::new(buffer.subscribe_boxed()))
     }
 
     /// Adds an outbound connector link for external system integration
@@ -1293,12 +1295,17 @@ impl<T: Send + Sync + 'static + Debug + Clone> JsonRecordAccess for TypedRecord<
             ))
         })?;
 
-        // 2. Subscribe to the buffer (get Box<dyn BufferReader<T>>)
-        let reader = self.subscribe()?;
+        // 2. Subscribe to the buffer (the adapter polls the erased reader
+        //    directly, so it takes the boxed reader rather than the ergonomic
+        //    `Reader<T>` wrapper).
+        let buffer = self
+            .buffer
+            .as_ref()
+            .ok_or_else(|| DbError::missing_configuration("buffer"))?;
 
         // 3. Wrap in JsonReaderAdapter
         let json_reader = JsonReaderAdapter {
-            inner: reader,
+            inner: buffer.subscribe_boxed(),
             codec,
         };
 
diff --git a/aimdb-embassy-adapter/src/buffer.rs b/aimdb-embassy-adapter/src/buffer.rs
index 8a79f151..dc38d002 100644
--- a/aimdb-embassy-adapter/src/buffer.rs
+++ b/aimdb-embassy-adapter/src/buffer.rs
@@ -43,13 +43,14 @@ extern crate alloc;
 use alloc::boxed::Box;
 use alloc::string::String;
 use alloc::sync::Arc;
+use core::task::{Context, Poll};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
 use aimdb_core::DbError;
 use embassy_sync::blocking_mutex::raw::CriticalSectionRawMutex;
 use embassy_sync::channel::Channel;
-use embassy_sync::pubsub::{PubSubChannel, WaitResult};
-use embassy_sync::watch::Watch;
+use embassy_sync::pubsub::{PubSubChannel, Subscriber, WaitResult};
+use embassy_sync::watch::{Receiver as WatchReceiver, Watch};
 
 #[cfg(feature = "metrics")]
 use aimdb_core::buffer::{BufferCounters, BufferMetrics, BufferMetricsSnapshot};
@@ -76,8 +77,11 @@ use aimdb_core::buffer::{BufferCounters, BufferMetrics, BufferMetricsSnapshot};
 /// type MyBuffer = EmbassyBuffer<u32, 32, 4, 2, 4>;
 ///
 /// # async fn example() {
+/// use aimdb_core::buffer::Reader;
 /// let buffer: MyBuffer = MyBuffer::new_spmc();
-/// let mut reader = buffer.subscribe();
+/// // `subscribe()` yields the concrete reader; wrap it in the ergonomic,
+/// // allocation-free `Reader<T>` handle for `recv().await` (design 037 / W8).
+/// let mut reader = Reader::new(Box::new(buffer.subscribe()));
 /// buffer.push(42);
 /// let value = reader.recv().await.unwrap();
 /// # }
@@ -206,12 +210,20 @@ impl<
         }
     }
 
+    /// The embassy subscriber/receiver is created **lazily on first poll**, not
+    /// here (unlike the Tokio adapter, which registers eagerly).
+    /// This matters only for `SpmcRing`: any message produced in the gap between
+    /// `subscribe()` and that first poll is missed, because the reader only
+    /// receives messages sent after it starts listening. In normal use the
+    /// consumer spawns and loops on `recv()`, so the gap is harmless. If you must
+    /// produce before the consumer has polled, call `try_recv()` once first to
+    /// start listening early (this is what the B0/B2 benches' `prime()` does).
     fn subscribe(&self) -> Self::Reader {
         // Clone the Arc for the reader
         EmbassyBufferReader {
             buffer: Arc::clone(&self.inner),
-            watch_receiver: None, // Will be initialized on first recv() for Watch buffers
-            spmc_subscriber: None, // Will be initialized on first recv() for SpmcRing buffers
+            watch_receiver: None, // Lazily initialized on first poll for Watch buffers
+            spmc_subscriber: None, // Lazily initialized on first poll for SpmcRing buffers
             #[cfg(feature = "metrics")]
             metrics: Arc::clone(&self.metrics),
         }
@@ -351,7 +363,9 @@ impl<
         F: Fn(T) -> Fut + Send + Sync,
         Fut: core::future::Future<Output = ()> + Send,
     {
-        let mut reader = self.subscribe();
+        // Wrap the concrete reader in the ergonomic, allocation-free
+        // `Reader<T>` handle so `recv().await` works (design 037 / W8).
+        let mut reader = aimdb_core::buffer::Reader::new(Box::new(self.subscribe()));
 
         loop {
             match reader.recv().await {
@@ -375,11 +389,75 @@ impl<
     }
 }
 
+// ============================================================================
+// Poll plumbing (design 037 / W8)
+// ============================================================================
+//
+// `poll_recv` drives embassy-sync's *public* poll methods directly —
+// `Subscriber::poll_next_message`, `Receiver::poll_changed`, and
+// `Channel::poll_receive` — so there is zero allocation per message and no
+// per-message future box. The reader stores the subscriber/receiver across
+// calls (lazily created on first poll); `try_recv` uses the matching
+// `try_*` methods. No `unsafe` beyond the pre-existing `'static` borrow
+// extension in the `make_*` helpers (the `Arc` keeps the primitive alive).
+
+/// Persistent SpmcRing subscriber with a lifetime extended to `'static` (the
+/// owning `Arc<EmbassyBufferInner>` keeps the channel alive for the reader).
+type SpmcSub<T, const CAP: usize, const SUBS: usize, const PUBS: usize> =
+    Subscriber<'static, CriticalSectionRawMutex, T, CAP, SUBS, PUBS>;
+
+/// Persistent Watch receiver, `'static` for the same reason as [`SpmcSub`].
+type WatchRx<T, const WATCH_N: usize> = WatchReceiver<'static, CriticalSectionRawMutex, T, WATCH_N>;
+
+/// Create the persistent SpmcRing subscriber, extending its borrow to `'static`.
+///
+/// SAFETY: the `Arc<EmbassyBufferInner>` in the reader keeps the `PubSubChannel`
+/// alive for the reader's whole life, so the `'static` subscriber never outlives
+/// the channel. (Same invariant the pre-W8 code relied on.)
+fn make_spmc_sub<
+    T: Clone + Send + 'static,
+    const CAP: usize,
+    const SUBS: usize,
+    const PUBS: usize,
+>(
+    channel: &PubSubChannel<CriticalSectionRawMutex, T, CAP, SUBS, PUBS>,
+) -> Result<SpmcSub<T, CAP, SUBS, PUBS>, DbError> {
+    let channel_static: &'static PubSubChannel<CriticalSectionRawMutex, T, CAP, SUBS, PUBS> =
+        unsafe { &*(channel as *const _) };
+    channel_static.subscriber().map_err(|_| {
+        defmt::error!(
+            "AimDB: SpmcRing subscriber slot exhausted (max SUBS={}). \
+             Increase the CONSUMERS const generic on buffer_sized<CAP, CONSUMERS>. \
+             Count one slot per .tap(), .link_to() connector, and each transform_join input.",
+            SUBS
+        );
+        DbError::BufferClosed {
+            buffer_name: String::from("embassy spmc ring"),
+        }
+    })
+}
+
+/// Create the persistent Watch receiver, extending its borrow to `'static`.
+///
+/// SAFETY: see [`make_spmc_sub`] — the `Arc` keeps the `Watch` alive.
+fn make_watch_rx<T: Clone + Send + 'static, const WATCH_N: usize>(
+    watch: &Watch<CriticalSectionRawMutex, T, WATCH_N>,
+) -> Result<WatchRx<T, WATCH_N>, DbError> {
+    let watch_static: &'static Watch<CriticalSectionRawMutex, T, WATCH_N> =
+        unsafe { &*(watch as *const _) };
+    watch_static.receiver().ok_or(DbError::BufferClosed {
+        buffer_name: String::from("embassy watch"),
+    })
+}
+
 /// Reader for Embassy buffers
 ///
-/// Holds persistent subscription state for each buffer type.
-/// For Watch buffers, stores a persistent Receiver to track which value has been seen.
-/// For SpmcRing buffers, stores a persistent Subscriber for cursor continuity.
+/// Holds persistent subscription state for each buffer type and drives it
+/// through embassy-sync's public poll methods, so `poll_recv` allocates nothing
+/// per message and stores no future box (design 037 / W8). For Watch a
+/// persistent Receiver tracks which value has been seen; for SpmcRing a
+/// persistent Subscriber keeps cursor continuity. Both are lazily created on the
+/// first poll.
 pub struct EmbassyBufferReader<
     T: Clone + Send + 'static,
     const CAP: usize,
@@ -388,14 +466,10 @@ pub struct EmbassyBufferReader<
     const WATCH_N: usize,
 > {
     buffer: Arc<EmbassyBufferInner<T, CAP, SUBS, PUBS, WATCH_N>>,
-    /// Persistent Watch receiver. The 'static lifetime is safe because the Arc keeps the Watch alive.
-    watch_receiver:
-        Option<embassy_sync::watch::Receiver<'static, CriticalSectionRawMutex, T, WATCH_N>>,
-    /// Persistent SpmcRing subscriber (same pattern as watch_receiver).
-    /// The 'static lifetime is safe because the Arc keeps the PubSubChannel alive.
-    spmc_subscriber: Option<
-        embassy_sync::pubsub::Subscriber<'static, CriticalSectionRawMutex, T, CAP, SUBS, PUBS>,
-    >,
+    /// Persistent Watch receiver, lazily created on first poll.
+    watch_receiver: Option<WatchRx<T, WATCH_N>>,
+    /// Persistent SpmcRing subscriber, lazily created on first poll.
+    spmc_subscriber: Option<SpmcSub<T, CAP, SUBS, PUBS>>,
     /// Shared counter state (cloned from the parent buffer at subscribe time).
     #[cfg(feature = "metrics")]
     metrics: Arc<BufferCounters>,
@@ -409,124 +483,67 @@ impl<
         const WATCH_N: usize,
     > BufferReader<T> for EmbassyBufferReader<T, CAP, SUBS, PUBS, WATCH_N>
 {
-    fn recv(
-        &mut self,
-    ) -> core::pin::Pin<Box<dyn core::future::Future<Output = Result<T, DbError>> + Send + '_>>
-    {
-        Box::pin(async move {
-            match &*self.buffer {
-                EmbassyBufferInner::SpmcRing(channel) => {
-                    // Lazily create persistent subscriber (same pattern as watch_receiver)
-                    if self.spmc_subscriber.is_none() {
-                        // SAFETY: The Arc in self.buffer keeps the PubSubChannel alive for this reader's lifetime.
-                        // We extend the lifetime to 'static to store the subscriber, which is safe because
-                        // the subscriber is dropped with the reader.
-                        let channel_static: &'static embassy_sync::pubsub::PubSubChannel<
-                            CriticalSectionRawMutex,
-                            T,
-                            CAP,
-                            SUBS,
-                            PUBS,
-                        > = unsafe { &*(channel as *const _) };
-                        self.spmc_subscriber = Some(
-                            channel_static.subscriber().map_err(|_| {
-                                defmt::error!(
-                                    "AimDB: SpmcRing subscriber slot exhausted (max SUBS={}). \
-                                     Increase the CONSUMERS const generic on buffer_sized<CAP, CONSUMERS>. \
-                                     Count one slot per .tap(), .link_to() connector, and each transform_join input.",
-                                    SUBS
-                                );
-                                DbError::BufferClosed {
-                                    buffer_name: String::from("embassy spmc ring"),
-                                }
-                            })?,
-                        );
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        match &*self.buffer {
+            EmbassyBufferInner::SpmcRing(channel) => {
+                // Lazily create the persistent subscriber, then poll it directly
+                // via embassy-sync's public `poll_next_message` (no future box,
+                // no allocation per message; lag preserved).
+                if self.spmc_subscriber.is_none() {
+                    match make_spmc_sub(channel) {
+                        Ok(sub) => self.spmc_subscriber = Some(sub),
+                        Err(e) => return Poll::Ready(Err(e)),
                     }
-                    match self.spmc_subscriber.as_mut().unwrap().next_message().await {
-                        WaitResult::Message(value) => {
-                            #[cfg(feature = "metrics")]
-                            self.metrics.increment_consumed();
-                            Ok(value)
-                        }
-                        WaitResult::Lagged(n) => {
-                            #[cfg(feature = "metrics")]
-                            self.metrics.add_dropped(n);
-                            Err(DbError::BufferLagged {
-                                lag_count: n,
-                                buffer_name: String::from("embassy spmc ring"),
-                            })
-                        }
+                }
+                match self.spmc_subscriber.as_mut().unwrap().poll_next_message(cx) {
+                    Poll::Ready(WaitResult::Message(value)) => {
+                        #[cfg(feature = "metrics")]
+                        self.metrics.increment_consumed();
+                        Poll::Ready(Ok(value))
+                    }
+                    Poll::Ready(WaitResult::Lagged(n)) => {
+                        #[cfg(feature = "metrics")]
+                        self.metrics.add_dropped(n);
+                        Poll::Ready(Err(DbError::BufferLagged {
+                            lag_count: n,
+                            buffer_name: String::from("embassy spmc ring"),
+                        }))
                     }
+                    Poll::Pending => Poll::Pending,
                 }
-                EmbassyBufferInner::Watch(watch) => {
-                    // Watch requires a persistent receiver to track seen values.
-                    // Creating a new receiver each time causes infinite loops (always returns current value).
-                    if self.watch_receiver.is_none() {
-                        // SAFETY: The Arc in self.buffer keeps the Watch alive for this reader's lifetime.
-                        // We extend the lifetime to 'static to store the receiver, which is safe because
-                        // the receiver is just (&Watch, u64 counter) and will be dropped with the reader.
-                        let watch_static: &'static embassy_sync::watch::Watch<
-                            CriticalSectionRawMutex,
-                            T,
-                            WATCH_N,
-                        > = unsafe { &*(watch as *const _) };
-
-                        self.watch_receiver = watch_static.receiver();
-                        if self.watch_receiver.is_none() {
-                            return Err(DbError::BufferClosed {
-                                buffer_name: String::from("embassy watch"),
-                            });
-                        }
+            }
+            EmbassyBufferInner::Watch(watch) => {
+                if self.watch_receiver.is_none() {
+                    match make_watch_rx(watch) {
+                        Ok(rx) => self.watch_receiver = Some(rx),
+                        Err(e) => return Poll::Ready(Err(e)),
                     }
-
-                    // Use the persistent receiver to detect changes
-                    if let Some(ref mut rx) = self.watch_receiver {
-                        let value = rx.changed().await;
+                }
+                match self.watch_receiver.as_mut().unwrap().poll_changed(cx) {
+                    Poll::Ready(value) => {
                         #[cfg(feature = "metrics")]
                         self.metrics.increment_consumed();
-                        Ok(value)
-                    } else {
-                        Err(DbError::BufferClosed {
-                            buffer_name: String::from("embassy watch"),
-                        })
+                        Poll::Ready(Ok(value))
                     }
+                    Poll::Pending => Poll::Pending,
                 }
-                EmbassyBufferInner::Mailbox(channel) => {
-                    let rx = channel.receiver();
-                    let value = rx.receive().await;
+            }
+            EmbassyBufferInner::Mailbox(channel) => match channel.poll_receive(cx) {
+                Poll::Ready(value) => {
                     #[cfg(feature = "metrics")]
                     self.metrics.increment_consumed();
-                    Ok(value)
+                    Poll::Ready(Ok(value))
                 }
-            }
-        })
+                Poll::Pending => Poll::Pending,
+            },
+        }
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
         match &*self.buffer {
             EmbassyBufferInner::SpmcRing(channel) => {
-                // Lazily create persistent subscriber (same as recv())
                 if self.spmc_subscriber.is_none() {
-                    let channel_static: &'static embassy_sync::pubsub::PubSubChannel<
-                        CriticalSectionRawMutex,
-                        T,
-                        CAP,
-                        SUBS,
-                        PUBS,
-                    > = unsafe { &*(channel as *const _) };
-                    self.spmc_subscriber = Some(
-                        channel_static.subscriber().map_err(|_| {
-                            defmt::error!(
-                                "AimDB: SpmcRing subscriber slot exhausted (max SUBS={}). \
-                                 Increase the CONSUMERS const generic on buffer_sized<CAP, CONSUMERS>. \
-                                 Count one slot per .tap(), .link_to() connector, and each transform_join input.",
-                                SUBS
-                            );
-                            DbError::BufferClosed {
-                                buffer_name: String::from("embassy spmc ring"),
-                            }
-                        })?,
-                    );
+                    self.spmc_subscriber = Some(make_spmc_sub(channel)?);
                 }
                 match self
                     .spmc_subscriber
@@ -542,18 +559,17 @@ impl<
                     None => Err(DbError::BufferEmpty),
                 }
             }
-            EmbassyBufferInner::Watch(_) => {
-                if let Some(ref mut rx) = self.watch_receiver {
-                    match rx.try_changed() {
-                        Some(value) => {
-                            #[cfg(feature = "metrics")]
-                            self.metrics.increment_consumed();
-                            Ok(value)
-                        }
-                        None => Err(DbError::BufferEmpty),
+            EmbassyBufferInner::Watch(watch) => {
+                if self.watch_receiver.is_none() {
+                    self.watch_receiver = Some(make_watch_rx(watch)?);
+                }
+                match self.watch_receiver.as_mut().unwrap().try_changed() {
+                    Some(value) => {
+                        #[cfg(feature = "metrics")]
+                        self.metrics.increment_consumed();
+                        Ok(value)
                     }
-                } else {
-                    Err(DbError::BufferEmpty)
+                    None => Err(DbError::BufferEmpty),
                 }
             }
             EmbassyBufferInner::Mailbox(channel) => match channel.try_receive() {
diff --git a/aimdb-tokio-adapter/Cargo.toml b/aimdb-tokio-adapter/Cargo.toml
index 4d2f1a81..7c5f8e94 100644
--- a/aimdb-tokio-adapter/Cargo.toml
+++ b/aimdb-tokio-adapter/Cargo.toml
@@ -18,7 +18,7 @@ default = ["std", "tokio-runtime"]
 std = ["aimdb-core/std"]
 
 # Runtime features
-tokio-runtime = ["tokio", "std"]
+tokio-runtime = ["tokio", "tokio-util", "std"]
 
 # Observability features
 tracing = ["aimdb-core/tracing", "dep:tracing"]
@@ -44,9 +44,15 @@ aimdb-core = { version = "1.1.0", path = "../aimdb-core", default-features = fal
 tokio = { workspace = true, optional = true, features = [
     "time",
     "rt-multi-thread",
-    "sync",            # For broadcast, watch, Mutex, Notify
+    "sync",            # For broadcast, watch, Mutex
 ] }
 
+# ReusableBoxFuture for the broadcast/watch readers' poll_recv (design 037 / W8):
+# one heap box per subscriber lifetime, reused for every message — zero
+# per-message allocation. broadcast/watch expose no public poll API, so the
+# reader round-trips the receiver through a stored, reused future.
+tokio-util = { version = "0.7", optional = true, default-features = false }
+
 # Observability (optional)
 tracing = { workspace = true, optional = true }
 
diff --git a/aimdb-tokio-adapter/src/buffer.rs b/aimdb-tokio-adapter/src/buffer.rs
index b9934dfe..8a39aa36 100644
--- a/aimdb-tokio-adapter/src/buffer.rs
+++ b/aimdb-tokio-adapter/src/buffer.rs
@@ -5,15 +5,21 @@
 //!
 //! - **SPMC Ring**: `tokio::sync::broadcast` for bounded multi-consumer queues
 //! - **SingleLatest**: `tokio::sync::watch` for latest-value semantics
-//! - **Mailbox**: `tokio::sync::Mutex` + `tokio::sync::Notify` for single-slot overwrite
+//! - **Mailbox**: `std::sync::Mutex` slot + a hand-rolled waker list for
+//!   single-slot overwrite (design 037 / W8 — no `Notify`, no per-message alloc)
+//!
+//! The broadcast/watch readers are poll-based ([`BufferReader::poll_recv`]) with
+//! no per-message heap allocation: each holds a [`ReusableBoxFuture`] that
+//! round-trips its receiver (these primitives expose no public poll API), so the
+//! single boxed future is allocated once per subscriber and reused per message.
 
-use std::future::Future;
-use std::pin::Pin;
 use std::sync::{Arc, Mutex as StdMutex};
+use std::task::{Context, Poll, Waker};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
 use aimdb_core::DbError;
-use tokio::sync::{broadcast, watch, Notify};
+use tokio::sync::{broadcast, watch};
+use tokio_util::sync::ReusableBoxFuture;
 
 #[cfg(feature = "metrics")]
 use aimdb_core::buffer::{BufferCounters, BufferMetrics, BufferMetricsSnapshot};
@@ -25,6 +31,23 @@ pub struct TokioBuffer<T: Clone + Send + Sync + 'static> {
     metrics: Arc<BufferCounters>,
 }
 
+/// Shared state for the Mailbox (single-slot overwrite) buffer.
+///
+/// Replaces the pre-W8 `Notify` with an explicit waker list beside the slot, so
+/// `poll_recv` registers a waker on `Pending` and `push` wakes them — no
+/// `Notify` permit subtleties, no per-message allocation (design 037 / W8 §6).
+///
+/// `pub` only because it appears in the `pub` [`TokioBufferReader`] reader enum;
+/// it is an implementation detail and not part of the supported API.
+#[doc(hidden)]
+pub struct MailboxState<T> {
+    /// The single value slot; a new `push` overwrites any unconsumed value.
+    slot: Option<T>,
+    /// Parked readers, woken on `push`. Deduplicated on registration and drained
+    /// on wake, so capacity stabilizes after warmup (mirrors the WASM adapter).
+    wakers: Vec<Waker>,
+}
+
 /// Internal buffer variants using Tokio primitives
 enum TokioBufferInner<T: Clone + Send + Sync + 'static> {
     Broadcast {
@@ -33,9 +56,8 @@ enum TokioBufferInner<T: Clone + Send + Sync + 'static> {
     Watch {
         tx: watch::Sender<Option<T>>,
     },
-    Notify {
-        slot: Arc<StdMutex<Option<T>>>,
-        notify: Arc<Notify>,
+    Mailbox {
+        state: Arc<StdMutex<MailboxState<T>>>,
     },
 }
 
@@ -59,9 +81,11 @@ impl<T: Clone + Send + Sync + 'static> Buffer<T> for TokioBuffer<T> {
                 let (tx, _rx) = watch::channel(None);
                 TokioBufferInner::Watch { tx }
             }
-            BufferCfg::Mailbox => TokioBufferInner::Notify {
-                slot: Arc::new(StdMutex::new(None)),
-                notify: Arc::new(Notify::new()),
+            BufferCfg::Mailbox => TokioBufferInner::Mailbox {
+                state: Arc::new(StdMutex::new(MailboxState {
+                    slot: None,
+                    wakers: Vec::new(),
+                })),
             },
         };
 
@@ -87,9 +111,15 @@ impl<T: Clone + Send + Sync + 'static> Buffer<T> for TokioBuffer<T> {
                 // before any subscriber attaches.
                 tx.send_replace(Some(value));
             }
-            TokioBufferInner::Notify { slot, notify } => {
-                *slot.lock().unwrap() = Some(value);
-                notify.notify_waiters();
+            TokioBufferInner::Mailbox { state } => {
+                let mut guard = state.lock().unwrap();
+                guard.slot = Some(value);
+                // Wake-all: spurious wakeups are benign — losers re-poll to
+                // `Pending` and re-register (design 037 §6). Drain so the list
+                // does not accumulate stale wakers.
+                for waker in guard.wakers.drain(..) {
+                    waker.wake();
+                }
             }
         }
     }
@@ -97,18 +127,19 @@ impl<T: Clone + Send + Sync + 'static> Buffer<T> for TokioBuffer<T> {
     fn subscribe(&self) -> Self::Reader {
         match &*self.inner {
             TokioBufferInner::Broadcast { tx } => TokioBufferReader::Broadcast {
-                rx: tx.subscribe(),
+                // Allocate the reusable future box once, here, capturing the
+                // freshly-subscribed receiver — reused for every message (W8).
+                recv: ReusableBoxFuture::new(broadcast_recv(tx.subscribe())),
                 #[cfg(feature = "metrics")]
                 metrics: Arc::clone(&self.metrics),
             },
             TokioBufferInner::Watch { tx } => TokioBufferReader::Watch {
-                rx: tx.subscribe(),
+                recv: ReusableBoxFuture::new(watch_recv(tx.subscribe())),
                 #[cfg(feature = "metrics")]
                 metrics: Arc::clone(&self.metrics),
             },
-            TokioBufferInner::Notify { slot, notify } => TokioBufferReader::Notify {
-                slot: Arc::clone(slot),
-                notify: Arc::clone(notify),
+            TokioBufferInner::Mailbox { state } => TokioBufferReader::Mailbox {
+                state: Arc::clone(state),
                 #[cfg(feature = "metrics")]
                 metrics: Arc::clone(&self.metrics),
             },
@@ -138,7 +169,7 @@ impl<T: Clone + Send + Sync + 'static> aimdb_core::buffer::DynBuffer<T> for Toki
             // watch::Sender::borrow() reads the slot non-destructively.
             TokioBufferInner::Watch { tx } => tx.borrow().clone(),
             // Same Mutex the Mailbox buffer already uses for the slot.
-            TokioBufferInner::Notify { slot, .. } => slot.lock().unwrap().clone(),
+            TokioBufferInner::Mailbox { state } => state.lock().unwrap().slot.clone(),
             // broadcast has no canonical latest — see design 031 §SPMC Ring.
             TokioBufferInner::Broadcast { .. } => None,
         }
@@ -176,9 +207,9 @@ impl<T: Clone + Send + Sync + 'static> BufferMetrics for TokioBuffer<T> {
                     1
                 }
             }
-            TokioBufferInner::Notify { slot, .. } => {
+            TokioBufferInner::Mailbox { state } => {
                 // Lock held only for is_some() check, released immediately.
-                if slot.lock().unwrap().is_some() {
+                if state.lock().unwrap().slot.is_some() {
                     1
                 } else {
                     0
@@ -214,7 +245,9 @@ impl<T: Clone + Send + Sync + 'static> TokioBuffer<T> {
         F: Fn(T) -> Fut + Send + Sync + 'static,
         Fut: std::future::Future<Output = ()> + Send + 'static,
     {
-        let mut reader = self.subscribe();
+        // Wrap the concrete reader in the ergonomic, allocation-free
+        // `Reader<T>` handle so `recv().await` works (design 037 / W8).
+        let mut reader = aimdb_core::buffer::Reader::new(Box::new(self.subscribe()));
 
         tokio::spawn(async move {
             loop {
@@ -258,153 +291,213 @@ impl<T: Clone + Send + Sync + 'static> TokioBuffer<T> {
     }
 }
 
+/// Output of the broadcast reader's reusable future: the `recv()` result paired
+/// with the receiver handed back so the next future can reuse it.
+type BroadcastRecvOutput<T> = (
+    Result<T, broadcast::error::RecvError>,
+    broadcast::Receiver<T>,
+);
+
+/// Output of the watch reader's reusable future. `Ok(Option<T>)` carries the
+/// borrowed-and-cloned latest value (`None` means the channel closed).
+type WatchRecvOutput<T> = (
+    Result<Option<T>, watch::error::RecvError>,
+    watch::Receiver<Option<T>>,
+);
+
+/// Await the next broadcast value, returning the receiver for reuse.
+///
+/// `broadcast::Receiver` exposes no public poll API, so the reader stores this
+/// future in a [`ReusableBoxFuture`] and round-trips the receiver through it —
+/// one allocation per subscriber, reused for every message (design 037 / W8).
+async fn broadcast_recv<T: Clone>(mut rx: broadcast::Receiver<T>) -> BroadcastRecvOutput<T> {
+    let res = rx.recv().await;
+    (res, rx)
+}
+
+/// Await the next watch change, returning the receiver for reuse. Mirrors the
+/// pre-W8 `changed().await` + `borrow().clone()` sequence.
+async fn watch_recv<T: Clone>(mut rx: watch::Receiver<Option<T>>) -> WatchRecvOutput<T> {
+    let res = match rx.changed().await {
+        Ok(()) => Ok(rx.borrow().clone()),
+        Err(e) => Err(e),
+    };
+    (res, rx)
+}
+
 /// Tokio-based buffer reader
 pub enum TokioBufferReader<T: Clone + Send + Sync + 'static> {
     Broadcast {
-        rx: broadcast::Receiver<T>,
+        recv: ReusableBoxFuture<'static, BroadcastRecvOutput<T>>,
         #[cfg(feature = "metrics")]
         metrics: Arc<BufferCounters>,
     },
     Watch {
-        rx: watch::Receiver<Option<T>>,
+        recv: ReusableBoxFuture<'static, WatchRecvOutput<T>>,
         #[cfg(feature = "metrics")]
         metrics: Arc<BufferCounters>,
     },
-    Notify {
-        slot: Arc<StdMutex<Option<T>>>,
-        notify: Arc<Notify>,
+    Mailbox {
+        state: Arc<StdMutex<MailboxState<T>>>,
         #[cfg(feature = "metrics")]
         metrics: Arc<BufferCounters>,
     },
 }
 
+impl<T: Clone + Send + Sync + 'static> TokioBufferReader<T> {
+    /// Map a broadcast `recv()` result into the AimDB error space (and record
+    /// metrics). Shared by `poll_recv` and `try_recv`.
+    fn map_broadcast(
+        result: Result<T, broadcast::error::RecvError>,
+        #[cfg(feature = "metrics")] metrics: &BufferCounters,
+    ) -> Result<T, DbError> {
+        match result {
+            Ok(value) => {
+                #[cfg(feature = "metrics")]
+                metrics.increment_consumed();
+                Ok(value)
+            }
+            Err(broadcast::error::RecvError::Lagged(n)) => {
+                #[cfg(feature = "metrics")]
+                metrics.add_dropped(n);
+                Err(DbError::BufferLagged {
+                    lag_count: n,
+                    buffer_name: "broadcast".to_string(),
+                })
+            }
+            Err(broadcast::error::RecvError::Closed) => Err(DbError::BufferClosed {
+                buffer_name: "broadcast".to_string(),
+            }),
+        }
+    }
+
+    /// Map a watch `changed()` result into the AimDB error space.
+    fn map_watch(
+        result: Result<Option<T>, watch::error::RecvError>,
+        #[cfg(feature = "metrics")] metrics: &BufferCounters,
+    ) -> Result<T, DbError> {
+        match result {
+            Ok(Some(v)) => {
+                #[cfg(feature = "metrics")]
+                metrics.increment_consumed();
+                Ok(v)
+            }
+            Ok(None) | Err(_) => Err(DbError::BufferClosed {
+                buffer_name: "watch".to_string(),
+            }),
+        }
+    }
+}
+
 impl<T: Clone + Send + Sync + 'static> BufferReader<T> for TokioBufferReader<T> {
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-        Box::pin(async move {
-            match self {
-                TokioBufferReader::Broadcast {
-                    rx,
-                    #[cfg(feature = "metrics")]
-                    metrics,
-                } => match rx.recv().await {
-                    Ok(value) => {
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        match self {
+            TokioBufferReader::Broadcast {
+                recv,
+                #[cfg(feature = "metrics")]
+                metrics,
+            } => match recv.poll(cx) {
+                Poll::Ready((result, rx)) => {
+                    // Re-arm the reusable future with the returned receiver
+                    // before handing back the value (no allocation — same future
+                    // type reuses the box).
+                    recv.set(broadcast_recv(rx));
+                    Poll::Ready(Self::map_broadcast(
+                        result,
                         #[cfg(feature = "metrics")]
-                        metrics.increment_consumed();
-                        Ok(value)
-                    }
-                    Err(broadcast::error::RecvError::Lagged(n)) => {
+                        metrics,
+                    ))
+                }
+                Poll::Pending => Poll::Pending,
+            },
+            TokioBufferReader::Watch {
+                recv,
+                #[cfg(feature = "metrics")]
+                metrics,
+            } => match recv.poll(cx) {
+                Poll::Ready((result, rx)) => {
+                    recv.set(watch_recv(rx));
+                    Poll::Ready(Self::map_watch(
+                        result,
                         #[cfg(feature = "metrics")]
-                        metrics.add_dropped(n);
-                        Err(DbError::BufferLagged {
-                            lag_count: n,
-                            buffer_name: "broadcast".to_string(),
-                        })
-                    }
-                    Err(broadcast::error::RecvError::Closed) => Err(DbError::BufferClosed {
-                        buffer_name: "broadcast".to_string(),
-                    }),
-                },
-                TokioBufferReader::Watch {
-                    rx,
-                    #[cfg(feature = "metrics")]
-                    metrics,
-                } => {
-                    rx.changed().await.map_err(|_| DbError::BufferClosed {
-                        buffer_name: "watch".to_string(),
-                    })?;
-
-                    let value = rx.borrow().clone();
-                    match value {
-                        Some(v) => {
-                            #[cfg(feature = "metrics")]
-                            metrics.increment_consumed();
-                            Ok(v)
-                        }
-                        None => Err(DbError::BufferClosed {
-                            buffer_name: "watch".to_string(),
-                        }),
-                    }
+                        metrics,
+                    ))
                 }
-                TokioBufferReader::Notify {
-                    slot,
-                    notify,
+                Poll::Pending => Poll::Pending,
+            },
+            TokioBufferReader::Mailbox {
+                state,
+                #[cfg(feature = "metrics")]
+                metrics,
+            } => {
+                let mut guard = state.lock().unwrap();
+                if let Some(value) = guard.slot.take() {
                     #[cfg(feature = "metrics")]
-                    metrics,
-                } => {
-                    loop {
-                        // Check if there's already a value
-                        {
-                            let mut guard = slot.lock().unwrap();
-                            if let Some(value) = guard.take() {
-                                #[cfg(feature = "metrics")]
-                                metrics.increment_consumed();
-                                return Ok(value);
-                            }
-                        }
-                        // No value, wait for notification
-                        notify.notified().await;
+                    metrics.increment_consumed();
+                    Poll::Ready(Ok(value))
+                } else {
+                    // Register the waker (dedup so repeated polls without a push
+                    // don't grow the list — mirrors the WASM adapter).
+                    if !guard.wakers.iter().any(|w| w.will_wake(cx.waker())) {
+                        guard.wakers.push(cx.waker().clone());
                     }
+                    Poll::Pending
                 }
             }
-        })
+        }
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
         match self {
+            // `broadcast`/`watch` have no public poll API, and their receivers
+            // live inside the reusable future. Poll that future with a no-op
+            // waker: `Ready` means a value/error is available now (try-recv
+            // semantics); `Pending` means empty. On `Ready`, re-arm the future.
             TokioBufferReader::Broadcast {
-                rx,
+                recv,
                 #[cfg(feature = "metrics")]
                 metrics,
-            } => match rx.try_recv() {
-                Ok(value) => {
-                    #[cfg(feature = "metrics")]
-                    metrics.increment_consumed();
-                    Ok(value)
-                }
-                Err(broadcast::error::TryRecvError::Empty) => Err(DbError::BufferEmpty),
-                Err(broadcast::error::TryRecvError::Lagged(n)) => {
-                    #[cfg(feature = "metrics")]
-                    metrics.add_dropped(n);
-                    Err(DbError::BufferLagged {
-                        lag_count: n,
-                        buffer_name: "broadcast".to_string(),
-                    })
+            } => {
+                let waker = Waker::noop();
+                let mut cx = Context::from_waker(waker);
+                match recv.poll(&mut cx) {
+                    Poll::Ready((result, rx)) => {
+                        recv.set(broadcast_recv(rx));
+                        Self::map_broadcast(
+                            result,
+                            #[cfg(feature = "metrics")]
+                            metrics,
+                        )
+                    }
+                    Poll::Pending => Err(DbError::BufferEmpty),
                 }
-                Err(broadcast::error::TryRecvError::Closed) => Err(DbError::BufferClosed {
-                    buffer_name: "broadcast".to_string(),
-                }),
-            },
+            }
             TokioBufferReader::Watch {
-                rx,
+                recv,
                 #[cfg(feature = "metrics")]
                 metrics,
-            } => match rx.has_changed() {
-                Err(_) => Err(DbError::BufferClosed {
-                    buffer_name: "watch".to_string(),
-                }),
-                Ok(false) => Err(DbError::BufferEmpty),
-                Ok(true) => {
-                    let val = rx.borrow_and_update().clone();
-                    match val {
-                        Some(v) => {
+            } => {
+                let waker = Waker::noop();
+                let mut cx = Context::from_waker(waker);
+                match recv.poll(&mut cx) {
+                    Poll::Ready((result, rx)) => {
+                        recv.set(watch_recv(rx));
+                        Self::map_watch(
+                            result,
                             #[cfg(feature = "metrics")]
-                            metrics.increment_consumed();
-                            Ok(v)
-                        }
-                        None => Err(DbError::BufferClosed {
-                            buffer_name: "watch".to_string(),
-                        }),
+                            metrics,
+                        )
                     }
+                    Poll::Pending => Err(DbError::BufferEmpty),
                 }
-            },
-            TokioBufferReader::Notify {
-                slot,
-                notify: _,
+            }
+            TokioBufferReader::Mailbox {
+                state,
                 #[cfg(feature = "metrics")]
                 metrics,
             } => {
-                let mut guard = slot.lock().unwrap();
-                match guard.take() {
+                let mut guard = state.lock().unwrap();
+                match guard.slot.take() {
                     Some(val) => {
                         #[cfg(feature = "metrics")]
                         metrics.increment_consumed();
@@ -420,12 +513,19 @@ impl<T: Clone + Send + Sync + 'static> BufferReader<T> for TokioBufferReader<T>
 #[cfg(test)]
 mod tests {
     use super::*;
+    use aimdb_core::buffer::Reader;
+
+    /// Wrap a concrete `TokioBufferReader` in the ergonomic `Reader<T>` so the
+    /// tests can keep exercising `recv().await` / `try_recv()` (design 037 / W8).
+    fn rdr<T: Clone + Send + Sync + 'static>(buffer: &TokioBuffer<T>) -> Reader<T> {
+        Reader::new(Box::new(buffer.subscribe()))
+    }
 
     #[tokio::test]
     async fn test_spmc_ring_basic() {
         let cfg = BufferCfg::SpmcRing { capacity: 10 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(42);
         assert_eq!(reader.recv().await.unwrap(), 42);
     }
@@ -434,8 +534,8 @@ mod tests {
     async fn test_spmc_ring_multiple_consumers() {
         let cfg = BufferCfg::SpmcRing { capacity: 10 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader1 = buffer.subscribe();
-        let mut reader2 = buffer.subscribe();
+        let mut reader1 = rdr(&buffer);
+        let mut reader2 = rdr(&buffer);
         buffer.push(1);
         buffer.push(2);
         assert_eq!(reader1.recv().await.unwrap(), 1);
@@ -448,7 +548,7 @@ mod tests {
     async fn test_single_latest_basic() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(42);
         assert_eq!(reader.recv().await.unwrap(), 42);
     }
@@ -457,7 +557,7 @@ mod tests {
     async fn test_single_latest_skip_intermediate() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(1);
         buffer.push(2);
         buffer.push(3);
@@ -468,7 +568,7 @@ mod tests {
     async fn test_mailbox_basic() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(42);
         assert_eq!(reader.recv().await.unwrap(), 42);
     }
@@ -477,7 +577,7 @@ mod tests {
     async fn test_mailbox_overwrite() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(1);
         buffer.push(2);
         assert_eq!(reader.recv().await.unwrap(), 2);
@@ -564,7 +664,7 @@ mod tests {
         let cfg = BufferCfg::SpmcRing { capacity: 3 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send more messages than capacity without reading
         // This will cause the slow reader to lag
@@ -607,9 +707,9 @@ mod tests {
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
         // Create three independent readers
-        let mut reader1 = buffer.subscribe();
-        let mut reader2 = buffer.subscribe();
-        let mut reader3 = buffer.subscribe();
+        let mut reader1 = rdr(&buffer);
+        let mut reader2 = rdr(&buffer);
+        let mut reader3 = rdr(&buffer);
 
         // Send values
         for i in 0..5 {
@@ -631,7 +731,7 @@ mod tests {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send multiple values rapidly
         buffer.push(1);
@@ -659,8 +759,8 @@ mod tests {
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
         // Create readers BEFORE sending values
-        let mut reader1 = buffer.subscribe();
-        let mut reader2 = buffer.subscribe();
+        let mut reader1 = rdr(&buffer);
+        let mut reader2 = rdr(&buffer);
 
         // Send values
         buffer.push(10);
@@ -681,7 +781,7 @@ mod tests {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send first value
         buffer.push(1);
@@ -709,7 +809,7 @@ mod tests {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send and immediately read
         buffer.push(10);
@@ -854,7 +954,7 @@ mod tests {
     async fn test_try_recv_broadcast_empty() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // No values written — try_recv returns Empty
         assert!(matches!(reader.try_recv(), Err(DbError::BufferEmpty)));
@@ -864,7 +964,7 @@ mod tests {
     async fn test_try_recv_broadcast_single_value() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         buffer.push(42);
         assert_eq!(reader.try_recv().unwrap(), 42);
@@ -877,7 +977,7 @@ mod tests {
     async fn test_try_recv_broadcast_drains_all_pending() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Write 5 values
         for i in 0..5 {
@@ -901,7 +1001,7 @@ mod tests {
     async fn test_try_recv_broadcast_handles_lag() {
         let cfg = BufferCfg::SpmcRing { capacity: 4 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Write 10 values into capacity-4 ring — reader falls behind
         for i in 0..10 {
@@ -936,7 +1036,7 @@ mod tests {
     async fn test_try_recv_watch_empty() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // No values written — try_recv returns Empty
         assert!(matches!(reader.try_recv(), Err(DbError::BufferEmpty)));
@@ -946,7 +1046,7 @@ mod tests {
     async fn test_try_recv_watch_returns_latest() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         buffer.push(1);
         buffer.push(2);
@@ -968,7 +1068,7 @@ mod tests {
     async fn test_try_recv_mailbox_empty() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // No values written — try_recv returns Empty
         assert!(matches!(reader.try_recv(), Err(DbError::BufferEmpty)));
@@ -978,7 +1078,7 @@ mod tests {
     async fn test_try_recv_mailbox_takes_value() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         buffer.push(1);
         buffer.push(2); // overwrites
@@ -999,7 +1099,7 @@ mod tests {
     async fn test_try_recv_interleaved_push_and_drain() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Push 3, drain all
         buffer.push(1);
@@ -1038,8 +1138,8 @@ mod tests {
     async fn test_try_recv_multiple_independent_readers() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader_a = buffer.subscribe();
-        let mut reader_b = buffer.subscribe();
+        let mut reader_a = rdr(&buffer);
+        let mut reader_b = rdr(&buffer);
 
         // Push values
         for i in 0..5 {
@@ -1075,7 +1175,7 @@ mod tests {
     async fn test_try_recv_after_async_recv() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Push 3 values
         buffer.push(10);
@@ -1104,6 +1204,7 @@ mod tests {
 
     mod peek_tests {
         use super::super::*;
+        use super::rdr;
         use aimdb_core::buffer::DynBuffer;
 
         #[tokio::test]
@@ -1127,7 +1228,7 @@ mod tests {
             // Subscribe BEFORE push so the receiver's version counter advances
             // on send_replace. (Watch receivers created after a push will only
             // wake on the *next* push — that's the gap peek() exists to fill.)
-            let mut reader = Buffer::subscribe(&buffer);
+            let mut reader = rdr(&buffer);
             DynBuffer::push(&buffer, 42);
 
             // Multiple peeks return the same value.
@@ -1169,7 +1270,7 @@ mod tests {
             DynBuffer::push(&buffer, 99);
             assert_eq!(buffer.peek(), Some(99));
             // Subscriber takes the slot.
-            let mut reader = Buffer::subscribe(&buffer);
+            let mut reader = rdr(&buffer);
             assert_eq!(reader.recv().await.unwrap(), 99);
             // After take(), peek sees the slot is empty.
             assert_eq!(buffer.peek(), None);
@@ -1228,7 +1329,7 @@ mod tests {
         async fn test_spmc_ring_consumed_count() {
             let cfg = BufferCfg::SpmcRing { capacity: 10 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Push and consume
             buffer.push(1);
@@ -1247,7 +1348,7 @@ mod tests {
         async fn test_spmc_ring_dropped_count_on_lag() {
             let cfg = BufferCfg::SpmcRing { capacity: 3 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Overfill buffer to cause lag
             for i in 0..10 {
@@ -1271,7 +1372,7 @@ mod tests {
         async fn test_metrics_reset() {
             let cfg = BufferCfg::SpmcRing { capacity: 10 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Generate some metrics
             buffer.push(1);
@@ -1295,7 +1396,7 @@ mod tests {
         async fn test_watch_buffer_metrics() {
             let cfg = BufferCfg::SingleLatest;
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             buffer.push(1);
             buffer.push(2);
@@ -1314,7 +1415,7 @@ mod tests {
         async fn test_mailbox_buffer_metrics() {
             let cfg = BufferCfg::Mailbox;
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             buffer.push(1);
             let _ = reader.recv().await.unwrap();
@@ -1350,7 +1451,7 @@ mod tests {
         async fn test_try_recv_tracks_consumed_metrics() {
             let cfg = BufferCfg::SpmcRing { capacity: 10 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Push and try_recv
             buffer.push(1);
@@ -1372,7 +1473,7 @@ mod tests {
         async fn test_try_recv_tracks_dropped_on_lag() {
             let cfg = BufferCfg::SpmcRing { capacity: 3 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Overfill to cause lag
             for i in 0..10 {
diff --git a/aimdb-wasm-adapter/src/buffer.rs b/aimdb-wasm-adapter/src/buffer.rs
index 45d01102..46dabd10 100644
--- a/aimdb-wasm-adapter/src/buffer.rs
+++ b/aimdb-wasm-adapter/src/buffer.rs
@@ -20,8 +20,6 @@ use alloc::collections::VecDeque;
 use alloc::rc::Rc;
 use alloc::vec::Vec;
 use core::cell::{Cell, RefCell};
-use core::future::Future;
-use core::pin::Pin;
 use core::task::{Context, Poll, Waker};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader, DynBuffer};
@@ -206,8 +204,38 @@ enum ReaderState {
 }
 
 impl<T: Clone + Send + 'static> BufferReader<T> for WasmBufferReader<T> {
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-        Box::pin(WasmRecvFuture { reader: self })
+    /// Poll for the next value (design 037 / W8).
+    ///
+    /// On each poll:
+    /// 1. Try to read a value (non-blocking).
+    /// 2. If available, return `Poll::Ready(Ok(value))`.
+    /// 3. If not, register the waker and return `Poll::Pending`.
+    ///
+    /// The waker is woken when [`WasmBuffer::push()`](WasmBuffer) fires. This is
+    /// allocation-free — the pre-W8 `Box::pin(WasmRecvFuture { .. })` existed
+    /// solely to satisfy the old async trait signature.
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        // Try non-blocking read first
+        match self.try_recv() {
+            Ok(value) => Poll::Ready(Ok(value)),
+            Err(e @ DbError::BufferLagged { .. }) => Poll::Ready(Err(e)),
+            Err(DbError::BufferEmpty) => {
+                // Register waker so we get woken on next push
+                let mut inner = self.buffer.borrow_mut();
+                let wakers = match &mut *inner {
+                    WasmBufferInner::SpmcRing { wakers, .. } => wakers,
+                    WasmBufferInner::SingleLatest { wakers, .. } => wakers,
+                    WasmBufferInner::Mailbox { wakers, .. } => wakers,
+                };
+                // Deduplicate: only add if no existing waker will wake the same task.
+                // Prevents unbounded growth when a single reader is polled repeatedly.
+                if !wakers.iter().any(|w| w.will_wake(cx.waker())) {
+                    wakers.push(cx.waker().clone());
+                }
+                Poll::Pending
+            }
+            Err(e) => Poll::Ready(Err(e)),
+        }
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
@@ -264,55 +292,6 @@ impl<T: Clone + Send + 'static> BufferReader<T> for WasmBufferReader<T> {
     }
 }
 
-// ============================================================================
-// Async recv future
-// ============================================================================
-
-/// Future returned by `WasmBufferReader::recv()`.
-///
-/// On each poll:
-/// 1. Try to read a value (non-blocking).
-/// 2. If available, return `Poll::Ready(Ok(value))`.
-/// 3. If not, register the waker and return `Poll::Pending`.
-///
-/// The waker is woken when `WasmBuffer::push()` fires.
-struct WasmRecvFuture<'a, T> {
-    reader: &'a mut WasmBufferReader<T>,
-}
-
-// SAFETY: wasm32 is single-threaded
-unsafe impl<T> Send for WasmRecvFuture<'_, T> {}
-
-impl<T: Clone + Send + 'static> Future for WasmRecvFuture<'_, T> {
-    type Output = Result<T, DbError>;
-
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        let this = self.get_mut();
-
-        // Try non-blocking read first
-        match this.reader.try_recv() {
-            Ok(value) => Poll::Ready(Ok(value)),
-            Err(e @ DbError::BufferLagged { .. }) => Poll::Ready(Err(e)),
-            Err(DbError::BufferEmpty) => {
-                // Register waker so we get woken on next push
-                let mut inner = this.reader.buffer.borrow_mut();
-                let wakers = match &mut *inner {
-                    WasmBufferInner::SpmcRing { wakers, .. } => wakers,
-                    WasmBufferInner::SingleLatest { wakers, .. } => wakers,
-                    WasmBufferInner::Mailbox { wakers, .. } => wakers,
-                };
-                // Deduplicate: only add if no existing waker will wake the same task.
-                // Prevents unbounded growth when a single reader is polled repeatedly.
-                if !wakers.iter().any(|w| w.will_wake(cx.waker())) {
-                    wakers.push(cx.waker().clone());
-                }
-                Poll::Pending
-            }
-            Err(e) => Poll::Ready(Err(e)),
-        }
-    }
-}
-
 // ============================================================================
 // Helpers
 // ============================================================================
diff --git a/docs/design/037-zero-alloc-consume-path.md b/docs/design/037-zero-alloc-consume-path.md
new file mode 100644
index 00000000..cebb636e
--- /dev/null
+++ b/docs/design/037-zero-alloc-consume-path.md
@@ -0,0 +1,145 @@
+# 037 — Zero-Allocation Consume Path: Poll-Based `BufferReader` SPI (W8)
+
+**Status:** Implemented 2026-06-20 (pending review), stacked on the `aimdb-bench` crate (design 038). Builds on the 034/035/036 review cycle and the #141–#147 series. **Must land inside the currently-open breaking window** (same release as the W1/W2 SPI breaks) or it waits for the next major. Host B0–B2 measured (§9); B3 on-target and embassy-host B0 are follow-ups.
+
+---
+
+## 1. Where this sits
+
+036 W1 (PR #141) removed the per-message `dyn Any` erasure from the connector, session-pump, and AimX paths. Its acceptance criterion was a `dyn Any` grep — which structurally cannot see boxed *futures*. Post-W1, exactly one AimDB-added per-message heap allocation remains: the `Pin<Box<dyn Future>>` constructed on every `recv()`.
+
+The asymmetry is now stark:
+
+| Direction | State after #141–#147 |
+|---|---|
+| Write path | Solved by 029: sync `push`, pre-bound handle, one vtable call, zero alloc |
+| Inbound (connector → record) | Solved by W1: fused **sync** typed ingest closure, zero AimDB-added alloc |
+| Consume path (record → consumer / connector / remote) | **One heap allocation per message, per reader** — this doc |
+
+W8 closes the loop. End state: **zero AimDB-added heap allocations per message, end to end; no locks held across `await`; abstraction cost = one indirect call — enforced in CI, measured in `benches/`.**
+
+## 2. Current state (verified, pr147 HEAD)
+
+| Path | Mechanism | Per-message cost |
+|---|---|---|
+| In-process consumer | [`BufferReader::recv` → `Pin<Box<dyn Future>>`](../../aimdb-core/src/buffer/traits.rs#L142); `Box::pin` at [tokio `buffer.rs:283`](../../aimdb-tokio-adapter/src/buffer.rs#L283), [embassy `buffer.rs:416`](../../aimdb-embassy-adapter/src/buffer.rs#L416), [wasm `buffer.rs:210`](../../aimdb-wasm-adapter/src/buffer.rs#L210) | 1 heap box + 1 indirect call |
+| Outbound connector | [`SerializedSource::recv` → `RecvSerializedFuture`](../../aimdb-core/src/connector.rs#L101) (W1 removed the value-level `Box<dyn Any>`; the future box stayed per the 036 W1 risk note "manual boxed-future pattern — keep it") | 1 heap box |
+| Remote access / JSON | [`recv_json`](../../aimdb-core/src/buffer/traits.rs#L177) boxes around [`JsonReaderAdapter`](../../aimdb-core/src/typed_record.rs#L137)'s inner `recv()`, which boxes again | 2 heap boxes |
+| Inbound | fused sync ingest (W1) | 0 |
+| Produce | [`WriteHandle::push`](../../aimdb-core/src/buffer/traits.rs) sync | 0 (1 indirect call) |
+
+Notable: the WASM adapter already owns a hand-rolled poll struct — `Box::pin(WasmRecvFuture { reader: self })`. The allocation exists **only to satisfy the trait signature**. The signature is the problem, not the implementations.
+
+Out of scope, recorded in §7: the latest-snapshot [`spin::Mutex`](../../aimdb-core/src/typed_record.rs#L36) on the produce path.
+
+## 3. Approach
+
+### 3.1 The SPI change
+
+Object safety and `async fn` conflict; object safety and `poll` do not. Replace the async method on the erased trait with its poll form:
+
+```rust
+pub trait BufferReader<T: Clone + Send>: Send {
+    /// Poll for the next value. Registers `cx.waker()` when `Pending`.
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>>;
+
+    fn try_recv(&mut self) -> Result<T, DbError>; // unchanged
+}
+```
+
+The **consumer-facing API does not move**. `Reader<T>::recv()` remains `async`, implemented once:
+
+```rust
+pub async fn recv(&mut self) -> Result<T, DbError> {
+    core::future::poll_fn(|cx| self.inner.poll_recv(cx)).await
+}
+```
+
+`core::future::poll_fn` is stable (1.64) and lives in `core` — `no_std`-clean, zero allocation, no `unsafe`. Call sites in examples and aimdb-pro compile unmodified. Only `BufferReader` *implementors* break: an SPI break, in the window that #131/#135/#141 already opened.
+
+### 3.2 Per adapter (in implementation order)
+
+1. **WASM — unbox what already exists.** `WasmRecvFuture`'s poll body *becomes* `poll_recv`; delete the `Box::pin`. Smallest diff; do it first to validate the trait shape.
+2. **Tokio Mailbox.** Currently `Mutex<slot>` + `Notify` ([`buffer.rs:8`](../../aimdb-tokio-adapter/src/buffer.rs#L8)). Replace `Notify` with waker storage beside the slot — single-slot take semantics is the textbook poll pattern. Drops the `Notify` permit subtleties entirely. Waker contract: see §6.
+3. **Embassy. (As built.)** `Channel` exposes `poll_receive(&self, cx)` natively → Mailbox maps directly, zero stored state, no `unsafe`. But the **pubsub `Subscriber` and watch `Receiver` had no lag-preserving public poll**: `Subscriber: Stream` exists but its `poll_next` silently drops `WaitResult::Lagged`, and the watch `Receiver`'s `poll_changed` sits on a *sealed* trait (not callable downstream). The poll-native methods exist internally (the async `next_message()`/`changed()` are built on them) but are deliberately private. Rather than hand-roll a `no_std` `ReusableBoxFuture` (which added ~80 lines of raw-pointer `unsafe` to the embedded adapter — the initial W8 cut did this), we added two **small, additive public wrappers** to the vendored `embassy-sync` and drive them directly: `Subscriber::poll_next_message(&mut self, cx) -> Poll<WaitResult<T>>` and `watch::Receiver::poll_changed(&mut self, cx) -> Poll<T>`. The reader stores the subscriber/receiver and polls them each call — zero per-message alloc, no future box, **no new `unsafe`** (only the pre-existing `'static` borrow extension, unchanged from pre-W8). The change is on a branch in the embassy submodule (`aimdb/public-poll-methods`) with an **upstream PR pending** — the parity argument is that `Channel::poll_receive` is already public, so this just gives pubsub/watch the matching method. See [tmp decision doc](tmp-embassy-poll-recv-unsafe-options.md).
+4. **Tokio Broadcast — the one residue.** `broadcast::Receiver` exposes no public poll API. Use the `BroadcastStream` technique: a `tokio_util::sync::ReusableBoxFuture` owned by the reader — **one allocation per subscriber lifetime, reused for every message**. This is a Tokio API limitation, not an AimDB design cost; documented as such.
+
+### 3.3 Fused and remote paths inherit it
+
+`SerializedSource` composes over `poll_recv` (subscribe → poll → serialize stays fused inside the registration closure); `RecvSerializedFuture` either becomes a poll method or keeps its async wrapper over the inner poll — either way the inner box is gone. `JsonBufferReader` collapses the same way; the remote-access **double** box disappears with no separate work item.
+
+## 4. Measurement program — the centerpiece (lands *with* the change, not after)
+
+W8 exists to make a claim provable; the proof ships in the same series. These benches instantiate the L0/L1 layers of the benchmark-pyramid plan for the consume path; the three canonical workload profiles — **Telemetry/`SpmcRing`**, **State/`SingleLatest`**, **Command/`Mailbox`** — are the unit of reporting throughout. New workspace member `benches/aimdb-bench` (criterion; host-only; dev-deps fenced from the `no_std` graph).
+
+**B0 — Allocation count (hard gate + headline number).** Counting `#[global_allocator]` wrapper in dedicated host test binaries — the W6 `host_test_stubs!` infrastructure is the natural home for the embassy-host build. Protocol: set up producer + subscribers; warmup absorbs one-time setup (including the tokio-broadcast `ReusableBoxFuture`); snapshot the counter; push/consume N = 10 000; assert **and report** allocations/message. Expected: **1 → 0**, per buffer profile × {tokio, embassy-host}. Doubles as the CI gate (§5) and the first table row of any publication. WASM: covered by the unboxed `WasmRecvFuture` unit tests; alloc-counting under wasm32 in CI is not worth the harness (justified per the #146 compile-delete-or-justify rule).
+
+**B1 — Leaf latency (L0).** Single-message publish→`recv`-return, ping-pong, **three-way per profile**: raw primitive (`broadcast` / `watch` / embassy `Channel` on host) vs AimDB-before vs AimDB-after. Tokio current-thread runtime, criterion `async_executor`, pinned core; report p50/p99 (criterion medians, never means). Two defined deltas, used consistently everywhere downstream: (after − raw) = **abstraction cost** — the README number; (before − after) = **what W8 bought** — the publication number.
+
+**B2 — Steady-state throughput (L1).** msgs/sec at saturation, SPSC and 1→4 fan-out per profile. Fan-out deliberately exposes the `T: Clone` per-consumer copy cost — measured and reported, not hidden. Same three-way comparison as B1.
+
+**B3 — On-target (Cortex-M).** The W3 KNX hardware rig already exists; reuse it. DWT `CYCCNT` around `recv` on the STM32H5, defmt-reported, N = 10 000: **cycles/message** before/after, plus the embedded-alloc **heap high-water mark** over the run — the fragmentation story no host bench can tell, and the number that makes the claim credible to the embedded audience. Ships as a flashable example with documented rig setup.
+
+**Methodology constraints (so the numbers survive review).** Current-thread executors and pinned cores isolate leaf cost from scheduler noise; warmup excluded from samples; rustc version, criterion configuration, host CPU, and B3 rig recorded in §9 alongside the results; results are explicitly microbenchmark scope — **no system-throughput claims are derived from them**. Reproduction is one command (`cargo bench -p aimdb-bench`) against the committed lockfile.
+
+**CI.** B0 is a required gate on PRs touching `aimdb-core/src/buffer/`, the adapter `buffer.rs` files, or `connector.rs`. B1/B2 run on the same trigger but report as trend only — CI runners are too noisy to gate on nanoseconds. B3 is a release-checklist item, not CI.
+
+## 5. Acceptance criteria
+
+- [x] `grep -rn "Box::pin" aimdb-{tokio,embassy,wasm}-adapter/src/buffer.rs aimdb-core/src/buffer/ aimdb-core/src/connector.rs` → zero hits on per-message construction sites. (Remaining hits are doc comments only: the wasm reader's note about the *removed* box, and the connector.rs BYOC doc example. The reused futures live in `ReusableBoxFuture::new` at subscribe time — tokio broadcast/watch — and the embassy `no_std` equivalent, all per-subscriber, not per-message.)
+- [x] `BufferReader::recv`/`JsonBufferReader::recv_json` (boxed-future form) deleted; `poll_recv`/`poll_recv_json` object-safe; `thumbv7em-none-eabihf` and `wasm32-unknown-unknown` builds + clippy green.
+- [x] **B0: 0 allocations/message** across the 3 tokio buffer profiles (was 1). Committed to `aimdb-bench/data/baselines/b0_alloc_tokio.json`. CI wiring is **advisory (report-only) per design 038 §6**; a hard gate and the embassy-host adapter B0 are the documented follow-up.
+- [x] Examples and aimdb-pro compile with **no call-site changes** (`subscribe().recv().await` unchanged; `Consumer`/`AimDb`/`TypedRecord::subscribe` now return `Reader<T>`). Concrete-reader holders wrap once via `Reader::new(Box::new(..))`.
+- [x] SPI break recorded in `aimdb-core` CHANGELOG; ships in the same release as the W1/W2 breaks.
+- [x] §9 populated for B0–B2 (host), rustc + criterion recorded. **B3 (on-target, STM32H5) deferred** to a hardware session per §8/§4.
+
+## 6. Risk notes
+
+- **Cancellation semantics on tokio broadcast.** The in-flight inner `Recv` future now persists across caller polls (owned `ReusableBoxFuture`) instead of being dropped per call. Strictly fewer lost-wakeup hazards; behavioral note: a value claimed by the inner future just before caller cancellation is delivered on the *next* poll rather than dropped — consistent with broadcast cursor semantics. Document on the reader.
+- **Lagged mapping** unchanged: `Lagged(n)` → `DbError::BufferLagged(n)`; pinned by existing adapter tests.
+- **Mailbox waker contract.** Single-slot take semantics with potentially multiple readers: store wakers, wake-**all** on push (spurious wakeups are benign; losers re-poll to `Pending`). Wake-one is an optimization with a starvation analysis attached — not now.
+- **Auto-traits.** The `poll_fn` future is `Send` iff the reader is; verify at the session-pump and connector spawn sites (compiler enforces; listed so the error is expected, not surprising).
+- **Embassy poll surface** — RESOLVED. The watch `Receiver` and pubsub `Subscriber` had no lag-preserving public poll (sealed plumbing + a lossy `Stream`). Resolved by adding public `poll_changed` / `poll_next_message` wrappers to the vendored `embassy-sync` (§3.2 step 3), upstream PR pending — *not* by the hand-rolled `no_std` `ReusableBoxFuture` the initial cut used, which is deleted along with its `unsafe`.
+
+## 7. Dormant items (trigger-only)
+
+| Item | Decision | Re-open trigger |
+|---|---|---|
+| Generic fast lane `Reader<T, B>` / `Producer<T, B>` (default type param keeps `Reader<T>` = boxed lane; adapter seam already exists: [`subscribe() -> Self::Reader`](../../aimdb-tokio-adapter/src/buffer.rs#L97) vs [`subscribe_boxed()`](../../aimdb-tokio-adapter/src/buffer.rs#L128)) | Not shipped. Post-W8 the dyn lane costs one indirect call and zero allocs; monomorphization stamps `T×B` copies against the ~50 KB flash budget | §9 numbers (B1/B3) or an MCU flame graph show the per-message indirect call as a measurable fraction of a real workload's budget |
+| Latest-snapshot `spin::Mutex` → atomic ptr swap (portable-atomic) | Keep — bounded, tiny critical section on produce only | Producer-side profiling shows it, or a hard "no spinlocks" claim is wanted for marketing |
+
+## 8. Sequencing and size
+
+1. **B0–B2 scaffolding first**; baseline numbers on current HEAD committed (the "before" columns). B3 baseline captured on the W3 rig — fold into the next scheduled hardware session.
+2. WASM unbox → Tokio Mailbox waker rewrite → Embassy `poll_receive` mapping → Tokio broadcast `ReusableBoxFuture`.
+3. Flip the trait, delete `recv()`, migrate `SerializedSource` + JSON paths, fix fallout.
+4. Gates into CI; populate §9 after-columns; status row in 036 §5 pointing here; README/website wording PR ("zero allocations per message, end to end; overhead: one indirect call, X ns") in the same series as the after-numbers — never before.
+
+**Size:** M — wide but mechanical (three adapters + two fused paths). The only design-sensitive piece is the Mailbox waker contract (§6).
+
+## 9. Results (populated by §4)
+
+**Environment:** rustc `1.91.1` · criterion `0.5` · host CPU `dev container (shared, noisy — host B1/B2 are indicative trend only, not gated)` · B3: STM32H5 @ `—` MHz, embassy-executor `—`, embedded-alloc `—` (deferred follow-up).
+
+**Host (B0–B2), measured 2026-06-20 via `aimdb-bench`.** B0 is the gate and headline; B1/B2 are after-W8 medians on a shared container (the three-way raw/before split and embassy-host B0 are deferred to the embassy follow-up per scope).
+
+| Profile | allocs/msg (B0) before → **after** | p50 (B1) after | msgs/s (B2) after |
+|---|---|---|---|
+| Telemetry — Tokio SpmcRing (`broadcast`) | 1 → **0** | ~195 ns | ~5.2 M/s |
+| State — Tokio SingleLatest (`watch`) | 1 → **0** | ~446 ns | ~2.5 M/s |
+| Command — Tokio Mailbox | 1 → **0** | ~70 ns | ~13.6 M/s |
+| Telemetry — Embassy Channel (host) | deferred (embassy-host B0 follow-up) | — | — |
+
+**Headline:** the last AimDB-added per-message heap allocation on the in-process consume path is removed — **1 → 0 allocs/msg** on every tokio buffer profile, byte total 0 in the measured window. Abstraction cost is one indirect call per `recv` (the boxed `Reader<T>` lane). Bytes/msg before-W8 was 144 B (the `Box::pin(async move …)` future) → 0 B after.
+
+**On-target (B3, STM32H5):**
+
+| Metric | Before | After |
+|---|---|---|
+| cycles/message (DWT `CYCCNT`) | — | — |
+| heap high-water mark, 10 k msgs | — | — |
+| allocations/message | — | — |
+
+## 10. Publication note (r/rust)
+
+The §9 tables are the post's spine; **the post follows the merged numbers, never precedes them**. Framing is purely technical per the established channel split: the find (`WasmRecvFuture` boxed solely to satisfy a trait signature), the mechanism (object safety vs `async fn`; poll as the escape that keeps `dyn`), the trade analysis (the box was deadweight, not part of the dyn trade — and the monomorphized `Reader<T, B>` lane declined *with data*, §7), the B0–B3 numbers, and the one-command reproduction path with links to the PR and this doc. No claims beyond measured scope. Declining an optimization on the basis of measurements is itself the credibility move for that audience — it belongs in the post, not just the doc.
diff --git a/examples/embassy-bench-stm32h5/.cargo/config.toml b/examples/embassy-bench-stm32h5/.cargo/config.toml
new file mode 100644
index 00000000..5aac79b7
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/.cargo/config.toml
@@ -0,0 +1,8 @@
+[target.thumbv8m.main-none-eabihf]
+runner = 'probe-rs run --chip STM32H563ZITx'
+
+[build]
+target = "thumbv8m.main-none-eabihf"
+
+[env]
+DEFMT_LOG = "info"
diff --git a/examples/embassy-bench-stm32h5/.gitignore b/examples/embassy-bench-stm32h5/.gitignore
new file mode 100644
index 00000000..8112e1e6
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/.gitignore
@@ -0,0 +1,5 @@
+target/
+Cargo.lock
+*.bin
+*.elf
+*.hex
diff --git a/examples/embassy-bench-stm32h5/Cargo.toml b/examples/embassy-bench-stm32h5/Cargo.toml
new file mode 100644
index 00000000..2c921add
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/Cargo.toml
@@ -0,0 +1,62 @@
+[package]
+edition = "2024"
+name = "embassy-bench-stm32h5"
+version = "0.1.0"
+license = "MIT OR Apache-2.0"
+publish = false
+description = "AimDB B3 on-target benchmark: cycle & allocation profiling of the Embassy buffer consume path on an STM32H563ZI (design 038 §B3)"
+
+[features]
+default = ["embassy-runtime"]
+embassy-runtime = []
+
+[dependencies]
+# AimDB — buffer layer only (no full AimDb stack), isolated like the host B0/B1/B2.
+aimdb-core = { path = "../../aimdb-core", default-features = false, features = [
+    "alloc",
+] }
+aimdb-embassy-adapter = { path = "../../aimdb-embassy-adapter", default-features = false, features = [
+    "embassy-runtime",
+] }
+
+# Embassy ecosystem - STM32H563ZI (same board/clock as the other H563 demos)
+embassy-stm32 = { workspace = true, features = [
+    "defmt",
+    "stm32h563zi",
+    "memory-x",
+    "time-driver-any",
+    "unstable-pac",
+] }
+embassy-sync = { workspace = true, features = ["defmt"] }
+embassy-executor = { workspace = true, features = [
+    "platform-cortex-m",
+    "executor-thread",
+    "defmt",
+] }
+embassy-time = { workspace = true, features = [
+    "defmt",
+    "defmt-timestamp-uptime",
+    "tick-hz-32_768",
+    "generic-queue-16",
+] }
+embassy-futures = { workspace = true }
+
+# Embedded debugging and logging
+defmt = { workspace = true }
+defmt-rtt = { workspace = true }
+panic-probe = { workspace = true }
+
+# Cortex-M runtime + DWT cycle counter access
+cortex-m = { workspace = true }
+cortex-m-rt = { workspace = true }
+# Provides the on-target `critical-section` impl that embassy-sync's
+# `CriticalSectionRawMutex` (used by every `EmbassyBuffer`) links against.
+critical-section = { workspace = true }
+
+# Counting allocator wraps this heap to validate 0 allocs/msg on real hardware.
+embedded-alloc = { version = "0.6", features = ["llff"] }
+
+[package.metadata.embassy]
+build = [
+    { target = "thumbv8m.main-none-eabihf", artifact-dir = "out/examples/embassy-bench-stm32h5" },
+]
diff --git a/examples/embassy-bench-stm32h5/README.md b/examples/embassy-bench-stm32h5/README.md
new file mode 100644
index 00000000..6405fc5b
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/README.md
@@ -0,0 +1,113 @@
+# embassy-bench-stm32h5 — AimDB B3 on-target profiling
+
+The **B3** tier of the AimDB benchmark suite (design [038](../../docs/design/038-aimdb-bench-crate-design.md)
+§B3 / §Phase 4): the measurements that **cannot run on a host**. It reads the
+Cortex-M **DWT cycle counter** (`CYCCNT`) to report the real per-message cost, in
+CPU cycles, of the AimDB Embassy buffer `push` → `recv` consume path on an
+**STM32H563ZI** (Cortex-M33 @ 250 MHz), and re-validates the W8 zero-allocation
+claim against the real embedded allocator (`embedded-alloc`).
+
+The host-runnable tiers — **B0** (allocations), **B1** (latency), **B2**
+(throughput) — live in the [`aimdb-bench`](../../aimdb-bench) crate and exercise
+the same Embassy buffer backend on the host via `futures::executor::block_on`.
+This crate is the on-hardware complement, not a replacement.
+
+## What it measures
+
+For each AimDB buffer profile it runs a tight, lockstep `push`→`recv` loop after
+a warmup and reports per message:
+
+| Profile     | Backend        | embassy-sync primitive |
+|-------------|----------------|------------------------|
+| Telemetry   | `SpmcRing`     | `PubSubChannel`        |
+| State       | `SingleLatest` | `Watch`                |
+| Command     | `Mailbox`      | `Channel<_, _, 1>`     |
+| Telemetry ×4| `SpmcRing` 1→4 | `PubSubChannel`        |
+
+Each line reports **cycles/msg** (CYCCNT delta ÷ batch) and **allocs/msg**
+(global-allocator calls ÷ batch). The target is **0 allocs/msg** — the same W8
+goal the host B0 suite gates on.
+
+## Hardware
+
+- **Board:** ST Nucleo-H563ZI (STM32H563ZI, Cortex-M33).
+- **Probe:** the onboard ST-LINK (SWD + RTT). No extra wiring — defmt logs stream
+  over RTT on the same USB cable you flash with.
+
+## Running
+
+From **this directory** (the local `.cargo/config.toml` selects the
+`thumbv8m.main-none-eabihf` target and the `probe-rs` runner):
+
+```bash
+cargo run --release
+```
+
+`--release` matters: debug vs release cycle counts differ by an order of
+magnitude (design 038 §15.8). Always record the build profile with a baseline.
+
+### Flashing from outside the dev container
+
+If `probe-rs` and the ST-LINK live on your host (not in the container), build in
+the container and flash from the host with [`flash.sh`](flash.sh):
+
+```bash
+# In the dev container:
+cd examples/embassy-bench-stm32h5 && cargo build --release
+
+# On the host (where the ST-LINK is attached):
+cd examples/embassy-bench-stm32h5 && ./flash.sh
+```
+
+`flash.sh` prefers the `--release` binary and falls back to debug with a warning;
+it runs `probe-rs run --chip STM32H563ZITx`, so B3 results stream over RTT to
+your terminal.
+
+First capture on a Nucleo-H563ZI @ 250 MHz (release), recorded in
+[`aimdb-bench/data/baselines/b3_cycles_stm32h5.json`](../../aimdb-bench/data/baselines/b3_cycles_stm32h5.json):
+
+```
+=== AimDB B3 — Embassy buffer profiling on STM32H563ZI @ 250 MHz ===
+cycle_counter=true  warmup=200  batch=512
+[B3] Telemetry SpmcRing    : 2013 cycles/msg, 0 allocs/msg  (1030839 cycles total, batch=512)
+[B3] State     SingleLatest: 2009 cycles/msg, 0 allocs/msg  (1029028 cycles total, batch=512)
+[B3] Command   Mailbox     : 1661 cycles/msg, 0 allocs/msg  (850440 cycles total, batch=512)
+[B3] Telemetry SpmcRing(1->4): 6239 cycles/msg, 0 allocs/msg  (4 deliveries/msg, 3194799 cycles total, batch=512)
+=== B3 complete — target=0 allocs/msg (W8 zero-alloc consume path) ===
+```
+
+Command (single-slot `Channel`) is cheapest; the 1→4 fan-out is sub-linear
+(~1560 cycles/delivery) since the single `push` is amortized across four reads.
+Treat these as the regression reference; re-run 2–3× and update the baseline JSON
+if a stable average drifts.
+
+## Notes & caveats
+
+- **Measurement window** excludes warmup. The one-time reader `Box`/lazy
+  `SpmcRing` subscriber registration happens during warmup, so the measured
+  window reflects steady-state per-message cost only.
+- **Payload construction is inside the timed loop**, identical to the host B1/B2
+  suites, so the figure is the end-to-end per-message consume cost (not the
+  buffer call alone).
+- **Clock governor / frequency:** DWT cycle counts assume the 250 MHz PLL1
+  config in `main.rs`. Record the baseline at a fixed clock (design 038 §15.6).
+- **CI compile check** uses `thumbv7em-none-eabihf` (the workspace's installed
+  embedded triple, see the `examples` Make target), matching the other H563
+  demos; the flashable artifact is the `thumbv8m.main-none-eabihf` build above.
+
+## Troubleshooting
+
+**`└─ <mod path> @ <invalid location: defmt frame-index: N>`** after each log line.
+The `[B3] …` messages themselves decode fine; only the file:line annotations are
+missing. The firmware is correct — it emits the same defmt 1.0 metadata
+(`_defmt_version_ = 4`, `.defmt` + `.symtab` present) as every other embassy
+example here, in both debug and release. The cause is host-side: the
+`defmt-decoder` bundled in your `probe-rs` decodes defmt-1.0 message payloads but
+not its location table. It affects any defmt-1.0 binary in this repo, not just
+this one — flash another example (e.g. `embassy-serial-connector-demo`) to
+confirm the same annotations appear.
+
+Fix: update `probe-rs` to a current release (its decoder is updated in lockstep
+with defmt) — `probe-rs --version`, then reinstall via your usual method (e.g.
+`cargo install probe-rs-tools --locked`). It is purely cosmetic for B3: the
+cycle/alloc numbers are unaffected.
diff --git a/examples/embassy-bench-stm32h5/build.rs b/examples/embassy-bench-stm32h5/build.rs
new file mode 100644
index 00000000..8cd32d7e
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/build.rs
@@ -0,0 +1,5 @@
+fn main() {
+    println!("cargo:rustc-link-arg-bins=--nmagic");
+    println!("cargo:rustc-link-arg-bins=-Tlink.x");
+    println!("cargo:rustc-link-arg-bins=-Tdefmt.x");
+}
diff --git a/examples/embassy-bench-stm32h5/flash.sh b/examples/embassy-bench-stm32h5/flash.sh
new file mode 100755
index 00000000..a031763a
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/flash.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Flash script for embassy-bench-stm32h5 (AimDB B3 on-target profiling).
+#
+# Run on the HOST machine where probe-rs and the ST-LINK are accessible.
+# Build first (in the dev container):
+#   cd examples/embassy-bench-stm32h5 && cargo build --release
+#
+# B3 cycle counts are only meaningful in --release: debug vs release differ by an
+# order of magnitude (design 038 §15.8). This script therefore prefers the
+# release binary and only falls back to debug with a warning.
+#
+# Results (cycles/msg + allocs/msg per profile) stream over RTT (SWD) as defmt
+# logs — probe-rs prints them to this terminal.
+set -e
+
+RELEASE_BINARY="../../target/thumbv8m.main-none-eabihf/release/embassy-bench-stm32h5"
+DEBUG_BINARY="../../target/thumbv8m.main-none-eabihf/debug/embassy-bench-stm32h5"
+
+if [ -f "$RELEASE_BINARY" ]; then
+    BINARY="$RELEASE_BINARY"
+elif [ -f "$DEBUG_BINARY" ]; then
+    BINARY="$DEBUG_BINARY"
+    echo "Warning: using the DEBUG build — B3 cycle counts are not representative."
+    echo "         Rebuild with --release for meaningful numbers:"
+    echo "           cd examples/embassy-bench-stm32h5 && cargo build --release"
+else
+    echo "Error: no binary found at:"
+    echo "  $RELEASE_BINARY"
+    echo "  $DEBUG_BINARY"
+    echo "Build it first in the dev container:"
+    echo "  cd examples/embassy-bench-stm32h5 && cargo build --release"
+    exit 1
+fi
+
+echo "Flashing embassy-bench-stm32h5 to STM32H563ZITx (B3 results stream over RTT)..."
+probe-rs run --chip STM32H563ZITx "$BINARY"
diff --git a/examples/embassy-bench-stm32h5/rust-toolchain.toml b/examples/embassy-bench-stm32h5/rust-toolchain.toml
new file mode 100644
index 00000000..750ee6f7
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/rust-toolchain.toml
@@ -0,0 +1,4 @@
+[toolchain]
+channel = "1.95"
+components = ["rust-src", "rustfmt", "llvm-tools"]
+targets = ["thumbv8m.main-none-eabihf"]
diff --git a/examples/embassy-bench-stm32h5/src/main.rs b/examples/embassy-bench-stm32h5/src/main.rs
new file mode 100644
index 00000000..12e79370
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/src/main.rs
@@ -0,0 +1,346 @@
+#![no_std]
+#![no_main]
+
+//! B3 — On-target cycle & allocation profiling of the AimDB Embassy buffer
+//! consume path (design 038 §B3 / §Phase 4).
+//!
+//! This is the part of the Embassy benchmark suite that **cannot run on a
+//! host**: it reads the Cortex-M **DWT cycle counter** (`CYCCNT`) to measure the
+//! real per-message cost, in CPU cycles, of `push` → `recv` for each AimDB
+//! buffer profile on an STM32H563ZI (Cortex-M33 @ 250 MHz). The host
+//! `aimdb-bench` suite covers B0 (allocations), B1 (wall-clock latency) and B2
+//! (throughput) for the same Embassy buffer backend; this binary adds the
+//! cycle-accurate B3 numbers and re-validates the W8 zero-allocation claim
+//! against the real embedded allocator (`embedded-alloc`), wrapped here in a
+//! counting allocator.
+//!
+//! ## What is measured
+//!
+//! For Telemetry (`SpmcRing`), State (`SingleLatest`) and Command (`Mailbox`),
+//! plus a 1→4 Telemetry fan-out, the binary runs a tight lockstep
+//! `push`→`recv` loop and reports, per message:
+//!   * **cycles/msg** — `DWT::cycle_count()` delta over the measured batch,
+//!   * **allocs/msg** — global-allocator call count over the measured batch.
+//!
+//! The measured window excludes a warmup phase; the one-time reader boxing and
+//! lazy `SpmcRing` subscriber registration happen during warmup. As with the
+//! host B1/B2 suites, payload construction is inside the timed loop, so the
+//! figure is the end-to-end per-message consume cost, not the buffer call in
+//! isolation.
+//!
+//! ## Running
+//!
+//! ```bash
+//! # From this crate dir, with a Nucleo-H563ZI connected via ST-LINK:
+//! cargo run --release
+//! ```
+//!
+//! Results stream over RTT (SWD) as defmt logs. `--release` is strongly
+//! recommended; debug vs release cycle counts differ by an order of magnitude
+//! (design 038 §15.8), so always record the build profile with a baseline.
+
+extern crate alloc;
+
+use alloc::boxed::Box;
+use core::alloc::{GlobalAlloc, Layout};
+use core::sync::atomic::{AtomicU32, Ordering};
+
+use aimdb_core::buffer::{Buffer, Reader};
+use aimdb_embassy_adapter::EmbassyBuffer;
+use cortex_m::peripheral::DWT;
+use defmt::info;
+use embassy_executor::Spawner;
+use embassy_futures::block_on;
+use {defmt_rtt as _, panic_probe as _};
+
+// ── Allocation-counting heap ─────────────────────────────────────────────────
+//
+// Wraps `embedded-alloc`'s `LlffHeap` so the B3 run can confirm 0 allocs/msg on
+// real hardware — the embedded analogue of the host `CountingAllocator<System>`
+// in `aimdb-bench` (design 038 §4 anticipated swapping `System` for an embedded
+// allocator without reworking the counter).
+
+static ALLOC_COUNT: AtomicU32 = AtomicU32::new(0);
+
+struct CountingHeap(embedded_alloc::LlffHeap);
+
+// SAFETY: every call is delegated unchanged to the inner heap; the only added
+// side effect is the relaxed atomic increment.
+unsafe impl GlobalAlloc for CountingHeap {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        unsafe { self.0.alloc(layout) }
+    }
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        unsafe { self.0.dealloc(ptr, layout) }
+    }
+}
+
+#[global_allocator]
+static HEAP: CountingHeap = CountingHeap(embedded_alloc::LlffHeap::empty());
+
+#[inline]
+fn reset_allocs() {
+    ALLOC_COUNT.store(0, Ordering::Relaxed);
+}
+
+#[inline]
+fn allocs() -> u32 {
+    ALLOC_COUNT.load(Ordering::Relaxed)
+}
+
+// ── Workload payloads ────────────────────────────────────────────────────────
+//
+// Mirror `aimdb_bench::profiles` exactly so B3 cycle numbers line up with the
+// host B0/B1/B2 figures for the same payload shapes. Kept in sync by hand — the
+// host crate is `std`-only and cannot be a dependency of this `no_std` binary.
+
+// The benchmark never reads these fields — they are payload "ballast" whose
+// size/shape matches the host profiles, so the per-message `clone` cost (and
+// thus the cycle count) is comparable. Hence `dead_code` is expected.
+#[derive(Clone)]
+#[allow(dead_code)]
+struct TelemetryMsg {
+    sensor_id: u32,
+    value: f64,
+    sequence: u64,
+}
+
+#[derive(Clone)]
+#[allow(dead_code)]
+struct StateMsg {
+    device_id: u32,
+    temperature: f64,
+    humidity: f64,
+    pressure: f64,
+    sequence: u64,
+}
+
+#[derive(Clone)]
+#[allow(dead_code)]
+struct CommandMsg {
+    command_id: u32,
+    target: u32,
+    value: f64,
+    sequence: u64,
+}
+
+#[inline]
+fn telemetry_msg(i: u64) -> TelemetryMsg {
+    TelemetryMsg {
+        sensor_id: (i % 16) as u32,
+        value: i as f64 * 0.1,
+        sequence: i,
+    }
+}
+
+#[inline]
+fn state_msg(i: u64) -> StateMsg {
+    StateMsg {
+        device_id: (i % 8) as u32,
+        temperature: 20.0 + i as f64 * 0.01,
+        humidity: 50.0 + i as f64 * 0.005,
+        pressure: 1013.25 + i as f64 * 0.001,
+        sequence: i,
+    }
+}
+
+#[inline]
+fn command_msg(i: u64) -> CommandMsg {
+    CommandMsg {
+        command_id: (i % 256) as u32,
+        target: (i % 4) as u32,
+        value: i as f64,
+        sequence: i,
+    }
+}
+
+// ── Buffer type aliases ──────────────────────────────────────────────────────
+//
+// Same backends as `aimdb_bench::profiles_embassy`, with smaller `CAP` to keep
+// the static `PubSubChannel` footprint modest on-target. The lockstep loops
+// keep at most one message in flight, so a small `CAP` never lags. `SUBS = 4`
+// on the Telemetry ring leaves room for the 1→4 fan-out.
+
+type TelemetryBuffer = EmbassyBuffer<TelemetryMsg, 8, 4, 1, 1>;
+type StateBuffer = EmbassyBuffer<StateMsg, 1, 1, 1, 2>;
+type CommandBuffer = EmbassyBuffer<CommandMsg, 1, 1, 1, 1>;
+
+const WARMUP: usize = 200;
+const BATCH: u32 = 512;
+
+/// Read CYCCNT, run `BATCH` lockstep `push`→`recv` cycles, return the cycle and
+/// allocation deltas over the measured window.
+macro_rules! measure {
+    ($reader:expr, $push:expr) => {{
+        reset_allocs();
+        let start = DWT::cycle_count();
+        for i in 0..BATCH {
+            let _ = $push(WARMUP as u64 + i as u64);
+            let _ = block_on($reader.recv());
+        }
+        let cycles = DWT::cycle_count().wrapping_sub(start);
+        (cycles, allocs())
+    }};
+}
+
+fn report(profile: &str, buffer: &str, cycles: u32, allocs: u32) {
+    info!(
+        "[B3] {=str} {=str}: {=u32} cycles/msg, {=u32} allocs/msg  ({=u32} cycles total, batch={=u32})",
+        profile,
+        buffer,
+        cycles / BATCH,
+        allocs / BATCH,
+        cycles,
+        BATCH,
+    );
+}
+
+#[embassy_executor::main]
+async fn main(_spawner: Spawner) {
+    // Initialize the heap behind the counting allocator.
+    {
+        use core::mem::MaybeUninit;
+        const HEAP_SIZE: usize = 32 * 1024; // 32 KB
+        static mut MEM: [MaybeUninit<u8>; HEAP_SIZE] = [MaybeUninit::uninit(); HEAP_SIZE];
+        unsafe {
+            let mem_ptr = core::ptr::addr_of_mut!(MEM);
+            HEAP.0.init((*mem_ptr).as_ptr() as usize, HEAP_SIZE);
+        }
+    }
+
+    // DWT cycle counter. We only touch DCB/DWT, which Embassy does not use, so
+    // stealing the core peripherals here is sound regardless of init ordering.
+    // SAFETY: exclusive access to DCB/DWT for the lifetime of this benchmark;
+    // no other code in this binary touches them.
+    let mut cp = unsafe { cortex_m::Peripherals::steal() };
+    cp.DCB.enable_trace();
+    cp.DWT.enable_cycle_counter();
+
+    // Clock tree: HSE 8 MHz (from the ST-LINK MCO) → PLL1 → 250 MHz. Identical
+    // to the other H563 demos in this repo.
+    let mut config = embassy_stm32::Config::default();
+    {
+        use embassy_stm32::rcc::*;
+        use embassy_stm32::time::Hertz;
+
+        config.rcc.hsi = None;
+        config.rcc.hsi48 = Some(Default::default());
+        config.rcc.hse = Some(Hse {
+            freq: Hertz(8_000_000),
+            mode: HseMode::BypassDigital,
+        });
+        config.rcc.pll1 = Some(Pll {
+            source: PllSource::Hse,
+            prediv: PllPreDiv::Div2,
+            mul: PllMul::Mul125,
+            divp: Some(PllDiv::Div2),
+            divq: Some(PllDiv::Div2),
+            divr: None,
+        });
+        config.rcc.ahb_pre = AHBPrescaler::Div1;
+        config.rcc.apb1_pre = APBPrescaler::Div1;
+        config.rcc.apb2_pre = APBPrescaler::Div1;
+        config.rcc.apb3_pre = APBPrescaler::Div1;
+        config.rcc.sys = Sysclk::Pll1P;
+        config.rcc.voltage_scale = VoltageScale::Scale0;
+    }
+    let _p = embassy_stm32::init(config);
+
+    info!("=== AimDB B3 — Embassy buffer profiling on STM32H563ZI @ 250 MHz ===");
+    info!(
+        "cycle_counter={=bool}  warmup={=u32}  batch={=u32}",
+        DWT::has_cycle_counter(),
+        WARMUP as u32,
+        BATCH
+    );
+
+    // ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────
+    //
+    // `try_recv` primes the lazily-created SpmcRing subscriber before the first
+    // push, otherwise the first message is missed and `recv` blocks forever.
+    {
+        let buf: TelemetryBuffer = EmbassyBuffer::new_spmc();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        let _ = reader.try_recv();
+        for i in 0..WARMUP {
+            buf.push(telemetry_msg(i as u64));
+            let _ = block_on(reader.recv());
+        }
+        let (cycles, n_allocs) = measure!(reader, |i| buf.push(telemetry_msg(i)));
+        report("Telemetry", "SpmcRing    ", cycles, n_allocs);
+    }
+
+    // ── State: SingleLatest / Watch ──────────────────────────────────────────
+    {
+        let buf: StateBuffer = EmbassyBuffer::new_watch();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        let _ = reader.try_recv();
+        for i in 0..WARMUP {
+            buf.push(state_msg(i as u64));
+            let _ = block_on(reader.recv());
+        }
+        let (cycles, n_allocs) = measure!(reader, |i| buf.push(state_msg(i)));
+        report("State    ", "SingleLatest", cycles, n_allocs);
+    }
+
+    // ── Command: Mailbox / Channel(capacity=1) ───────────────────────────────
+    {
+        let buf: CommandBuffer = EmbassyBuffer::new_mailbox();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        let _ = reader.try_recv();
+        for i in 0..WARMUP {
+            buf.push(command_msg(i as u64));
+            let _ = block_on(reader.recv());
+        }
+        let (cycles, n_allocs) = measure!(reader, |i| buf.push(command_msg(i)));
+        report("Command  ", "Mailbox     ", cycles, n_allocs);
+    }
+
+    // ── Telemetry 1→4 fan-out ────────────────────────────────────────────────
+    //
+    // One publisher, four subscribers, lockstep. Reported per produced message
+    // (each observed by all four readers — 4 deliveries/msg).
+    {
+        let buf: TelemetryBuffer = EmbassyBuffer::new_spmc();
+        let mut r0 = Reader::new(Box::new(buf.subscribe()));
+        let mut r1 = Reader::new(Box::new(buf.subscribe()));
+        let mut r2 = Reader::new(Box::new(buf.subscribe()));
+        let mut r3 = Reader::new(Box::new(buf.subscribe()));
+        let _ = r0.try_recv();
+        let _ = r1.try_recv();
+        let _ = r2.try_recv();
+        let _ = r3.try_recv();
+        for i in 0..WARMUP {
+            buf.push(telemetry_msg(i as u64));
+            let _ = block_on(r0.recv());
+            let _ = block_on(r1.recv());
+            let _ = block_on(r2.recv());
+            let _ = block_on(r3.recv());
+        }
+        reset_allocs();
+        let start = DWT::cycle_count();
+        for i in 0..BATCH {
+            buf.push(telemetry_msg(WARMUP as u64 + i as u64));
+            let _ = block_on(r0.recv());
+            let _ = block_on(r1.recv());
+            let _ = block_on(r2.recv());
+            let _ = block_on(r3.recv());
+        }
+        let cycles = DWT::cycle_count().wrapping_sub(start);
+        let n_allocs = allocs();
+        info!(
+            "[B3] Telemetry SpmcRing(1->4): {=u32} cycles/msg, {=u32} allocs/msg  (4 deliveries/msg, {=u32} cycles total, batch={=u32})",
+            cycles / BATCH,
+            n_allocs / BATCH,
+            cycles,
+            BATCH,
+        );
+    }
+
+    info!("=== B3 complete — target=0 allocs/msg (W8 zero-alloc consume path) ===");
+
+    loop {
+        cortex_m::asm::wfe();
+    }
+}