From cfcfad53ecb1c69f3e5979faec06c5df4fc6cb33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Tue, 16 Jun 2026 20:13:07 +0000
Subject: [PATCH 01/16] feat(bench): add benchmarking infrastructure for AimDB

- Introduced `aimdb-bench` crate for benchmarking AimDB with various profiles.
- Implemented allocation counting benchmarks (B0) using `CountingAllocator`.
- Added latency benchmarks (B1) to measure push-to-receive latency.
- Developed throughput benchmarks (B2) for steady-state performance.
- Created pipeline benchmarks for both allocation (B0-Pipeline) and runner-driven throughput (B-Runner-Pipeline).
- Established workload profiles for telemetry, state, and command messages.
- Results are serialized to JSON for easy analysis and comparison.
---
 Cargo.lock                               | 150 +++++++++++++
 Cargo.toml                               |   2 +
 Makefile                                 |   4 +-
 aimdb-bench/Cargo.toml                   |  51 +++++
 aimdb-bench/benches/b0_alloc_tokio.rs    | 133 +++++++++++
 aimdb-bench/benches/b1_latency.rs        | 138 ++++++++++++
 aimdb-bench/benches/b2_throughput.rs     | 174 +++++++++++++++
 aimdb-bench/benches/b_alloc_pipeline.rs  | 273 +++++++++++++++++++++++
 aimdb-bench/benches/b_runner_pipeline.rs | 217 ++++++++++++++++++
 aimdb-bench/src/alloc.rs                 |  72 ++++++
 aimdb-bench/src/lib.rs                   |  22 ++
 aimdb-bench/src/profiles.rs              | 112 ++++++++++
 aimdb-bench/src/reports.rs               |  61 +++++
 13 files changed, 1407 insertions(+), 2 deletions(-)
 create mode 100644 aimdb-bench/Cargo.toml
 create mode 100644 aimdb-bench/benches/b0_alloc_tokio.rs
 create mode 100644 aimdb-bench/benches/b1_latency.rs
 create mode 100644 aimdb-bench/benches/b2_throughput.rs
 create mode 100644 aimdb-bench/benches/b_alloc_pipeline.rs
 create mode 100644 aimdb-bench/benches/b_runner_pipeline.rs
 create mode 100644 aimdb-bench/src/alloc.rs
 create mode 100644 aimdb-bench/src/lib.rs
 create mode 100644 aimdb-bench/src/profiles.rs
 create mode 100644 aimdb-bench/src/reports.rs

diff --git a/Cargo.lock b/Cargo.lock
index 690940f..bcbe97d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,6 +23,18 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "aimdb-bench"
+version = "0.1.0"
+dependencies = [
+ "aimdb-core",
+ "aimdb-tokio-adapter",
+ "criterion",
+ "serde",
+ "serde_json",
+ "tokio",
+]
+
 [[package]]
 name = "aimdb-cli"
 version = "0.6.0"
@@ -383,6 +395,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
 [[package]]
 name = "anstream"
 version = "0.6.21"
@@ -636,6 +654,12 @@ version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
 [[package]]
 name = "cc"
 version = "1.2.43"
@@ -682,6 +706,33 @@ dependencies = [
  "windows-link 0.2.1",
 ]
 
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
 [[package]]
 name = "clap"
 version = "4.5.51"
@@ -849,6 +900,40 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
 [[package]]
 name = "critical-section"
 version = "1.2.0"
@@ -861,6 +946,12 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -1017,6 +1108,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+
 [[package]]
 name = "embassy-embedded-hal"
 version = "0.6.0"
@@ -1736,6 +1833,17 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -1824,6 +1932,12 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
 [[package]]
 name = "http"
 version = "1.4.0"
@@ -2124,12 +2238,32 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "is-terminal"
+version = "0.4.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.15"
@@ -2458,6 +2592,12 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
 [[package]]
 name = "openssl"
 version = "0.10.80"
@@ -3460,6 +3600,16 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "tokio"
 version = "1.48.0"
diff --git a/Cargo.toml b/Cargo.toml
index 2b7dbb4..12faba1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,6 +36,8 @@ members = [
     "examples/weather-mesh-demo/weather-station-gamma",
     "examples/hello-mailbox",
     "examples/hello-single-latest-async",
+    # Benchmarking infrastructure — host-only, excluded from default-members
+    "aimdb-bench",
 ]
 exclude = ["_external"]
 resolver = "2"
diff --git a/Makefile b/Makefile
index 3ea61ce..fe97993 100644
--- a/Makefile
+++ b/Makefile
@@ -176,7 +176,7 @@ test:
 
 fmt:
 	@printf "$(GREEN)Formatting code (workspace members only)...$(NC)\n"
-	@for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async; do \
+	@for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
 		printf "$(YELLOW)  → Formatting $$pkg$(NC)\n"; \
 		cargo fmt -p $$pkg 2>/dev/null || true; \
 	done
@@ -185,7 +185,7 @@ fmt:
 fmt-check:
 	@printf "$(GREEN)Checking code formatting (workspace members only)...$(NC)\n"
 	@FAILED=0; \
-	for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async; do \
+	for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
 		printf "$(YELLOW)  → Checking $$pkg$(NC)\n"; \
 		if ! cargo fmt -p $$pkg -- --check 2>&1; then \
 			printf "$(RED)❌ Formatting check failed for $$pkg$(NC)\n"; \
diff --git a/aimdb-bench/Cargo.toml b/aimdb-bench/Cargo.toml
new file mode 100644
index 0000000..552f5ae
--- /dev/null
+++ b/aimdb-bench/Cargo.toml
@@ -0,0 +1,51 @@
+[package]
+name = "aimdb-bench"
+version = "0.1.0"
+edition = "2021"
+publish = false
+description = "Benchmarking infrastructure for AimDB — not for production use"
+
+[lib]
+name = "aimdb_bench"
+
+[[bench]]
+name = "b0_alloc_tokio"
+harness = false
+
+[[bench]]
+name = "b1_latency"
+harness = false
+
+[[bench]]
+name = "b2_throughput"
+harness = false
+
+[[bench]]
+name = "b_runner_pipeline"
+harness = false
+
+[[bench]]
+name = "b_alloc_pipeline"
+harness = false
+
+[dependencies]
+# Core AimDB types
+aimdb-core = { path = "../aimdb-core", features = ["std"] }
+
+# Tokio runtime adapter (the primary target for B0-B2)
+aimdb-tokio-adapter = { path = "../aimdb-tokio-adapter", features = [
+    "tokio-runtime",
+] }
+
+# Async runtime — current-thread executor is used for noise reduction in B0
+tokio = { workspace = true }
+
+# JSON output for baseline snapshots
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+[dev-dependencies]
+# Statistical benchmarking for B1/B2
+criterion = { version = "0.5", default-features = false, features = [
+    "cargo_bench_support",
+] }
diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
new file mode 100644
index 0000000..b74c36a
--- /dev/null
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -0,0 +1,133 @@
+//! B0 — Allocation counting on the Tokio adapter.
+//!
+//! Measures per-message allocation cost for each workload profile using
+//! `TokioBuffer<T>` directly (not the full `AimDb` stack).
+//!
+//! **Pre-W8 baseline (expected):** 1 alloc/msg from the `Box::pin(async
+//! move { ... })` constructed inside `TokioBufferReader::recv()` on every
+//! call.  The target is **0 allocs/msg**.
+//!
+//! **Measurement model:**
+//! 1. Create buffer + reader.
+//! 2. Warmup ≥ `WARMUP_ITERS` push → recv cycles (excluded from counters).
+//! 3. `reset()` allocation counters.
+//! 4. Run `BATCH_SIZE` push → recv cycles.
+//! 5. `snapshot()` counters; divide by `BATCH_SIZE` for per-message figures.
+//!
+//! **Noise reduction:** a current-thread Tokio runtime is used so there are
+//! no work-stealing threads and Tokio's scheduler does not allocate per-poll
+//! in the hot path.  The counter then cleanly isolates AimDB's per-message
+//! contribution.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b0_alloc_tokio
+//! ```
+//!
+//! Results are written to `target/bench-results/b0_alloc_tokio.json`.
+
+use aimdb_bench::{
+    alloc::{reset, snapshot},
+    profiles::{
+        command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
+        BATCH_SIZE, WARMUP_ITERS,
+    },
+    reports::AllocReport,
+};
+use aimdb_core::buffer::{Buffer, BufferReader};
+
+fn main() {
+    // Current-thread executor — no work-stealing threads, minimal scheduler
+    // overhead, clean allocation signal.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("failed to build current-thread Tokio runtime");
+
+    println!("=== B0 Allocation Benchmarks (Tokio adapter, buffer layer) ===");
+    println!("  Warmup iters : {WARMUP_ITERS}");
+    println!("  Batch size   : {BATCH_SIZE}");
+    println!();
+
+    // ── Telemetry: SpmcRing / broadcast ─────────────────────────────────────
+    //
+    // Subscribe before pushing so the reader holds its read position from the
+    // start — a reader created after sends are in flight misses them.
+    let telemetry_report = rt.block_on(async {
+        let buf = telemetry_buffer();
+        let mut reader = buf.subscribe();
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(telemetry_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(telemetry_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Telemetry", "SpmcRing", BATCH_SIZE, allocs, bytes)
+    });
+    telemetry_report.print();
+
+    // ── State: SingleLatest / watch ──────────────────────────────────────────
+    let state_report = rt.block_on(async {
+        let buf = state_buffer();
+        let mut reader = buf.subscribe();
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(state_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(state_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("State", "SingleLatest", BATCH_SIZE, allocs, bytes)
+    });
+    state_report.print();
+
+    // ── Command: Mailbox / Mutex + Notify ────────────────────────────────────
+    //
+    // Tight 1:1 push → recv loop matches Mailbox semantics.  Do NOT batch
+    // pushes ahead of the consumer: the single slot overwrites earlier values,
+    // and only the last write survives — which would conflate Mailbox overwrite
+    // semantics with throughput measurement.
+    let command_report = rt.block_on(async {
+        let buf = command_buffer();
+        let mut reader = buf.subscribe();
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(command_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(command_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Command", "Mailbox", BATCH_SIZE, allocs, bytes)
+    });
+    command_report.print();
+
+    println!();
+    println!(
+        "Pre-W8 expectation: ~1 alloc/msg (Box::pin in recv()). \
+         Target: 0 allocs/msg."
+    );
+
+    // Persist results for baseline comparison.
+    let reports = vec![telemetry_report, state_report, command_report];
+    let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
+    let out_dir = "target/bench-results";
+    std::fs::create_dir_all(out_dir).ok();
+    let out_path = format!("{out_dir}/b0_alloc_tokio.json");
+    std::fs::write(&out_path, &json).expect("failed to write results");
+    println!("\nResults written to {out_path}");
+}
diff --git a/aimdb-bench/benches/b1_latency.rs b/aimdb-bench/benches/b1_latency.rs
new file mode 100644
index 0000000..9b2e07a
--- /dev/null
+++ b/aimdb-bench/benches/b1_latency.rs
@@ -0,0 +1,138 @@
+//! B1 — Push-to-recv latency benchmarks (Criterion).
+//!
+//! Measures the wall-clock latency from `buf.push(msg)` to `reader.recv()`
+//! returning, for each workload profile.  Uses `TokioBuffer<T>` directly to
+//! isolate the buffer layer from `AimDb` initialization overhead.
+//!
+//! **Measurement model:** `iter_custom` gives Criterion the total elapsed time
+//! for *iters* push → recv cycles (post-warmup).  Criterion computes the
+//! per-iteration p50/p99 distribution over many samples.
+//!
+//! **Executor:** a single current-thread Tokio runtime is shared across all
+//! bench iterations.  This eliminates work-stealing scheduler noise and keeps
+//! the signal comparable to the B0 allocator runs.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b1_latency
+//! # Save a named baseline:
+//! cargo bench -p aimdb-bench --bench b1_latency -- --save-baseline pre-w8
+//! # Compare against that baseline:
+//! cargo bench -p aimdb-bench --bench b1_latency -- --baseline pre-w8
+//! ```
+
+use aimdb_bench::profiles::{
+    command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
+    WARMUP_ITERS,
+};
+use aimdb_core::buffer::{Buffer, BufferReader};
+use criterion::{criterion_group, criterion_main, Criterion};
+
+// ── Telemetry: SpmcRing / broadcast ──────────────────────────────────────────
+
+fn bench_latency_telemetry(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-Latency");
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = telemetry_buffer();
+                let mut reader = buf.subscribe();
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State: SingleLatest / watch ───────────────────────────────────────────────
+
+fn bench_latency_state(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-Latency");
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = state_buffer();
+                let mut reader = buf.subscribe();
+
+                for i in 0..WARMUP_ITERS {
+                    buf.push(state_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command: Mailbox / Notify ─────────────────────────────────────────────────
+
+fn bench_latency_command(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B1-Latency");
+
+    // Tight 1:1 push → recv loop — matches Mailbox semantics.
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = command_buffer();
+                let mut reader = buf.subscribe();
+
+                for i in 0..WARMUP_ITERS {
+                    buf.push(command_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_latency_telemetry,
+    bench_latency_state,
+    bench_latency_command,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/benches/b2_throughput.rs b/aimdb-bench/benches/b2_throughput.rs
new file mode 100644
index 0000000..9722ff0
--- /dev/null
+++ b/aimdb-bench/benches/b2_throughput.rs
@@ -0,0 +1,174 @@
+//! B2 — Steady-state throughput benchmarks (Criterion).
+//!
+//! Measures messages per second for SPSC (1 producer, 1 consumer) and 1→4
+//! fan-out configurations, using `TokioBuffer<T>` directly.
+//!
+//! **Fan-out safety rules (SpmcRing / broadcast):**
+//! - All readers are subscribed *before* any messages are pushed so each
+//!   reader holds its read position from the start.
+//! - `TELEMETRY_CAPACITY >= BATCH_SIZE` prevents `BufferLagged` within a
+//!   single Criterion iteration.
+//!
+//! **Mailbox throughput:** tight 1:1 push → recv loop.  Do NOT batch pushes
+//! ahead of the consumer — the single slot overwrites earlier values and
+//! only the last write survives, which conflates Mailbox overwrite semantics
+//! with throughput measurement.  See design 038 §4 for details.
+//!
+//! **Executor:** single current-thread Tokio runtime, same as B0/B1.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b2_throughput
+//! cargo bench -p aimdb-bench --bench b2_throughput -- --save-baseline pre-w8
+//! cargo bench -p aimdb-bench --bench b2_throughput -- --baseline pre-w8
+//! ```
+
+use aimdb_bench::profiles::{
+    command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
+    BATCH_SIZE, TELEMETRY_CAPACITY,
+};
+use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
+use aimdb_tokio_adapter::TokioBuffer;
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+
+// ── Telemetry SPSC ────────────────────────────────────────────────────────────
+
+fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B2-Throughput");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                // Subscribe before pushing — reader holds position from start.
+                let buf = telemetry_buffer();
+                let mut reader = buf.subscribe();
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg(i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Telemetry 1→4 fan-out ────────────────────────────────────────────────────
+//
+// All 4 readers are subscribed before any messages are pushed.
+// Each iteration: 1 push + recv on all 4 readers (sequential in bench, as
+// they would all eventually converge on a current-thread executor).
+// TELEMETRY_CAPACITY >= BATCH_SIZE ensures no reader lags.
+
+fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B2-Throughput");
+    // Each iteration produces 1 message observed by 4 consumers.
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_fanout_1x4", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                // All readers subscribed before first push (required for
+                // broadcast correctness: capacity >= iters avoids lag).
+                let buf = TokioBuffer::<_>::new(&BufferCfg::SpmcRing {
+                    capacity: TELEMETRY_CAPACITY.max(iters as usize + BATCH_SIZE),
+                });
+                let mut r0 = buf.subscribe();
+                let mut r1 = buf.subscribe();
+                let mut r2 = buf.subscribe();
+                let mut r3 = buf.subscribe();
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg(i));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State SPSC ────────────────────────────────────────────────────────────────
+
+fn bench_throughput_state_spsc(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B2-Throughput");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = state_buffer();
+                let mut reader = buf.subscribe();
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(state_msg(i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command / Mailbox SPSC ────────────────────────────────────────────────────
+
+fn bench_throughput_command_mailbox(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut group = c.benchmark_group("B2-Throughput");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                let buf = command_buffer();
+                let mut reader = buf.subscribe();
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(command_msg(i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_throughput_telemetry_spsc,
+    bench_throughput_telemetry_fanout,
+    bench_throughput_state_spsc,
+    bench_throughput_command_mailbox,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/benches/b_alloc_pipeline.rs b/aimdb-bench/benches/b_alloc_pipeline.rs
new file mode 100644
index 0000000..ecac7d4
--- /dev/null
+++ b/aimdb-bench/benches/b_alloc_pipeline.rs
@@ -0,0 +1,273 @@
+//! B0-Pipeline — Allocation counting for a live runner-driven pipeline.
+//!
+//! Measures per-message allocation cost for a real `.source()` -> buffer ->
+//! `.tap()` pipeline driven by `AimDbRunner`. Unlike `b0_alloc_tokio`, this is
+//! an integration-layer measurement: it includes runner/stage machinery in
+//! addition to the buffer consume path.
+//!
+//! **Scope:** this bench intentionally minimizes harness-side noise. The source
+//! generates each batch internally after a single start notification, and the
+//! tap emits a single completion notification when the whole batch has been
+//! consumed. There is no per-message ingress or ack channel traffic in the
+//! measured window.
+//!
+//! **Interpretation:** use this as an informational companion to the raw-buffer
+//! B0 gate in `b0_alloc_tokio`. If this regresses, the raw-buffer B0 still
+//! tells you whether the issue is in the consume path itself.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b_alloc_pipeline
+//! ```
+//!
+//! Results are written to `target/bench-results/b_alloc_pipeline.json`.
+
+use std::fmt::Debug;
+use std::sync::{
+    atomic::{AtomicU64, AtomicUsize, Ordering},
+    Arc,
+};
+
+use aimdb_bench::{
+    alloc::{reset, snapshot},
+    profiles::{
+        command_msg, state_msg, telemetry_msg, CommandMsg, StateMsg, TelemetryMsg, BATCH_SIZE,
+        TELEMETRY_CAPACITY, WARMUP_ITERS,
+    },
+    reports::AllocReport,
+};
+use aimdb_core::{buffer::BufferCfg, AimDb, AimDbBuilder};
+use aimdb_tokio_adapter::{TokioAdapter, TokioRecordRegistrarExt};
+use tokio::sync::{oneshot, Notify};
+
+struct BatchState {
+    start_epoch: AtomicU64,
+    completed_epoch: AtomicU64,
+    batch_size: AtomicUsize,
+    target_epoch: AtomicU64,
+    consumed_in_epoch: AtomicUsize,
+    start_notify: Notify,
+    pace_tokens: AtomicUsize,
+    pace_notify: Notify,
+    done_notify: Notify,
+}
+
+impl BatchState {
+    fn new() -> Self {
+        Self {
+            start_epoch: AtomicU64::new(0),
+            completed_epoch: AtomicU64::new(0),
+            batch_size: AtomicUsize::new(0),
+            target_epoch: AtomicU64::new(0),
+            consumed_in_epoch: AtomicUsize::new(0),
+            start_notify: Notify::new(),
+            pace_tokens: AtomicUsize::new(0),
+            pace_notify: Notify::new(),
+            done_notify: Notify::new(),
+        }
+    }
+}
+
+struct PipelineHarness {
+    _db: AimDb,
+    state: Arc<BatchState>,
+}
+
+impl PipelineHarness {
+    async fn warmup(&self, batch_size: usize) {
+        self.run_batch(batch_size).await;
+    }
+
+    async fn measure(&self, batch_size: usize) {
+        self.run_batch(batch_size).await;
+    }
+
+    async fn run_batch(&self, batch_size: usize) {
+        let epoch = self.state.start_epoch.load(Ordering::Acquire) + 1;
+        self.state.batch_size.store(batch_size, Ordering::Release);
+        self.state.consumed_in_epoch.store(0, Ordering::Release);
+        self.state.target_epoch.store(epoch, Ordering::Release);
+        self.state.pace_tokens.store(1, Ordering::Release);
+        self.state.start_epoch.store(epoch, Ordering::Release);
+        self.state.start_notify.notify_waiters();
+        self.state.pace_notify.notify_waiters();
+
+        while self.state.completed_epoch.load(Ordering::Acquire) < epoch {
+            self.state.done_notify.notified().await;
+        }
+    }
+}
+
+async fn build_pipeline_harness<T, Make>(
+    key: &'static str,
+    cfg: BufferCfg,
+    make_value: Make,
+) -> PipelineHarness
+where
+    T: Send + Sync + Clone + Debug + 'static,
+    Make: Fn(u64) -> T + Send + Sync + Clone + 'static,
+{
+    let state = Arc::new(BatchState::new());
+    let (tap_ready_tx, tap_ready_rx) = oneshot::channel::<()>();
+
+    let adapter = Arc::new(TokioAdapter);
+    let mut builder = AimDbBuilder::new().runtime(adapter);
+    builder.configure::<T>(key, {
+        let state = Arc::clone(&state);
+        move |reg| {
+            let source_state = Arc::clone(&state);
+            let tap_state = Arc::clone(&state);
+            let mut tap_ready_tx = Some(tap_ready_tx);
+            let make_value = make_value.clone();
+
+            reg.buffer(cfg)
+                .source(move |_ctx, producer| async move {
+                    let mut next_value_index = 0u64;
+                    let mut seen_epoch = 0u64;
+                    loop {
+                        while source_state.start_epoch.load(Ordering::Acquire) == seen_epoch {
+                            source_state.start_notify.notified().await;
+                        }
+
+                        seen_epoch = source_state.start_epoch.load(Ordering::Acquire);
+                        let batch_size = source_state.batch_size.load(Ordering::Acquire);
+                        for _ in 0..batch_size {
+                            loop {
+                                let available = source_state.pace_tokens.load(Ordering::Acquire);
+                                if available > 0 {
+                                    if source_state
+                                        .pace_tokens
+                                        .compare_exchange(
+                                            available,
+                                            available - 1,
+                                            Ordering::AcqRel,
+                                            Ordering::Acquire,
+                                        )
+                                        .is_ok()
+                                    {
+                                        break;
+                                    }
+                                } else {
+                                    source_state.pace_notify.notified().await;
+                                }
+                            }
+                            producer.produce(make_value(next_value_index));
+                            next_value_index += 1;
+                        }
+                    }
+                })
+                .with_name("bench_source")
+                .tap(move |_ctx, consumer| async move {
+                    let mut reader = consumer.subscribe();
+                    tap_ready_tx
+                        .take()
+                        .expect("tap readiness sender already used")
+                        .send(())
+                        .expect("failed to signal tap readiness");
+
+                    loop {
+                        match reader.recv().await {
+                            Ok(_value) => {
+                                let current_epoch = tap_state.target_epoch.load(Ordering::Acquire);
+                                let seen =
+                                    tap_state.consumed_in_epoch.fetch_add(1, Ordering::AcqRel) + 1;
+                                let batch_size = tap_state.batch_size.load(Ordering::Acquire);
+                                if seen < batch_size {
+                                    tap_state.pace_tokens.fetch_add(1, Ordering::AcqRel);
+                                    tap_state.pace_notify.notify_waiters();
+                                }
+                                if seen == batch_size {
+                                    tap_state
+                                        .completed_epoch
+                                        .store(current_epoch, Ordering::Release);
+                                    tap_state.done_notify.notify_waiters();
+                                }
+                            }
+                            Err(_) => break,
+                        }
+                    }
+                })
+                .with_name("bench_tap");
+        }
+    });
+
+    let (db, runner) = builder
+        .build()
+        .await
+        .expect("alloc pipeline bench build failed");
+    tokio::spawn(runner.run());
+    tap_ready_rx
+        .await
+        .expect("pipeline tap exited before signalling readiness");
+
+    PipelineHarness { _db: db, state }
+}
+
+fn main() {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("failed to build current-thread Tokio runtime");
+
+    println!("=== B0 Allocation Benchmarks (Runner pipeline) ===");
+    println!("  Warmup batch : {WARMUP_ITERS}");
+    println!("  Measured batch: {BATCH_SIZE}");
+    println!();
+
+    let telemetry_report = rt.block_on(async {
+        let harness = build_pipeline_harness::<TelemetryMsg, _>(
+            "bench::telemetry",
+            BufferCfg::SpmcRing {
+                capacity: TELEMETRY_CAPACITY,
+            },
+            telemetry_msg,
+        )
+        .await;
+
+        harness.warmup(WARMUP_ITERS).await;
+        reset();
+        harness.measure(BATCH_SIZE).await;
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Telemetry", "SpmcRing", BATCH_SIZE, allocs, bytes)
+    });
+    telemetry_report.print();
+
+    let state_report = rt.block_on(async {
+        let harness = build_pipeline_harness::<StateMsg, _>(
+            "bench::state",
+            BufferCfg::SingleLatest,
+            state_msg,
+        )
+        .await;
+
+        harness.warmup(WARMUP_ITERS).await;
+        reset();
+        harness.measure(BATCH_SIZE).await;
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("State", "SingleLatest", BATCH_SIZE, allocs, bytes)
+    });
+    state_report.print();
+
+    let command_report = rt.block_on(async {
+        let harness = build_pipeline_harness::<CommandMsg, _>(
+            "bench::command",
+            BufferCfg::Mailbox,
+            command_msg,
+        )
+        .await;
+
+        harness.warmup(WARMUP_ITERS).await;
+        reset();
+        harness.measure(BATCH_SIZE).await;
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Command", "Mailbox", BATCH_SIZE, allocs, bytes)
+    });
+    command_report.print();
+
+    let reports = vec![telemetry_report, state_report, command_report];
+    let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
+    let out_dir = "target/bench-results";
+    std::fs::create_dir_all(out_dir).ok();
+    let out_path = format!("{out_dir}/b_alloc_pipeline.json");
+    std::fs::write(&out_path, &json).expect("failed to write results");
+    println!("\nResults written to {out_path}");
+}
diff --git a/aimdb-bench/benches/b_runner_pipeline.rs b/aimdb-bench/benches/b_runner_pipeline.rs
new file mode 100644
index 0000000..36efb60
--- /dev/null
+++ b/aimdb-bench/benches/b_runner_pipeline.rs
@@ -0,0 +1,217 @@
+//! B-Runner-Pipeline — Runner-driven in-process pipeline throughput.
+//!
+//! Exercises the same three profiles as B0/B1/B2 through a real `AimDbRunner`
+//! path: `.source()` -> buffer -> `.tap()`.  This makes the benchmark measure
+//! stage wakeups and the runner-driven producer/consumer pipeline rather than
+//! direct `Producer<T>` / `Consumer<T>` calls from the bench body.
+//!
+//! **Scope:** this is a real runner benchmark, but still in-process only.
+//! It does not include outbound connectors, serialization, transport, or
+//! kernel I/O. The timing window includes the coordination handshakes used to
+//! feed messages into the source stage and observe completion at the tap stage.
+//!
+//! **Setup:** one `AimDb` instance is built per bench group, and its runner is
+//! spawned once. Criterion samples then push work into the source stage via an
+//! ingress channel and wait for completion signals emitted by the tap stage.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b_runner_pipeline
+//! ```
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use aimdb_bench::profiles::{
+    command_msg, state_msg, telemetry_msg, CommandMsg, StateMsg, TelemetryMsg, TELEMETRY_CAPACITY,
+    WARMUP_ITERS,
+};
+use aimdb_core::{buffer::BufferCfg, AimDb, AimDbBuilder};
+use aimdb_tokio_adapter::{TokioAdapter, TokioRecordRegistrarExt};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use tokio::sync::{
+    mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
+    oneshot,
+};
+
+struct RunnerHarness<T> {
+    _db: AimDb,
+    input_tx: UnboundedSender<T>,
+    ack_rx: UnboundedReceiver<()>,
+}
+
+impl<T> RunnerHarness<T> {
+    async fn round_trip(&mut self, value: T) {
+        self.input_tx
+            .send(value)
+            .expect("source input channel closed unexpectedly");
+        self.ack_rx
+            .recv()
+            .await
+            .expect("tap acknowledgement channel closed unexpectedly");
+    }
+}
+
+async fn build_runner_harness<T>(key: &'static str, cfg: BufferCfg) -> RunnerHarness<T>
+where
+    T: Send + Sync + Clone + Debug + 'static,
+{
+    let (input_tx, mut input_rx) = unbounded_channel::<T>();
+    let (ack_tx, ack_rx) = unbounded_channel::<()>();
+    let (tap_ready_tx, tap_ready_rx) = oneshot::channel::<()>();
+
+    let adapter = Arc::new(TokioAdapter);
+    let mut builder = AimDbBuilder::new().runtime(adapter);
+    builder.configure::<T>(key, move |reg| {
+        let ack_tx = ack_tx.clone();
+        let mut tap_ready_tx = Some(tap_ready_tx);
+
+        reg.buffer(cfg)
+            .source(move |_ctx, producer| async move {
+                while let Some(value) = input_rx.recv().await {
+                    producer.produce(value);
+                }
+            })
+            .with_name("bench_source")
+            .tap(move |_ctx, consumer| async move {
+                let mut reader = consumer.subscribe();
+                tap_ready_tx
+                    .take()
+                    .expect("tap readiness sender already used")
+                    .send(())
+                    .expect("failed to signal tap readiness");
+                while let Ok(_value) = reader.recv().await {
+                    ack_tx
+                        .send(())
+                        .expect("bench tap ack channel closed unexpectedly");
+                }
+            })
+            .with_name("bench_tap");
+    });
+
+    let (db, runner) = builder.build().await.expect("runner bench build failed");
+    tokio::spawn(runner.run());
+    tap_ready_rx
+        .await
+        .expect("runner tap exited before signalling readiness");
+
+    RunnerHarness {
+        _db: db,
+        input_tx,
+        ack_rx,
+    }
+}
+
+// ── Telemetry E2E ─────────────────────────────────────────────────────────────
+
+fn bench_e2e_telemetry(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut harness = rt.block_on(build_runner_harness::<TelemetryMsg>(
+        "bench::telemetry",
+        BufferCfg::SpmcRing {
+            capacity: TELEMETRY_CAPACITY,
+        },
+    ));
+
+    let mut group = c.benchmark_group("B-Runner-Pipeline");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                for i in 0..WARMUP_ITERS {
+                    harness.round_trip(telemetry_msg(i as u64)).await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    harness
+                        .round_trip(telemetry_msg((WARMUP_ITERS as u64) + i))
+                        .await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State E2E ─────────────────────────────────────────────────────────────────
+
+fn bench_e2e_state(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut harness = rt.block_on(build_runner_harness::<StateMsg>(
+        "bench::state",
+        BufferCfg::SingleLatest,
+    ));
+
+    let mut group = c.benchmark_group("B-Runner-Pipeline");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                for i in 0..WARMUP_ITERS {
+                    harness.round_trip(state_msg(i as u64)).await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    harness.round_trip(state_msg((WARMUP_ITERS as u64) + i)).await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command E2E ───────────────────────────────────────────────────────────────
+
+fn bench_e2e_command(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .expect("tokio runtime");
+
+    let mut harness = rt.block_on(build_runner_harness::<CommandMsg>(
+        "bench::command",
+        BufferCfg::Mailbox,
+    ));
+
+    let mut group = c.benchmark_group("B-Runner-Pipeline");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            rt.block_on(async {
+                for i in 0..WARMUP_ITERS {
+                    harness.round_trip(command_msg(i as u64)).await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    harness.round_trip(command_msg((WARMUP_ITERS as u64) + i)).await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_e2e_telemetry,
+    bench_e2e_state,
+    bench_e2e_command,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/src/alloc.rs b/aimdb-bench/src/alloc.rs
new file mode 100644
index 0000000..41f56e8
--- /dev/null
+++ b/aimdb-bench/src/alloc.rs
@@ -0,0 +1,72 @@
+//! Allocation counting for B0 benchmarks.
+//!
+//! Wraps the system allocator with atomic counters so B0 benchmarks can
+//! measure per-message allocation overhead precisely.
+//!
+//! **Production isolation.** `#[global_allocator]` is a per-binary
+//! link-time declaration.  `CountingAllocator` exists only in the bench
+//! binaries produced by `aimdb-bench`.  Nothing in the production
+//! dependency graph — `aimdb-core`, `aimdb-tokio-adapter`, application
+//! binaries — depends on `aimdb-bench`, so this has zero impact on
+//! production code.
+//!
+//! **Generic inner allocator.** `CountingAllocator<A>` is generic over the
+//! inner `GlobalAlloc` so a future embedded B3 target can swap `System` for
+//! `embedded-alloc` or similar without changing this module.
+
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Total allocation call count (since last [`reset`]).
+pub static ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Total bytes allocated (since last [`reset`]).
+pub static ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
+
+/// Wraps an inner `GlobalAlloc`, incrementing [`ALLOC_COUNT`] and
+/// [`ALLOC_BYTES`] on every allocation.
+pub struct CountingAllocator<A>(pub A);
+
+// SAFETY: we delegate every call to the inner allocator unchanged;
+// the only side-effect is the atomic counter updates.
+unsafe impl<A: GlobalAlloc> GlobalAlloc for CountingAllocator<A> {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        ALLOC_BYTES.fetch_add(layout.size() as u64, Ordering::Relaxed);
+        // SAFETY: caller guarantees `layout` is valid; delegated to inner.
+        unsafe { self.0.alloc(layout) }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        // SAFETY: caller guarantees `ptr` was allocated by this allocator.
+        unsafe { self.0.dealloc(ptr, layout) }
+    }
+}
+
+/// The global allocator used by all bench binaries.
+///
+/// Applies to every bench binary that links `aimdb-bench` — not to any
+/// production crate.
+#[global_allocator]
+static GLOBAL: CountingAllocator<System> = CountingAllocator(System);
+
+/// Reset both counters to zero.
+///
+/// Call once after the warmup phase, immediately before the measured window.
+#[inline]
+pub fn reset() {
+    ALLOC_COUNT.store(0, Ordering::Relaxed);
+    ALLOC_BYTES.store(0, Ordering::Relaxed);
+}
+
+/// Snapshot the current counters.
+///
+/// Returns `(count, bytes)` — total allocations and total bytes since the
+/// last [`reset`].
+#[inline]
+pub fn snapshot() -> (u64, u64) {
+    (
+        ALLOC_COUNT.load(Ordering::Relaxed),
+        ALLOC_BYTES.load(Ordering::Relaxed),
+    )
+}
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
new file mode 100644
index 0000000..9835521
--- /dev/null
+++ b/aimdb-bench/src/lib.rs
@@ -0,0 +1,22 @@
+//! AimDB benchmarking infrastructure.
+//!
+//! Provides reusable primitives for B0 (allocation counting), B1 (latency),
+//! and B2 (throughput) benchmarks.  **Not for production use.**
+//!
+//! The `alloc` module registers [`alloc::CountingAllocator`] as the
+//! `#[global_allocator]` for every bench binary that links this crate.
+//! Nothing in the production dependency graph depends on `aimdb-bench`.
+//!
+//! # Bench entrypoints
+//!
+//! | File                         | Class | Purpose                                      |
+//! |------------------------------|-------|----------------------------------------------|
+//! | `benches/b0_alloc_tokio.rs`  | B0    | Per-message allocation (buffer layer)        |
+//! | `benches/b1_latency.rs`      | B1    | Push-to-recv latency (buffer layer)          |
+//! | `benches/b2_throughput.rs`   | B2    | Steady-state throughput (buffer layer)       |
+//! | `benches/b_alloc_pipeline.rs`| info  | Per-message allocation (runner pipeline)     |
+//! | `benches/b_runner_pipeline.rs`     | info  | Runner pipeline throughput (Criterion)       |
+
+pub mod alloc;
+pub mod profiles;
+pub mod reports;
diff --git a/aimdb-bench/src/profiles.rs b/aimdb-bench/src/profiles.rs
new file mode 100644
index 0000000..b57cc4b
--- /dev/null
+++ b/aimdb-bench/src/profiles.rs
@@ -0,0 +1,112 @@
+//! Canonical workload profiles and deterministic message factories.
+//!
+//! Three profiles match the three buffer types:
+//!
+//! | Profile      | Buffer type   | Tokio primitive     | Payload   |
+//! |--------------|---------------|---------------------|-----------|
+//! | **Telemetry**| `SpmcRing`    | `broadcast`         | small     |
+//! | **State**    | `SingleLatest`| `watch`             | medium    |
+//! | **Command**  | `Mailbox`     | `Mutex + Notify`    | small     |
+//!
+//! Buffers are constructed from a `BufferCfg` via the `Buffer<T>` trait so
+//! the bench code tests exactly the same code path that production uses.
+
+use aimdb_core::buffer::{Buffer, BufferCfg};
+use aimdb_tokio_adapter::TokioBuffer;
+
+// ── Payload types ─────────────────────────────────────────────────────────────
+
+/// Small telemetry reading pushed at high frequency (Telemetry profile).
+#[derive(Debug, Clone, PartialEq)]
+pub struct TelemetryMsg {
+    pub sensor_id: u32,
+    pub value: f64,
+    pub sequence: u64,
+}
+
+/// Medium state snapshot with several fields (State profile).
+#[derive(Debug, Clone, PartialEq)]
+pub struct StateMsg {
+    pub device_id: u32,
+    pub temperature: f64,
+    pub humidity: f64,
+    pub pressure: f64,
+    pub sequence: u64,
+}
+
+/// Control command payload (Command / Mailbox profile).
+#[derive(Debug, Clone, PartialEq)]
+pub struct CommandMsg {
+    pub command_id: u32,
+    pub target: u32,
+    pub value: f64,
+    pub sequence: u64,
+}
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+/// Ring capacity for the Telemetry profile.
+///
+/// Must be ≥ `BATCH_SIZE` so B2 fan-out measurements never trigger
+/// `BufferLagged` within a single iteration.
+pub const TELEMETRY_CAPACITY: usize = 1024;
+
+/// Number of messages in the B0 measured batch window.
+pub const BATCH_SIZE: usize = 512;
+
+/// Warmup iterations excluded from the B0 measurement window.
+pub const WARMUP_ITERS: usize = 200;
+
+// ── Deterministic message factories ──────────────────────────────────────────
+
+/// Produce a deterministic `TelemetryMsg` for iteration `i`.
+#[inline]
+pub fn telemetry_msg(i: u64) -> TelemetryMsg {
+    TelemetryMsg {
+        sensor_id: (i % 16) as u32,
+        value: i as f64 * 0.1,
+        sequence: i,
+    }
+}
+
+/// Produce a deterministic `StateMsg` for iteration `i`.
+#[inline]
+pub fn state_msg(i: u64) -> StateMsg {
+    StateMsg {
+        device_id: (i % 8) as u32,
+        temperature: 20.0 + i as f64 * 0.01,
+        humidity: 50.0 + i as f64 * 0.005,
+        pressure: 1013.25 + i as f64 * 0.001,
+        sequence: i,
+    }
+}
+
+/// Produce a deterministic `CommandMsg` for iteration `i`.
+#[inline]
+pub fn command_msg(i: u64) -> CommandMsg {
+    CommandMsg {
+        command_id: (i % 256) as u32,
+        target: (i % 4) as u32,
+        value: i as f64,
+        sequence: i,
+    }
+}
+
+// ── Buffer constructors ───────────────────────────────────────────────────────
+
+/// Build a `TokioBuffer<TelemetryMsg>` backed by `SpmcRing` (`broadcast`).
+pub fn telemetry_buffer() -> TokioBuffer<TelemetryMsg> {
+    TokioBuffer::new(&BufferCfg::SpmcRing {
+        capacity: TELEMETRY_CAPACITY,
+    })
+}
+
+/// Build a `TokioBuffer<StateMsg>` backed by `SingleLatest` (`watch`).
+pub fn state_buffer() -> TokioBuffer<StateMsg> {
+    TokioBuffer::new(&BufferCfg::SingleLatest)
+}
+
+/// Build a `TokioBuffer<CommandMsg>` backed by `Mailbox` (`Mutex + Notify`).
+pub fn command_buffer() -> TokioBuffer<CommandMsg> {
+    TokioBuffer::new(&BufferCfg::Mailbox)
+}
diff --git a/aimdb-bench/src/reports.rs b/aimdb-bench/src/reports.rs
new file mode 100644
index 0000000..9b1f6d8
--- /dev/null
+++ b/aimdb-bench/src/reports.rs
@@ -0,0 +1,61 @@
+//! Result structs for B0 benchmark output.
+//!
+//! Serialized as JSON for storage in `data/baselines/`.  B1/B2 results are
+//! managed by Criterion's built-in baseline system (`target/criterion/`).
+
+use serde::{Deserialize, Serialize};
+
+/// B0 allocation report for a single workload profile.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AllocReport {
+    /// Profile name (e.g. "Telemetry", "State", "Command").
+    pub profile: String,
+    /// Buffer type (e.g. "SpmcRing", "SingleLatest", "Mailbox").
+    pub buffer_type: String,
+    /// Total allocations in the measured batch.
+    pub total_allocs: u64,
+    /// Total bytes allocated in the measured batch.
+    pub total_bytes: u64,
+    /// Number of messages in the batch.
+    pub batch_size: usize,
+    /// Mean allocations per message.
+    pub allocs_per_msg: f64,
+    /// Mean bytes allocated per message.
+    pub bytes_per_msg: f64,
+}
+
+impl AllocReport {
+    /// Construct from raw counter snapshot and batch metadata.
+    pub fn new(
+        profile: impl Into<String>,
+        buffer_type: impl Into<String>,
+        batch_size: usize,
+        total_allocs: u64,
+        total_bytes: u64,
+    ) -> Self {
+        let n = batch_size as f64;
+        Self {
+            profile: profile.into(),
+            buffer_type: buffer_type.into(),
+            total_allocs,
+            total_bytes,
+            batch_size,
+            allocs_per_msg: total_allocs as f64 / n,
+            bytes_per_msg: total_bytes as f64 / n,
+        }
+    }
+
+    /// Print a human-readable one-liner to stdout.
+    pub fn print(&self) {
+        println!(
+            "[B0] {:12} ({:12}): {:.3} allocs/msg  ({} total allocs, {} B/msg avg, {} B total, batch={})",
+            self.profile,
+            self.buffer_type,
+            self.allocs_per_msg,
+            self.total_allocs,
+            self.bytes_per_msg as u64,
+            self.total_bytes,
+            self.batch_size,
+        );
+    }
+}

From aab1cd689f250970329e195ecc26311e00c92cd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Tue, 16 Jun 2026 20:21:58 +0000
Subject: [PATCH 02/16] feat(bench): update output directory for benchmark
 results and add baseline data for b0_alloc_tokio

---
 aimdb-bench/benches/b0_alloc_tokio.rs         |  4 +--
 aimdb-bench/benches/b_alloc_pipeline.rs       |  4 +--
 .../data/baselines/b0_alloc_tokio.json        | 29 +++++++++++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 aimdb-bench/data/baselines/b0_alloc_tokio.json

diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
index b74c36a..09c7b53 100644
--- a/aimdb-bench/benches/b0_alloc_tokio.rs
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -125,8 +125,8 @@ fn main() {
     // Persist results for baseline comparison.
     let reports = vec![telemetry_report, state_report, command_report];
     let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
-    let out_dir = "target/bench-results";
-    std::fs::create_dir_all(out_dir).ok();
+    let out_dir = "aimdb-bench/target/bench-results";
+    std::fs::create_dir_all(out_dir).expect("failed to create results directory");
     let out_path = format!("{out_dir}/b0_alloc_tokio.json");
     std::fs::write(&out_path, &json).expect("failed to write results");
     println!("\nResults written to {out_path}");
diff --git a/aimdb-bench/benches/b_alloc_pipeline.rs b/aimdb-bench/benches/b_alloc_pipeline.rs
index ecac7d4..35619ab 100644
--- a/aimdb-bench/benches/b_alloc_pipeline.rs
+++ b/aimdb-bench/benches/b_alloc_pipeline.rs
@@ -265,8 +265,8 @@ fn main() {
 
     let reports = vec![telemetry_report, state_report, command_report];
     let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
-    let out_dir = "target/bench-results";
-    std::fs::create_dir_all(out_dir).ok();
+    let out_dir = "aimdb-bench/target/bench-results";
+    std::fs::create_dir_all(out_dir).expect("failed to create results directory");
     let out_path = format!("{out_dir}/b_alloc_pipeline.json");
     std::fs::write(&out_path, &json).expect("failed to write results");
     println!("\nResults written to {out_path}");
diff --git a/aimdb-bench/data/baselines/b0_alloc_tokio.json b/aimdb-bench/data/baselines/b0_alloc_tokio.json
new file mode 100644
index 0000000..afeb6ff
--- /dev/null
+++ b/aimdb-bench/data/baselines/b0_alloc_tokio.json
@@ -0,0 +1,29 @@
+[
+  {
+    "profile": "Telemetry",
+    "buffer_type": "SpmcRing",
+    "total_allocs": 512,
+    "total_bytes": 73728,
+    "batch_size": 512,
+    "allocs_per_msg": 1.0,
+    "bytes_per_msg": 144.0
+  },
+  {
+    "profile": "State",
+    "buffer_type": "SingleLatest",
+    "total_allocs": 512,
+    "total_bytes": 73728,
+    "batch_size": 512,
+    "allocs_per_msg": 1.0,
+    "bytes_per_msg": 144.0
+  },
+  {
+    "profile": "Command",
+    "buffer_type": "Mailbox",
+    "total_allocs": 512,
+    "total_bytes": 73728,
+    "batch_size": 512,
+    "allocs_per_msg": 1.0,
+    "bytes_per_msg": 144.0
+  }
+]
\ No newline at end of file

From 5b73b012c310ddb38af3ac8f44ff2200ce115717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Tue, 16 Jun 2026 20:30:52 +0000
Subject: [PATCH 03/16] feat(bench): add README for benchmarking infrastructure
 and usage instructions

---
 aimdb-bench/README.md | 106 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 aimdb-bench/README.md

diff --git a/aimdb-bench/README.md b/aimdb-bench/README.md
new file mode 100644
index 0000000..37fd300
--- /dev/null
+++ b/aimdb-bench/README.md
@@ -0,0 +1,106 @@
+# aimdb-bench
+
+Benchmarking infrastructure for AimDB. **Not for production use.**
+
+Measures three classes of performance across three canonical workload profiles:
+
+| Class | Tool | Purpose | CI gate? |
+|---|---|---|---|
+| **B0** — allocations/msg | hand-rolled `CountingAllocator` | regression detection on the consume path | phase 5 (planned) |
+| **B1** — push-to-recv latency | Criterion p50/p99 | trend tracking | no |
+| **B2** — steady-state throughput | Criterion msgs/sec | trend tracking | no |
+
+Plus two informational benches that exercise the full runner-driven pipeline.
+
+**Adapters covered:** Tokio only. Embassy is a planned follow-up once it can be exercised through host-test stubs without pulling in `embassy-runtime`.
+
+---
+
+## Workload profiles
+
+Every bench runs the same three profiles, matching the three AimDB buffer types:
+
+| Profile | Buffer | Tokio primitive | Payload |
+|---|---|---|---|
+| **Telemetry** | `SpmcRing` | `broadcast` | small (32 B) |
+| **State** | `SingleLatest` | `watch` | medium (48 B) |
+| **Command** | `Mailbox` | `Mutex + Notify` | small (32 B) |
+
+---
+
+## Running
+
+Always run from the workspace root (`/aimdb_ws/aimdb`).
+
+```sh
+# B0 — allocation gate (buffer layer)
+cargo bench -p aimdb-bench --bench b0_alloc_tokio
+
+# B1 — latency (Criterion)
+cargo bench -p aimdb-bench --bench b1_latency
+
+# B2 — throughput (Criterion)
+cargo bench -p aimdb-bench --bench b2_throughput
+
+# Informational: allocation count through the runner pipeline
+cargo bench -p aimdb-bench --bench b_alloc_pipeline
+
+# Informational: runner-pipeline throughput (Criterion)
+cargo bench -p aimdb-bench --bench b_runner_pipeline
+
+# All at once
+cargo bench -p aimdb-bench
+```
+
+### Criterion baselines
+
+B1 and B2 use Criterion's built-in baseline system:
+
+```sh
+# Save a named baseline before a change
+cargo bench -p aimdb-bench --bench b1_latency -- --save-baseline pre-w8
+
+# Compare against it after
+cargo bench -p aimdb-bench --bench b1_latency -- --baseline pre-w8
+```
+
+Criterion writes HTML reports to `target/criterion/`.
+
+---
+
+## B0 — allocation gate
+
+`b0_alloc_tokio` does not use Criterion. It runs a fixed warmup + batch cycle and writes JSON results to `target/bench-results/b0_alloc_tokio.json`.
+
+**Measurement model:**
+1. Create buffer + reader.
+2. Warmup ≥ 200 push → recv cycles (excluded from counters).
+3. Reset allocation counters.
+4. Run 512 push → recv cycles.
+5. Snapshot counters; divide by 512 for per-message figures.
+
+The committed baseline lives in `data/baselines/b0_alloc_tokio.json`. When a change intentionally improves or changes allocation behaviour, re-run the bench and commit the updated JSON with a clear rationale in the commit message.
+
+**Noise reduction:** a `new_current_thread()` Tokio executor is used so there are no work-stealing threads and Tokio's scheduler does not allocate per-poll in the hot path.
+
+**Production isolation:** `#[global_allocator]` is a per-binary link-time declaration. `CountingAllocator` exists only in bench binaries. Nothing in the production dependency graph is affected.
+
+---
+
+## Informational pipeline benches
+
+`b_alloc_pipeline` and `b_runner_pipeline` exercise the same three profiles through a real `AimDbRunner` pipeline (`.source()` → buffer → `.tap()`). These include runner/stage machinery overhead on top of the buffer consume path.
+
+Use them as a comparison point, not a regression gate. If they regress, `b0_alloc_tokio` tells you whether the issue is in the consume path itself.
+
+---
+
+## Caveats
+
+- All benches measure a single current-thread Tokio executor. Results do not predict multi-threaded or work-stealing scheduler behavior.
+- B0 is a counter, not a memory profiler. It reports allocation count and byte total; not per-call precision or heap fragmentation.
+- Criterion p99 can vary ±5–10% on noisy CI runners. Use p50 medians for trend comparisons.
+- Always specify `--release` or debug build consistently when comparing runs; optimizations differ by 5–50×.
+- `b_alloc_pipeline` uses a paced source: per-message pace tokens and notification channels. The coordination overhead is included in the measured window.
+
+

From 631b7e7d4db046c7bf1e8ac1b4235711c0930499 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Tue, 16 Jun 2026 20:31:35 +0000
Subject: [PATCH 04/16] style: format async calls for better readability in
 benchmark functions

---
 aimdb-bench/benches/b_runner_pipeline.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/aimdb-bench/benches/b_runner_pipeline.rs b/aimdb-bench/benches/b_runner_pipeline.rs
index 36efb60..7c62181 100644
--- a/aimdb-bench/benches/b_runner_pipeline.rs
+++ b/aimdb-bench/benches/b_runner_pipeline.rs
@@ -164,7 +164,9 @@ fn bench_e2e_state(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    harness.round_trip(state_msg((WARMUP_ITERS as u64) + i)).await;
+                    harness
+                        .round_trip(state_msg((WARMUP_ITERS as u64) + i))
+                        .await;
                 }
                 start.elapsed()
             })
@@ -198,7 +200,9 @@ fn bench_e2e_command(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    harness.round_trip(command_msg((WARMUP_ITERS as u64) + i)).await;
+                    harness
+                        .round_trip(command_msg((WARMUP_ITERS as u64) + i))
+                        .await;
                 }
                 start.elapsed()
             })

From de94dd6c497fef72c2c91d6476f3f9c5b0371e60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Tue, 16 Jun 2026 20:42:22 +0000
Subject: [PATCH 05/16] fix: add missing license field in Cargo.toml

---
 aimdb-bench/Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aimdb-bench/Cargo.toml b/aimdb-bench/Cargo.toml
index 552f5ae..00a5fc7 100644
--- a/aimdb-bench/Cargo.toml
+++ b/aimdb-bench/Cargo.toml
@@ -3,6 +3,7 @@ name = "aimdb-bench"
 version = "0.1.0"
 edition = "2021"
 publish = false
+license = "Apache-2.0"
 description = "Benchmarking infrastructure for AimDB — not for production use"
 
 [lib]

From 319ef3f174e4507e73cd5fb7599ab0cf95ab5c78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Fri, 19 Jun 2026 19:29:10 +0000
Subject: [PATCH 06/16] feat(bench): enhance Makefile and README for
 benchmarking infrastructure, update output paths for results

---
 Makefile                                |  4 +++
 aimdb-bench/README.md                   |  3 +-
 aimdb-bench/benches/b0_alloc_tokio.rs   |  5 +--
 aimdb-bench/benches/b2_throughput.rs    | 46 +++++++++++++++++++------
 aimdb-bench/benches/b_alloc_pipeline.rs | 44 ++++++++++++-----------
 aimdb-bench/src/lib.rs                  | 14 ++++----
 6 files changed, 75 insertions(+), 41 deletions(-)

diff --git a/Makefile b/Makefile
index fe97993..3ba668a 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,8 @@ build:
 	cargo build --package aimdb-serial-connector --no-default-features --features "tokio-runtime"
 	@printf "$(YELLOW)  → Building WASM adapter$(NC)\n"
 	cargo build --package aimdb-wasm-adapter --target wasm32-unknown-unknown --features "wasm-runtime"
+	@printf "$(YELLOW)  → Building benchmarking infrastructure (host-only, incl. benches)$(NC)\n"
+	cargo build --package aimdb-bench --benches
 
 test:
 	@printf "$(GREEN)Running all tests (valid combinations)...$(NC)\n"
@@ -262,6 +264,8 @@ clippy:
 	cargo clippy --package aimdb-serial-connector --target thumbv7em-none-eabihf --no-default-features --features "embassy-runtime,defmt" -- -D warnings
 	@printf "$(YELLOW)  → Clippy on WASM adapter$(NC)\n"
 	cargo clippy --package aimdb-wasm-adapter --target wasm32-unknown-unknown --features "wasm-runtime" -- -D warnings
+	@printf "$(YELLOW)  → Clippy on benchmarking infrastructure (host-only, incl. benches)$(NC)\n"
+	cargo clippy --package aimdb-bench --all-targets -- -D warnings
 
 doc:
 	@printf "$(GREEN)Generating dual-platform documentation...$(NC)\n"
diff --git a/aimdb-bench/README.md b/aimdb-bench/README.md
index 37fd300..95192d1 100644
--- a/aimdb-bench/README.md
+++ b/aimdb-bench/README.md
@@ -70,7 +70,7 @@ Criterion writes HTML reports to `target/criterion/`.
 
 ## B0 — allocation gate
 
-`b0_alloc_tokio` does not use Criterion. It runs a fixed warmup + batch cycle and writes JSON results to `target/bench-results/b0_alloc_tokio.json`.
+`b0_alloc_tokio` does not use Criterion. It runs a fixed warmup + batch cycle and writes JSON results to `aimdb-bench/target/bench-results/b0_alloc_tokio.json` (the path is anchored to the crate dir, so it is the same regardless of the directory you run from).
 
 **Measurement model:**
 1. Create buffer + reader.
@@ -99,6 +99,7 @@ Use them as a comparison point, not a regression gate. If they regress, `b0_allo
 
 - All benches measure a single current-thread Tokio executor. Results do not predict multi-threaded or work-stealing scheduler behavior.
 - B0 is a counter, not a memory profiler. It reports allocation count and byte total; not per-call precision or heap fragmentation.
+- B0's `bytes_per_msg` measures the **boxed `recv()` future**, not the message payload. The single per-message allocation is the `Box::pin` in `TokioBufferReader::recv()`, and because all three buffer arms share one `async` block the future is a single type sized to its largest arm — so all three profiles report the same byte count regardless of payload size.
 - Criterion p99 can vary ±5–10% on noisy CI runners. Use p50 medians for trend comparisons.
 - Always specify `--release` or debug build consistently when comparing runs; optimizations differ by 5–50×.
 - `b_alloc_pipeline` uses a paced source: per-message pace tokens and notification channels. The coordination overhead is included in the measured window.
diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
index 09c7b53..f5a09df 100644
--- a/aimdb-bench/benches/b0_alloc_tokio.rs
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -24,7 +24,8 @@
 //! cargo bench -p aimdb-bench --bench b0_alloc_tokio
 //! ```
 //!
-//! Results are written to `target/bench-results/b0_alloc_tokio.json`.
+//! Results are written to `aimdb-bench/target/bench-results/b0_alloc_tokio.json`
+//! (anchored to the crate dir, so the path is the same regardless of CWD).
 
 use aimdb_bench::{
     alloc::{reset, snapshot},
@@ -125,7 +126,7 @@ fn main() {
     // Persist results for baseline comparison.
     let reports = vec![telemetry_report, state_report, command_report];
     let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
-    let out_dir = "aimdb-bench/target/bench-results";
+    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/target/bench-results");
     std::fs::create_dir_all(out_dir).expect("failed to create results directory");
     let out_path = format!("{out_dir}/b0_alloc_tokio.json");
     std::fs::write(&out_path, &json).expect("failed to write results");
diff --git a/aimdb-bench/benches/b2_throughput.rs b/aimdb-bench/benches/b2_throughput.rs
index 9722ff0..80c5041 100644
--- a/aimdb-bench/benches/b2_throughput.rs
+++ b/aimdb-bench/benches/b2_throughput.rs
@@ -6,8 +6,9 @@
 //! **Fan-out safety rules (SpmcRing / broadcast):**
 //! - All readers are subscribed *before* any messages are pushed so each
 //!   reader holds its read position from the start.
-//! - `TELEMETRY_CAPACITY >= BATCH_SIZE` prevents `BufferLagged` within a
-//!   single Criterion iteration.
+//! - The loop is strict lockstep (1 push, then `recv` on every reader), so at
+//!   most one message is ever in flight; the ring capacity (`TELEMETRY_CAPACITY`)
+//!   is far more than enough to keep any reader from lagging within an iteration.
 //!
 //! **Mailbox throughput:** tight 1:1 push → recv loop.  Do NOT batch pushes
 //! ahead of the consumer — the single slot overwrites earlier values and
@@ -25,10 +26,9 @@
 
 use aimdb_bench::profiles::{
     command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
-    BATCH_SIZE, TELEMETRY_CAPACITY,
+    WARMUP_ITERS,
 };
-use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
-use aimdb_tokio_adapter::TokioBuffer;
+use aimdb_core::buffer::{Buffer, BufferReader};
 use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 
 // ── Telemetry SPSC ────────────────────────────────────────────────────────────
@@ -48,6 +48,12 @@ fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
                 let buf = telemetry_buffer();
                 let mut reader = buf.subscribe();
 
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
                 let start = std::time::Instant::now();
                 for i in 0..iters {
                     buf.push(telemetry_msg(i));
@@ -80,16 +86,24 @@ fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
     group.bench_function("telemetry_fanout_1x4", |b| {
         b.iter_custom(|iters| {
             rt.block_on(async {
-                // All readers subscribed before first push (required for
-                // broadcast correctness: capacity >= iters avoids lag).
-                let buf = TokioBuffer::<_>::new(&BufferCfg::SpmcRing {
-                    capacity: TELEMETRY_CAPACITY.max(iters as usize + BATCH_SIZE),
-                });
+                // All readers subscribed before first push so each holds its
+                // read position from the start. Lockstep below keeps at most one
+                // message in flight, so the fixed ring capacity never lags.
+                let buf = telemetry_buffer();
                 let mut r0 = buf.subscribe();
                 let mut r1 = buf.subscribe();
                 let mut r2 = buf.subscribe();
                 let mut r3 = buf.subscribe();
 
+                // Warmup — not timed (mirrors B1 and the SPSC benches).
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+
                 let start = std::time::Instant::now();
                 for i in 0..iters {
                     buf.push(telemetry_msg(i));
@@ -122,6 +136,12 @@ fn bench_throughput_state_spsc(c: &mut Criterion) {
                 let buf = state_buffer();
                 let mut reader = buf.subscribe();
 
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(state_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
                 let start = std::time::Instant::now();
                 for i in 0..iters {
                     buf.push(state_msg(i));
@@ -151,6 +171,12 @@ fn bench_throughput_command_mailbox(c: &mut Criterion) {
                 let buf = command_buffer();
                 let mut reader = buf.subscribe();
 
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(command_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
                 let start = std::time::Instant::now();
                 for i in 0..iters {
                     buf.push(command_msg(i));
diff --git a/aimdb-bench/benches/b_alloc_pipeline.rs b/aimdb-bench/benches/b_alloc_pipeline.rs
index 35619ab..978c5c9 100644
--- a/aimdb-bench/benches/b_alloc_pipeline.rs
+++ b/aimdb-bench/benches/b_alloc_pipeline.rs
@@ -20,7 +20,15 @@
 //! cargo bench -p aimdb-bench --bench b_alloc_pipeline
 //! ```
 //!
-//! Results are written to `target/bench-results/b_alloc_pipeline.json`.
+//! Results are written to `aimdb-bench/target/bench-results/b_alloc_pipeline.json`
+//! (anchored to the crate dir, so the path is the same regardless of CWD).
+//!
+//! **Executor dependency.** The source/tap pacing below uses a check-then-await
+//! pattern (load an atomic, and only `.notified().await` if there is no work).
+//! `Notify::notify_waiters()` does not store a permit, so this is only free of
+//! lost wakeups because the bench runs on a **current-thread** Tokio runtime:
+//! nothing can preempt between the atomic load and the `.await`. Do not port
+//! this harness to a multi-threaded executor without revisiting the pacing.
 
 use std::fmt::Debug;
 use std::sync::{
@@ -165,25 +173,19 @@ where
                         .send(())
                         .expect("failed to signal tap readiness");
 
-                    loop {
-                        match reader.recv().await {
-                            Ok(_value) => {
-                                let current_epoch = tap_state.target_epoch.load(Ordering::Acquire);
-                                let seen =
-                                    tap_state.consumed_in_epoch.fetch_add(1, Ordering::AcqRel) + 1;
-                                let batch_size = tap_state.batch_size.load(Ordering::Acquire);
-                                if seen < batch_size {
-                                    tap_state.pace_tokens.fetch_add(1, Ordering::AcqRel);
-                                    tap_state.pace_notify.notify_waiters();
-                                }
-                                if seen == batch_size {
-                                    tap_state
-                                        .completed_epoch
-                                        .store(current_epoch, Ordering::Release);
-                                    tap_state.done_notify.notify_waiters();
-                                }
-                            }
-                            Err(_) => break,
+                    while let Ok(_value) = reader.recv().await {
+                        let current_epoch = tap_state.target_epoch.load(Ordering::Acquire);
+                        let seen = tap_state.consumed_in_epoch.fetch_add(1, Ordering::AcqRel) + 1;
+                        let batch_size = tap_state.batch_size.load(Ordering::Acquire);
+                        if seen < batch_size {
+                            tap_state.pace_tokens.fetch_add(1, Ordering::AcqRel);
+                            tap_state.pace_notify.notify_waiters();
+                        }
+                        if seen == batch_size {
+                            tap_state
+                                .completed_epoch
+                                .store(current_epoch, Ordering::Release);
+                            tap_state.done_notify.notify_waiters();
                         }
                     }
                 })
@@ -265,7 +267,7 @@ fn main() {
 
     let reports = vec![telemetry_report, state_report, command_report];
     let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
-    let out_dir = "aimdb-bench/target/bench-results";
+    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/target/bench-results");
     std::fs::create_dir_all(out_dir).expect("failed to create results directory");
     let out_path = format!("{out_dir}/b_alloc_pipeline.json");
     std::fs::write(&out_path, &json).expect("failed to write results");
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
index 9835521..6400509 100644
--- a/aimdb-bench/src/lib.rs
+++ b/aimdb-bench/src/lib.rs
@@ -9,13 +9,13 @@
 //!
 //! # Bench entrypoints
 //!
-//! | File                         | Class | Purpose                                      |
-//! |------------------------------|-------|----------------------------------------------|
-//! | `benches/b0_alloc_tokio.rs`  | B0    | Per-message allocation (buffer layer)        |
-//! | `benches/b1_latency.rs`      | B1    | Push-to-recv latency (buffer layer)          |
-//! | `benches/b2_throughput.rs`   | B2    | Steady-state throughput (buffer layer)       |
-//! | `benches/b_alloc_pipeline.rs`| info  | Per-message allocation (runner pipeline)     |
-//! | `benches/b_runner_pipeline.rs`     | info  | Runner pipeline throughput (Criterion)       |
+//! | File                            | Class | Purpose                                  |
+//! |---------------------------------|-------|------------------------------------------|
+//! | `benches/b0_alloc_tokio.rs`     | B0    | Per-message allocation (buffer layer)    |
+//! | `benches/b1_latency.rs`         | B1    | Push-to-recv latency (buffer layer)      |
+//! | `benches/b2_throughput.rs`      | B2    | Steady-state throughput (buffer layer)   |
+//! | `benches/b_alloc_pipeline.rs`   | info  | Per-message allocation (runner pipeline) |
+//! | `benches/b_runner_pipeline.rs`  | info  | Runner pipeline throughput (Criterion)   |
 
 pub mod alloc;
 pub mod profiles;

From f852034906253b9ec649906c95a133cedb8aab4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sat, 20 Jun 2026 06:50:05 +0000
Subject: [PATCH 07/16] feat(buffer): implement zero-allocation consume path
 for BufferReader SPI

- Replace async recv method with poll_recv for object safety and zero allocations.
- Remove WasmRecvFuture and its associated heap allocation, simplifying the reader implementation.
- Update documentation to reflect the new design and performance metrics.
- Ensure compatibility with existing consumer-facing API by maintaining async recv method.
- Introduce measurement program to validate allocation and performance improvements across different buffer profiles.
---
 Cargo.lock                                    |   1 +
 aimdb-bench/README.md                         |   4 +-
 aimdb-bench/benches/b0_alloc_tokio.rs         |   8 +-
 aimdb-bench/benches/b1_latency.rs             |   8 +-
 aimdb-bench/benches/b2_throughput.rs          |  16 +-
 .../data/baselines/b0_alloc_tokio.json        |  26 +-
 aimdb-core/CHANGELOG.md                       |   7 +
 aimdb-core/src/buffer/mod.rs                  |   4 +
 aimdb-core/src/buffer/reader.rs               |  85 ++++
 aimdb-core/src/buffer/traits.rs               |  51 ++-
 aimdb-core/src/builder.rs                     |   5 +-
 aimdb-core/src/lib.rs                         |   3 +
 aimdb-core/src/profiling/mod.rs               |  25 +-
 aimdb-core/src/remote/stream.rs               |  58 ++-
 aimdb-core/src/typed_api.rs                   |  32 +-
 aimdb-core/src/typed_record.rs                |  51 ++-
 aimdb-embassy-adapter/src/buffer.rs           | 417 +++++++++++------
 aimdb-tokio-adapter/Cargo.toml                |  10 +-
 aimdb-tokio-adapter/src/buffer.rs             | 433 +++++++++++-------
 aimdb-wasm-adapter/src/buffer.rs              |  85 ++--
 docs/design/037-zero-alloc-consume-path.md    | 145 ++++++
 21 files changed, 985 insertions(+), 489 deletions(-)
 create mode 100644 aimdb-core/src/buffer/reader.rs
 create mode 100644 docs/design/037-zero-alloc-consume-path.md

diff --git a/Cargo.lock b/Cargo.lock
index bcbe97d..2d8ffc3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -313,6 +313,7 @@ dependencies = [
  "serde_json",
  "tokio",
  "tokio-test",
+ "tokio-util",
  "tracing",
 ]
 
diff --git a/aimdb-bench/README.md b/aimdb-bench/README.md
index 95192d1..2c57eb2 100644
--- a/aimdb-bench/README.md
+++ b/aimdb-bench/README.md
@@ -81,6 +81,8 @@ Criterion writes HTML reports to `target/criterion/`.
 
 The committed baseline lives in `data/baselines/b0_alloc_tokio.json`. When a change intentionally improves or changes allocation behaviour, re-run the bench and commit the updated JSON with a clear rationale in the commit message.
 
+> **W8 result (design 037).** Since the zero-allocation consume path landed, the baseline records **0 allocs/msg** across all three tokio profiles (down from 1 — the boxed `recv()` future is gone). The committed baseline is therefore the target value; any nonzero B0 on these profiles is a regression to investigate.
+
 **Noise reduction:** a `new_current_thread()` Tokio executor is used so there are no work-stealing threads and Tokio's scheduler does not allocate per-poll in the hot path.
 
 **Production isolation:** `#[global_allocator]` is a per-binary link-time declaration. `CountingAllocator` exists only in bench binaries. Nothing in the production dependency graph is affected.
@@ -99,7 +101,7 @@ Use them as a comparison point, not a regression gate. If they regress, `b0_allo
 
 - All benches measure a single current-thread Tokio executor. Results do not predict multi-threaded or work-stealing scheduler behavior.
 - B0 is a counter, not a memory profiler. It reports allocation count and byte total; not per-call precision or heap fragmentation.
-- B0's `bytes_per_msg` measures the **boxed `recv()` future**, not the message payload. The single per-message allocation is the `Box::pin` in `TokioBufferReader::recv()`, and because all three buffer arms share one `async` block the future is a single type sized to its largest arm — so all three profiles report the same byte count regardless of payload size.
+- B0's `bytes_per_msg` measures AimDB-added per-message heap allocations, not the message payload. Pre-W8 this was the `Box::pin` boxed `recv()` future (a single ~144 B type shared across all three buffer arms, hence identical byte counts); since design 037 / W8 the consume path is poll-based and this is **0 B/msg** on the clean path. A nonzero value flags a regression — e.g. the broadcast error path still allocates its `buffer_name` string, so a B0 run that triggers `BufferLagged`/`BufferClosed` will report > 0.
 - Criterion p99 can vary ±5–10% on noisy CI runners. Use p50 medians for trend comparisons.
 - Always specify `--release` or debug build consistently when comparing runs; optimizations differ by 5–50×.
 - `b_alloc_pipeline` uses a paced source: per-message pace tokens and notification channels. The coordination overhead is included in the measured window.
diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
index f5a09df..332e76f 100644
--- a/aimdb-bench/benches/b0_alloc_tokio.rs
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -35,7 +35,7 @@ use aimdb_bench::{
     },
     reports::AllocReport,
 };
-use aimdb_core::buffer::{Buffer, BufferReader};
+use aimdb_core::buffer::{Buffer, Reader};
 
 fn main() {
     // Current-thread executor — no work-stealing threads, minimal scheduler
@@ -55,7 +55,7 @@ fn main() {
     // start — a reader created after sends are in flight misses them.
     let telemetry_report = rt.block_on(async {
         let buf = telemetry_buffer();
-        let mut reader = buf.subscribe();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
 
         for i in 0..WARMUP_ITERS {
             buf.push(telemetry_msg(i as u64));
@@ -75,7 +75,7 @@ fn main() {
     // ── State: SingleLatest / watch ──────────────────────────────────────────
     let state_report = rt.block_on(async {
         let buf = state_buffer();
-        let mut reader = buf.subscribe();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
 
         for i in 0..WARMUP_ITERS {
             buf.push(state_msg(i as u64));
@@ -100,7 +100,7 @@ fn main() {
     // semantics with throughput measurement.
     let command_report = rt.block_on(async {
         let buf = command_buffer();
-        let mut reader = buf.subscribe();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
 
         for i in 0..WARMUP_ITERS {
             buf.push(command_msg(i as u64));
diff --git a/aimdb-bench/benches/b1_latency.rs b/aimdb-bench/benches/b1_latency.rs
index 9b2e07a..1d85b96 100644
--- a/aimdb-bench/benches/b1_latency.rs
+++ b/aimdb-bench/benches/b1_latency.rs
@@ -25,7 +25,7 @@ use aimdb_bench::profiles::{
     command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
     WARMUP_ITERS,
 };
-use aimdb_core::buffer::{Buffer, BufferReader};
+use aimdb_core::buffer::{Buffer, Reader};
 use criterion::{criterion_group, criterion_main, Criterion};
 
 // ── Telemetry: SpmcRing / broadcast ──────────────────────────────────────────
@@ -41,7 +41,7 @@ fn bench_latency_telemetry(c: &mut Criterion) {
         b.iter_custom(|iters| {
             rt.block_on(async {
                 let buf = telemetry_buffer();
-                let mut reader = buf.subscribe();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
 
                 // Warmup — not timed.
                 for i in 0..WARMUP_ITERS {
@@ -75,7 +75,7 @@ fn bench_latency_state(c: &mut Criterion) {
         b.iter_custom(|iters| {
             rt.block_on(async {
                 let buf = state_buffer();
-                let mut reader = buf.subscribe();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
 
                 for i in 0..WARMUP_ITERS {
                     buf.push(state_msg(i as u64));
@@ -109,7 +109,7 @@ fn bench_latency_command(c: &mut Criterion) {
         b.iter_custom(|iters| {
             rt.block_on(async {
                 let buf = command_buffer();
-                let mut reader = buf.subscribe();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
 
                 for i in 0..WARMUP_ITERS {
                     buf.push(command_msg(i as u64));
diff --git a/aimdb-bench/benches/b2_throughput.rs b/aimdb-bench/benches/b2_throughput.rs
index 80c5041..54dec89 100644
--- a/aimdb-bench/benches/b2_throughput.rs
+++ b/aimdb-bench/benches/b2_throughput.rs
@@ -28,7 +28,7 @@ use aimdb_bench::profiles::{
     command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
     WARMUP_ITERS,
 };
-use aimdb_core::buffer::{Buffer, BufferReader};
+use aimdb_core::buffer::{Buffer, Reader};
 use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 
 // ── Telemetry SPSC ────────────────────────────────────────────────────────────
@@ -46,7 +46,7 @@ fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
             rt.block_on(async {
                 // Subscribe before pushing — reader holds position from start.
                 let buf = telemetry_buffer();
-                let mut reader = buf.subscribe();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
 
                 // Warmup — not timed.
                 for i in 0..WARMUP_ITERS {
@@ -90,10 +90,10 @@ fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
                 // read position from the start. Lockstep below keeps at most one
                 // message in flight, so the fixed ring capacity never lags.
                 let buf = telemetry_buffer();
-                let mut r0 = buf.subscribe();
-                let mut r1 = buf.subscribe();
-                let mut r2 = buf.subscribe();
-                let mut r3 = buf.subscribe();
+                let mut r0 = Reader::new(Box::new(buf.subscribe()));
+                let mut r1 = Reader::new(Box::new(buf.subscribe()));
+                let mut r2 = Reader::new(Box::new(buf.subscribe()));
+                let mut r3 = Reader::new(Box::new(buf.subscribe()));
 
                 // Warmup — not timed (mirrors B1 and the SPSC benches).
                 for i in 0..WARMUP_ITERS {
@@ -134,7 +134,7 @@ fn bench_throughput_state_spsc(c: &mut Criterion) {
         b.iter_custom(|iters| {
             rt.block_on(async {
                 let buf = state_buffer();
-                let mut reader = buf.subscribe();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
 
                 // Warmup — not timed.
                 for i in 0..WARMUP_ITERS {
@@ -169,7 +169,7 @@ fn bench_throughput_command_mailbox(c: &mut Criterion) {
         b.iter_custom(|iters| {
             rt.block_on(async {
                 let buf = command_buffer();
-                let mut reader = buf.subscribe();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
 
                 // Warmup — not timed.
                 for i in 0..WARMUP_ITERS {
diff --git a/aimdb-bench/data/baselines/b0_alloc_tokio.json b/aimdb-bench/data/baselines/b0_alloc_tokio.json
index afeb6ff..51fd3d9 100644
--- a/aimdb-bench/data/baselines/b0_alloc_tokio.json
+++ b/aimdb-bench/data/baselines/b0_alloc_tokio.json
@@ -2,28 +2,28 @@
   {
     "profile": "Telemetry",
     "buffer_type": "SpmcRing",
-    "total_allocs": 512,
-    "total_bytes": 73728,
+    "total_allocs": 0,
+    "total_bytes": 0,
     "batch_size": 512,
-    "allocs_per_msg": 1.0,
-    "bytes_per_msg": 144.0
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
   },
   {
     "profile": "State",
     "buffer_type": "SingleLatest",
-    "total_allocs": 512,
-    "total_bytes": 73728,
+    "total_allocs": 0,
+    "total_bytes": 0,
     "batch_size": 512,
-    "allocs_per_msg": 1.0,
-    "bytes_per_msg": 144.0
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
   },
   {
     "profile": "Command",
     "buffer_type": "Mailbox",
-    "total_allocs": 512,
-    "total_bytes": 73728,
+    "total_allocs": 0,
+    "total_bytes": 0,
     "batch_size": 512,
-    "allocs_per_msg": 1.0,
-    "bytes_per_msg": 144.0
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
   }
-]
\ No newline at end of file
+]
diff --git a/aimdb-core/CHANGELOG.md b/aimdb-core/CHANGELOG.md
index 51c25e9..fce4fca 100644
--- a/aimdb-core/CHANGELOG.md
+++ b/aimdb-core/CHANGELOG.md
@@ -14,6 +14,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed (breaking)
 
+- **Design 037 W8 — zero-allocation consume path: `BufferReader` is now poll-based ([design doc](../docs/design/037-zero-alloc-consume-path.md)).** The object-erased async `recv` returned a `Pin<Box<dyn Future>>`, heap-allocating on every call — the last AimDB-added per-message allocation on the consume path. It is replaced by an object-safe poll method, restoring async ergonomics through a new handle:
+  - **SPI break (adapter authors only):** `BufferReader<T>::recv(&mut self) -> Pin<Box<dyn Future>>` → `BufferReader<T>::poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>>`. `try_recv` is unchanged. Same break for the `remote-access` `JsonBufferReader`: `recv_json` → `poll_recv_json`. Object safety is preserved (poll, unlike `async fn`, is object-safe).
+  - **New consumer handles:** `buffer::Reader<T>` (and `buffer::JsonReader` under `remote-access`) wrap the erased reader and expose an `async fn recv()` implemented once via `core::future::poll_fn` — `core`-only, `no_std`-clean, zero-allocation, no `unsafe`. `Consumer::subscribe`, `TypedRecord::subscribe`, and `AimDb::subscribe` now return `Reader<T>` instead of `Box<dyn BufferReader<T> + Send>`.
+  - **Source-compatible for consumers:** `subscribe().recv().await` is unchanged at every call site; examples and aimdb-pro compile without edits. Holders of a concrete adapter reader wrap it once: `Reader::new(Box::new(reader))`.
+  - **Connector SPI unchanged (BYOC-stable, design 039 §2):** `SerializedReader::recv` keeps its boxed `RecvSerializedFuture`; only the *inner* per-message box is eliminated. The remote-access JSON path drops both its boxes (`poll_recv_json` + `JsonReader`).
+  - **Result:** **0 AimDB-added heap allocations per message** on the in-process consume path, enforced by the `aimdb-bench` B0 suite (1 → 0 allocs/msg across all three tokio buffer profiles). Adapters that expose no public poll API (tokio `broadcast`/`watch`, embassy `pubsub`/`watch`) round-trip their receiver through a single reused boxed future (`tokio_util::sync::ReusableBoxFuture` on tokio; a `no_std` equivalent on embassy) — one allocation per subscriber lifetime, not per message. The tokio Mailbox replaces `Notify` with an explicit waker list beside the slot.
+
 - **Design 036 W1 — data-plane de-`Any`: the per-message `Box<dyn Any>` is gone from the connector SPI ([design doc §W1](../docs/design/036-followup-refactoring.md)).** Both ends of every erased hop were typed — `T` is known in the registrar where routes are wired, and the connector spine only wants bytes — so the typed pipeline is now built inside closures at registration time (`finish()`) and the SPI exposes only the wire level. The full break inventory:
   - **Inbound:** new `IngestFn = Arc<dyn Fn(&RuntimeContext, &[u8]) -> Result<(), String>>` + `IngestFactoryFn` replace deserializer + producer: deserialize + produce in one typed closure, **synchronous** (`Producer::produce` is sync + infallible per design 029 — the per-message `Box::pin` disappears along with the `Box<dyn Any>`). Deleted: `ProducerTrait`/`produce_any`, `ProducerFactoryFn`, `DeserializerFn`/`ContextDeserializerFn`/`DeserializerKind`, `TypedRecord::create_producer_trait`. `InboundConnectorLink` is `{ url, config, ingest_factory, topic_resolver }` (factory non-optional — `finish()` validates the deserializer before registering, error strings unchanged); `collect_inbound_routes` returns `Vec<(String, IngestFn)>`; `Route` is `{ resource_id, ingest }`.
   - **`Router::route` is a sync fn and its context is mandatory:** `route(&self, resource_id, payload, ctx: &RuntimeContext) -> Result<(), String>` (was `async` with `Option<&RuntimeContext>`). Every production caller already passed `Some(&ctx)`; the old "skip context-deserializers when no ctx" branch is unrepresentable now that raw-vs-context is invisible inside the fused closure.
diff --git a/aimdb-core/src/buffer/mod.rs b/aimdb-core/src/buffer/mod.rs
index 74591b9..ac6a938 100644
--- a/aimdb-core/src/buffer/mod.rs
+++ b/aimdb-core/src/buffer/mod.rs
@@ -59,11 +59,13 @@
 mod cfg;
 #[cfg(feature = "metrics")]
 mod counters;
+mod reader;
 mod traits;
 mod writer;
 
 // Public API exports
 pub use cfg::BufferCfg;
+pub use reader::Reader;
 pub use traits::{Buffer, BufferReader, DynBuffer};
 
 // Crate-private — used by Producer<T> to push without per-call lookup
@@ -72,6 +74,8 @@ pub(crate) use writer::RecordWriter;
 
 // JSON streaming support
 #[cfg(feature = "remote-access")]
+pub use reader::JsonReader;
+#[cfg(feature = "remote-access")]
 pub use traits::JsonBufferReader;
 
 // Buffer metrics (feature-gated; works in no_std with portable-atomic)
diff --git a/aimdb-core/src/buffer/reader.rs b/aimdb-core/src/buffer/reader.rs
new file mode 100644
index 0000000..1ee5421
--- /dev/null
+++ b/aimdb-core/src/buffer/reader.rs
@@ -0,0 +1,85 @@
+//! Consumer-facing reader handles (design 037 / W8).
+//!
+//! The [`BufferReader`] / [`JsonBufferReader`] SPIs are object-safe and
+//! poll-based so adapters can implement them without a per-message
+//! `Pin<Box<dyn Future>>` allocation. These handles restore the ergonomic
+//! `async fn recv().await` surface for consumers by wrapping the erased
+//! reader's `poll_*` method in [`core::future::poll_fn`] — which is `core`-only
+//! (no_std-clean), allocation-free, and `unsafe`-free.
+//!
+//! The wrapped future is `Send` because the boxed reader is `Send`.
+
+use alloc::boxed::Box;
+use core::future::poll_fn;
+
+use crate::buffer::BufferReader;
+use crate::DbError;
+
+#[cfg(feature = "remote-access")]
+use crate::buffer::JsonBufferReader;
+
+/// Owned, ergonomic handle over an erased [`BufferReader`].
+///
+/// Returned by `Consumer::subscribe`. This is the "boxed lane": one indirect
+/// call per `recv`, zero AimDB-added heap allocations per message. (The generic
+/// monomorphized `Reader<T, B>` fast lane remains dormant — see design 037 §7.)
+pub struct Reader<T: Clone + Send> {
+    inner: Box<dyn BufferReader<T> + Send>,
+}
+
+impl<T: Clone + Send> Reader<T> {
+    /// Wrap an erased reader in an ergonomic handle.
+    pub fn new(inner: Box<dyn BufferReader<T> + Send>) -> Self {
+        Self { inner }
+    }
+
+    /// Receive the next value, awaiting until one is available.
+    ///
+    /// Allocation-free: wraps the erased reader's
+    /// [`poll_recv`](BufferReader::poll_recv) via `core::future::poll_fn`.
+    ///
+    /// # Behavior by Buffer Type
+    /// - **SPMC Ring**: Returns next value, or `Lagged(n)` if fell behind
+    /// - **SingleLatest**: Waits for value change, returns most recent
+    /// - **Mailbox**: Waits for slot value, takes and clears it
+    pub async fn recv(&mut self) -> Result<T, DbError> {
+        poll_fn(|cx| self.inner.poll_recv(cx)).await
+    }
+
+    /// Non-blocking receive — returns immediately.
+    ///
+    /// Returns `Err(DbError::BufferEmpty)` if no pending values.
+    pub fn try_recv(&mut self) -> Result<T, DbError> {
+        self.inner.try_recv()
+    }
+}
+
+/// Owned, ergonomic handle over an erased [`JsonBufferReader`].
+///
+/// Returned by `subscribe_json`. Awaiting `recv_json` is allocation-free: it
+/// wraps [`poll_recv_json`](JsonBufferReader::poll_recv_json) via
+/// `core::future::poll_fn`, so the pre-W8 remote-access double box is gone.
+#[cfg(feature = "remote-access")]
+pub struct JsonReader {
+    inner: Box<dyn JsonBufferReader + Send>,
+}
+
+#[cfg(feature = "remote-access")]
+impl JsonReader {
+    /// Wrap an erased JSON reader in an ergonomic handle.
+    pub fn new(inner: Box<dyn JsonBufferReader + Send>) -> Self {
+        Self { inner }
+    }
+
+    /// Receive the next value as JSON, awaiting until one is available.
+    pub async fn recv_json(&mut self) -> Result<serde_json::Value, DbError> {
+        poll_fn(|cx| self.inner.poll_recv_json(cx)).await
+    }
+
+    /// Non-blocking receive as JSON — returns immediately.
+    ///
+    /// Returns `Err(DbError::BufferEmpty)` if no pending values.
+    pub fn try_recv_json(&mut self) -> Result<serde_json::Value, DbError> {
+        self.inner.try_recv_json()
+    }
+}
diff --git a/aimdb-core/src/buffer/traits.rs b/aimdb-core/src/buffer/traits.rs
index e49b5e1..c53bada 100644
--- a/aimdb-core/src/buffer/traits.rs
+++ b/aimdb-core/src/buffer/traits.rs
@@ -5,8 +5,7 @@
 //!
 //! See `aimdb-tokio-adapter` and `aimdb-embassy-adapter` for implementations.
 
-use core::future::Future;
-use core::pin::Pin;
+use core::task::{Context, Poll};
 
 use alloc::boxed::Box;
 
@@ -124,22 +123,33 @@ pub(crate) trait WriteHandle<T: Clone + Send + 'static>: Send + Sync {
 
 /// Reader trait for consuming values from a buffer
 ///
-/// All read operations are async. Each reader is independent with its own state.
+/// This is the object-safe **service-provider interface** that runtime adapters
+/// implement. It is poll-based — and therefore object-safe and zero-allocation —
+/// rather than `async`: an `async fn` on an erased trait forces a
+/// `Pin<Box<dyn Future>>` heap allocation on every call (design 037 / W8).
+/// Consumers do not call this directly; they use the [`Reader<T>`](super::Reader)
+/// handle returned by `Consumer::subscribe`, whose `recv()` is `async` and wraps
+/// [`poll_recv`](BufferReader::poll_recv) via `core::future::poll_fn` with no
+/// allocation.
+///
+/// Each reader is independent with its own state.
 ///
 /// # Error Handling
 /// - `Ok(value)` - Successfully received a value
 /// - `Err(BufferLagged)` - Missed messages (SPMC ring only, can continue)
 /// - `Err(BufferClosed)` - Buffer closed (graceful shutdown)
 pub trait BufferReader<T: Clone + Send>: Send {
-    /// Receive the next value (async)
+    /// Poll for the next value.
     ///
-    /// Waits for the next available value. Returns immediately if buffered.
+    /// Returns `Poll::Ready(Ok(value))` when a value is available,
+    /// `Poll::Ready(Err(..))` on lag/closure, or `Poll::Pending` after
+    /// registering `cx.waker()` to be woken when the next value arrives.
     ///
     /// # Behavior by Buffer Type
     /// - **SPMC Ring**: Returns next value, or `Lagged(n)` if fell behind
     /// - **SingleLatest**: Waits for value change, returns most recent
     /// - **Mailbox**: Waits for slot value, takes and clears it
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>>;
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>>;
 
     /// Non-blocking receive — returns immediately.
     ///
@@ -158,25 +168,30 @@ pub trait BufferReader<T: Clone + Send>: Send {
 /// `serde_json::Value`. Used by remote access protocol for subscriptions.
 ///
 /// This trait enables subscribing to a buffer without knowing the concrete type `T`
-/// at compile time, by serializing values to JSON on each `recv_json()` call.
+/// at compile time, by serializing values to JSON on each poll.
+///
+/// Object-safe and poll-based for the same reason as [`BufferReader`] (design
+/// 037 / W8). Consumers use the [`JsonReader`](super::JsonReader) handle, whose
+/// `recv_json()` is `async` and wraps [`poll_recv_json`](JsonBufferReader::poll_recv_json)
+/// with no allocation.
 ///
 /// # Requirements
 /// - Record must be configured with `.with_remote_access()`
 /// - Only available with the `remote-access` feature (requires serde_json)
 #[cfg(feature = "remote-access")]
 pub trait JsonBufferReader: Send {
-    /// Receive the next value as JSON (async)
+    /// Poll for the next value, serialized to JSON.
     ///
-    /// Waits for the next value from the underlying buffer and serializes it to JSON.
+    /// Returns `Poll::Ready(Ok(json))` when a value is available and
+    /// serializes successfully, `Poll::Ready(Err(..))` on lag/closure/serialize
+    /// failure, or `Poll::Pending` after registering `cx.waker()`.
     ///
     /// # Returns
     /// - `Ok(JsonValue)` - Successfully received and serialized value
     /// - `Err(BufferLagged)` - Missed messages (can continue reading)
     /// - `Err(BufferClosed)` - Buffer closed (graceful shutdown)
     /// - `Err(SerializationFailed)` - Failed to serialize value to JSON
-    fn recv_json(
-        &mut self,
-    ) -> Pin<Box<dyn Future<Output = Result<serde_json::Value, DbError>> + Send + '_>>;
+    fn poll_recv_json(&mut self, cx: &mut Context<'_>) -> Poll<Result<serde_json::Value, DbError>>;
 
     /// Non-blocking receive as JSON — returns immediately.
     ///
@@ -287,13 +302,11 @@ mod tests {
     }
 
     impl<T: Clone + Send> BufferReader<T> for MockReader<T> {
-        fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-            Box::pin(async {
-                // Return closed for testing
-                Err(DbError::BufferClosed {
-                    buffer_name: "mock".to_string(),
-                })
-            })
+        fn poll_recv(&mut self, _cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+            // Return closed for testing
+            Poll::Ready(Err(DbError::BufferClosed {
+                buffer_name: "mock".to_string(),
+            }))
         }
 
         fn try_recv(&mut self) -> Result<T, DbError> {
diff --git a/aimdb-core/src/builder.rs b/aimdb-core/src/builder.rs
index 09be8c2..32962df 100644
--- a/aimdb-core/src/builder.rs
+++ b/aimdb-core/src/builder.rs
@@ -960,10 +960,7 @@ impl AimDb {
     ///
     /// # Arguments
     /// * `key` - The record key (e.g., "sensor.temperature")
-    pub fn subscribe<T>(
-        &self,
-        key: impl AsRef<str>,
-    ) -> DbResult<Box<dyn crate::buffer::BufferReader<T> + Send>>
+    pub fn subscribe<T>(&self, key: impl AsRef<str>) -> DbResult<crate::buffer::Reader<T>>
     where
         T: Send + Sync + 'static + Debug + Clone,
     {
diff --git a/aimdb-core/src/lib.rs b/aimdb-core/src/lib.rs
index cc48d1e..9f9fced 100644
--- a/aimdb-core/src/lib.rs
+++ b/aimdb-core/src/lib.rs
@@ -58,6 +58,9 @@ pub use extensions::Extensions;
 pub use aimdb_executor::{ExecutorError, ExecutorResult};
 
 // Producer-Consumer Pattern exports
+#[cfg(feature = "remote-access")]
+pub use buffer::JsonReader;
+pub use buffer::Reader;
 pub use builder::OutboundRoute;
 pub use builder::{AimDb, AimDbBuilder};
 pub use connector::ConnectorBuilder;
diff --git a/aimdb-core/src/profiling/mod.rs b/aimdb-core/src/profiling/mod.rs
index 9b71f32..a3d4dfc 100644
--- a/aimdb-core/src/profiling/mod.rs
+++ b/aimdb-core/src/profiling/mod.rs
@@ -24,9 +24,8 @@ pub use record_profiling::{RecordProfilingMetrics, StageEntry};
 pub use stage_metrics::StageMetrics;
 
 use alloc::{boxed::Box, sync::Arc};
-use core::future::Future;
-use core::pin::Pin;
 use core::sync::atomic::Ordering;
+use core::task::{Context, Poll};
 use portable_atomic::AtomicU64;
 
 use crate::buffer::BufferReader;
@@ -110,17 +109,17 @@ impl<T: Clone + Send> ProfilingBufferReader<T> {
 }
 
 impl<T: Clone + Send> BufferReader<T> for ProfilingBufferReader<T> {
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-        Box::pin(async move {
-            // `started_ns` ≈ the moment the consumer finished processing the
-            // previous value and asked for the next one.
-            let started_ns = (self.clock)();
-            let result = self.inner.recv().await;
-            if result.is_ok() {
-                self.on_yield(started_ns);
-            }
-            result
-        })
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        // `started_ns` ≈ the moment the consumer finished processing the
+        // previous value and asked for the next one. Sampled per poll; only the
+        // poll that yields a value records the interval, matching the prior
+        // await-based behavior.
+        let started_ns = (self.clock)();
+        let result = self.inner.poll_recv(cx);
+        if let Poll::Ready(Ok(_)) = &result {
+            self.on_yield(started_ns);
+        }
+        result
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
diff --git a/aimdb-core/src/remote/stream.rs b/aimdb-core/src/remote/stream.rs
index 7bdea5e..6adddcc 100644
--- a/aimdb-core/src/remote/stream.rs
+++ b/aimdb-core/src/remote/stream.rs
@@ -37,14 +37,16 @@ pub(crate) fn stream_record_updates(
     let record = inner
         .storage(id)
         .ok_or(DbError::InvalidRecordId { id: id.raw() })?;
-    let reader = record
-        .json_access()
-        .ok_or_else(|| {
-            DbError::runtime_error(alloc::format!(
-                "Record '{record_key}' does not support JSON remote access"
-            ))
-        })?
-        .subscribe_json()?;
+    let reader = crate::buffer::JsonReader::new(
+        record
+            .json_access()
+            .ok_or_else(|| {
+                DbError::runtime_error(alloc::format!(
+                    "Record '{record_key}' does not support JSON remote access"
+                ))
+            })?
+            .subscribe_json()?,
+    );
 
     // Pair the reader with an owned copy of the record key so lag/error
     // logs identify which record fell behind — the previous mpsc-based
@@ -81,9 +83,9 @@ pub(crate) fn stream_record_updates(
 #[cfg(all(test, feature = "std"))]
 mod tests {
     use super::*;
-    use crate::buffer::JsonBufferReader;
+    use crate::buffer::{JsonBufferReader, JsonReader};
+    use core::task::{Context, Poll};
     use futures_util::StreamExt;
-    use std::pin::Pin;
     use std::sync::atomic::{AtomicUsize, Ordering};
     use std::sync::Arc;
 
@@ -95,25 +97,21 @@ mod tests {
     }
 
     impl JsonBufferReader for FakeReader {
-        fn recv_json(
+        fn poll_recv_json(
             &mut self,
-        ) -> Pin<
-            Box<dyn std::future::Future<Output = Result<serde_json::Value, DbError>> + Send + '_>,
-        > {
-            let step = self.step.clone();
-            Box::pin(async move {
-                let s = step.fetch_add(1, Ordering::SeqCst);
-                match s {
-                    0 => Ok(serde_json::json!({"v": 1})),
-                    1 => Err(DbError::BufferLagged {
-                        buffer_name: "test".to_string(),
-                        lag_count: 7,
-                    }),
-                    2 => Ok(serde_json::json!({"v": 2})),
-                    _ => Err(DbError::BufferClosed {
-                        buffer_name: "test".to_string(),
-                    }),
-                }
+            _cx: &mut Context<'_>,
+        ) -> Poll<Result<serde_json::Value, DbError>> {
+            let s = self.step.fetch_add(1, Ordering::SeqCst);
+            Poll::Ready(match s {
+                0 => Ok(serde_json::json!({"v": 1})),
+                1 => Err(DbError::BufferLagged {
+                    buffer_name: "test".to_string(),
+                    lag_count: 7,
+                }),
+                2 => Ok(serde_json::json!({"v": 2})),
+                _ => Err(DbError::BufferClosed {
+                    buffer_name: "test".to_string(),
+                }),
             })
         }
 
@@ -124,9 +122,9 @@ mod tests {
 
     #[tokio::test]
     async fn unfold_skips_lag_and_terminates_on_closed() {
-        let reader: Box<dyn JsonBufferReader + Send> = Box::new(FakeReader {
+        let reader = JsonReader::new(Box::new(FakeReader {
             step: Arc::new(AtomicUsize::new(0)),
-        });
+        }));
 
         let stream = unfold(reader, |mut reader| async move {
             loop {
diff --git a/aimdb-core/src/typed_api.rs b/aimdb-core/src/typed_api.rs
index dda6e3f..78c6107 100644
--- a/aimdb-core/src/typed_api.rs
+++ b/aimdb-core/src/typed_api.rs
@@ -215,19 +215,22 @@ where
 
     /// Subscribe to updates for this record type.
     ///
-    /// Returns a reader that yields values as they are produced.
+    /// Returns a [`Reader<T>`](crate::buffer::Reader) that yields values as they
+    /// are produced. Its `recv().await` is allocation-free (design 037 / W8).
     /// Infallible — the buffer is pre-resolved at `Consumer` construction.
-    pub fn subscribe(&self) -> Box<dyn crate::buffer::BufferReader<T> + Send> {
+    pub fn subscribe(&self) -> crate::buffer::Reader<T> {
         let reader = self.buffer.subscribe_boxed();
         #[cfg(feature = "profiling")]
         if let Some((metrics, clock)) = &self.profiling {
-            return Box::new(crate::profiling::ProfilingBufferReader::new(
-                reader,
-                metrics.clone(),
-                clock.clone(),
+            return crate::buffer::Reader::new(Box::new(
+                crate::profiling::ProfilingBufferReader::new(
+                    reader,
+                    metrics.clone(),
+                    clock.clone(),
+                ),
             ));
         }
-        reader
+        crate::buffer::Reader::new(reader)
     }
 }
 
@@ -281,8 +284,12 @@ where
 
 /// One subscription of a [`FusedSource`]: recv → resolve destination →
 /// serialize, all on the typed value.
+///
+/// The connector SPI keeps its boxed `RecvSerializedFuture` (BYOC stays stable,
+/// design 039 §2); only the *inner* per-message box is eliminated by reading
+/// through the allocation-free [`Reader<T>`](crate::buffer::Reader) (W8).
 struct FusedReader<T: Clone + Send + 'static> {
-    inner: Box<dyn crate::buffer::BufferReader<T> + Send>,
+    inner: crate::buffer::Reader<T>,
     serialize: FusedSerializeFn<T>,
     topic: Option<Arc<dyn crate::connector::TopicProvider<T>>>,
 }
@@ -1722,15 +1729,16 @@ mod tests {
     }
 
     impl crate::buffer::BufferReader<TestRecord> for ScriptedReader {
-        fn recv(
+        fn poll_recv(
             &mut self,
-        ) -> Pin<Box<dyn Future<Output = Result<TestRecord, crate::DbError>> + Send + '_>> {
+            _cx: &mut core::task::Context<'_>,
+        ) -> core::task::Poll<Result<TestRecord, crate::DbError>> {
             let next = if self.script.is_empty() {
                 Err(Self::closed())
             } else {
                 self.script.remove(0)
             };
-            Box::pin(async move { next })
+            core::task::Poll::Ready(next)
         }
         fn try_recv(&mut self) -> Result<TestRecord, crate::DbError> {
             unimplemented!("not needed for fused reader tests")
@@ -1750,7 +1758,7 @@ mod tests {
         topic: Option<Arc<dyn crate::connector::TopicProvider<TestRecord>>>,
     ) -> FusedReader<TestRecord> {
         FusedReader {
-            inner: Box::new(ScriptedReader { script }),
+            inner: crate::buffer::Reader::new(Box::new(ScriptedReader { script })),
             serialize,
             topic,
         }
diff --git a/aimdb-core/src/typed_record.rs b/aimdb-core/src/typed_record.rs
index d50d4e3..8407578 100644
--- a/aimdb-core/src/typed_record.rs
+++ b/aimdb-core/src/typed_record.rs
@@ -19,6 +19,8 @@
 
 use core::any::Any;
 use core::fmt::Debug;
+#[cfg(feature = "remote-access")]
+use core::task::{Context, Poll};
 
 use alloc::{
     boxed::Box,
@@ -134,24 +136,21 @@ struct JsonReaderAdapter<T: Clone + Send + 'static> {
 
 #[cfg(feature = "remote-access")]
 impl<T: Clone + Send + 'static> crate::buffer::JsonBufferReader for JsonReaderAdapter<T> {
-    fn recv_json(
+    fn poll_recv_json(
         &mut self,
-    ) -> core::pin::Pin<
-        Box<
-            dyn core::future::Future<Output = Result<serde_json::Value, crate::DbError>>
-                + Send
-                + '_,
-        >,
-    > {
-        Box::pin(async move {
-            // Receive typed value from buffer
-            let value = self.inner.recv().await?;
-
-            // Serialize to JSON
-            self.codec
-                .encode(&value)
-                .ok_or_else(|| crate::DbError::runtime_error("Failed to serialize value to JSON"))
-        })
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<serde_json::Value, crate::DbError>> {
+        // Poll the inner typed reader (allocation-free), then serialize on the
+        // ready value — the pre-W8 outer + inner double box are both gone.
+        match self.inner.poll_recv(cx) {
+            Poll::Ready(Ok(value)) => {
+                Poll::Ready(self.codec.encode(&value).ok_or_else(|| {
+                    crate::DbError::runtime_error("Failed to serialize value to JSON")
+                }))
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
+            Poll::Pending => Poll::Pending,
+        }
     }
 
     fn try_recv_json(&mut self) -> Result<serde_json::Value, crate::DbError> {
@@ -861,15 +860,18 @@ impl<T: Send + 'static + Debug + Clone> TypedRecord<T> {
 
     /// Subscribes to the buffer for this record type
     ///
+    /// Returns an ergonomic, allocation-free [`Reader<T>`](crate::buffer::Reader)
+    /// handle (design 037 / W8).
+    ///
     /// # Errors
     /// Returns `DbError::MissingConfiguration` if no buffer configured
-    pub fn subscribe(&self) -> crate::DbResult<Box<dyn crate::buffer::BufferReader<T> + Send>> {
+    pub fn subscribe(&self) -> crate::DbResult<crate::buffer::Reader<T>> {
         let buffer = self
             .buffer
             .as_ref()
             .ok_or_else(|| crate::DbError::missing_configuration("buffer"))?;
 
-        Ok(buffer.subscribe_boxed())
+        Ok(crate::buffer::Reader::new(buffer.subscribe_boxed()))
     }
 
     /// Adds an outbound connector link for external system integration
@@ -1293,12 +1295,17 @@ impl<T: Send + Sync + 'static + Debug + Clone> JsonRecordAccess for TypedRecord<
             ))
         })?;
 
-        // 2. Subscribe to the buffer (get Box<dyn BufferReader<T>>)
-        let reader = self.subscribe()?;
+        // 2. Subscribe to the buffer (the adapter polls the erased reader
+        //    directly, so it takes the boxed reader rather than the ergonomic
+        //    `Reader<T>` wrapper).
+        let buffer = self
+            .buffer
+            .as_ref()
+            .ok_or_else(|| DbError::missing_configuration("buffer"))?;
 
         // 3. Wrap in JsonReaderAdapter
         let json_reader = JsonReaderAdapter {
-            inner: reader,
+            inner: buffer.subscribe_boxed(),
             codec,
         };
 
diff --git a/aimdb-embassy-adapter/src/buffer.rs b/aimdb-embassy-adapter/src/buffer.rs
index 8a79f15..b720480 100644
--- a/aimdb-embassy-adapter/src/buffer.rs
+++ b/aimdb-embassy-adapter/src/buffer.rs
@@ -43,13 +43,18 @@ extern crate alloc;
 use alloc::boxed::Box;
 use alloc::string::String;
 use alloc::sync::Arc;
+use core::alloc::Layout;
+use core::future::Future;
+use core::pin::Pin;
+use core::ptr::{self, NonNull};
+use core::task::{Context, Poll};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
 use aimdb_core::DbError;
 use embassy_sync::blocking_mutex::raw::CriticalSectionRawMutex;
 use embassy_sync::channel::Channel;
-use embassy_sync::pubsub::{PubSubChannel, WaitResult};
-use embassy_sync::watch::Watch;
+use embassy_sync::pubsub::{PubSubChannel, Subscriber, WaitResult};
+use embassy_sync::watch::{Receiver as WatchReceiver, Watch};
 
 #[cfg(feature = "metrics")]
 use aimdb_core::buffer::{BufferCounters, BufferMetrics, BufferMetricsSnapshot};
@@ -76,8 +81,11 @@ use aimdb_core::buffer::{BufferCounters, BufferMetrics, BufferMetricsSnapshot};
 /// type MyBuffer = EmbassyBuffer<u32, 32, 4, 2, 4>;
 ///
 /// # async fn example() {
+/// use aimdb_core::buffer::Reader;
 /// let buffer: MyBuffer = MyBuffer::new_spmc();
-/// let mut reader = buffer.subscribe();
+/// // `subscribe()` yields the concrete reader; wrap it in the ergonomic,
+/// // allocation-free `Reader<T>` handle for `recv().await` (design 037 / W8).
+/// let mut reader = Reader::new(Box::new(buffer.subscribe()));
 /// buffer.push(42);
 /// let value = reader.recv().await.unwrap();
 /// # }
@@ -210,8 +218,8 @@ impl<
         // Clone the Arc for the reader
         EmbassyBufferReader {
             buffer: Arc::clone(&self.inner),
-            watch_receiver: None, // Will be initialized on first recv() for Watch buffers
-            spmc_subscriber: None, // Will be initialized on first recv() for SpmcRing buffers
+            watch_recv: None, // Lazily initialized on first poll for Watch buffers
+            spmc_recv: None,  // Lazily initialized on first poll for SpmcRing buffers
             #[cfg(feature = "metrics")]
             metrics: Arc::clone(&self.metrics),
         }
@@ -351,7 +359,9 @@ impl<
         F: Fn(T) -> Fut + Send + Sync,
         Fut: core::future::Future<Output = ()> + Send,
     {
-        let mut reader = self.subscribe();
+        // Wrap the concrete reader in the ergonomic, allocation-free
+        // `Reader<T>` handle so `recv().await` works (design 037 / W8).
+        let mut reader = aimdb_core::buffer::Reader::new(Box::new(self.subscribe()));
 
         loop {
             match reader.recv().await {
@@ -375,11 +385,173 @@ impl<
     }
 }
 
+// ============================================================================
+// Zero-allocation poll plumbing (design 037 / W8)
+// ============================================================================
+
+/// Persistent SpmcRing subscriber with a lifetime extended to `'static` (the
+/// owning `Arc<EmbassyBufferInner>` keeps the channel alive for the reader).
+type SpmcSub<T, const CAP: usize, const SUBS: usize, const PUBS: usize> =
+    Subscriber<'static, CriticalSectionRawMutex, T, CAP, SUBS, PUBS>;
+
+/// Persistent Watch receiver, `'static` for the same reason as [`SpmcSub`].
+type WatchRx<T, const WATCH_N: usize> = WatchReceiver<'static, CriticalSectionRawMutex, T, WATCH_N>;
+
+/// Output of the SpmcRing reusable future: the `next_message()` result paired
+/// with the subscriber handed back so the next future can reuse it.
+type SpmcRecvOutput<T, const CAP: usize, const SUBS: usize, const PUBS: usize> =
+    (WaitResult<T>, SpmcSub<T, CAP, SUBS, PUBS>);
+
+/// Output of the Watch reusable future: the changed value plus the receiver.
+type WatchRecvOutput<T, const WATCH_N: usize> = (T, WatchRx<T, WATCH_N>);
+
+/// Await the next SpmcRing message, returning the subscriber for reuse.
+///
+/// Uses the existing async `next_message()` so lag (`WaitResult::Lagged`) is
+/// surfaced exactly as before; the subscriber is round-tripped through the
+/// future so the reader can store it without self-referencing.
+async fn spmc_recv_fut<T: Clone, const CAP: usize, const SUBS: usize, const PUBS: usize>(
+    mut sub: SpmcSub<T, CAP, SUBS, PUBS>,
+) -> SpmcRecvOutput<T, CAP, SUBS, PUBS> {
+    let result = sub.next_message().await;
+    (result, sub)
+}
+
+/// Await the next Watch change, returning the receiver for reuse.
+async fn watch_recv_fut<T: Clone, const WATCH_N: usize>(
+    mut rx: WatchRx<T, WATCH_N>,
+) -> WatchRecvOutput<T, WATCH_N> {
+    let value = rx.changed().await;
+    (value, rx)
+}
+
+/// Create the persistent SpmcRing subscriber, extending its borrow to `'static`.
+///
+/// SAFETY: the `Arc<EmbassyBufferInner>` in the reader keeps the `PubSubChannel`
+/// alive for the reader's whole life, so the `'static` subscriber never outlives
+/// the channel. (Same invariant the pre-W8 code relied on.)
+fn make_spmc_sub<
+    T: Clone + Send + 'static,
+    const CAP: usize,
+    const SUBS: usize,
+    const PUBS: usize,
+>(
+    channel: &PubSubChannel<CriticalSectionRawMutex, T, CAP, SUBS, PUBS>,
+) -> Result<SpmcSub<T, CAP, SUBS, PUBS>, DbError> {
+    let channel_static: &'static PubSubChannel<CriticalSectionRawMutex, T, CAP, SUBS, PUBS> =
+        unsafe { &*(channel as *const _) };
+    channel_static.subscriber().map_err(|_| {
+        defmt::error!(
+            "AimDB: SpmcRing subscriber slot exhausted (max SUBS={}). \
+             Increase the CONSUMERS const generic on buffer_sized<CAP, CONSUMERS>. \
+             Count one slot per .tap(), .link_to() connector, and each transform_join input.",
+            SUBS
+        );
+        DbError::BufferClosed {
+            buffer_name: String::from("embassy spmc ring"),
+        }
+    })
+}
+
+/// Create the persistent Watch receiver, extending its borrow to `'static`.
+///
+/// SAFETY: see [`make_spmc_sub`] — the `Arc` keeps the `Watch` alive.
+fn make_watch_rx<T: Clone + Send + 'static, const WATCH_N: usize>(
+    watch: &Watch<CriticalSectionRawMutex, T, WATCH_N>,
+) -> Result<WatchRx<T, WATCH_N>, DbError> {
+    let watch_static: &'static Watch<CriticalSectionRawMutex, T, WATCH_N> =
+        unsafe { &*(watch as *const _) };
+    watch_static.receiver().ok_or(DbError::BufferClosed {
+        buffer_name: String::from("embassy watch"),
+    })
+}
+
+/// Minimal `no_std` port of `tokio_util::sync::ReusableBoxFuture`.
+///
+/// Stores a boxed future and, on [`set`](ReusableBoxFuture::set), reuses the
+/// existing heap allocation whenever the replacement future has the same layout
+/// — which it always does here, since each reader stores exactly one concrete
+/// async-fn future type. That is what makes the embassy consume path
+/// zero-allocation per message (design 037 / W8): the single box is allocated
+/// once per subscriber and reused for every message.
+///
+/// `unsafe` is confined to this type and mirrors the upstream tokio-util impl.
+struct ReusableBoxFuture<O> {
+    boxed: NonNull<dyn Future<Output = O> + Send + 'static>,
+}
+
+// SAFETY: the boxed future is owned exclusively and is itself `Send`.
+unsafe impl<O> Send for ReusableBoxFuture<O> {}
+
+impl<O> ReusableBoxFuture<O> {
+    fn new<F>(future: F) -> Self
+    where
+        F: Future<Output = O> + Send + 'static,
+    {
+        let boxed: Box<dyn Future<Output = O> + Send + 'static> = Box::new(future);
+        // SAFETY: `Box::into_raw` never yields a null pointer.
+        Self {
+            boxed: unsafe { NonNull::new_unchecked(Box::into_raw(boxed)) },
+        }
+    }
+
+    /// Replace the stored future, reusing the allocation when layouts match
+    /// (the common path) and reallocating only as a fallback.
+    fn set<F>(&mut self, future: F)
+    where
+        F: Future<Output = O> + Send + 'static,
+    {
+        if let Err(future) = self.try_set(future) {
+            *self = Self::new(future);
+        }
+    }
+
+    fn try_set<F>(&mut self, future: F) -> Result<(), F>
+    where
+        F: Future<Output = O> + Send + 'static,
+    {
+        // SAFETY: `self.boxed` is always a valid, owned boxed future.
+        let existing_layout = Layout::for_value(unsafe { self.boxed.as_ref() });
+        if Layout::new::<F>() != existing_layout {
+            return Err(future);
+        }
+        // SAFETY: same layout — drop the completed future in place, write the
+        // new one into the same allocation, and rebuild the fat pointer with
+        // `F`'s vtable. The previous future has already resolved (we only `set`
+        // after a poll returned `Ready`), so the pin contract is upheld.
+        unsafe {
+            let ptr = self.boxed.as_ptr();
+            ptr::drop_in_place(ptr);
+            let ptr = ptr as *mut F;
+            ptr::write(ptr, future);
+            self.boxed = NonNull::new_unchecked(ptr as *mut (dyn Future<Output = O> + Send));
+        }
+        Ok(())
+    }
+
+    fn poll(&mut self, cx: &mut Context<'_>) -> Poll<O> {
+        // SAFETY: the future lives behind a stable heap allocation and is never
+        // moved while borrowed; `&mut self` guarantees unique access.
+        let fut = unsafe { Pin::new_unchecked(self.boxed.as_mut()) };
+        fut.poll(cx)
+    }
+}
+
+impl<O> Drop for ReusableBoxFuture<O> {
+    fn drop(&mut self) {
+        // SAFETY: reconstruct and drop the owning `Box`.
+        unsafe {
+            drop(Box::from_raw(self.boxed.as_ptr()));
+        }
+    }
+}
+
 /// Reader for Embassy buffers
 ///
-/// Holds persistent subscription state for each buffer type.
-/// For Watch buffers, stores a persistent Receiver to track which value has been seen.
-/// For SpmcRing buffers, stores a persistent Subscriber for cursor continuity.
+/// Holds persistent subscription state for each buffer type, driven through a
+/// reused boxed future so `poll_recv` allocates nothing per message (W8).
+/// For Watch buffers the future owns a persistent Receiver (tracks which value
+/// has been seen); for SpmcRing a persistent Subscriber (cursor continuity).
 pub struct EmbassyBufferReader<
     T: Clone + Send + 'static,
     const CAP: usize,
@@ -388,14 +560,11 @@ pub struct EmbassyBufferReader<
     const WATCH_N: usize,
 > {
     buffer: Arc<EmbassyBufferInner<T, CAP, SUBS, PUBS, WATCH_N>>,
-    /// Persistent Watch receiver. The 'static lifetime is safe because the Arc keeps the Watch alive.
-    watch_receiver:
-        Option<embassy_sync::watch::Receiver<'static, CriticalSectionRawMutex, T, WATCH_N>>,
-    /// Persistent SpmcRing subscriber (same pattern as watch_receiver).
-    /// The 'static lifetime is safe because the Arc keeps the PubSubChannel alive.
-    spmc_subscriber: Option<
-        embassy_sync::pubsub::Subscriber<'static, CriticalSectionRawMutex, T, CAP, SUBS, PUBS>,
-    >,
+    /// Reusable future round-tripping the persistent Watch receiver. Lazily
+    /// created on first poll (the receiver borrows the Arc-kept-alive Watch).
+    watch_recv: Option<ReusableBoxFuture<WatchRecvOutput<T, WATCH_N>>>,
+    /// Reusable future round-tripping the persistent SpmcRing subscriber.
+    spmc_recv: Option<ReusableBoxFuture<SpmcRecvOutput<T, CAP, SUBS, PUBS>>>,
     /// Shared counter state (cloned from the parent buffer at subscribe time).
     #[cfg(feature = "metrics")]
     metrics: Arc<BufferCounters>,
@@ -409,151 +578,123 @@ impl<
         const WATCH_N: usize,
     > BufferReader<T> for EmbassyBufferReader<T, CAP, SUBS, PUBS, WATCH_N>
 {
-    fn recv(
-        &mut self,
-    ) -> core::pin::Pin<Box<dyn core::future::Future<Output = Result<T, DbError>> + Send + '_>>
-    {
-        Box::pin(async move {
-            match &*self.buffer {
-                EmbassyBufferInner::SpmcRing(channel) => {
-                    // Lazily create persistent subscriber (same pattern as watch_receiver)
-                    if self.spmc_subscriber.is_none() {
-                        // SAFETY: The Arc in self.buffer keeps the PubSubChannel alive for this reader's lifetime.
-                        // We extend the lifetime to 'static to store the subscriber, which is safe because
-                        // the subscriber is dropped with the reader.
-                        let channel_static: &'static embassy_sync::pubsub::PubSubChannel<
-                            CriticalSectionRawMutex,
-                            T,
-                            CAP,
-                            SUBS,
-                            PUBS,
-                        > = unsafe { &*(channel as *const _) };
-                        self.spmc_subscriber = Some(
-                            channel_static.subscriber().map_err(|_| {
-                                defmt::error!(
-                                    "AimDB: SpmcRing subscriber slot exhausted (max SUBS={}). \
-                                     Increase the CONSUMERS const generic on buffer_sized<CAP, CONSUMERS>. \
-                                     Count one slot per .tap(), .link_to() connector, and each transform_join input.",
-                                    SUBS
-                                );
-                                DbError::BufferClosed {
-                                    buffer_name: String::from("embassy spmc ring"),
-                                }
-                            })?,
-                        );
-                    }
-                    match self.spmc_subscriber.as_mut().unwrap().next_message().await {
-                        WaitResult::Message(value) => {
-                            #[cfg(feature = "metrics")]
-                            self.metrics.increment_consumed();
-                            Ok(value)
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        match &*self.buffer {
+            EmbassyBufferInner::SpmcRing(channel) => {
+                // Lazily create the reusable future (allocates its single box
+                // once; reused for every message thereafter).
+                if self.spmc_recv.is_none() {
+                    match make_spmc_sub(channel) {
+                        Ok(sub) => {
+                            self.spmc_recv = Some(ReusableBoxFuture::new(spmc_recv_fut(sub)))
                         }
-                        WaitResult::Lagged(n) => {
-                            #[cfg(feature = "metrics")]
-                            self.metrics.add_dropped(n);
-                            Err(DbError::BufferLagged {
-                                lag_count: n,
-                                buffer_name: String::from("embassy spmc ring"),
-                            })
+                        Err(e) => return Poll::Ready(Err(e)),
+                    }
+                }
+                let recv = self.spmc_recv.as_mut().unwrap();
+                match recv.poll(cx) {
+                    Poll::Ready((result, sub)) => {
+                        recv.set(spmc_recv_fut(sub));
+                        match result {
+                            WaitResult::Message(value) => {
+                                #[cfg(feature = "metrics")]
+                                self.metrics.increment_consumed();
+                                Poll::Ready(Ok(value))
+                            }
+                            WaitResult::Lagged(n) => {
+                                #[cfg(feature = "metrics")]
+                                self.metrics.add_dropped(n);
+                                Poll::Ready(Err(DbError::BufferLagged {
+                                    lag_count: n,
+                                    buffer_name: String::from("embassy spmc ring"),
+                                }))
+                            }
                         }
                     }
+                    Poll::Pending => Poll::Pending,
                 }
-                EmbassyBufferInner::Watch(watch) => {
-                    // Watch requires a persistent receiver to track seen values.
-                    // Creating a new receiver each time causes infinite loops (always returns current value).
-                    if self.watch_receiver.is_none() {
-                        // SAFETY: The Arc in self.buffer keeps the Watch alive for this reader's lifetime.
-                        // We extend the lifetime to 'static to store the receiver, which is safe because
-                        // the receiver is just (&Watch, u64 counter) and will be dropped with the reader.
-                        let watch_static: &'static embassy_sync::watch::Watch<
-                            CriticalSectionRawMutex,
-                            T,
-                            WATCH_N,
-                        > = unsafe { &*(watch as *const _) };
-
-                        self.watch_receiver = watch_static.receiver();
-                        if self.watch_receiver.is_none() {
-                            return Err(DbError::BufferClosed {
-                                buffer_name: String::from("embassy watch"),
-                            });
+            }
+            EmbassyBufferInner::Watch(watch) => {
+                if self.watch_recv.is_none() {
+                    match make_watch_rx(watch) {
+                        Ok(rx) => {
+                            self.watch_recv = Some(ReusableBoxFuture::new(watch_recv_fut(rx)))
                         }
+                        Err(e) => return Poll::Ready(Err(e)),
                     }
-
-                    // Use the persistent receiver to detect changes
-                    if let Some(ref mut rx) = self.watch_receiver {
-                        let value = rx.changed().await;
+                }
+                let recv = self.watch_recv.as_mut().unwrap();
+                match recv.poll(cx) {
+                    Poll::Ready((value, rx)) => {
+                        recv.set(watch_recv_fut(rx));
                         #[cfg(feature = "metrics")]
                         self.metrics.increment_consumed();
-                        Ok(value)
-                    } else {
-                        Err(DbError::BufferClosed {
-                            buffer_name: String::from("embassy watch"),
-                        })
+                        Poll::Ready(Ok(value))
                     }
+                    Poll::Pending => Poll::Pending,
                 }
-                EmbassyBufferInner::Mailbox(channel) => {
-                    let rx = channel.receiver();
-                    let value = rx.receive().await;
+            }
+            EmbassyBufferInner::Mailbox(channel) => match channel.poll_receive(cx) {
+                Poll::Ready(value) => {
                     #[cfg(feature = "metrics")]
                     self.metrics.increment_consumed();
-                    Ok(value)
+                    Poll::Ready(Ok(value))
                 }
-            }
-        })
+                Poll::Pending => Poll::Pending,
+            },
+        }
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
+        // `broadcast`/watch state lives inside the reusable future; poll it with
+        // a no-op waker for non-blocking semantics (mirrors the tokio adapter).
         match &*self.buffer {
             EmbassyBufferInner::SpmcRing(channel) => {
-                // Lazily create persistent subscriber (same as recv())
-                if self.spmc_subscriber.is_none() {
-                    let channel_static: &'static embassy_sync::pubsub::PubSubChannel<
-                        CriticalSectionRawMutex,
-                        T,
-                        CAP,
-                        SUBS,
-                        PUBS,
-                    > = unsafe { &*(channel as *const _) };
-                    self.spmc_subscriber = Some(
-                        channel_static.subscriber().map_err(|_| {
-                            defmt::error!(
-                                "AimDB: SpmcRing subscriber slot exhausted (max SUBS={}). \
-                                 Increase the CONSUMERS const generic on buffer_sized<CAP, CONSUMERS>. \
-                                 Count one slot per .tap(), .link_to() connector, and each transform_join input.",
-                                SUBS
-                            );
-                            DbError::BufferClosed {
-                                buffer_name: String::from("embassy spmc ring"),
+                if self.spmc_recv.is_none() {
+                    self.spmc_recv = Some(ReusableBoxFuture::new(spmc_recv_fut(make_spmc_sub(
+                        channel,
+                    )?)));
+                }
+                let recv = self.spmc_recv.as_mut().unwrap();
+                let mut cx = Context::from_waker(core::task::Waker::noop());
+                match recv.poll(&mut cx) {
+                    Poll::Ready((result, sub)) => {
+                        recv.set(spmc_recv_fut(sub));
+                        match result {
+                            WaitResult::Message(value) => {
+                                #[cfg(feature = "metrics")]
+                                self.metrics.increment_consumed();
+                                Ok(value)
                             }
-                        })?,
-                    );
+                            WaitResult::Lagged(n) => {
+                                #[cfg(feature = "metrics")]
+                                self.metrics.add_dropped(n);
+                                Err(DbError::BufferLagged {
+                                    lag_count: n,
+                                    buffer_name: String::from("embassy spmc ring"),
+                                })
+                            }
+                        }
+                    }
+                    Poll::Pending => Err(DbError::BufferEmpty),
                 }
-                match self
-                    .spmc_subscriber
-                    .as_mut()
-                    .unwrap()
-                    .try_next_message_pure()
-                {
-                    Some(value) => {
+            }
+            EmbassyBufferInner::Watch(watch) => {
+                if self.watch_recv.is_none() {
+                    self.watch_recv = Some(ReusableBoxFuture::new(watch_recv_fut(make_watch_rx(
+                        watch,
+                    )?)));
+                }
+                let recv = self.watch_recv.as_mut().unwrap();
+                let mut cx = Context::from_waker(core::task::Waker::noop());
+                match recv.poll(&mut cx) {
+                    Poll::Ready((value, rx)) => {
+                        recv.set(watch_recv_fut(rx));
                         #[cfg(feature = "metrics")]
                         self.metrics.increment_consumed();
                         Ok(value)
                     }
-                    None => Err(DbError::BufferEmpty),
-                }
-            }
-            EmbassyBufferInner::Watch(_) => {
-                if let Some(ref mut rx) = self.watch_receiver {
-                    match rx.try_changed() {
-                        Some(value) => {
-                            #[cfg(feature = "metrics")]
-                            self.metrics.increment_consumed();
-                            Ok(value)
-                        }
-                        None => Err(DbError::BufferEmpty),
-                    }
-                } else {
-                    Err(DbError::BufferEmpty)
+                    Poll::Pending => Err(DbError::BufferEmpty),
                 }
             }
             EmbassyBufferInner::Mailbox(channel) => match channel.try_receive() {
diff --git a/aimdb-tokio-adapter/Cargo.toml b/aimdb-tokio-adapter/Cargo.toml
index 4d2f1a8..7c5f8e9 100644
--- a/aimdb-tokio-adapter/Cargo.toml
+++ b/aimdb-tokio-adapter/Cargo.toml
@@ -18,7 +18,7 @@ default = ["std", "tokio-runtime"]
 std = ["aimdb-core/std"]
 
 # Runtime features
-tokio-runtime = ["tokio", "std"]
+tokio-runtime = ["tokio", "tokio-util", "std"]
 
 # Observability features
 tracing = ["aimdb-core/tracing", "dep:tracing"]
@@ -44,9 +44,15 @@ aimdb-core = { version = "1.1.0", path = "../aimdb-core", default-features = fal
 tokio = { workspace = true, optional = true, features = [
     "time",
     "rt-multi-thread",
-    "sync",            # For broadcast, watch, Mutex, Notify
+    "sync",            # For broadcast, watch, Mutex
 ] }
 
+# ReusableBoxFuture for the broadcast/watch readers' poll_recv (design 037 / W8):
+# one heap box per subscriber lifetime, reused for every message — zero
+# per-message allocation. broadcast/watch expose no public poll API, so the
+# reader round-trips the receiver through a stored, reused future.
+tokio-util = { version = "0.7", optional = true, default-features = false }
+
 # Observability (optional)
 tracing = { workspace = true, optional = true }
 
diff --git a/aimdb-tokio-adapter/src/buffer.rs b/aimdb-tokio-adapter/src/buffer.rs
index b9934df..8a39aa3 100644
--- a/aimdb-tokio-adapter/src/buffer.rs
+++ b/aimdb-tokio-adapter/src/buffer.rs
@@ -5,15 +5,21 @@
 //!
 //! - **SPMC Ring**: `tokio::sync::broadcast` for bounded multi-consumer queues
 //! - **SingleLatest**: `tokio::sync::watch` for latest-value semantics
-//! - **Mailbox**: `tokio::sync::Mutex` + `tokio::sync::Notify` for single-slot overwrite
+//! - **Mailbox**: `std::sync::Mutex` slot + a hand-rolled waker list for
+//!   single-slot overwrite (design 037 / W8 — no `Notify`, no per-message alloc)
+//!
+//! The broadcast/watch readers are poll-based ([`BufferReader::poll_recv`]) with
+//! no per-message heap allocation: each holds a [`ReusableBoxFuture`] that
+//! round-trips its receiver (these primitives expose no public poll API), so the
+//! single boxed future is allocated once per subscriber and reused per message.
 
-use std::future::Future;
-use std::pin::Pin;
 use std::sync::{Arc, Mutex as StdMutex};
+use std::task::{Context, Poll, Waker};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
 use aimdb_core::DbError;
-use tokio::sync::{broadcast, watch, Notify};
+use tokio::sync::{broadcast, watch};
+use tokio_util::sync::ReusableBoxFuture;
 
 #[cfg(feature = "metrics")]
 use aimdb_core::buffer::{BufferCounters, BufferMetrics, BufferMetricsSnapshot};
@@ -25,6 +31,23 @@ pub struct TokioBuffer<T: Clone + Send + Sync + 'static> {
     metrics: Arc<BufferCounters>,
 }
 
+/// Shared state for the Mailbox (single-slot overwrite) buffer.
+///
+/// Replaces the pre-W8 `Notify` with an explicit waker list beside the slot, so
+/// `poll_recv` registers a waker on `Pending` and `push` wakes them — no
+/// `Notify` permit subtleties, no per-message allocation (design 037 / W8 §6).
+///
+/// `pub` only because it appears in the `pub` [`TokioBufferReader`] reader enum;
+/// it is an implementation detail and not part of the supported API.
+#[doc(hidden)]
+pub struct MailboxState<T> {
+    /// The single value slot; a new `push` overwrites any unconsumed value.
+    slot: Option<T>,
+    /// Parked readers, woken on `push`. Deduplicated on registration and drained
+    /// on wake, so capacity stabilizes after warmup (mirrors the WASM adapter).
+    wakers: Vec<Waker>,
+}
+
 /// Internal buffer variants using Tokio primitives
 enum TokioBufferInner<T: Clone + Send + Sync + 'static> {
     Broadcast {
@@ -33,9 +56,8 @@ enum TokioBufferInner<T: Clone + Send + Sync + 'static> {
     Watch {
         tx: watch::Sender<Option<T>>,
     },
-    Notify {
-        slot: Arc<StdMutex<Option<T>>>,
-        notify: Arc<Notify>,
+    Mailbox {
+        state: Arc<StdMutex<MailboxState<T>>>,
     },
 }
 
@@ -59,9 +81,11 @@ impl<T: Clone + Send + Sync + 'static> Buffer<T> for TokioBuffer<T> {
                 let (tx, _rx) = watch::channel(None);
                 TokioBufferInner::Watch { tx }
             }
-            BufferCfg::Mailbox => TokioBufferInner::Notify {
-                slot: Arc::new(StdMutex::new(None)),
-                notify: Arc::new(Notify::new()),
+            BufferCfg::Mailbox => TokioBufferInner::Mailbox {
+                state: Arc::new(StdMutex::new(MailboxState {
+                    slot: None,
+                    wakers: Vec::new(),
+                })),
             },
         };
 
@@ -87,9 +111,15 @@ impl<T: Clone + Send + Sync + 'static> Buffer<T> for TokioBuffer<T> {
                 // before any subscriber attaches.
                 tx.send_replace(Some(value));
             }
-            TokioBufferInner::Notify { slot, notify } => {
-                *slot.lock().unwrap() = Some(value);
-                notify.notify_waiters();
+            TokioBufferInner::Mailbox { state } => {
+                let mut guard = state.lock().unwrap();
+                guard.slot = Some(value);
+                // Wake-all: spurious wakeups are benign — losers re-poll to
+                // `Pending` and re-register (design 037 §6). Drain so the list
+                // does not accumulate stale wakers.
+                for waker in guard.wakers.drain(..) {
+                    waker.wake();
+                }
             }
         }
     }
@@ -97,18 +127,19 @@ impl<T: Clone + Send + Sync + 'static> Buffer<T> for TokioBuffer<T> {
     fn subscribe(&self) -> Self::Reader {
         match &*self.inner {
             TokioBufferInner::Broadcast { tx } => TokioBufferReader::Broadcast {
-                rx: tx.subscribe(),
+                // Allocate the reusable future box once, here, capturing the
+                // freshly-subscribed receiver — reused for every message (W8).
+                recv: ReusableBoxFuture::new(broadcast_recv(tx.subscribe())),
                 #[cfg(feature = "metrics")]
                 metrics: Arc::clone(&self.metrics),
             },
             TokioBufferInner::Watch { tx } => TokioBufferReader::Watch {
-                rx: tx.subscribe(),
+                recv: ReusableBoxFuture::new(watch_recv(tx.subscribe())),
                 #[cfg(feature = "metrics")]
                 metrics: Arc::clone(&self.metrics),
             },
-            TokioBufferInner::Notify { slot, notify } => TokioBufferReader::Notify {
-                slot: Arc::clone(slot),
-                notify: Arc::clone(notify),
+            TokioBufferInner::Mailbox { state } => TokioBufferReader::Mailbox {
+                state: Arc::clone(state),
                 #[cfg(feature = "metrics")]
                 metrics: Arc::clone(&self.metrics),
             },
@@ -138,7 +169,7 @@ impl<T: Clone + Send + Sync + 'static> aimdb_core::buffer::DynBuffer<T> for Toki
             // watch::Sender::borrow() reads the slot non-destructively.
             TokioBufferInner::Watch { tx } => tx.borrow().clone(),
             // Same Mutex the Mailbox buffer already uses for the slot.
-            TokioBufferInner::Notify { slot, .. } => slot.lock().unwrap().clone(),
+            TokioBufferInner::Mailbox { state } => state.lock().unwrap().slot.clone(),
             // broadcast has no canonical latest — see design 031 §SPMC Ring.
             TokioBufferInner::Broadcast { .. } => None,
         }
@@ -176,9 +207,9 @@ impl<T: Clone + Send + Sync + 'static> BufferMetrics for TokioBuffer<T> {
                     1
                 }
             }
-            TokioBufferInner::Notify { slot, .. } => {
+            TokioBufferInner::Mailbox { state } => {
                 // Lock held only for is_some() check, released immediately.
-                if slot.lock().unwrap().is_some() {
+                if state.lock().unwrap().slot.is_some() {
                     1
                 } else {
                     0
@@ -214,7 +245,9 @@ impl<T: Clone + Send + Sync + 'static> TokioBuffer<T> {
         F: Fn(T) -> Fut + Send + Sync + 'static,
         Fut: std::future::Future<Output = ()> + Send + 'static,
     {
-        let mut reader = self.subscribe();
+        // Wrap the concrete reader in the ergonomic, allocation-free
+        // `Reader<T>` handle so `recv().await` works (design 037 / W8).
+        let mut reader = aimdb_core::buffer::Reader::new(Box::new(self.subscribe()));
 
         tokio::spawn(async move {
             loop {
@@ -258,153 +291,213 @@ impl<T: Clone + Send + Sync + 'static> TokioBuffer<T> {
     }
 }
 
+/// Output of the broadcast reader's reusable future: the `recv()` result paired
+/// with the receiver handed back so the next future can reuse it.
+type BroadcastRecvOutput<T> = (
+    Result<T, broadcast::error::RecvError>,
+    broadcast::Receiver<T>,
+);
+
+/// Output of the watch reader's reusable future. `Ok(Option<T>)` carries the
+/// borrowed-and-cloned latest value (`None` means the channel closed).
+type WatchRecvOutput<T> = (
+    Result<Option<T>, watch::error::RecvError>,
+    watch::Receiver<Option<T>>,
+);
+
+/// Await the next broadcast value, returning the receiver for reuse.
+///
+/// `broadcast::Receiver` exposes no public poll API, so the reader stores this
+/// future in a [`ReusableBoxFuture`] and round-trips the receiver through it —
+/// one allocation per subscriber, reused for every message (design 037 / W8).
+async fn broadcast_recv<T: Clone>(mut rx: broadcast::Receiver<T>) -> BroadcastRecvOutput<T> {
+    let res = rx.recv().await;
+    (res, rx)
+}
+
+/// Await the next watch change, returning the receiver for reuse. Mirrors the
+/// pre-W8 `changed().await` + `borrow().clone()` sequence.
+async fn watch_recv<T: Clone>(mut rx: watch::Receiver<Option<T>>) -> WatchRecvOutput<T> {
+    let res = match rx.changed().await {
+        Ok(()) => Ok(rx.borrow().clone()),
+        Err(e) => Err(e),
+    };
+    (res, rx)
+}
+
 /// Tokio-based buffer reader
 pub enum TokioBufferReader<T: Clone + Send + Sync + 'static> {
     Broadcast {
-        rx: broadcast::Receiver<T>,
+        recv: ReusableBoxFuture<'static, BroadcastRecvOutput<T>>,
         #[cfg(feature = "metrics")]
         metrics: Arc<BufferCounters>,
     },
     Watch {
-        rx: watch::Receiver<Option<T>>,
+        recv: ReusableBoxFuture<'static, WatchRecvOutput<T>>,
         #[cfg(feature = "metrics")]
         metrics: Arc<BufferCounters>,
     },
-    Notify {
-        slot: Arc<StdMutex<Option<T>>>,
-        notify: Arc<Notify>,
+    Mailbox {
+        state: Arc<StdMutex<MailboxState<T>>>,
         #[cfg(feature = "metrics")]
         metrics: Arc<BufferCounters>,
     },
 }
 
+impl<T: Clone + Send + Sync + 'static> TokioBufferReader<T> {
+    /// Map a broadcast `recv()` result into the AimDB error space (and record
+    /// metrics). Shared by `poll_recv` and `try_recv`.
+    fn map_broadcast(
+        result: Result<T, broadcast::error::RecvError>,
+        #[cfg(feature = "metrics")] metrics: &BufferCounters,
+    ) -> Result<T, DbError> {
+        match result {
+            Ok(value) => {
+                #[cfg(feature = "metrics")]
+                metrics.increment_consumed();
+                Ok(value)
+            }
+            Err(broadcast::error::RecvError::Lagged(n)) => {
+                #[cfg(feature = "metrics")]
+                metrics.add_dropped(n);
+                Err(DbError::BufferLagged {
+                    lag_count: n,
+                    buffer_name: "broadcast".to_string(),
+                })
+            }
+            Err(broadcast::error::RecvError::Closed) => Err(DbError::BufferClosed {
+                buffer_name: "broadcast".to_string(),
+            }),
+        }
+    }
+
+    /// Map a watch `changed()` result into the AimDB error space.
+    fn map_watch(
+        result: Result<Option<T>, watch::error::RecvError>,
+        #[cfg(feature = "metrics")] metrics: &BufferCounters,
+    ) -> Result<T, DbError> {
+        match result {
+            Ok(Some(v)) => {
+                #[cfg(feature = "metrics")]
+                metrics.increment_consumed();
+                Ok(v)
+            }
+            Ok(None) | Err(_) => Err(DbError::BufferClosed {
+                buffer_name: "watch".to_string(),
+            }),
+        }
+    }
+}
+
 impl<T: Clone + Send + Sync + 'static> BufferReader<T> for TokioBufferReader<T> {
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-        Box::pin(async move {
-            match self {
-                TokioBufferReader::Broadcast {
-                    rx,
-                    #[cfg(feature = "metrics")]
-                    metrics,
-                } => match rx.recv().await {
-                    Ok(value) => {
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        match self {
+            TokioBufferReader::Broadcast {
+                recv,
+                #[cfg(feature = "metrics")]
+                metrics,
+            } => match recv.poll(cx) {
+                Poll::Ready((result, rx)) => {
+                    // Re-arm the reusable future with the returned receiver
+                    // before handing back the value (no allocation — same future
+                    // type reuses the box).
+                    recv.set(broadcast_recv(rx));
+                    Poll::Ready(Self::map_broadcast(
+                        result,
                         #[cfg(feature = "metrics")]
-                        metrics.increment_consumed();
-                        Ok(value)
-                    }
-                    Err(broadcast::error::RecvError::Lagged(n)) => {
+                        metrics,
+                    ))
+                }
+                Poll::Pending => Poll::Pending,
+            },
+            TokioBufferReader::Watch {
+                recv,
+                #[cfg(feature = "metrics")]
+                metrics,
+            } => match recv.poll(cx) {
+                Poll::Ready((result, rx)) => {
+                    recv.set(watch_recv(rx));
+                    Poll::Ready(Self::map_watch(
+                        result,
                         #[cfg(feature = "metrics")]
-                        metrics.add_dropped(n);
-                        Err(DbError::BufferLagged {
-                            lag_count: n,
-                            buffer_name: "broadcast".to_string(),
-                        })
-                    }
-                    Err(broadcast::error::RecvError::Closed) => Err(DbError::BufferClosed {
-                        buffer_name: "broadcast".to_string(),
-                    }),
-                },
-                TokioBufferReader::Watch {
-                    rx,
-                    #[cfg(feature = "metrics")]
-                    metrics,
-                } => {
-                    rx.changed().await.map_err(|_| DbError::BufferClosed {
-                        buffer_name: "watch".to_string(),
-                    })?;
-
-                    let value = rx.borrow().clone();
-                    match value {
-                        Some(v) => {
-                            #[cfg(feature = "metrics")]
-                            metrics.increment_consumed();
-                            Ok(v)
-                        }
-                        None => Err(DbError::BufferClosed {
-                            buffer_name: "watch".to_string(),
-                        }),
-                    }
+                        metrics,
+                    ))
                 }
-                TokioBufferReader::Notify {
-                    slot,
-                    notify,
+                Poll::Pending => Poll::Pending,
+            },
+            TokioBufferReader::Mailbox {
+                state,
+                #[cfg(feature = "metrics")]
+                metrics,
+            } => {
+                let mut guard = state.lock().unwrap();
+                if let Some(value) = guard.slot.take() {
                     #[cfg(feature = "metrics")]
-                    metrics,
-                } => {
-                    loop {
-                        // Check if there's already a value
-                        {
-                            let mut guard = slot.lock().unwrap();
-                            if let Some(value) = guard.take() {
-                                #[cfg(feature = "metrics")]
-                                metrics.increment_consumed();
-                                return Ok(value);
-                            }
-                        }
-                        // No value, wait for notification
-                        notify.notified().await;
+                    metrics.increment_consumed();
+                    Poll::Ready(Ok(value))
+                } else {
+                    // Register the waker (dedup so repeated polls without a push
+                    // don't grow the list — mirrors the WASM adapter).
+                    if !guard.wakers.iter().any(|w| w.will_wake(cx.waker())) {
+                        guard.wakers.push(cx.waker().clone());
                     }
+                    Poll::Pending
                 }
             }
-        })
+        }
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
         match self {
+            // `broadcast`/`watch` have no public poll API, and their receivers
+            // live inside the reusable future. Poll that future with a no-op
+            // waker: `Ready` means a value/error is available now (try-recv
+            // semantics); `Pending` means empty. On `Ready`, re-arm the future.
             TokioBufferReader::Broadcast {
-                rx,
+                recv,
                 #[cfg(feature = "metrics")]
                 metrics,
-            } => match rx.try_recv() {
-                Ok(value) => {
-                    #[cfg(feature = "metrics")]
-                    metrics.increment_consumed();
-                    Ok(value)
-                }
-                Err(broadcast::error::TryRecvError::Empty) => Err(DbError::BufferEmpty),
-                Err(broadcast::error::TryRecvError::Lagged(n)) => {
-                    #[cfg(feature = "metrics")]
-                    metrics.add_dropped(n);
-                    Err(DbError::BufferLagged {
-                        lag_count: n,
-                        buffer_name: "broadcast".to_string(),
-                    })
+            } => {
+                let waker = Waker::noop();
+                let mut cx = Context::from_waker(waker);
+                match recv.poll(&mut cx) {
+                    Poll::Ready((result, rx)) => {
+                        recv.set(broadcast_recv(rx));
+                        Self::map_broadcast(
+                            result,
+                            #[cfg(feature = "metrics")]
+                            metrics,
+                        )
+                    }
+                    Poll::Pending => Err(DbError::BufferEmpty),
                 }
-                Err(broadcast::error::TryRecvError::Closed) => Err(DbError::BufferClosed {
-                    buffer_name: "broadcast".to_string(),
-                }),
-            },
+            }
             TokioBufferReader::Watch {
-                rx,
+                recv,
                 #[cfg(feature = "metrics")]
                 metrics,
-            } => match rx.has_changed() {
-                Err(_) => Err(DbError::BufferClosed {
-                    buffer_name: "watch".to_string(),
-                }),
-                Ok(false) => Err(DbError::BufferEmpty),
-                Ok(true) => {
-                    let val = rx.borrow_and_update().clone();
-                    match val {
-                        Some(v) => {
+            } => {
+                let waker = Waker::noop();
+                let mut cx = Context::from_waker(waker);
+                match recv.poll(&mut cx) {
+                    Poll::Ready((result, rx)) => {
+                        recv.set(watch_recv(rx));
+                        Self::map_watch(
+                            result,
                             #[cfg(feature = "metrics")]
-                            metrics.increment_consumed();
-                            Ok(v)
-                        }
-                        None => Err(DbError::BufferClosed {
-                            buffer_name: "watch".to_string(),
-                        }),
+                            metrics,
+                        )
                     }
+                    Poll::Pending => Err(DbError::BufferEmpty),
                 }
-            },
-            TokioBufferReader::Notify {
-                slot,
-                notify: _,
+            }
+            TokioBufferReader::Mailbox {
+                state,
                 #[cfg(feature = "metrics")]
                 metrics,
             } => {
-                let mut guard = slot.lock().unwrap();
-                match guard.take() {
+                let mut guard = state.lock().unwrap();
+                match guard.slot.take() {
                     Some(val) => {
                         #[cfg(feature = "metrics")]
                         metrics.increment_consumed();
@@ -420,12 +513,19 @@ impl<T: Clone + Send + Sync + 'static> BufferReader<T> for TokioBufferReader<T>
 #[cfg(test)]
 mod tests {
     use super::*;
+    use aimdb_core::buffer::Reader;
+
+    /// Wrap a concrete `TokioBufferReader` in the ergonomic `Reader<T>` so the
+    /// tests can keep exercising `recv().await` / `try_recv()` (design 037 / W8).
+    fn rdr<T: Clone + Send + Sync + 'static>(buffer: &TokioBuffer<T>) -> Reader<T> {
+        Reader::new(Box::new(buffer.subscribe()))
+    }
 
     #[tokio::test]
     async fn test_spmc_ring_basic() {
         let cfg = BufferCfg::SpmcRing { capacity: 10 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(42);
         assert_eq!(reader.recv().await.unwrap(), 42);
     }
@@ -434,8 +534,8 @@ mod tests {
     async fn test_spmc_ring_multiple_consumers() {
         let cfg = BufferCfg::SpmcRing { capacity: 10 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader1 = buffer.subscribe();
-        let mut reader2 = buffer.subscribe();
+        let mut reader1 = rdr(&buffer);
+        let mut reader2 = rdr(&buffer);
         buffer.push(1);
         buffer.push(2);
         assert_eq!(reader1.recv().await.unwrap(), 1);
@@ -448,7 +548,7 @@ mod tests {
     async fn test_single_latest_basic() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(42);
         assert_eq!(reader.recv().await.unwrap(), 42);
     }
@@ -457,7 +557,7 @@ mod tests {
     async fn test_single_latest_skip_intermediate() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(1);
         buffer.push(2);
         buffer.push(3);
@@ -468,7 +568,7 @@ mod tests {
     async fn test_mailbox_basic() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(42);
         assert_eq!(reader.recv().await.unwrap(), 42);
     }
@@ -477,7 +577,7 @@ mod tests {
     async fn test_mailbox_overwrite() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
         buffer.push(1);
         buffer.push(2);
         assert_eq!(reader.recv().await.unwrap(), 2);
@@ -564,7 +664,7 @@ mod tests {
         let cfg = BufferCfg::SpmcRing { capacity: 3 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send more messages than capacity without reading
         // This will cause the slow reader to lag
@@ -607,9 +707,9 @@ mod tests {
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
         // Create three independent readers
-        let mut reader1 = buffer.subscribe();
-        let mut reader2 = buffer.subscribe();
-        let mut reader3 = buffer.subscribe();
+        let mut reader1 = rdr(&buffer);
+        let mut reader2 = rdr(&buffer);
+        let mut reader3 = rdr(&buffer);
 
         // Send values
         for i in 0..5 {
@@ -631,7 +731,7 @@ mod tests {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send multiple values rapidly
         buffer.push(1);
@@ -659,8 +759,8 @@ mod tests {
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
         // Create readers BEFORE sending values
-        let mut reader1 = buffer.subscribe();
-        let mut reader2 = buffer.subscribe();
+        let mut reader1 = rdr(&buffer);
+        let mut reader2 = rdr(&buffer);
 
         // Send values
         buffer.push(10);
@@ -681,7 +781,7 @@ mod tests {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send first value
         buffer.push(1);
@@ -709,7 +809,7 @@ mod tests {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
 
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Send and immediately read
         buffer.push(10);
@@ -854,7 +954,7 @@ mod tests {
     async fn test_try_recv_broadcast_empty() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // No values written — try_recv returns Empty
         assert!(matches!(reader.try_recv(), Err(DbError::BufferEmpty)));
@@ -864,7 +964,7 @@ mod tests {
     async fn test_try_recv_broadcast_single_value() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         buffer.push(42);
         assert_eq!(reader.try_recv().unwrap(), 42);
@@ -877,7 +977,7 @@ mod tests {
     async fn test_try_recv_broadcast_drains_all_pending() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Write 5 values
         for i in 0..5 {
@@ -901,7 +1001,7 @@ mod tests {
     async fn test_try_recv_broadcast_handles_lag() {
         let cfg = BufferCfg::SpmcRing { capacity: 4 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Write 10 values into capacity-4 ring — reader falls behind
         for i in 0..10 {
@@ -936,7 +1036,7 @@ mod tests {
     async fn test_try_recv_watch_empty() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // No values written — try_recv returns Empty
         assert!(matches!(reader.try_recv(), Err(DbError::BufferEmpty)));
@@ -946,7 +1046,7 @@ mod tests {
     async fn test_try_recv_watch_returns_latest() {
         let cfg = BufferCfg::SingleLatest;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         buffer.push(1);
         buffer.push(2);
@@ -968,7 +1068,7 @@ mod tests {
     async fn test_try_recv_mailbox_empty() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // No values written — try_recv returns Empty
         assert!(matches!(reader.try_recv(), Err(DbError::BufferEmpty)));
@@ -978,7 +1078,7 @@ mod tests {
     async fn test_try_recv_mailbox_takes_value() {
         let cfg = BufferCfg::Mailbox;
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         buffer.push(1);
         buffer.push(2); // overwrites
@@ -999,7 +1099,7 @@ mod tests {
     async fn test_try_recv_interleaved_push_and_drain() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Push 3, drain all
         buffer.push(1);
@@ -1038,8 +1138,8 @@ mod tests {
     async fn test_try_recv_multiple_independent_readers() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader_a = buffer.subscribe();
-        let mut reader_b = buffer.subscribe();
+        let mut reader_a = rdr(&buffer);
+        let mut reader_b = rdr(&buffer);
 
         // Push values
         for i in 0..5 {
@@ -1075,7 +1175,7 @@ mod tests {
     async fn test_try_recv_after_async_recv() {
         let cfg = BufferCfg::SpmcRing { capacity: 16 };
         let buffer = TokioBuffer::<i32>::new(&cfg);
-        let mut reader = buffer.subscribe();
+        let mut reader = rdr(&buffer);
 
         // Push 3 values
         buffer.push(10);
@@ -1104,6 +1204,7 @@ mod tests {
 
     mod peek_tests {
         use super::super::*;
+        use super::rdr;
         use aimdb_core::buffer::DynBuffer;
 
         #[tokio::test]
@@ -1127,7 +1228,7 @@ mod tests {
             // Subscribe BEFORE push so the receiver's version counter advances
             // on send_replace. (Watch receivers created after a push will only
             // wake on the *next* push — that's the gap peek() exists to fill.)
-            let mut reader = Buffer::subscribe(&buffer);
+            let mut reader = rdr(&buffer);
             DynBuffer::push(&buffer, 42);
 
             // Multiple peeks return the same value.
@@ -1169,7 +1270,7 @@ mod tests {
             DynBuffer::push(&buffer, 99);
             assert_eq!(buffer.peek(), Some(99));
             // Subscriber takes the slot.
-            let mut reader = Buffer::subscribe(&buffer);
+            let mut reader = rdr(&buffer);
             assert_eq!(reader.recv().await.unwrap(), 99);
             // After take(), peek sees the slot is empty.
             assert_eq!(buffer.peek(), None);
@@ -1228,7 +1329,7 @@ mod tests {
         async fn test_spmc_ring_consumed_count() {
             let cfg = BufferCfg::SpmcRing { capacity: 10 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Push and consume
             buffer.push(1);
@@ -1247,7 +1348,7 @@ mod tests {
         async fn test_spmc_ring_dropped_count_on_lag() {
             let cfg = BufferCfg::SpmcRing { capacity: 3 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Overfill buffer to cause lag
             for i in 0..10 {
@@ -1271,7 +1372,7 @@ mod tests {
         async fn test_metrics_reset() {
             let cfg = BufferCfg::SpmcRing { capacity: 10 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Generate some metrics
             buffer.push(1);
@@ -1295,7 +1396,7 @@ mod tests {
         async fn test_watch_buffer_metrics() {
             let cfg = BufferCfg::SingleLatest;
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             buffer.push(1);
             buffer.push(2);
@@ -1314,7 +1415,7 @@ mod tests {
         async fn test_mailbox_buffer_metrics() {
             let cfg = BufferCfg::Mailbox;
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             buffer.push(1);
             let _ = reader.recv().await.unwrap();
@@ -1350,7 +1451,7 @@ mod tests {
         async fn test_try_recv_tracks_consumed_metrics() {
             let cfg = BufferCfg::SpmcRing { capacity: 10 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Push and try_recv
             buffer.push(1);
@@ -1372,7 +1473,7 @@ mod tests {
         async fn test_try_recv_tracks_dropped_on_lag() {
             let cfg = BufferCfg::SpmcRing { capacity: 3 };
             let buffer = TokioBuffer::<i32>::new(&cfg);
-            let mut reader = buffer.subscribe();
+            let mut reader = rdr(&buffer);
 
             // Overfill to cause lag
             for i in 0..10 {
diff --git a/aimdb-wasm-adapter/src/buffer.rs b/aimdb-wasm-adapter/src/buffer.rs
index 45d0110..46dabd1 100644
--- a/aimdb-wasm-adapter/src/buffer.rs
+++ b/aimdb-wasm-adapter/src/buffer.rs
@@ -20,8 +20,6 @@ use alloc::collections::VecDeque;
 use alloc::rc::Rc;
 use alloc::vec::Vec;
 use core::cell::{Cell, RefCell};
-use core::future::Future;
-use core::pin::Pin;
 use core::task::{Context, Poll, Waker};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader, DynBuffer};
@@ -206,8 +204,38 @@ enum ReaderState {
 }
 
 impl<T: Clone + Send + 'static> BufferReader<T> for WasmBufferReader<T> {
-    fn recv(&mut self) -> Pin<Box<dyn Future<Output = Result<T, DbError>> + Send + '_>> {
-        Box::pin(WasmRecvFuture { reader: self })
+    /// Poll for the next value (design 037 / W8).
+    ///
+    /// On each poll:
+    /// 1. Try to read a value (non-blocking).
+    /// 2. If available, return `Poll::Ready(Ok(value))`.
+    /// 3. If not, register the waker and return `Poll::Pending`.
+    ///
+    /// The waker is woken when [`WasmBuffer::push()`](WasmBuffer) fires. This is
+    /// allocation-free — the pre-W8 `Box::pin(WasmRecvFuture { .. })` existed
+    /// solely to satisfy the old async trait signature.
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
+        // Try non-blocking read first
+        match self.try_recv() {
+            Ok(value) => Poll::Ready(Ok(value)),
+            Err(e @ DbError::BufferLagged { .. }) => Poll::Ready(Err(e)),
+            Err(DbError::BufferEmpty) => {
+                // Register waker so we get woken on next push
+                let mut inner = self.buffer.borrow_mut();
+                let wakers = match &mut *inner {
+                    WasmBufferInner::SpmcRing { wakers, .. } => wakers,
+                    WasmBufferInner::SingleLatest { wakers, .. } => wakers,
+                    WasmBufferInner::Mailbox { wakers, .. } => wakers,
+                };
+                // Deduplicate: only add if no existing waker will wake the same task.
+                // Prevents unbounded growth when a single reader is polled repeatedly.
+                if !wakers.iter().any(|w| w.will_wake(cx.waker())) {
+                    wakers.push(cx.waker().clone());
+                }
+                Poll::Pending
+            }
+            Err(e) => Poll::Ready(Err(e)),
+        }
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
@@ -264,55 +292,6 @@ impl<T: Clone + Send + 'static> BufferReader<T> for WasmBufferReader<T> {
     }
 }
 
-// ============================================================================
-// Async recv future
-// ============================================================================
-
-/// Future returned by `WasmBufferReader::recv()`.
-///
-/// On each poll:
-/// 1. Try to read a value (non-blocking).
-/// 2. If available, return `Poll::Ready(Ok(value))`.
-/// 3. If not, register the waker and return `Poll::Pending`.
-///
-/// The waker is woken when `WasmBuffer::push()` fires.
-struct WasmRecvFuture<'a, T> {
-    reader: &'a mut WasmBufferReader<T>,
-}
-
-// SAFETY: wasm32 is single-threaded
-unsafe impl<T> Send for WasmRecvFuture<'_, T> {}
-
-impl<T: Clone + Send + 'static> Future for WasmRecvFuture<'_, T> {
-    type Output = Result<T, DbError>;
-
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        let this = self.get_mut();
-
-        // Try non-blocking read first
-        match this.reader.try_recv() {
-            Ok(value) => Poll::Ready(Ok(value)),
-            Err(e @ DbError::BufferLagged { .. }) => Poll::Ready(Err(e)),
-            Err(DbError::BufferEmpty) => {
-                // Register waker so we get woken on next push
-                let mut inner = this.reader.buffer.borrow_mut();
-                let wakers = match &mut *inner {
-                    WasmBufferInner::SpmcRing { wakers, .. } => wakers,
-                    WasmBufferInner::SingleLatest { wakers, .. } => wakers,
-                    WasmBufferInner::Mailbox { wakers, .. } => wakers,
-                };
-                // Deduplicate: only add if no existing waker will wake the same task.
-                // Prevents unbounded growth when a single reader is polled repeatedly.
-                if !wakers.iter().any(|w| w.will_wake(cx.waker())) {
-                    wakers.push(cx.waker().clone());
-                }
-                Poll::Pending
-            }
-            Err(e) => Poll::Ready(Err(e)),
-        }
-    }
-}
-
 // ============================================================================
 // Helpers
 // ============================================================================
diff --git a/docs/design/037-zero-alloc-consume-path.md b/docs/design/037-zero-alloc-consume-path.md
new file mode 100644
index 0000000..fe392cc
--- /dev/null
+++ b/docs/design/037-zero-alloc-consume-path.md
@@ -0,0 +1,145 @@
+# 037 — Zero-Allocation Consume Path: Poll-Based `BufferReader` SPI (W8)
+
+**Status:** Implemented 2026-06-20 (pending review), stacked on the `aimdb-bench` crate (design 038). Builds on the 034/035/036 review cycle and the #141–#147 series. **Must land inside the currently-open breaking window** (same release as the W1/W2 SPI breaks) or it waits for the next major. Host B0–B2 measured (§9); B3 on-target and embassy-host B0 are follow-ups.
+
+---
+
+## 1. Where this sits
+
+036 W1 (PR #141) removed the per-message `dyn Any` erasure from the connector, session-pump, and AimX paths. Its acceptance criterion was a `dyn Any` grep — which structurally cannot see boxed *futures*. Post-W1, exactly one AimDB-added per-message heap allocation remains: the `Pin<Box<dyn Future>>` constructed on every `recv()`.
+
+The asymmetry is now stark:
+
+| Direction | State after #141–#147 |
+|---|---|
+| Write path | Solved by 029: sync `push`, pre-bound handle, one vtable call, zero alloc |
+| Inbound (connector → record) | Solved by W1: fused **sync** typed ingest closure, zero AimDB-added alloc |
+| Consume path (record → consumer / connector / remote) | **One heap allocation per message, per reader** — this doc |
+
+W8 closes the loop. End state: **zero AimDB-added heap allocations per message, end to end; no locks held across `await`; abstraction cost = one indirect call — enforced in CI, measured in `benches/`.**
+
+## 2. Current state (verified, pr147 HEAD)
+
+| Path | Mechanism | Per-message cost |
+|---|---|---|
+| In-process consumer | [`BufferReader::recv` → `Pin<Box<dyn Future>>`](../../aimdb-core/src/buffer/traits.rs#L142); `Box::pin` at [tokio `buffer.rs:283`](../../aimdb-tokio-adapter/src/buffer.rs#L283), [embassy `buffer.rs:416`](../../aimdb-embassy-adapter/src/buffer.rs#L416), [wasm `buffer.rs:210`](../../aimdb-wasm-adapter/src/buffer.rs#L210) | 1 heap box + 1 indirect call |
+| Outbound connector | [`SerializedSource::recv` → `RecvSerializedFuture`](../../aimdb-core/src/connector.rs#L101) (W1 removed the value-level `Box<dyn Any>`; the future box stayed per the 036 W1 risk note "manual boxed-future pattern — keep it") | 1 heap box |
+| Remote access / JSON | [`recv_json`](../../aimdb-core/src/buffer/traits.rs#L177) boxes around [`JsonReaderAdapter`](../../aimdb-core/src/typed_record.rs#L137)'s inner `recv()`, which boxes again | 2 heap boxes |
+| Inbound | fused sync ingest (W1) | 0 |
+| Produce | [`WriteHandle::push`](../../aimdb-core/src/buffer/traits.rs) sync | 0 (1 indirect call) |
+
+Notable: the WASM adapter already owns a hand-rolled poll struct — `Box::pin(WasmRecvFuture { reader: self })`. The allocation exists **only to satisfy the trait signature**. The signature is the problem, not the implementations.
+
+Out of scope, recorded in §7: the latest-snapshot [`spin::Mutex`](../../aimdb-core/src/typed_record.rs#L36) on the produce path.
+
+## 3. Approach
+
+### 3.1 The SPI change
+
+Object safety and `async fn` conflict; object safety and `poll` do not. Replace the async method on the erased trait with its poll form:
+
+```rust
+pub trait BufferReader<T: Clone + Send>: Send {
+    /// Poll for the next value. Registers `cx.waker()` when `Pending`.
+    fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>>;
+
+    fn try_recv(&mut self) -> Result<T, DbError>; // unchanged
+}
+```
+
+The **consumer-facing API does not move**. `Reader<T>::recv()` remains `async`, implemented once:
+
+```rust
+pub async fn recv(&mut self) -> Result<T, DbError> {
+    core::future::poll_fn(|cx| self.inner.poll_recv(cx)).await
+}
+```
+
+`core::future::poll_fn` is stable (1.64) and lives in `core` — `no_std`-clean, zero allocation, no `unsafe`. Call sites in examples and aimdb-pro compile unmodified. Only `BufferReader` *implementors* break: an SPI break, in the window that #131/#135/#141 already opened.
+
+### 3.2 Per adapter (in implementation order)
+
+1. **WASM — unbox what already exists.** `WasmRecvFuture`'s poll body *becomes* `poll_recv`; delete the `Box::pin`. Smallest diff; do it first to validate the trait shape.
+2. **Tokio Mailbox.** Currently `Mutex<slot>` + `Notify` ([`buffer.rs:8`](../../aimdb-tokio-adapter/src/buffer.rs#L8)). Replace `Notify` with waker storage beside the slot — single-slot take semantics is the textbook poll pattern. Drops the `Notify` permit subtleties entirely. Waker contract: see §6.
+3. **Embassy.** Verified against the locked embassy-sync **0.8.0**: `Channel` exposes [`poll_receive(&self, cx)`](https://docs.rs/embassy-sync/0.8.0/embassy_sync/channel/struct.Channel.html) natively (channel.rs:332 in the crate source) and pubsub `Subscriber` implements `Stream` (`poll_next`). Direct mapping, zero stored state. If the Watch-backed path lacks a public poll fn, mirror its `changed()` poll body — embassy futures are hand-rolled poll structs; mechanical.
+4. **Tokio Broadcast — the one residue.** `broadcast::Receiver` exposes no public poll API. Use the `BroadcastStream` technique: a `tokio_util::sync::ReusableBoxFuture` owned by the reader — **one allocation per subscriber lifetime, reused for every message**. This is a Tokio API limitation, not an AimDB design cost; documented as such.
+
+### 3.3 Fused and remote paths inherit it
+
+`SerializedSource` composes over `poll_recv` (subscribe → poll → serialize stays fused inside the registration closure); `RecvSerializedFuture` either becomes a poll method or keeps its async wrapper over the inner poll — either way the inner box is gone. `JsonBufferReader` collapses the same way; the remote-access **double** box disappears with no separate work item.
+
+## 4. Measurement program — the centerpiece (lands *with* the change, not after)
+
+W8 exists to make a claim provable; the proof ships in the same series. These benches instantiate the L0/L1 layers of the benchmark-pyramid plan for the consume path; the three canonical workload profiles — **Telemetry/`SpmcRing`**, **State/`SingleLatest`**, **Command/`Mailbox`** — are the unit of reporting throughout. New workspace member `benches/aimdb-bench` (criterion; host-only; dev-deps fenced from the `no_std` graph).
+
+**B0 — Allocation count (hard gate + headline number).** Counting `#[global_allocator]` wrapper in dedicated host test binaries — the W6 `host_test_stubs!` infrastructure is the natural home for the embassy-host build. Protocol: set up producer + subscribers; warmup absorbs one-time setup (including the tokio-broadcast `ReusableBoxFuture`); snapshot the counter; push/consume N = 10 000; assert **and report** allocations/message. Expected: **1 → 0**, per buffer profile × {tokio, embassy-host}. Doubles as the CI gate (§5) and the first table row of any publication. WASM: covered by the unboxed `WasmRecvFuture` unit tests; alloc-counting under wasm32 in CI is not worth the harness (justified per the #146 compile-delete-or-justify rule).
+
+**B1 — Leaf latency (L0).** Single-message publish→`recv`-return, ping-pong, **three-way per profile**: raw primitive (`broadcast` / `watch` / embassy `Channel` on host) vs AimDB-before vs AimDB-after. Tokio current-thread runtime, criterion `async_executor`, pinned core; report p50/p99 (criterion medians, never means). Two defined deltas, used consistently everywhere downstream: (after − raw) = **abstraction cost** — the README number; (before − after) = **what W8 bought** — the publication number.
+
+**B2 — Steady-state throughput (L1).** msgs/sec at saturation, SPSC and 1→4 fan-out per profile. Fan-out deliberately exposes the `T: Clone` per-consumer copy cost — measured and reported, not hidden. Same three-way comparison as B1.
+
+**B3 — On-target (Cortex-M).** The W3 KNX hardware rig already exists; reuse it. DWT `CYCCNT` around `recv` on the STM32H5, defmt-reported, N = 10 000: **cycles/message** before/after, plus the embedded-alloc **heap high-water mark** over the run — the fragmentation story no host bench can tell, and the number that makes the claim credible to the embedded audience. Ships as a flashable example with documented rig setup.
+
+**Methodology constraints (so the numbers survive review).** Current-thread executors and pinned cores isolate leaf cost from scheduler noise; warmup excluded from samples; rustc version, criterion configuration, host CPU, and B3 rig recorded in §9 alongside the results; results are explicitly microbenchmark scope — **no system-throughput claims are derived from them**. Reproduction is one command (`cargo bench -p aimdb-bench`) against the committed lockfile.
+
+**CI.** B0 is a required gate on PRs touching `aimdb-core/src/buffer/`, the adapter `buffer.rs` files, or `connector.rs`. B1/B2 run on the same trigger but report as trend only — CI runners are too noisy to gate on nanoseconds. B3 is a release-checklist item, not CI.
+
+## 5. Acceptance criteria
+
+- [x] `grep -rn "Box::pin" aimdb-{tokio,embassy,wasm}-adapter/src/buffer.rs aimdb-core/src/buffer/ aimdb-core/src/connector.rs` → zero hits on per-message construction sites. (Remaining hits are doc comments only: the wasm reader's note about the *removed* box, and the connector.rs BYOC doc example. The reused futures live in `ReusableBoxFuture::new` at subscribe time — tokio broadcast/watch — and the embassy `no_std` equivalent, all per-subscriber, not per-message.)
+- [x] `BufferReader::recv`/`JsonBufferReader::recv_json` (boxed-future form) deleted; `poll_recv`/`poll_recv_json` object-safe; `thumbv7em-none-eabihf` and `wasm32-unknown-unknown` builds + clippy green.
+- [x] **B0: 0 allocations/message** across the 3 tokio buffer profiles (was 1). Committed to `aimdb-bench/data/baselines/b0_alloc_tokio.json`. CI wiring is **advisory (report-only) per design 038 §6**; a hard gate and the embassy-host adapter B0 are the documented follow-up.
+- [x] Examples and aimdb-pro compile with **no call-site changes** (`subscribe().recv().await` unchanged; `Consumer`/`AimDb`/`TypedRecord::subscribe` now return `Reader<T>`). Concrete-reader holders wrap once via `Reader::new(Box::new(..))`.
+- [x] SPI break recorded in `aimdb-core` CHANGELOG; ships in the same release as the W1/W2 breaks.
+- [x] §9 populated for B0–B2 (host), rustc + criterion recorded. **B3 (on-target, STM32H5) deferred** to a hardware session per §8/§4.
+
+## 6. Risk notes
+
+- **Cancellation semantics on tokio broadcast.** The in-flight inner `Recv` future now persists across caller polls (owned `ReusableBoxFuture`) instead of being dropped per call. Strictly fewer lost-wakeup hazards; behavioral note: a value claimed by the inner future just before caller cancellation is delivered on the *next* poll rather than dropped — consistent with broadcast cursor semantics. Document on the reader.
+- **Lagged mapping** unchanged: `Lagged(n)` → `DbError::BufferLagged(n)`; pinned by existing adapter tests.
+- **Mailbox waker contract.** Single-slot take semantics with potentially multiple readers: store wakers, wake-**all** on push (spurious wakeups are benign; losers re-poll to `Pending`). Wake-one is an optimization with a starvation analysis attached — not now.
+- **Auto-traits.** The `poll_fn` future is `Send` iff the reader is; verify at the session-pump and connector spawn sites (compiler enforces; listed so the error is expected, not surprising).
+- **Embassy Watch poll surface** unverified on 0.8.0 — confirm during step 3; fallback is the mechanical mirror noted in §3.2.
+
+## 7. Dormant items (trigger-only)
+
+| Item | Decision | Re-open trigger |
+|---|---|---|
+| Generic fast lane `Reader<T, B>` / `Producer<T, B>` (default type param keeps `Reader<T>` = boxed lane; adapter seam already exists: [`subscribe() -> Self::Reader`](../../aimdb-tokio-adapter/src/buffer.rs#L97) vs [`subscribe_boxed()`](../../aimdb-tokio-adapter/src/buffer.rs#L128)) | Not shipped. Post-W8 the dyn lane costs one indirect call and zero allocs; monomorphization stamps `T×B` copies against the ~50 KB flash budget | §9 numbers (B1/B3) or an MCU flame graph show the per-message indirect call as a measurable fraction of a real workload's budget |
+| Latest-snapshot `spin::Mutex` → atomic ptr swap (portable-atomic) | Keep — bounded, tiny critical section on produce only | Producer-side profiling shows it, or a hard "no spinlocks" claim is wanted for marketing |
+
+## 8. Sequencing and size
+
+1. **B0–B2 scaffolding first**; baseline numbers on current HEAD committed (the "before" columns). B3 baseline captured on the W3 rig — fold into the next scheduled hardware session.
+2. WASM unbox → Tokio Mailbox waker rewrite → Embassy `poll_receive` mapping → Tokio broadcast `ReusableBoxFuture`.
+3. Flip the trait, delete `recv()`, migrate `SerializedSource` + JSON paths, fix fallout.
+4. Gates into CI; populate §9 after-columns; status row in 036 §5 pointing here; README/website wording PR ("zero allocations per message, end to end; overhead: one indirect call, X ns") in the same series as the after-numbers — never before.
+
+**Size:** M — wide but mechanical (three adapters + two fused paths). The only design-sensitive piece is the Mailbox waker contract (§6).
+
+## 9. Results (populated by §4)
+
+**Environment:** rustc `1.91.1` · criterion `0.5` · host CPU `dev container (shared, noisy — host B1/B2 are indicative trend only, not gated)` · B3: STM32H5 @ `—` MHz, embassy-executor `—`, embedded-alloc `—` (deferred follow-up).
+
+**Host (B0–B2), measured 2026-06-20 via `aimdb-bench`.** B0 is the gate and headline; B1/B2 are after-W8 medians on a shared container (the three-way raw/before split and embassy-host B0 are deferred to the embassy follow-up per scope).
+
+| Profile | allocs/msg (B0) before → **after** | p50 (B1) after | msgs/s (B2) after |
+|---|---|---|---|
+| Telemetry — Tokio SpmcRing (`broadcast`) | 1 → **0** | ~195 ns | ~5.2 M/s |
+| State — Tokio SingleLatest (`watch`) | 1 → **0** | ~446 ns | ~2.5 M/s |
+| Command — Tokio Mailbox | 1 → **0** | ~70 ns | ~13.6 M/s |
+| Telemetry — Embassy Channel (host) | deferred (embassy-host B0 follow-up) | — | — |
+
+**Headline:** the last AimDB-added per-message heap allocation on the in-process consume path is removed — **1 → 0 allocs/msg** on every tokio buffer profile, byte total 0 in the measured window. Abstraction cost is one indirect call per `recv` (the boxed `Reader<T>` lane). Bytes/msg before-W8 was 144 B (the `Box::pin(async move …)` future) → 0 B after.
+
+**On-target (B3, STM32H5):**
+
+| Metric | Before | After |
+|---|---|---|
+| cycles/message (DWT `CYCCNT`) | — | — |
+| heap high-water mark, 10 k msgs | — | — |
+| allocations/message | — | — |
+
+## 10. Publication note (r/rust)
+
+The §9 tables are the post's spine; **the post follows the merged numbers, never precedes them**. Framing is purely technical per the established channel split: the find (`WasmRecvFuture` boxed solely to satisfy a trait signature), the mechanism (object safety vs `async fn`; poll as the escape that keeps `dyn`), the trade analysis (the box was deadweight, not part of the dyn trade — and the monomorphized `Reader<T, B>` lane declined *with data*, §7), the B0–B3 numbers, and the one-command reproduction path with links to the PR and this doc. No claims beyond measured scope. Declining an optimization on the basis of measurements is itself the credibility move for that audience — it belongs in the post, not just the doc.

From ce10d9184c9ee07df052454ab7490bed17ea7a71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sat, 20 Jun 2026 21:42:03 +0000
Subject: [PATCH 08/16] feat(buffer): refactor EmbassyBufferReader to eliminate
 per-message allocations

---
 Cargo.lock                          |   4 +-
 aimdb-core/CHANGELOG.md             |   2 +-
 aimdb-embassy-adapter/src/buffer.rs | 255 +++++++---------------------
 3 files changed, 64 insertions(+), 197 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2d8ffc3..eb3d5d1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3390,11 +3390,11 @@ dependencies = [
 [[package]]
 name = "stm32-metapac"
 version = "21.0.0"
-source = "git+https://github.com/embassy-rs/stm32-data-generated?tag=stm32-data-be62608f8f93a21fe76c8f70c0fa9d30c9ab2503#46619beefd7015dffaa1f8e756f614718f0dd0bf"
+source = "git+https://github.com/embassy-rs/stm32-data-generated?tag=stm32-data-98c747c5a5eb2fe1bfa452d6375445a1c3c51628#ff0350aebb88f1498cbf5800305de327df02012c"
 dependencies = [
  "cortex-m",
  "cortex-m-rt",
- "defmt 0.3.100",
+ "defmt 1.0.1",
 ]
 
 [[package]]
diff --git a/aimdb-core/CHANGELOG.md b/aimdb-core/CHANGELOG.md
index fce4fca..1f6e504 100644
--- a/aimdb-core/CHANGELOG.md
+++ b/aimdb-core/CHANGELOG.md
@@ -19,7 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - **New consumer handles:** `buffer::Reader<T>` (and `buffer::JsonReader` under `remote-access`) wrap the erased reader and expose an `async fn recv()` implemented once via `core::future::poll_fn` — `core`-only, `no_std`-clean, zero-allocation, no `unsafe`. `Consumer::subscribe`, `TypedRecord::subscribe`, and `AimDb::subscribe` now return `Reader<T>` instead of `Box<dyn BufferReader<T> + Send>`.
   - **Source-compatible for consumers:** `subscribe().recv().await` is unchanged at every call site; examples and aimdb-pro compile without edits. Holders of a concrete adapter reader wrap it once: `Reader::new(Box::new(reader))`.
   - **Connector SPI unchanged (BYOC-stable, design 039 §2):** `SerializedReader::recv` keeps its boxed `RecvSerializedFuture`; only the *inner* per-message box is eliminated. The remote-access JSON path drops both its boxes (`poll_recv_json` + `JsonReader`).
-  - **Result:** **0 AimDB-added heap allocations per message** on the in-process consume path, enforced by the `aimdb-bench` B0 suite (1 → 0 allocs/msg across all three tokio buffer profiles). Adapters that expose no public poll API (tokio `broadcast`/`watch`, embassy `pubsub`/`watch`) round-trip their receiver through a single reused boxed future (`tokio_util::sync::ReusableBoxFuture` on tokio; a `no_std` equivalent on embassy) — one allocation per subscriber lifetime, not per message. The tokio Mailbox replaces `Notify` with an explicit waker list beside the slot.
+  - **Result:** **0 AimDB-added heap allocations per message** on the in-process consume path, enforced by the `aimdb-bench` B0 suite (1 → 0 allocs/msg across all three tokio buffer profiles). Tokio `broadcast`/`watch` expose no public poll API, so the reader round-trips its receiver through a single reused `tokio_util::sync::ReusableBoxFuture` — one allocation per subscriber lifetime, not per message — and the Mailbox replaces `Notify` with an explicit waker list beside the slot. Embassy drives embassy-sync's public poll methods directly (`Subscriber::poll_next_message`, `watch::Receiver::poll_changed`, `Channel::poll_receive`) — no future box and **no new `unsafe`**; `poll_next_message`/`poll_changed` were added to the vendored `embassy-sync` as small additive wrappers (upstream PR pending).
 
 - **Design 036 W1 — data-plane de-`Any`: the per-message `Box<dyn Any>` is gone from the connector SPI ([design doc §W1](../docs/design/036-followup-refactoring.md)).** Both ends of every erased hop were typed — `T` is known in the registrar where routes are wired, and the connector spine only wants bytes — so the typed pipeline is now built inside closures at registration time (`finish()`) and the SPI exposes only the wire level. The full break inventory:
   - **Inbound:** new `IngestFn = Arc<dyn Fn(&RuntimeContext, &[u8]) -> Result<(), String>>` + `IngestFactoryFn` replace deserializer + producer: deserialize + produce in one typed closure, **synchronous** (`Producer::produce` is sync + infallible per design 029 — the per-message `Box::pin` disappears along with the `Box<dyn Any>`). Deleted: `ProducerTrait`/`produce_any`, `ProducerFactoryFn`, `DeserializerFn`/`ContextDeserializerFn`/`DeserializerKind`, `TypedRecord::create_producer_trait`. `InboundConnectorLink` is `{ url, config, ingest_factory, topic_resolver }` (factory non-optional — `finish()` validates the deserializer before registering, error strings unchanged); `collect_inbound_routes` returns `Vec<(String, IngestFn)>`; `Route` is `{ resource_id, ingest }`.
diff --git a/aimdb-embassy-adapter/src/buffer.rs b/aimdb-embassy-adapter/src/buffer.rs
index b720480..d674c38 100644
--- a/aimdb-embassy-adapter/src/buffer.rs
+++ b/aimdb-embassy-adapter/src/buffer.rs
@@ -43,10 +43,6 @@ extern crate alloc;
 use alloc::boxed::Box;
 use alloc::string::String;
 use alloc::sync::Arc;
-use core::alloc::Layout;
-use core::future::Future;
-use core::pin::Pin;
-use core::ptr::{self, NonNull};
 use core::task::{Context, Poll};
 
 use aimdb_core::buffer::{Buffer, BufferCfg, BufferReader};
@@ -218,8 +214,8 @@ impl<
         // Clone the Arc for the reader
         EmbassyBufferReader {
             buffer: Arc::clone(&self.inner),
-            watch_recv: None, // Lazily initialized on first poll for Watch buffers
-            spmc_recv: None,  // Lazily initialized on first poll for SpmcRing buffers
+            watch_receiver: None, // Lazily initialized on first poll for Watch buffers
+            spmc_subscriber: None, // Lazily initialized on first poll for SpmcRing buffers
             #[cfg(feature = "metrics")]
             metrics: Arc::clone(&self.metrics),
         }
@@ -386,8 +382,16 @@ impl<
 }
 
 // ============================================================================
-// Zero-allocation poll plumbing (design 037 / W8)
+// Poll plumbing (design 037 / W8)
 // ============================================================================
+//
+// `poll_recv` drives embassy-sync's *public* poll methods directly —
+// `Subscriber::poll_next_message`, `Receiver::poll_changed`, and
+// `Channel::poll_receive` — so there is zero allocation per message and no
+// per-message future box. The reader stores the subscriber/receiver across
+// calls (lazily created on first poll); `try_recv` uses the matching
+// `try_*` methods. No `unsafe` beyond the pre-existing `'static` borrow
+// extension in the `make_*` helpers (the `Arc` keeps the primitive alive).
 
 /// Persistent SpmcRing subscriber with a lifetime extended to `'static` (the
 /// owning `Arc<EmbassyBufferInner>` keeps the channel alive for the reader).
@@ -397,34 +401,6 @@ type SpmcSub<T, const CAP: usize, const SUBS: usize, const PUBS: usize> =
 /// Persistent Watch receiver, `'static` for the same reason as [`SpmcSub`].
 type WatchRx<T, const WATCH_N: usize> = WatchReceiver<'static, CriticalSectionRawMutex, T, WATCH_N>;
 
-/// Output of the SpmcRing reusable future: the `next_message()` result paired
-/// with the subscriber handed back so the next future can reuse it.
-type SpmcRecvOutput<T, const CAP: usize, const SUBS: usize, const PUBS: usize> =
-    (WaitResult<T>, SpmcSub<T, CAP, SUBS, PUBS>);
-
-/// Output of the Watch reusable future: the changed value plus the receiver.
-type WatchRecvOutput<T, const WATCH_N: usize> = (T, WatchRx<T, WATCH_N>);
-
-/// Await the next SpmcRing message, returning the subscriber for reuse.
-///
-/// Uses the existing async `next_message()` so lag (`WaitResult::Lagged`) is
-/// surfaced exactly as before; the subscriber is round-tripped through the
-/// future so the reader can store it without self-referencing.
-async fn spmc_recv_fut<T: Clone, const CAP: usize, const SUBS: usize, const PUBS: usize>(
-    mut sub: SpmcSub<T, CAP, SUBS, PUBS>,
-) -> SpmcRecvOutput<T, CAP, SUBS, PUBS> {
-    let result = sub.next_message().await;
-    (result, sub)
-}
-
-/// Await the next Watch change, returning the receiver for reuse.
-async fn watch_recv_fut<T: Clone, const WATCH_N: usize>(
-    mut rx: WatchRx<T, WATCH_N>,
-) -> WatchRecvOutput<T, WATCH_N> {
-    let value = rx.changed().await;
-    (value, rx)
-}
-
 /// Create the persistent SpmcRing subscriber, extending its borrow to `'static`.
 ///
 /// SAFETY: the `Arc<EmbassyBufferInner>` in the reader keeps the `PubSubChannel`
@@ -466,92 +442,14 @@ fn make_watch_rx<T: Clone + Send + 'static, const WATCH_N: usize>(
     })
 }
 
-/// Minimal `no_std` port of `tokio_util::sync::ReusableBoxFuture`.
-///
-/// Stores a boxed future and, on [`set`](ReusableBoxFuture::set), reuses the
-/// existing heap allocation whenever the replacement future has the same layout
-/// — which it always does here, since each reader stores exactly one concrete
-/// async-fn future type. That is what makes the embassy consume path
-/// zero-allocation per message (design 037 / W8): the single box is allocated
-/// once per subscriber and reused for every message.
-///
-/// `unsafe` is confined to this type and mirrors the upstream tokio-util impl.
-struct ReusableBoxFuture<O> {
-    boxed: NonNull<dyn Future<Output = O> + Send + 'static>,
-}
-
-// SAFETY: the boxed future is owned exclusively and is itself `Send`.
-unsafe impl<O> Send for ReusableBoxFuture<O> {}
-
-impl<O> ReusableBoxFuture<O> {
-    fn new<F>(future: F) -> Self
-    where
-        F: Future<Output = O> + Send + 'static,
-    {
-        let boxed: Box<dyn Future<Output = O> + Send + 'static> = Box::new(future);
-        // SAFETY: `Box::into_raw` never yields a null pointer.
-        Self {
-            boxed: unsafe { NonNull::new_unchecked(Box::into_raw(boxed)) },
-        }
-    }
-
-    /// Replace the stored future, reusing the allocation when layouts match
-    /// (the common path) and reallocating only as a fallback.
-    fn set<F>(&mut self, future: F)
-    where
-        F: Future<Output = O> + Send + 'static,
-    {
-        if let Err(future) = self.try_set(future) {
-            *self = Self::new(future);
-        }
-    }
-
-    fn try_set<F>(&mut self, future: F) -> Result<(), F>
-    where
-        F: Future<Output = O> + Send + 'static,
-    {
-        // SAFETY: `self.boxed` is always a valid, owned boxed future.
-        let existing_layout = Layout::for_value(unsafe { self.boxed.as_ref() });
-        if Layout::new::<F>() != existing_layout {
-            return Err(future);
-        }
-        // SAFETY: same layout — drop the completed future in place, write the
-        // new one into the same allocation, and rebuild the fat pointer with
-        // `F`'s vtable. The previous future has already resolved (we only `set`
-        // after a poll returned `Ready`), so the pin contract is upheld.
-        unsafe {
-            let ptr = self.boxed.as_ptr();
-            ptr::drop_in_place(ptr);
-            let ptr = ptr as *mut F;
-            ptr::write(ptr, future);
-            self.boxed = NonNull::new_unchecked(ptr as *mut (dyn Future<Output = O> + Send));
-        }
-        Ok(())
-    }
-
-    fn poll(&mut self, cx: &mut Context<'_>) -> Poll<O> {
-        // SAFETY: the future lives behind a stable heap allocation and is never
-        // moved while borrowed; `&mut self` guarantees unique access.
-        let fut = unsafe { Pin::new_unchecked(self.boxed.as_mut()) };
-        fut.poll(cx)
-    }
-}
-
-impl<O> Drop for ReusableBoxFuture<O> {
-    fn drop(&mut self) {
-        // SAFETY: reconstruct and drop the owning `Box`.
-        unsafe {
-            drop(Box::from_raw(self.boxed.as_ptr()));
-        }
-    }
-}
-
 /// Reader for Embassy buffers
 ///
-/// Holds persistent subscription state for each buffer type, driven through a
-/// reused boxed future so `poll_recv` allocates nothing per message (W8).
-/// For Watch buffers the future owns a persistent Receiver (tracks which value
-/// has been seen); for SpmcRing a persistent Subscriber (cursor continuity).
+/// Holds persistent subscription state for each buffer type and drives it
+/// through embassy-sync's public poll methods, so `poll_recv` allocates nothing
+/// per message and stores no future box (design 037 / W8). For Watch a
+/// persistent Receiver tracks which value has been seen; for SpmcRing a
+/// persistent Subscriber keeps cursor continuity. Both are lazily created on the
+/// first poll.
 pub struct EmbassyBufferReader<
     T: Clone + Send + 'static,
     const CAP: usize,
@@ -560,11 +458,10 @@ pub struct EmbassyBufferReader<
     const WATCH_N: usize,
 > {
     buffer: Arc<EmbassyBufferInner<T, CAP, SUBS, PUBS, WATCH_N>>,
-    /// Reusable future round-tripping the persistent Watch receiver. Lazily
-    /// created on first poll (the receiver borrows the Arc-kept-alive Watch).
-    watch_recv: Option<ReusableBoxFuture<WatchRecvOutput<T, WATCH_N>>>,
-    /// Reusable future round-tripping the persistent SpmcRing subscriber.
-    spmc_recv: Option<ReusableBoxFuture<SpmcRecvOutput<T, CAP, SUBS, PUBS>>>,
+    /// Persistent Watch receiver, lazily created on first poll.
+    watch_receiver: Option<WatchRx<T, WATCH_N>>,
+    /// Persistent SpmcRing subscriber, lazily created on first poll.
+    spmc_subscriber: Option<SpmcSub<T, CAP, SUBS, PUBS>>,
     /// Shared counter state (cloned from the parent buffer at subscribe time).
     #[cfg(feature = "metrics")]
     metrics: Arc<BufferCounters>,
@@ -581,52 +478,41 @@ impl<
     fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
         match &*self.buffer {
             EmbassyBufferInner::SpmcRing(channel) => {
-                // Lazily create the reusable future (allocates its single box
-                // once; reused for every message thereafter).
-                if self.spmc_recv.is_none() {
+                // Lazily create the persistent subscriber, then poll it directly
+                // via embassy-sync's public `poll_next_message` (no future box,
+                // no allocation per message; lag preserved).
+                if self.spmc_subscriber.is_none() {
                     match make_spmc_sub(channel) {
-                        Ok(sub) => {
-                            self.spmc_recv = Some(ReusableBoxFuture::new(spmc_recv_fut(sub)))
-                        }
+                        Ok(sub) => self.spmc_subscriber = Some(sub),
                         Err(e) => return Poll::Ready(Err(e)),
                     }
                 }
-                let recv = self.spmc_recv.as_mut().unwrap();
-                match recv.poll(cx) {
-                    Poll::Ready((result, sub)) => {
-                        recv.set(spmc_recv_fut(sub));
-                        match result {
-                            WaitResult::Message(value) => {
-                                #[cfg(feature = "metrics")]
-                                self.metrics.increment_consumed();
-                                Poll::Ready(Ok(value))
-                            }
-                            WaitResult::Lagged(n) => {
-                                #[cfg(feature = "metrics")]
-                                self.metrics.add_dropped(n);
-                                Poll::Ready(Err(DbError::BufferLagged {
-                                    lag_count: n,
-                                    buffer_name: String::from("embassy spmc ring"),
-                                }))
-                            }
-                        }
+                match self.spmc_subscriber.as_mut().unwrap().poll_next_message(cx) {
+                    Poll::Ready(WaitResult::Message(value)) => {
+                        #[cfg(feature = "metrics")]
+                        self.metrics.increment_consumed();
+                        Poll::Ready(Ok(value))
+                    }
+                    Poll::Ready(WaitResult::Lagged(n)) => {
+                        #[cfg(feature = "metrics")]
+                        self.metrics.add_dropped(n);
+                        Poll::Ready(Err(DbError::BufferLagged {
+                            lag_count: n,
+                            buffer_name: String::from("embassy spmc ring"),
+                        }))
                     }
                     Poll::Pending => Poll::Pending,
                 }
             }
             EmbassyBufferInner::Watch(watch) => {
-                if self.watch_recv.is_none() {
+                if self.watch_receiver.is_none() {
                     match make_watch_rx(watch) {
-                        Ok(rx) => {
-                            self.watch_recv = Some(ReusableBoxFuture::new(watch_recv_fut(rx)))
-                        }
+                        Ok(rx) => self.watch_receiver = Some(rx),
                         Err(e) => return Poll::Ready(Err(e)),
                     }
                 }
-                let recv = self.watch_recv.as_mut().unwrap();
-                match recv.poll(cx) {
-                    Poll::Ready((value, rx)) => {
-                        recv.set(watch_recv_fut(rx));
+                match self.watch_receiver.as_mut().unwrap().poll_changed(cx) {
+                    Poll::Ready(value) => {
                         #[cfg(feature = "metrics")]
                         self.metrics.increment_consumed();
                         Poll::Ready(Ok(value))
@@ -646,55 +532,36 @@ impl<
     }
 
     fn try_recv(&mut self) -> Result<T, DbError> {
-        // `broadcast`/watch state lives inside the reusable future; poll it with
-        // a no-op waker for non-blocking semantics (mirrors the tokio adapter).
         match &*self.buffer {
             EmbassyBufferInner::SpmcRing(channel) => {
-                if self.spmc_recv.is_none() {
-                    self.spmc_recv = Some(ReusableBoxFuture::new(spmc_recv_fut(make_spmc_sub(
-                        channel,
-                    )?)));
+                if self.spmc_subscriber.is_none() {
+                    self.spmc_subscriber = Some(make_spmc_sub(channel)?);
                 }
-                let recv = self.spmc_recv.as_mut().unwrap();
-                let mut cx = Context::from_waker(core::task::Waker::noop());
-                match recv.poll(&mut cx) {
-                    Poll::Ready((result, sub)) => {
-                        recv.set(spmc_recv_fut(sub));
-                        match result {
-                            WaitResult::Message(value) => {
-                                #[cfg(feature = "metrics")]
-                                self.metrics.increment_consumed();
-                                Ok(value)
-                            }
-                            WaitResult::Lagged(n) => {
-                                #[cfg(feature = "metrics")]
-                                self.metrics.add_dropped(n);
-                                Err(DbError::BufferLagged {
-                                    lag_count: n,
-                                    buffer_name: String::from("embassy spmc ring"),
-                                })
-                            }
-                        }
+                match self
+                    .spmc_subscriber
+                    .as_mut()
+                    .unwrap()
+                    .try_next_message_pure()
+                {
+                    Some(value) => {
+                        #[cfg(feature = "metrics")]
+                        self.metrics.increment_consumed();
+                        Ok(value)
                     }
-                    Poll::Pending => Err(DbError::BufferEmpty),
+                    None => Err(DbError::BufferEmpty),
                 }
             }
             EmbassyBufferInner::Watch(watch) => {
-                if self.watch_recv.is_none() {
-                    self.watch_recv = Some(ReusableBoxFuture::new(watch_recv_fut(make_watch_rx(
-                        watch,
-                    )?)));
+                if self.watch_receiver.is_none() {
+                    self.watch_receiver = Some(make_watch_rx(watch)?);
                 }
-                let recv = self.watch_recv.as_mut().unwrap();
-                let mut cx = Context::from_waker(core::task::Waker::noop());
-                match recv.poll(&mut cx) {
-                    Poll::Ready((value, rx)) => {
-                        recv.set(watch_recv_fut(rx));
+                match self.watch_receiver.as_mut().unwrap().try_changed() {
+                    Some(value) => {
                         #[cfg(feature = "metrics")]
                         self.metrics.increment_consumed();
                         Ok(value)
                     }
-                    Poll::Pending => Err(DbError::BufferEmpty),
+                    None => Err(DbError::BufferEmpty),
                 }
             }
             EmbassyBufferInner::Mailbox(channel) => match channel.try_receive() {

From f53e979bd66b230a4599e0bf4b4cca4a2f77fa81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sat, 20 Jun 2026 21:59:33 +0000
Subject: [PATCH 09/16] feat(profiling): add pending_since to track consumer
 processing time in ProfilingBufferReader

---
 aimdb-core/src/profiling/mod.rs | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/aimdb-core/src/profiling/mod.rs b/aimdb-core/src/profiling/mod.rs
index a3d4dfc..ca2f6da 100644
--- a/aimdb-core/src/profiling/mod.rs
+++ b/aimdb-core/src/profiling/mod.rs
@@ -84,6 +84,11 @@ pub(crate) struct ProfilingBufferReader<T: Clone + Send> {
     clock: Clock,
     /// Wall-clock (ns) at which the last value was handed to the consumer.
     last_yield_ns: Option<u64>,
+    /// Wall-clock (ns) of the first poll of the current recv cycle — the moment
+    /// the consumer asked for the next value. Memoized across re-polls so a
+    /// `Pending` wait for the producer is not counted as consumer processing
+    /// time; cleared when the cycle completes (see `poll_recv`).
+    pending_since: Option<u64>,
 }
 
 impl<T: Clone + Send> ProfilingBufferReader<T> {
@@ -97,6 +102,7 @@ impl<T: Clone + Send> ProfilingBufferReader<T> {
             metrics,
             clock,
             last_yield_ns: None,
+            pending_since: None,
         }
     }
 
@@ -111,13 +117,22 @@ impl<T: Clone + Send> ProfilingBufferReader<T> {
 impl<T: Clone + Send> BufferReader<T> for ProfilingBufferReader<T> {
     fn poll_recv(&mut self, cx: &mut Context<'_>) -> Poll<Result<T, DbError>> {
         // `started_ns` ≈ the moment the consumer finished processing the
-        // previous value and asked for the next one. Sampled per poll; only the
-        // poll that yields a value records the interval, matching the prior
-        // await-based behavior.
-        let started_ns = (self.clock)();
+        // previous value and asked for the next one — i.e. the *first* poll of
+        // this recv cycle. Memoized in `pending_since` so re-polls after a
+        // `Pending` (waiting on the producer) reuse it instead of resampling;
+        // this keeps the recorded interval equal to consumer processing time and
+        // matches the prior await-based `recv()`, which captured `started_ns`
+        // once when the future was first polled. (Clock is read once per cycle,
+        // not once per poll.)
+        let started_ns = *self.pending_since.get_or_insert_with(|| (self.clock)());
         let result = self.inner.poll_recv(cx);
-        if let Poll::Ready(Ok(_)) = &result {
-            self.on_yield(started_ns);
+        if result.is_ready() {
+            // The recv "future" completed (Ok or Err) — close out the cycle so
+            // the next ask resamples the clock.
+            self.pending_since = None;
+            if matches!(result, Poll::Ready(Ok(_))) {
+                self.on_yield(started_ns);
+            }
         }
         result
     }

From 2ff39840575effe2ed0b3dcf98df2b8e8a68ad95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 09:27:25 +0000
Subject: [PATCH 10/16] feat: add Embassy adapter benchmarks for B2 throughput

- Implemented `b2_throughput_embassy.rs` to measure steady-state throughput using the Embassy buffer backend.
- Added baseline data for allocation metrics in `b0_alloc_embassy.json`.
- Created cycle profiling baseline for STM32H563ZI in `b3_cycles_stm32h5.json`.
- Updated `lib.rs` to include new Embassy benchmarks and profiles.
- Introduced `profiles_embassy.rs` for buffer constructors tailored for the Embassy adapter.
- Set up STM32H563ZI example with necessary configurations and dependencies in `Cargo.toml`.
- Added README documentation for the STM32H563ZI example, detailing usage and results.
- Implemented a build script for linking and memory configuration in `build.rs`.
- Created a flash script for easy deployment to the STM32H563ZI board.
- Established a Rust toolchain file for consistent development environment.
- Developed main benchmarking logic in `main.rs` to measure cycles and allocations for various buffer profiles.
---
 Cargo.lock                                    |  25 ++
 Cargo.toml                                    |   1 +
 Makefile                                      |   6 +-
 aimdb-bench/Cargo.toml                        |  30 ++
 aimdb-bench/README.md                         |  23 +-
 aimdb-bench/benches/b0_alloc_embassy.rs       | 132 +++++++
 aimdb-bench/benches/b1_latency_embassy.rs     | 132 +++++++
 aimdb-bench/benches/b2_throughput_embassy.rs  | 194 ++++++++++
 .../data/baselines/b0_alloc_embassy.json      |  29 ++
 .../data/baselines/b3_cycles_stm32h5.json     |  45 +++
 aimdb-bench/src/lib.rs                        |  22 +-
 aimdb-bench/src/profiles_embassy.rs           |  79 ++++
 .../embassy-bench-stm32h5/.cargo/config.toml  |   8 +
 examples/embassy-bench-stm32h5/.gitignore     |   5 +
 examples/embassy-bench-stm32h5/Cargo.toml     |  62 ++++
 examples/embassy-bench-stm32h5/README.md      | 113 ++++++
 examples/embassy-bench-stm32h5/build.rs       |   5 +
 examples/embassy-bench-stm32h5/flash.sh       |  36 ++
 .../embassy-bench-stm32h5/rust-toolchain.toml |   4 +
 examples/embassy-bench-stm32h5/src/main.rs    | 346 ++++++++++++++++++
 20 files changed, 1287 insertions(+), 10 deletions(-)
 create mode 100644 aimdb-bench/benches/b0_alloc_embassy.rs
 create mode 100644 aimdb-bench/benches/b1_latency_embassy.rs
 create mode 100644 aimdb-bench/benches/b2_throughput_embassy.rs
 create mode 100644 aimdb-bench/data/baselines/b0_alloc_embassy.json
 create mode 100644 aimdb-bench/data/baselines/b3_cycles_stm32h5.json
 create mode 100644 aimdb-bench/src/profiles_embassy.rs
 create mode 100644 examples/embassy-bench-stm32h5/.cargo/config.toml
 create mode 100644 examples/embassy-bench-stm32h5/.gitignore
 create mode 100644 examples/embassy-bench-stm32h5/Cargo.toml
 create mode 100644 examples/embassy-bench-stm32h5/README.md
 create mode 100644 examples/embassy-bench-stm32h5/build.rs
 create mode 100755 examples/embassy-bench-stm32h5/flash.sh
 create mode 100644 examples/embassy-bench-stm32h5/rust-toolchain.toml
 create mode 100644 examples/embassy-bench-stm32h5/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index eb3d5d1..41910f1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -28,8 +28,13 @@ name = "aimdb-bench"
 version = "0.1.0"
 dependencies = [
  "aimdb-core",
+ "aimdb-embassy-adapter",
  "aimdb-tokio-adapter",
  "criterion",
+ "critical-section",
+ "defmt 1.0.1",
+ "embassy-time-driver",
+ "futures",
  "serde",
  "serde_json",
  "tokio",
@@ -1115,6 +1120,26 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
 
+[[package]]
+name = "embassy-bench-stm32h5"
+version = "0.1.0"
+dependencies = [
+ "aimdb-core",
+ "aimdb-embassy-adapter",
+ "cortex-m",
+ "cortex-m-rt",
+ "critical-section",
+ "defmt 1.0.1",
+ "defmt-rtt",
+ "embassy-executor",
+ "embassy-futures",
+ "embassy-stm32",
+ "embassy-sync",
+ "embassy-time",
+ "embedded-alloc",
+ "panic-probe",
+]
+
 [[package]]
 name = "embassy-embedded-hal"
 version = "0.6.0"
diff --git a/Cargo.toml b/Cargo.toml
index 12faba1..f3b5ef5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,6 +27,7 @@ members = [
     "examples/embassy-mqtt-connector-demo",
     "examples/embassy-knx-connector-demo",
     "examples/embassy-serial-connector-demo",
+    "examples/embassy-bench-stm32h5",
     "examples/sync-api-demo",
     "examples/remote-access-demo",
     "examples/weather-mesh-demo/weather-mesh-common",
diff --git a/Makefile b/Makefile
index 3ba668a..eeb97b7 100644
--- a/Makefile
+++ b/Makefile
@@ -178,7 +178,7 @@ test:
 
 fmt:
 	@printf "$(GREEN)Formatting code (workspace members only)...$(NC)\n"
-	@for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
+	@for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo embassy-bench-stm32h5 weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
 		printf "$(YELLOW)  → Formatting $$pkg$(NC)\n"; \
 		cargo fmt -p $$pkg 2>/dev/null || true; \
 	done
@@ -187,7 +187,7 @@ fmt:
 fmt-check:
 	@printf "$(GREEN)Checking code formatting (workspace members only)...$(NC)\n"
 	@FAILED=0; \
-	for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
+	for pkg in aimdb-executor aimdb-derive aimdb-data-contracts aimdb-core aimdb-client aimdb-embassy-adapter aimdb-tokio-adapter aimdb-wasm-adapter aimdb-sync aimdb-persistence aimdb-persistence-sqlite aimdb-mqtt-connector aimdb-knx-connector aimdb-ws-protocol aimdb-websocket-connector aimdb-uds-connector aimdb-serial-connector aimdb-codegen aimdb-cli aimdb-mcp sync-api-demo tokio-mqtt-connector-demo embassy-mqtt-connector-demo tokio-knx-connector-demo embassy-knx-connector-demo embassy-serial-connector-demo embassy-bench-stm32h5 weather-mesh-common weather-hub weather-station-alpha weather-station-beta hello-mailbox hello-single-latest-async aimdb-bench; do \
 		printf "$(YELLOW)  → Checking $$pkg$(NC)\n"; \
 		if ! cargo fmt -p $$pkg -- --check 2>&1; then \
 			printf "$(RED)❌ Formatting check failed for $$pkg$(NC)\n"; \
@@ -376,6 +376,8 @@ examples:
 	cargo build --package embassy-knx-connector-demo --target thumbv7em-none-eabihf
 	@printf "$(YELLOW)  → Building embassy-serial-connector-demo (embedded, embassy runtime)$(NC)\n"
 	cargo build --package embassy-serial-connector-demo --target thumbv7em-none-eabihf
+	@printf "$(YELLOW)  → Building embassy-bench-stm32h5 (B3 on-target profiling, embassy runtime)$(NC)\n"
+	cargo build --package embassy-bench-stm32h5 --target thumbv7em-none-eabihf
 	@printf "$(YELLOW)  → Building weather-mesh-demo: weather-mesh-common$(NC)\n"
 	cargo build --package weather-mesh-common
 	@printf "$(YELLOW)  → Building weather-mesh-demo: weather-hub (cloud aggregator)$(NC)\n"
diff --git a/aimdb-bench/Cargo.toml b/aimdb-bench/Cargo.toml
index 00a5fc7..619ff96 100644
--- a/aimdb-bench/Cargo.toml
+++ b/aimdb-bench/Cargo.toml
@@ -29,6 +29,18 @@ harness = false
 name = "b_alloc_pipeline"
 harness = false
 
+[[bench]]
+name = "b0_alloc_embassy"
+harness = false
+
+[[bench]]
+name = "b1_latency_embassy"
+harness = false
+
+[[bench]]
+name = "b2_throughput_embassy"
+harness = false
+
 [dependencies]
 # Core AimDB types
 aimdb-core = { path = "../aimdb-core", features = ["std"] }
@@ -38,6 +50,12 @@ aimdb-tokio-adapter = { path = "../aimdb-tokio-adapter", features = [
     "tokio-runtime",
 ] }
 
+aimdb-embassy-adapter = { path = "../aimdb-embassy-adapter", default-features = false, features = [
+    "alloc",
+    "embassy-sync",
+    "embassy-time",
+] }
+
 # Async runtime — current-thread executor is used for noise reduction in B0
 tokio = { workspace = true }
 
@@ -50,3 +68,15 @@ serde_json = { workspace = true }
 criterion = { version = "0.5", default-features = false, features = [
     "cargo_bench_support",
 ] }
+
+# Host driver for the Embassy buffer futures (matches the adapter's own host
+# tests, which also use `futures::executor::block_on`).
+futures = "0.3"
+
+# `critical-section` host impl for embassy-sync's `CriticalSectionRawMutex`.
+# Linked only into the embassy bench binaries; the lib rlib does not need it.
+critical-section = { version = "1.1", features = ["std"] }
+
+# defmt logger / panic-handler + embassy-time driver stubs for the host bench
+defmt = { workspace = true }
+embassy-time-driver = { path = "../_external/embassy/embassy-time-driver" }
diff --git a/aimdb-bench/README.md b/aimdb-bench/README.md
index 2c57eb2..6fde577 100644
--- a/aimdb-bench/README.md
+++ b/aimdb-bench/README.md
@@ -12,7 +12,19 @@ Measures three classes of performance across three canonical workload profiles:
 
 Plus two informational benches that exercise the full runner-driven pipeline.
 
-**Adapters covered:** Tokio only. Embassy is a planned follow-up once it can be exercised through host-test stubs without pulling in `embassy-runtime`.
+**Adapters covered:**
+
+- **Tokio** — `b0_alloc_tokio`, `b1_latency`, `b2_throughput` (host).
+- **Embassy** — `b0_alloc_embassy`, `b1_latency_embassy`, `b2_throughput_embassy`
+  (host). These drive the real [`EmbassyBuffer`] backend via
+  `futures::executor::block_on` over embassy-sync's poll methods — no
+  `embassy-runtime`, no cortex-m executor, no hardware. The buffer constructors
+  live in [`profiles_embassy`](src/profiles_embassy.rs).
+- **Embassy on-target (B3)** — cycle-accurate per-message profiling (`DWT`
+  `CYCCNT`) on an STM32H563ZI lives in a separate hardware-only crate,
+  [`examples/embassy-bench-stm32h5`](../examples/embassy-bench-stm32h5), because
+  it cannot run on a host. It also re-validates 0 allocs/msg against the real
+  embedded allocator.
 
 ---
 
@@ -42,6 +54,11 @@ cargo bench -p aimdb-bench --bench b1_latency
 # B2 — throughput (Criterion)
 cargo bench -p aimdb-bench --bench b2_throughput
 
+# Embassy buffer backend (host) — same three classes
+cargo bench -p aimdb-bench --bench b0_alloc_embassy
+cargo bench -p aimdb-bench --bench b1_latency_embassy
+cargo bench -p aimdb-bench --bench b2_throughput_embassy
+
 # Informational: allocation count through the runner pipeline
 cargo bench -p aimdb-bench --bench b_alloc_pipeline
 
@@ -83,6 +100,10 @@ The committed baseline lives in `data/baselines/b0_alloc_tokio.json`. When a cha
 
 > **W8 result (design 037).** Since the zero-allocation consume path landed, the baseline records **0 allocs/msg** across all three tokio profiles (down from 1 — the boxed `recv()` future is gone). The committed baseline is therefore the target value; any nonzero B0 on these profiles is a regression to investigate.
 
+`b0_alloc_embassy` mirrors this against the Embassy buffer backend and writes `data/baselines/b0_alloc_embassy.json` — also **0 allocs/msg** across all three profiles, confirming the Embassy `poll_recv` path is allocation-free on the host. The on-target B3 bench (`examples/embassy-bench-stm32h5`) re-checks the same 0-alloc claim against the real embedded allocator.
+
+> **Embassy priming.** Unlike Tokio's `broadcast`, an Embassy `SpmcRing` reader registers its embassy `Subscriber` *lazily, on first poll* — a message pushed before that first poll is missed, and the next `recv()` would block forever. The embassy benches call `profiles_embassy::prime()` on each reader before the first `push` to force registration (a no-op for Watch/Mailbox readers).
+
 **Noise reduction:** a `new_current_thread()` Tokio executor is used so there are no work-stealing threads and Tokio's scheduler does not allocate per-poll in the hot path.
 
 **Production isolation:** `#[global_allocator]` is a per-binary link-time declaration. `CountingAllocator` exists only in bench binaries. Nothing in the production dependency graph is affected.
diff --git a/aimdb-bench/benches/b0_alloc_embassy.rs b/aimdb-bench/benches/b0_alloc_embassy.rs
new file mode 100644
index 0000000..4f2a924
--- /dev/null
+++ b/aimdb-bench/benches/b0_alloc_embassy.rs
@@ -0,0 +1,132 @@
+//! B0 — Allocation counting on the Embassy adapter (host-driven).
+//!
+//! The Embassy companion to [`b0_alloc_tokio`]. Measures per-message
+//! allocation cost for each workload profile against the **Embassy** buffer
+//! backend ([`EmbassyBuffer`]), driven on the host via
+//! `futures::executor::block_on` over embassy-sync's poll methods — no
+//! `embassy-runtime`, no cortex-m executor, no hardware.
+//!
+//! After the zero-allocation consume path (design 037 / W8), the Embassy
+//! `poll_recv` drives embassy-sync's public `poll_*` methods directly with no
+//! per-message future box, so the target here is the same **0 allocs/msg** as
+//! the Tokio suite. The one-time `Box::new(reader)` and the lazy subscriber
+//! registration happen during setup/warmup, before the counters are reset.
+//!
+//! **Measurement model** (identical to `b0_alloc_tokio`):
+//! 1. Create buffer + reader; **prime** the reader (forces lazy SpmcRing
+//!    subscriber registration — see [`profiles_embassy`]).
+//! 2. Warmup ≥ `WARMUP_ITERS` push → recv cycles (excluded from counters).
+//! 3. `reset()` allocation counters.
+//! 4. Run `BATCH_SIZE` push → recv cycles.
+//! 5. `snapshot()` counters; divide by `BATCH_SIZE` for per-message figures.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b0_alloc_embassy
+//! ```
+//!
+//! Results are written to `aimdb-bench/target/bench-results/b0_alloc_embassy.json`
+//! (anchored to the crate dir, so the path is the same regardless of CWD).
+
+// The Embassy adapter calls `defmt::*` unconditionally and links embassy-time;
+// on the host neither a logger nor a time driver exists. This expands no-op
+// stubs so the bench binary links. Must appear exactly once, at top level.
+aimdb_embassy_adapter::host_test_stubs!();
+
+use aimdb_bench::{
+    alloc::{reset, snapshot},
+    profiles::{command_msg, state_msg, telemetry_msg, BATCH_SIZE, WARMUP_ITERS},
+    profiles_embassy::{command_buffer, prime, state_buffer, telemetry_buffer},
+    reports::AllocReport,
+};
+use aimdb_core::buffer::{Buffer, Reader};
+use futures::executor::block_on;
+
+fn main() {
+    println!("=== B0 Allocation Benchmarks (Embassy adapter, buffer layer, host) ===");
+    println!("  Warmup iters : {WARMUP_ITERS}");
+    println!("  Batch size   : {BATCH_SIZE}");
+    println!();
+
+    // ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────
+    //
+    // `prime()` is REQUIRED here: the SpmcRing subscriber is created on the
+    // reader's first poll, so without priming the first pushed message would be
+    // missed and the first `recv()` would block forever.
+    let telemetry_report = block_on(async {
+        let buf = telemetry_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        prime(&mut reader);
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(telemetry_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(telemetry_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Telemetry", "SpmcRing", BATCH_SIZE, allocs, bytes)
+    });
+    telemetry_report.print();
+
+    // ── State: SingleLatest / Watch ──────────────────────────────────────────
+    let state_report = block_on(async {
+        let buf = state_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        prime(&mut reader);
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(state_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(state_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("State", "SingleLatest", BATCH_SIZE, allocs, bytes)
+    });
+    state_report.print();
+
+    // ── Command: Mailbox / Channel(capacity=1) ───────────────────────────────
+    //
+    // Tight 1:1 push → recv loop matches Mailbox semantics. Do NOT batch pushes
+    // ahead of the consumer: the single slot overwrites earlier values.
+    let command_report = block_on(async {
+        let buf = command_buffer();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        prime(&mut reader);
+
+        for i in 0..WARMUP_ITERS {
+            buf.push(command_msg(i as u64));
+            let _ = reader.recv().await;
+        }
+
+        reset();
+        for i in 0..BATCH_SIZE {
+            buf.push(command_msg((WARMUP_ITERS + i) as u64));
+            let _ = reader.recv().await;
+        }
+        let (allocs, bytes) = snapshot();
+        AllocReport::new("Command", "Mailbox", BATCH_SIZE, allocs, bytes)
+    });
+    command_report.print();
+
+    println!();
+    println!("Target: 0 allocs/msg (W8 zero-alloc consume path, same as the Tokio B0 suite).");
+
+    // Persist results for baseline comparison.
+    let reports = vec![telemetry_report, state_report, command_report];
+    let json = serde_json::to_string_pretty(&reports).expect("failed to serialize reports");
+    let out_dir = concat!(env!("CARGO_MANIFEST_DIR"), "/target/bench-results");
+    std::fs::create_dir_all(out_dir).expect("failed to create results directory");
+    let out_path = format!("{out_dir}/b0_alloc_embassy.json");
+    std::fs::write(&out_path, &json).expect("failed to write results");
+    println!("\nResults written to {out_path}");
+}
diff --git a/aimdb-bench/benches/b1_latency_embassy.rs b/aimdb-bench/benches/b1_latency_embassy.rs
new file mode 100644
index 0000000..06971a2
--- /dev/null
+++ b/aimdb-bench/benches/b1_latency_embassy.rs
@@ -0,0 +1,132 @@
+//! B1 — Push-to-recv latency on the Embassy adapter (host-driven, Criterion).
+//!
+//! The Embassy companion to [`b1_latency`]. Measures the wall-clock latency
+//! from `buf.push(msg)` to `reader.recv()` returning, for each workload
+//! profile, against the **Embassy** buffer backend. Driven on the host via
+//! `futures::executor::block_on` over embassy-sync's poll methods — no
+//! `embassy-runtime`, no cortex-m executor, no hardware.
+//!
+//! These are host wall-clock numbers for trend tracking and Tokio-vs-Embassy
+//! comparison; they are **not** a substitute for on-target cycle counts. Real
+//! embedded latency is measured in CPU cycles by the B3 STM32H5 bench
+//! (`examples/embassy-bench-stm32h5`).
+//!
+//! **Measurement model:** `iter_custom` gives Criterion the total elapsed time
+//! for *iters* push → recv cycles (post-warmup). Each reader is **primed**
+//! before the first push so the lazy SpmcRing subscriber is registered (see
+//! [`profiles_embassy`]).
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b1_latency_embassy
+//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --save-baseline pre-x
+//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --baseline pre-x
+//! ```
+
+aimdb_embassy_adapter::host_test_stubs!();
+
+use aimdb_bench::profiles::{command_msg, state_msg, telemetry_msg, WARMUP_ITERS};
+use aimdb_bench::profiles_embassy::{command_buffer, prime, state_buffer, telemetry_buffer};
+use aimdb_core::buffer::{Buffer, Reader};
+use criterion::{criterion_group, criterion_main, Criterion};
+use futures::executor::block_on;
+
+// ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────────
+
+fn bench_latency_telemetry(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-Latency-Embassy");
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = telemetry_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State: SingleLatest / Watch ───────────────────────────────────────────────
+
+fn bench_latency_state(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-Latency-Embassy");
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = state_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                for i in 0..WARMUP_ITERS {
+                    buf.push(state_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command: Mailbox / Channel(capacity=1) ────────────────────────────────────
+
+fn bench_latency_command(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-Latency-Embassy");
+
+    // Tight 1:1 push → recv loop — matches Mailbox semantics.
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = command_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                for i in 0..WARMUP_ITERS {
+                    buf.push(command_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_latency_telemetry,
+    bench_latency_state,
+    bench_latency_command,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/benches/b2_throughput_embassy.rs b/aimdb-bench/benches/b2_throughput_embassy.rs
new file mode 100644
index 0000000..8517776
--- /dev/null
+++ b/aimdb-bench/benches/b2_throughput_embassy.rs
@@ -0,0 +1,194 @@
+//! B2 — Steady-state throughput on the Embassy adapter (host-driven, Criterion).
+//!
+//! The Embassy companion to [`b2_throughput`]. Measures messages per second for
+//! SPSC (1 producer, 1 consumer) and 1→4 fan-out configurations against the
+//! **Embassy** buffer backend, driven on the host via
+//! `futures::executor::block_on` — no `embassy-runtime`, no cortex-m executor,
+//! no hardware.
+//!
+//! These are host throughput numbers for trend tracking and Tokio-vs-Embassy
+//! comparison; on-target throughput in CPU cycles is covered by the B3 STM32H5
+//! bench (`examples/embassy-bench-stm32h5`).
+//!
+//! **Fan-out safety rules (SpmcRing / PubSubChannel):**
+//! - All readers are **primed** before any messages are pushed, so each holds
+//!   its read position from the start (the embassy `Subscriber` is otherwise
+//!   created lazily on first poll and would miss earlier messages).
+//! - `SUBS = 4` on [`TelemetryBuffer`](aimdb_bench::profiles_embassy::TelemetryBuffer)
+//!   provides exactly four subscriber slots for the fan-out.
+//! - The loop is strict lockstep (1 push, then `recv` on every reader), so at
+//!   most one message is ever in flight and the fixed `CAP` never lags.
+//!
+//! **Mailbox throughput:** tight 1:1 push → recv loop. Do NOT batch pushes
+//! ahead of the consumer — the single slot overwrites earlier values.
+//!
+//! Run:
+//! ```text
+//! cargo bench -p aimdb-bench --bench b2_throughput_embassy
+//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --save-baseline pre-x
+//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --baseline pre-x
+//! ```
+
+aimdb_embassy_adapter::host_test_stubs!();
+
+use aimdb_bench::profiles::{command_msg, state_msg, telemetry_msg, WARMUP_ITERS};
+use aimdb_bench::profiles_embassy::{command_buffer, prime, state_buffer, telemetry_buffer};
+use aimdb_core::buffer::{Buffer, Reader};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use futures::executor::block_on;
+
+// ── Telemetry SPSC ────────────────────────────────────────────────────────────
+
+fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_spsc", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = telemetry_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg(i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Telemetry 1→4 fan-out ────────────────────────────────────────────────────
+//
+// All 4 readers are primed before any messages are pushed, so each registers
+// its subscriber at the current position. Each iteration: 1 push + recv on all
+// 4 readers. Lockstep keeps at most one message in flight, so the fixed CAP
+// never lags.
+
+fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+    // Each iteration produces 1 message observed by 4 consumers.
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("telemetry_fanout_1x4", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = telemetry_buffer();
+                let mut r0 = Reader::new(Box::new(buf.subscribe()));
+                let mut r1 = Reader::new(Box::new(buf.subscribe()));
+                let mut r2 = Reader::new(Box::new(buf.subscribe()));
+                let mut r3 = Reader::new(Box::new(buf.subscribe()));
+                // Prime all four BEFORE the first push (registers 4 subscribers).
+                prime(&mut r0);
+                prime(&mut r1);
+                prime(&mut r2);
+                prime(&mut r3);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(telemetry_msg(i as u64));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(telemetry_msg(i));
+                    let _ = r0.recv().await;
+                    let _ = r1.recv().await;
+                    let _ = r2.recv().await;
+                    let _ = r3.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── State SPSC ────────────────────────────────────────────────────────────────
+
+fn bench_throughput_state_spsc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("state_spsc", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = state_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(state_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(state_msg(i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+// ── Command / Mailbox SPSC ────────────────────────────────────────────────────
+
+fn bench_throughput_command_mailbox(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("command_mailbox", |b| {
+        b.iter_custom(|iters| {
+            block_on(async {
+                let buf = command_buffer();
+                let mut reader = Reader::new(Box::new(buf.subscribe()));
+                prime(&mut reader);
+
+                // Warmup — not timed.
+                for i in 0..WARMUP_ITERS {
+                    buf.push(command_msg(i as u64));
+                    let _ = reader.recv().await;
+                }
+
+                let start = std::time::Instant::now();
+                for i in 0..iters {
+                    buf.push(command_msg(i));
+                    let _ = reader.recv().await;
+                }
+                start.elapsed()
+            })
+        });
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_throughput_telemetry_spsc,
+    bench_throughput_telemetry_fanout,
+    bench_throughput_state_spsc,
+    bench_throughput_command_mailbox,
+);
+criterion_main!(benches);
diff --git a/aimdb-bench/data/baselines/b0_alloc_embassy.json b/aimdb-bench/data/baselines/b0_alloc_embassy.json
new file mode 100644
index 0000000..8ee6134
--- /dev/null
+++ b/aimdb-bench/data/baselines/b0_alloc_embassy.json
@@ -0,0 +1,29 @@
+[
+  {
+    "profile": "Telemetry",
+    "buffer_type": "SpmcRing",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  },
+  {
+    "profile": "State",
+    "buffer_type": "SingleLatest",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  },
+  {
+    "profile": "Command",
+    "buffer_type": "Mailbox",
+    "total_allocs": 0,
+    "total_bytes": 0,
+    "batch_size": 512,
+    "allocs_per_msg": 0.0,
+    "bytes_per_msg": 0.0
+  }
+]
\ No newline at end of file
diff --git a/aimdb-bench/data/baselines/b3_cycles_stm32h5.json b/aimdb-bench/data/baselines/b3_cycles_stm32h5.json
new file mode 100644
index 0000000..4ff0dfe
--- /dev/null
+++ b/aimdb-bench/data/baselines/b3_cycles_stm32h5.json
@@ -0,0 +1,45 @@
+{
+  "bench": "b3_cycles_embassy_stm32h5",
+  "description": "On-target DWT CYCCNT cycles/msg + allocs/msg for the Embassy buffer consume path. Produced by examples/embassy-bench-stm32h5 (cannot run on a host).",
+  "target": "STM32H563ZI (Cortex-M33)",
+  "clock_hz": 250000000,
+  "build_profile": "release",
+  "warmup": 200,
+  "batch": 512,
+  "captured": "2026-06-21",
+  "note": "First capture on a Nucleo-H563ZI rig (release). Single run; re-run 2-3x and update if a stable average differs.",
+  "results": [
+    {
+      "profile": "Telemetry",
+      "buffer_type": "SpmcRing",
+      "deliveries_per_msg": 1,
+      "cycles_per_msg": 2013,
+      "total_cycles": 1030839,
+      "allocs_per_msg": 0
+    },
+    {
+      "profile": "State",
+      "buffer_type": "SingleLatest",
+      "deliveries_per_msg": 1,
+      "cycles_per_msg": 2009,
+      "total_cycles": 1029028,
+      "allocs_per_msg": 0
+    },
+    {
+      "profile": "Command",
+      "buffer_type": "Mailbox",
+      "deliveries_per_msg": 1,
+      "cycles_per_msg": 1661,
+      "total_cycles": 850440,
+      "allocs_per_msg": 0
+    },
+    {
+      "profile": "Telemetry",
+      "buffer_type": "SpmcRing(1->4)",
+      "deliveries_per_msg": 4,
+      "cycles_per_msg": 6239,
+      "total_cycles": 3194799,
+      "allocs_per_msg": 0
+    }
+  ]
+}
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
index 6400509..109d27c 100644
--- a/aimdb-bench/src/lib.rs
+++ b/aimdb-bench/src/lib.rs
@@ -9,14 +9,22 @@
 //!
 //! # Bench entrypoints
 //!
-//! | File                            | Class | Purpose                                  |
-//! |---------------------------------|-------|------------------------------------------|
-//! | `benches/b0_alloc_tokio.rs`     | B0    | Per-message allocation (buffer layer)    |
-//! | `benches/b1_latency.rs`         | B1    | Push-to-recv latency (buffer layer)      |
-//! | `benches/b2_throughput.rs`      | B2    | Steady-state throughput (buffer layer)   |
-//! | `benches/b_alloc_pipeline.rs`   | info  | Per-message allocation (runner pipeline) |
-//! | `benches/b_runner_pipeline.rs`  | info  | Runner pipeline throughput (Criterion)   |
+//! | File                              | Class | Purpose                                  |
+//! |-----------------------------------|-------|------------------------------------------|
+//! | `benches/b0_alloc_tokio.rs`       | B0    | Per-message allocation (Tokio buffer)    |
+//! | `benches/b1_latency.rs`           | B1    | Push-to-recv latency (Tokio buffer)      |
+//! | `benches/b2_throughput.rs`        | B2    | Steady-state throughput (Tokio buffer)   |
+//! | `benches/b0_alloc_embassy.rs`     | B0    | Per-message allocation (Embassy buffer)  |
+//! | `benches/b1_latency_embassy.rs`   | B1    | Push-to-recv latency (Embassy buffer)    |
+//! | `benches/b2_throughput_embassy.rs`| B2    | Steady-state throughput (Embassy buffer) |
+//! | `benches/b_alloc_pipeline.rs`     | info  | Per-message allocation (runner pipeline) |
+//! | `benches/b_runner_pipeline.rs`    | info  | Runner pipeline throughput (Criterion)   |
+//!
+//! On-target cycle profiling (B3) is a separate hardware-only crate,
+//! `examples/embassy-bench-stm32h5`, because DWT cycle counting cannot run on a
+//! host. See design doc 038 §Phase 4 / §B3.
 
 pub mod alloc;
 pub mod profiles;
+pub mod profiles_embassy;
 pub mod reports;
diff --git a/aimdb-bench/src/profiles_embassy.rs b/aimdb-bench/src/profiles_embassy.rs
new file mode 100644
index 0000000..3546091
--- /dev/null
+++ b/aimdb-bench/src/profiles_embassy.rs
@@ -0,0 +1,79 @@
+//! Embassy buffer constructors for the host-driven B0/B1/B2 suites.
+//!
+//! These reuse the same payload types and message factories as the Tokio
+//! profiles ([`crate::profiles`]) so the two adapters are measured against
+//! identical workloads. Only the buffer backend differs: here the buffers are
+//! [`EmbassyBuffer`]s built on embassy-sync primitives, driven on the host via
+//! `futures::executor::block_on`.
+//!
+//! # Const-generic sizing
+//!
+//! Unlike Tokio's runtime-sized buffers, Embassy buffers are sized at
+//! compile time (`EmbassyBuffer<T, CAP, SUBS, PUBS, WATCH_N>`). The aliases
+//! below fix those parameters per profile:
+//!
+//! | Profile   | Backend        | CAP | SUBS | PUBS | WATCH_N | Notes                       |
+//! |-----------|----------------|-----|------|------|---------|-----------------------------|
+//! | Telemetry | `SpmcRing`     | 16  | 4    | 1    | 1       | SUBS=4 covers 1→4 fan-out   |
+//! | State     | `SingleLatest` | 1   | 1    | 1    | 4       | only WATCH_N is used        |
+//! | Command   | `Mailbox`      | 1   | 1    | 1    | 1       | Channel capacity is fixed=1 |
+//!
+//! The lockstep push→recv loops in the benches keep at most one message in
+//! flight, so `CAP=16` for Telemetry is far more than enough to avoid lagging.
+//!
+//! # Lazy SpmcRing subscriber (important)
+//!
+//! An [`EmbassyBuffer`] `SpmcRing` reader registers its underlying embassy
+//! `Subscriber` **lazily, on its first poll** — not at `subscribe()` time. A
+//! message published before that first poll is therefore missed, and a
+//! subsequent `recv()` would block forever. Benches must call
+//! [`prime`] on each reader *before* the first `push`, which forces subscriber
+//! registration via `try_recv`. This is a no-op for Watch/Mailbox readers, so
+//! it is safe (and clearer) to prime every reader uniformly.
+
+use aimdb_core::buffer::Reader;
+use aimdb_embassy_adapter::EmbassyBuffer;
+
+use crate::profiles::{CommandMsg, StateMsg, TelemetryMsg};
+
+/// SpmcRing capacity for the Telemetry profile (compile-time const generic).
+pub const TELEMETRY_CAP: usize = 16;
+
+/// Telemetry buffer: `SpmcRing` with room for the 1→4 fan-out (SUBS=4).
+pub type TelemetryBuffer = EmbassyBuffer<TelemetryMsg, TELEMETRY_CAP, 4, 1, 1>;
+
+/// State buffer: `SingleLatest` (`Watch`); only `WATCH_N` is meaningful.
+pub type StateBuffer = EmbassyBuffer<StateMsg, 1, 1, 1, 4>;
+
+/// Command buffer: `Mailbox` (single-slot `Channel`).
+pub type CommandBuffer = EmbassyBuffer<CommandMsg, 1, 1, 1, 1>;
+
+/// Build a Telemetry `SpmcRing` Embassy buffer.
+pub fn telemetry_buffer() -> TelemetryBuffer {
+    EmbassyBuffer::new_spmc()
+}
+
+/// Build a State `SingleLatest` Embassy buffer.
+pub fn state_buffer() -> StateBuffer {
+    EmbassyBuffer::new_watch()
+}
+
+/// Build a Command `Mailbox` Embassy buffer.
+pub fn command_buffer() -> CommandBuffer {
+    EmbassyBuffer::new_mailbox()
+}
+
+/// Force lazy subscriber registration on an Embassy reader before the first
+/// `push`.
+///
+/// For an `SpmcRing` reader this registers the embassy `Subscriber` at the
+/// current queue position so it does not miss the first published message (see
+/// the module docs). For Watch/Mailbox readers it is a harmless empty read.
+/// Must be called *outside* the measured window — registration may allocate.
+#[inline]
+pub fn prime<T: Clone + Send>(reader: &mut Reader<T>) {
+    // `Reader<T>` exposes `try_recv`; the only expected error here is
+    // `BufferEmpty`, which we deliberately ignore — the point is the side
+    // effect of creating the subscriber, not the (absent) value.
+    let _ = reader.try_recv();
+}
diff --git a/examples/embassy-bench-stm32h5/.cargo/config.toml b/examples/embassy-bench-stm32h5/.cargo/config.toml
new file mode 100644
index 0000000..5aac79b
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/.cargo/config.toml
@@ -0,0 +1,8 @@
+[target.thumbv8m.main-none-eabihf]
+runner = 'probe-rs run --chip STM32H563ZITx'
+
+[build]
+target = "thumbv8m.main-none-eabihf"
+
+[env]
+DEFMT_LOG = "info"
diff --git a/examples/embassy-bench-stm32h5/.gitignore b/examples/embassy-bench-stm32h5/.gitignore
new file mode 100644
index 0000000..8112e1e
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/.gitignore
@@ -0,0 +1,5 @@
+target/
+Cargo.lock
+*.bin
+*.elf
+*.hex
diff --git a/examples/embassy-bench-stm32h5/Cargo.toml b/examples/embassy-bench-stm32h5/Cargo.toml
new file mode 100644
index 0000000..2c921ad
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/Cargo.toml
@@ -0,0 +1,62 @@
+[package]
+edition = "2024"
+name = "embassy-bench-stm32h5"
+version = "0.1.0"
+license = "MIT OR Apache-2.0"
+publish = false
+description = "AimDB B3 on-target benchmark: cycle & allocation profiling of the Embassy buffer consume path on an STM32H563ZI (design 038 §B3)"
+
+[features]
+default = ["embassy-runtime"]
+embassy-runtime = []
+
+[dependencies]
+# AimDB — buffer layer only (no full AimDb stack), isolated like the host B0/B1/B2.
+aimdb-core = { path = "../../aimdb-core", default-features = false, features = [
+    "alloc",
+] }
+aimdb-embassy-adapter = { path = "../../aimdb-embassy-adapter", default-features = false, features = [
+    "embassy-runtime",
+] }
+
+# Embassy ecosystem - STM32H563ZI (same board/clock as the other H563 demos)
+embassy-stm32 = { workspace = true, features = [
+    "defmt",
+    "stm32h563zi",
+    "memory-x",
+    "time-driver-any",
+    "unstable-pac",
+] }
+embassy-sync = { workspace = true, features = ["defmt"] }
+embassy-executor = { workspace = true, features = [
+    "platform-cortex-m",
+    "executor-thread",
+    "defmt",
+] }
+embassy-time = { workspace = true, features = [
+    "defmt",
+    "defmt-timestamp-uptime",
+    "tick-hz-32_768",
+    "generic-queue-16",
+] }
+embassy-futures = { workspace = true }
+
+# Embedded debugging and logging
+defmt = { workspace = true }
+defmt-rtt = { workspace = true }
+panic-probe = { workspace = true }
+
+# Cortex-M runtime + DWT cycle counter access
+cortex-m = { workspace = true }
+cortex-m-rt = { workspace = true }
+# Provides the on-target `critical-section` impl that embassy-sync's
+# `CriticalSectionRawMutex` (used by every `EmbassyBuffer`) links against.
+critical-section = { workspace = true }
+
+# Counting allocator wraps this heap to validate 0 allocs/msg on real hardware.
+embedded-alloc = { version = "0.6", features = ["llff"] }
+
+[package.metadata.embassy]
+build = [
+    { target = "thumbv8m.main-none-eabihf", artifact-dir = "out/examples/embassy-bench-stm32h5" },
+]
diff --git a/examples/embassy-bench-stm32h5/README.md b/examples/embassy-bench-stm32h5/README.md
new file mode 100644
index 0000000..6405fc5
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/README.md
@@ -0,0 +1,113 @@
+# embassy-bench-stm32h5 — AimDB B3 on-target profiling
+
+The **B3** tier of the AimDB benchmark suite (design [038](../../docs/design/038-aimdb-bench-crate-design.md)
+§B3 / §Phase 4): the measurements that **cannot run on a host**. It reads the
+Cortex-M **DWT cycle counter** (`CYCCNT`) to report the real per-message cost, in
+CPU cycles, of the AimDB Embassy buffer `push` → `recv` consume path on an
+**STM32H563ZI** (Cortex-M33 @ 250 MHz), and re-validates the W8 zero-allocation
+claim against the real embedded allocator (`embedded-alloc`).
+
+The host-runnable tiers — **B0** (allocations), **B1** (latency), **B2**
+(throughput) — live in the [`aimdb-bench`](../../aimdb-bench) crate and exercise
+the same Embassy buffer backend on the host via `futures::executor::block_on`.
+This crate is the on-hardware complement, not a replacement.
+
+## What it measures
+
+For each AimDB buffer profile it runs a tight, lockstep `push`→`recv` loop after
+a warmup and reports per message:
+
+| Profile     | Backend        | embassy-sync primitive |
+|-------------|----------------|------------------------|
+| Telemetry   | `SpmcRing`     | `PubSubChannel`        |
+| State       | `SingleLatest` | `Watch`                |
+| Command     | `Mailbox`      | `Channel<_, _, 1>`     |
+| Telemetry ×4| `SpmcRing` 1→4 | `PubSubChannel`        |
+
+Each line reports **cycles/msg** (CYCCNT delta ÷ batch) and **allocs/msg**
+(global-allocator calls ÷ batch). The target is **0 allocs/msg** — the same W8
+goal the host B0 suite gates on.
+
+## Hardware
+
+- **Board:** ST Nucleo-H563ZI (STM32H563ZI, Cortex-M33).
+- **Probe:** the onboard ST-LINK (SWD + RTT). No extra wiring — defmt logs stream
+  over RTT on the same USB cable you flash with.
+
+## Running
+
+From **this directory** (the local `.cargo/config.toml` selects the
+`thumbv8m.main-none-eabihf` target and the `probe-rs` runner):
+
+```bash
+cargo run --release
+```
+
+`--release` matters: debug vs release cycle counts differ by an order of
+magnitude (design 038 §15.8). Always record the build profile with a baseline.
+
+### Flashing from outside the dev container
+
+If `probe-rs` and the ST-LINK live on your host (not in the container), build in
+the container and flash from the host with [`flash.sh`](flash.sh):
+
+```bash
+# In the dev container:
+cd examples/embassy-bench-stm32h5 && cargo build --release
+
+# On the host (where the ST-LINK is attached):
+cd examples/embassy-bench-stm32h5 && ./flash.sh
+```
+
+`flash.sh` prefers the `--release` binary and falls back to debug with a warning;
+it runs `probe-rs run --chip STM32H563ZITx`, so B3 results stream over RTT to
+your terminal.
+
+First capture on a Nucleo-H563ZI @ 250 MHz (release), recorded in
+[`aimdb-bench/data/baselines/b3_cycles_stm32h5.json`](../../aimdb-bench/data/baselines/b3_cycles_stm32h5.json):
+
+```
+=== AimDB B3 — Embassy buffer profiling on STM32H563ZI @ 250 MHz ===
+cycle_counter=true  warmup=200  batch=512
+[B3] Telemetry SpmcRing    : 2013 cycles/msg, 0 allocs/msg  (1030839 cycles total, batch=512)
+[B3] State     SingleLatest: 2009 cycles/msg, 0 allocs/msg  (1029028 cycles total, batch=512)
+[B3] Command   Mailbox     : 1661 cycles/msg, 0 allocs/msg  (850440 cycles total, batch=512)
+[B3] Telemetry SpmcRing(1->4): 6239 cycles/msg, 0 allocs/msg  (4 deliveries/msg, 3194799 cycles total, batch=512)
+=== B3 complete — target=0 allocs/msg (W8 zero-alloc consume path) ===
+```
+
+Command (single-slot `Channel`) is cheapest; the 1→4 fan-out is sub-linear
+(~1560 cycles/delivery) since the single `push` is amortized across four reads.
+Treat these as the regression reference; re-run 2–3× and update the baseline JSON
+if a stable average drifts.
+
+## Notes & caveats
+
+- **Measurement window** excludes warmup. The one-time reader `Box`/lazy
+  `SpmcRing` subscriber registration happens during warmup, so the measured
+  window reflects steady-state per-message cost only.
+- **Payload construction is inside the timed loop**, identical to the host B1/B2
+  suites, so the figure is the end-to-end per-message consume cost (not the
+  buffer call alone).
+- **Clock governor / frequency:** DWT cycle counts assume the 250 MHz PLL1
+  config in `main.rs`. Record the baseline at a fixed clock (design 038 §15.6).
+- **CI compile check** uses `thumbv7em-none-eabihf` (the workspace's installed
+  embedded triple, see the `examples` Make target), matching the other H563
+  demos; the flashable artifact is the `thumbv8m.main-none-eabihf` build above.
+
+## Troubleshooting
+
+**`└─ <mod path> @ <invalid location: defmt frame-index: N>`** after each log line.
+The `[B3] …` messages themselves decode fine; only the file:line annotations are
+missing. The firmware is correct — it emits the same defmt 1.0 metadata
+(`_defmt_version_ = 4`, `.defmt` + `.symtab` present) as every other embassy
+example here, in both debug and release. The cause is host-side: the
+`defmt-decoder` bundled in your `probe-rs` decodes defmt-1.0 message payloads but
+not its location table. It affects any defmt-1.0 binary in this repo, not just
+this one — flash another example (e.g. `embassy-serial-connector-demo`) to
+confirm the same annotations appear.
+
+Fix: update `probe-rs` to a current release (its decoder is updated in lockstep
+with defmt) — `probe-rs --version`, then reinstall via your usual method (e.g.
+`cargo install probe-rs-tools --locked`). It is purely cosmetic for B3: the
+cycle/alloc numbers are unaffected.
diff --git a/examples/embassy-bench-stm32h5/build.rs b/examples/embassy-bench-stm32h5/build.rs
new file mode 100644
index 0000000..8cd32d7
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/build.rs
@@ -0,0 +1,5 @@
+fn main() {
+    println!("cargo:rustc-link-arg-bins=--nmagic");
+    println!("cargo:rustc-link-arg-bins=-Tlink.x");
+    println!("cargo:rustc-link-arg-bins=-Tdefmt.x");
+}
diff --git a/examples/embassy-bench-stm32h5/flash.sh b/examples/embassy-bench-stm32h5/flash.sh
new file mode 100755
index 0000000..a031763
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/flash.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Flash script for embassy-bench-stm32h5 (AimDB B3 on-target profiling).
+#
+# Run on the HOST machine where probe-rs and the ST-LINK are accessible.
+# Build first (in the dev container):
+#   cd examples/embassy-bench-stm32h5 && cargo build --release
+#
+# B3 cycle counts are only meaningful in --release: debug vs release differ by an
+# order of magnitude (design 038 §15.8). This script therefore prefers the
+# release binary and only falls back to debug with a warning.
+#
+# Results (cycles/msg + allocs/msg per profile) stream over RTT (SWD) as defmt
+# logs — probe-rs prints them to this terminal.
+set -e
+
+RELEASE_BINARY="../../target/thumbv8m.main-none-eabihf/release/embassy-bench-stm32h5"
+DEBUG_BINARY="../../target/thumbv8m.main-none-eabihf/debug/embassy-bench-stm32h5"
+
+if [ -f "$RELEASE_BINARY" ]; then
+    BINARY="$RELEASE_BINARY"
+elif [ -f "$DEBUG_BINARY" ]; then
+    BINARY="$DEBUG_BINARY"
+    echo "Warning: using the DEBUG build — B3 cycle counts are not representative."
+    echo "         Rebuild with --release for meaningful numbers:"
+    echo "           cd examples/embassy-bench-stm32h5 && cargo build --release"
+else
+    echo "Error: no binary found at:"
+    echo "  $RELEASE_BINARY"
+    echo "  $DEBUG_BINARY"
+    echo "Build it first in the dev container:"
+    echo "  cd examples/embassy-bench-stm32h5 && cargo build --release"
+    exit 1
+fi
+
+echo "Flashing embassy-bench-stm32h5 to STM32H563ZITx (B3 results stream over RTT)..."
+probe-rs run --chip STM32H563ZITx "$BINARY"
diff --git a/examples/embassy-bench-stm32h5/rust-toolchain.toml b/examples/embassy-bench-stm32h5/rust-toolchain.toml
new file mode 100644
index 0000000..750ee6f
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/rust-toolchain.toml
@@ -0,0 +1,4 @@
+[toolchain]
+channel = "1.95"
+components = ["rust-src", "rustfmt", "llvm-tools"]
+targets = ["thumbv8m.main-none-eabihf"]
diff --git a/examples/embassy-bench-stm32h5/src/main.rs b/examples/embassy-bench-stm32h5/src/main.rs
new file mode 100644
index 0000000..12e7937
--- /dev/null
+++ b/examples/embassy-bench-stm32h5/src/main.rs
@@ -0,0 +1,346 @@
+#![no_std]
+#![no_main]
+
+//! B3 — On-target cycle & allocation profiling of the AimDB Embassy buffer
+//! consume path (design 038 §B3 / §Phase 4).
+//!
+//! This is the part of the Embassy benchmark suite that **cannot run on a
+//! host**: it reads the Cortex-M **DWT cycle counter** (`CYCCNT`) to measure the
+//! real per-message cost, in CPU cycles, of `push` → `recv` for each AimDB
+//! buffer profile on an STM32H563ZI (Cortex-M33 @ 250 MHz). The host
+//! `aimdb-bench` suite covers B0 (allocations), B1 (wall-clock latency) and B2
+//! (throughput) for the same Embassy buffer backend; this binary adds the
+//! cycle-accurate B3 numbers and re-validates the W8 zero-allocation claim
+//! against the real embedded allocator (`embedded-alloc`), wrapped here in a
+//! counting allocator.
+//!
+//! ## What is measured
+//!
+//! For Telemetry (`SpmcRing`), State (`SingleLatest`) and Command (`Mailbox`),
+//! plus a 1→4 Telemetry fan-out, the binary runs a tight lockstep
+//! `push`→`recv` loop and reports, per message:
+//!   * **cycles/msg** — `DWT::cycle_count()` delta over the measured batch,
+//!   * **allocs/msg** — global-allocator call count over the measured batch.
+//!
+//! The measured window excludes a warmup phase; the one-time reader boxing and
+//! lazy `SpmcRing` subscriber registration happen during warmup. As with the
+//! host B1/B2 suites, payload construction is inside the timed loop, so the
+//! figure is the end-to-end per-message consume cost, not the buffer call in
+//! isolation.
+//!
+//! ## Running
+//!
+//! ```bash
+//! # From this crate dir, with a Nucleo-H563ZI connected via ST-LINK:
+//! cargo run --release
+//! ```
+//!
+//! Results stream over RTT (SWD) as defmt logs. `--release` is strongly
+//! recommended; debug vs release cycle counts differ by an order of magnitude
+//! (design 038 §15.8), so always record the build profile with a baseline.
+
+extern crate alloc;
+
+use alloc::boxed::Box;
+use core::alloc::{GlobalAlloc, Layout};
+use core::sync::atomic::{AtomicU32, Ordering};
+
+use aimdb_core::buffer::{Buffer, Reader};
+use aimdb_embassy_adapter::EmbassyBuffer;
+use cortex_m::peripheral::DWT;
+use defmt::info;
+use embassy_executor::Spawner;
+use embassy_futures::block_on;
+use {defmt_rtt as _, panic_probe as _};
+
+// ── Allocation-counting heap ─────────────────────────────────────────────────
+//
+// Wraps `embedded-alloc`'s `LlffHeap` so the B3 run can confirm 0 allocs/msg on
+// real hardware — the embedded analogue of the host `CountingAllocator<System>`
+// in `aimdb-bench` (design 038 §4 anticipated swapping `System` for an embedded
+// allocator without reworking the counter).
+
+static ALLOC_COUNT: AtomicU32 = AtomicU32::new(0);
+
+struct CountingHeap(embedded_alloc::LlffHeap);
+
+// SAFETY: every call is delegated unchanged to the inner heap; the only added
+// side effect is the relaxed atomic increment.
+unsafe impl GlobalAlloc for CountingHeap {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        unsafe { self.0.alloc(layout) }
+    }
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        unsafe { self.0.dealloc(ptr, layout) }
+    }
+}
+
+#[global_allocator]
+static HEAP: CountingHeap = CountingHeap(embedded_alloc::LlffHeap::empty());
+
+#[inline]
+fn reset_allocs() {
+    ALLOC_COUNT.store(0, Ordering::Relaxed);
+}
+
+#[inline]
+fn allocs() -> u32 {
+    ALLOC_COUNT.load(Ordering::Relaxed)
+}
+
+// ── Workload payloads ────────────────────────────────────────────────────────
+//
+// Mirror `aimdb_bench::profiles` exactly so B3 cycle numbers line up with the
+// host B0/B1/B2 figures for the same payload shapes. Kept in sync by hand — the
+// host crate is `std`-only and cannot be a dependency of this `no_std` binary.
+
+// The benchmark never reads these fields — they are payload "ballast" whose
+// size/shape matches the host profiles, so the per-message `clone` cost (and
+// thus the cycle count) is comparable. Hence `dead_code` is expected.
+#[derive(Clone)]
+#[allow(dead_code)]
+struct TelemetryMsg {
+    sensor_id: u32,
+    value: f64,
+    sequence: u64,
+}
+
+#[derive(Clone)]
+#[allow(dead_code)]
+struct StateMsg {
+    device_id: u32,
+    temperature: f64,
+    humidity: f64,
+    pressure: f64,
+    sequence: u64,
+}
+
+#[derive(Clone)]
+#[allow(dead_code)]
+struct CommandMsg {
+    command_id: u32,
+    target: u32,
+    value: f64,
+    sequence: u64,
+}
+
+#[inline]
+fn telemetry_msg(i: u64) -> TelemetryMsg {
+    TelemetryMsg {
+        sensor_id: (i % 16) as u32,
+        value: i as f64 * 0.1,
+        sequence: i,
+    }
+}
+
+#[inline]
+fn state_msg(i: u64) -> StateMsg {
+    StateMsg {
+        device_id: (i % 8) as u32,
+        temperature: 20.0 + i as f64 * 0.01,
+        humidity: 50.0 + i as f64 * 0.005,
+        pressure: 1013.25 + i as f64 * 0.001,
+        sequence: i,
+    }
+}
+
+#[inline]
+fn command_msg(i: u64) -> CommandMsg {
+    CommandMsg {
+        command_id: (i % 256) as u32,
+        target: (i % 4) as u32,
+        value: i as f64,
+        sequence: i,
+    }
+}
+
+// ── Buffer type aliases ──────────────────────────────────────────────────────
+//
+// Same backends as `aimdb_bench::profiles_embassy`, with smaller `CAP` to keep
+// the static `PubSubChannel` footprint modest on-target. The lockstep loops
+// keep at most one message in flight, so a small `CAP` never lags. `SUBS = 4`
+// on the Telemetry ring leaves room for the 1→4 fan-out.
+
+type TelemetryBuffer = EmbassyBuffer<TelemetryMsg, 8, 4, 1, 1>;
+type StateBuffer = EmbassyBuffer<StateMsg, 1, 1, 1, 2>;
+type CommandBuffer = EmbassyBuffer<CommandMsg, 1, 1, 1, 1>;
+
+const WARMUP: usize = 200;
+const BATCH: u32 = 512;
+
+/// Read CYCCNT, run `BATCH` lockstep `push`→`recv` cycles, return the cycle and
+/// allocation deltas over the measured window.
+macro_rules! measure {
+    ($reader:expr, $push:expr) => {{
+        reset_allocs();
+        let start = DWT::cycle_count();
+        for i in 0..BATCH {
+            let _ = $push(WARMUP as u64 + i as u64);
+            let _ = block_on($reader.recv());
+        }
+        let cycles = DWT::cycle_count().wrapping_sub(start);
+        (cycles, allocs())
+    }};
+}
+
+fn report(profile: &str, buffer: &str, cycles: u32, allocs: u32) {
+    info!(
+        "[B3] {=str} {=str}: {=u32} cycles/msg, {=u32} allocs/msg  ({=u32} cycles total, batch={=u32})",
+        profile,
+        buffer,
+        cycles / BATCH,
+        allocs / BATCH,
+        cycles,
+        BATCH,
+    );
+}
+
+#[embassy_executor::main]
+async fn main(_spawner: Spawner) {
+    // Initialize the heap behind the counting allocator.
+    {
+        use core::mem::MaybeUninit;
+        const HEAP_SIZE: usize = 32 * 1024; // 32 KB
+        static mut MEM: [MaybeUninit<u8>; HEAP_SIZE] = [MaybeUninit::uninit(); HEAP_SIZE];
+        unsafe {
+            let mem_ptr = core::ptr::addr_of_mut!(MEM);
+            HEAP.0.init((*mem_ptr).as_ptr() as usize, HEAP_SIZE);
+        }
+    }
+
+    // DWT cycle counter. We only touch DCB/DWT, which Embassy does not use, so
+    // stealing the core peripherals here is sound regardless of init ordering.
+    // SAFETY: exclusive access to DCB/DWT for the lifetime of this benchmark;
+    // no other code in this binary touches them.
+    let mut cp = unsafe { cortex_m::Peripherals::steal() };
+    cp.DCB.enable_trace();
+    cp.DWT.enable_cycle_counter();
+
+    // Clock tree: HSE 8 MHz (from the ST-LINK MCO) → PLL1 → 250 MHz. Identical
+    // to the other H563 demos in this repo.
+    let mut config = embassy_stm32::Config::default();
+    {
+        use embassy_stm32::rcc::*;
+        use embassy_stm32::time::Hertz;
+
+        config.rcc.hsi = None;
+        config.rcc.hsi48 = Some(Default::default());
+        config.rcc.hse = Some(Hse {
+            freq: Hertz(8_000_000),
+            mode: HseMode::BypassDigital,
+        });
+        config.rcc.pll1 = Some(Pll {
+            source: PllSource::Hse,
+            prediv: PllPreDiv::Div2,
+            mul: PllMul::Mul125,
+            divp: Some(PllDiv::Div2),
+            divq: Some(PllDiv::Div2),
+            divr: None,
+        });
+        config.rcc.ahb_pre = AHBPrescaler::Div1;
+        config.rcc.apb1_pre = APBPrescaler::Div1;
+        config.rcc.apb2_pre = APBPrescaler::Div1;
+        config.rcc.apb3_pre = APBPrescaler::Div1;
+        config.rcc.sys = Sysclk::Pll1P;
+        config.rcc.voltage_scale = VoltageScale::Scale0;
+    }
+    let _p = embassy_stm32::init(config);
+
+    info!("=== AimDB B3 — Embassy buffer profiling on STM32H563ZI @ 250 MHz ===");
+    info!(
+        "cycle_counter={=bool}  warmup={=u32}  batch={=u32}",
+        DWT::has_cycle_counter(),
+        WARMUP as u32,
+        BATCH
+    );
+
+    // ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────
+    //
+    // `try_recv` primes the lazily-created SpmcRing subscriber before the first
+    // push, otherwise the first message is missed and `recv` blocks forever.
+    {
+        let buf: TelemetryBuffer = EmbassyBuffer::new_spmc();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        let _ = reader.try_recv();
+        for i in 0..WARMUP {
+            buf.push(telemetry_msg(i as u64));
+            let _ = block_on(reader.recv());
+        }
+        let (cycles, n_allocs) = measure!(reader, |i| buf.push(telemetry_msg(i)));
+        report("Telemetry", "SpmcRing    ", cycles, n_allocs);
+    }
+
+    // ── State: SingleLatest / Watch ──────────────────────────────────────────
+    {
+        let buf: StateBuffer = EmbassyBuffer::new_watch();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        let _ = reader.try_recv();
+        for i in 0..WARMUP {
+            buf.push(state_msg(i as u64));
+            let _ = block_on(reader.recv());
+        }
+        let (cycles, n_allocs) = measure!(reader, |i| buf.push(state_msg(i)));
+        report("State    ", "SingleLatest", cycles, n_allocs);
+    }
+
+    // ── Command: Mailbox / Channel(capacity=1) ───────────────────────────────
+    {
+        let buf: CommandBuffer = EmbassyBuffer::new_mailbox();
+        let mut reader = Reader::new(Box::new(buf.subscribe()));
+        let _ = reader.try_recv();
+        for i in 0..WARMUP {
+            buf.push(command_msg(i as u64));
+            let _ = block_on(reader.recv());
+        }
+        let (cycles, n_allocs) = measure!(reader, |i| buf.push(command_msg(i)));
+        report("Command  ", "Mailbox     ", cycles, n_allocs);
+    }
+
+    // ── Telemetry 1→4 fan-out ────────────────────────────────────────────────
+    //
+    // One publisher, four subscribers, lockstep. Reported per produced message
+    // (each observed by all four readers — 4 deliveries/msg).
+    {
+        let buf: TelemetryBuffer = EmbassyBuffer::new_spmc();
+        let mut r0 = Reader::new(Box::new(buf.subscribe()));
+        let mut r1 = Reader::new(Box::new(buf.subscribe()));
+        let mut r2 = Reader::new(Box::new(buf.subscribe()));
+        let mut r3 = Reader::new(Box::new(buf.subscribe()));
+        let _ = r0.try_recv();
+        let _ = r1.try_recv();
+        let _ = r2.try_recv();
+        let _ = r3.try_recv();
+        for i in 0..WARMUP {
+            buf.push(telemetry_msg(i as u64));
+            let _ = block_on(r0.recv());
+            let _ = block_on(r1.recv());
+            let _ = block_on(r2.recv());
+            let _ = block_on(r3.recv());
+        }
+        reset_allocs();
+        let start = DWT::cycle_count();
+        for i in 0..BATCH {
+            buf.push(telemetry_msg(WARMUP as u64 + i as u64));
+            let _ = block_on(r0.recv());
+            let _ = block_on(r1.recv());
+            let _ = block_on(r2.recv());
+            let _ = block_on(r3.recv());
+        }
+        let cycles = DWT::cycle_count().wrapping_sub(start);
+        let n_allocs = allocs();
+        info!(
+            "[B3] Telemetry SpmcRing(1->4): {=u32} cycles/msg, {=u32} allocs/msg  (4 deliveries/msg, {=u32} cycles total, batch={=u32})",
+            cycles / BATCH,
+            n_allocs / BATCH,
+            cycles,
+            BATCH,
+        );
+    }
+
+    info!("=== B3 complete — target=0 allocs/msg (W8 zero-alloc consume path) ===");
+
+    loop {
+        cortex_m::asm::wfe();
+    }
+}

From 34b8b5c5b32d7082baf548f51bb39586dbd3c330 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 19:52:02 +0000
Subject: [PATCH 11/16] feat(bench): update baseline references in benchmark
 scripts to 'main'

---
 aimdb-bench/benches/b0_alloc_embassy.rs      | 10 +++++-----
 aimdb-bench/benches/b0_alloc_tokio.rs        | 13 ++++++-------
 aimdb-bench/benches/b1_latency.rs            |  4 ++--
 aimdb-bench/benches/b1_latency_embassy.rs    |  4 ++--
 aimdb-bench/benches/b2_throughput.rs         |  4 ++--
 aimdb-bench/benches/b2_throughput_embassy.rs |  4 ++--
 aimdb-bench/src/lib.rs                       |  2 +-
 7 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/aimdb-bench/benches/b0_alloc_embassy.rs b/aimdb-bench/benches/b0_alloc_embassy.rs
index 4f2a924..ba57aef 100644
--- a/aimdb-bench/benches/b0_alloc_embassy.rs
+++ b/aimdb-bench/benches/b0_alloc_embassy.rs
@@ -6,10 +6,10 @@
 //! `futures::executor::block_on` over embassy-sync's poll methods — no
 //! `embassy-runtime`, no cortex-m executor, no hardware.
 //!
-//! After the zero-allocation consume path (design 037 / W8), the Embassy
-//! `poll_recv` drives embassy-sync's public `poll_*` methods directly with no
-//! per-message future box, so the target here is the same **0 allocs/msg** as
-//! the Tokio suite. The one-time `Box::new(reader)` and the lazy subscriber
+//! The Embassy `poll_recv` drives embassy-sync's public `poll_*` methods
+//! directly with no per-message future box, so the steady-state hot path is
+//! allocation-free — the expected result is **0 allocs/msg**, the same as the
+//! Tokio suite. The one-time `Box::new(reader)` and the lazy subscriber
 //! registration happen during setup/warmup, before the counters are reset.
 //!
 //! **Measurement model** (identical to `b0_alloc_tokio`):
@@ -119,7 +119,7 @@ fn main() {
     command_report.print();
 
     println!();
-    println!("Target: 0 allocs/msg (W8 zero-alloc consume path, same as the Tokio B0 suite).");
+    println!("Expected: 0 allocs/msg — allocation-free consume path, same as the Tokio B0 suite.");
 
     // Persist results for baseline comparison.
     let reports = vec![telemetry_report, state_report, command_report];
diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
index 332e76f..b52ab4d 100644
--- a/aimdb-bench/benches/b0_alloc_tokio.rs
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -3,9 +3,11 @@
 //! Measures per-message allocation cost for each workload profile using
 //! `TokioBuffer<T>` directly (not the full `AimDb` stack).
 //!
-//! **Pre-W8 baseline (expected):** 1 alloc/msg from the `Box::pin(async
-//! move { ... })` constructed inside `TokioBufferReader::recv()` on every
-//! call.  The target is **0 allocs/msg**.
+//! **Expected result: 0 allocs/msg.** The zero-allocation consume path drives
+//! the reader without boxing a future per `recv()`, so the steady-state hot
+//! path is allocation-free. This bench is the hard gate that keeps it that way:
+//! any per-message allocation that creeps back into the consume path shows up
+//! here as a non-zero `allocs/msg`.
 //!
 //! **Measurement model:**
 //! 1. Create buffer + reader.
@@ -118,10 +120,7 @@ fn main() {
     command_report.print();
 
     println!();
-    println!(
-        "Pre-W8 expectation: ~1 alloc/msg (Box::pin in recv()). \
-         Target: 0 allocs/msg."
-    );
+    println!("Expected: 0 allocs/msg — the consume path is allocation-free in steady state.");
 
     // Persist results for baseline comparison.
     let reports = vec![telemetry_report, state_report, command_report];
diff --git a/aimdb-bench/benches/b1_latency.rs b/aimdb-bench/benches/b1_latency.rs
index 1d85b96..82360b7 100644
--- a/aimdb-bench/benches/b1_latency.rs
+++ b/aimdb-bench/benches/b1_latency.rs
@@ -16,9 +16,9 @@
 //! ```text
 //! cargo bench -p aimdb-bench --bench b1_latency
 //! # Save a named baseline:
-//! cargo bench -p aimdb-bench --bench b1_latency -- --save-baseline pre-w8
+//! cargo bench -p aimdb-bench --bench b1_latency -- --save-baseline main
 //! # Compare against that baseline:
-//! cargo bench -p aimdb-bench --bench b1_latency -- --baseline pre-w8
+//! cargo bench -p aimdb-bench --bench b1_latency -- --baseline main
 //! ```
 
 use aimdb_bench::profiles::{
diff --git a/aimdb-bench/benches/b1_latency_embassy.rs b/aimdb-bench/benches/b1_latency_embassy.rs
index 06971a2..99f7ac5 100644
--- a/aimdb-bench/benches/b1_latency_embassy.rs
+++ b/aimdb-bench/benches/b1_latency_embassy.rs
@@ -19,8 +19,8 @@
 //! Run:
 //! ```text
 //! cargo bench -p aimdb-bench --bench b1_latency_embassy
-//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --save-baseline pre-x
-//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --baseline pre-x
+//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --baseline main
 //! ```
 
 aimdb_embassy_adapter::host_test_stubs!();
diff --git a/aimdb-bench/benches/b2_throughput.rs b/aimdb-bench/benches/b2_throughput.rs
index 54dec89..4f8ada6 100644
--- a/aimdb-bench/benches/b2_throughput.rs
+++ b/aimdb-bench/benches/b2_throughput.rs
@@ -20,8 +20,8 @@
 //! Run:
 //! ```text
 //! cargo bench -p aimdb-bench --bench b2_throughput
-//! cargo bench -p aimdb-bench --bench b2_throughput -- --save-baseline pre-w8
-//! cargo bench -p aimdb-bench --bench b2_throughput -- --baseline pre-w8
+//! cargo bench -p aimdb-bench --bench b2_throughput -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b2_throughput -- --baseline main
 //! ```
 
 use aimdb_bench::profiles::{
diff --git a/aimdb-bench/benches/b2_throughput_embassy.rs b/aimdb-bench/benches/b2_throughput_embassy.rs
index 8517776..e38474e 100644
--- a/aimdb-bench/benches/b2_throughput_embassy.rs
+++ b/aimdb-bench/benches/b2_throughput_embassy.rs
@@ -25,8 +25,8 @@
 //! Run:
 //! ```text
 //! cargo bench -p aimdb-bench --bench b2_throughput_embassy
-//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --save-baseline pre-x
-//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --baseline pre-x
+//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --baseline main
 //! ```
 
 aimdb_embassy_adapter::host_test_stubs!();
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
index 109d27c..8e776ee 100644
--- a/aimdb-bench/src/lib.rs
+++ b/aimdb-bench/src/lib.rs
@@ -22,7 +22,7 @@
 //!
 //! On-target cycle profiling (B3) is a separate hardware-only crate,
 //! `examples/embassy-bench-stm32h5`, because DWT cycle counting cannot run on a
-//! host. See design doc 038 §Phase 4 / §B3.
+//! host. See design doc 038 for the on-target B3 harness.
 
 pub mod alloc;
 pub mod profiles;

From 5d9630cbb7ff8167782ef5eec349a7d2ef410995 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 20:20:29 +0000
Subject: [PATCH 12/16] feat(bench): consolidate latency and throughput
 benchmarks for Tokio and Embassy adapters

---
 aimdb-bench/Cargo.toml                        |  15 +-
 aimdb-bench/README.md                         |  25 ++--
 ...throughput_embassy.rs => b1_b2_embassy.rs} |  63 ++++----
 .../{b2_throughput.rs => b1_b2_tokio.rs}      |  55 ++++---
 aimdb-bench/benches/b1_latency.rs             | 138 ------------------
 aimdb-bench/benches/b1_latency_embassy.rs     | 132 -----------------
 aimdb-bench/src/lib.rs                        |   6 +-
 7 files changed, 87 insertions(+), 347 deletions(-)
 rename aimdb-bench/benches/{b2_throughput_embassy.rs => b1_b2_embassy.rs} (75%)
 rename aimdb-bench/benches/{b2_throughput.rs => b1_b2_tokio.rs} (79%)
 delete mode 100644 aimdb-bench/benches/b1_latency.rs
 delete mode 100644 aimdb-bench/benches/b1_latency_embassy.rs

diff --git a/aimdb-bench/Cargo.toml b/aimdb-bench/Cargo.toml
index 619ff96..9d3ab4c 100644
--- a/aimdb-bench/Cargo.toml
+++ b/aimdb-bench/Cargo.toml
@@ -13,12 +13,11 @@ name = "aimdb_bench"
 name = "b0_alloc_tokio"
 harness = false
 
+# Combined B1 (latency) + B2 (throughput) — a throughput-annotated Criterion
+# bench already reports per-iteration latency, so a separate b1_latency target
+# would be redundant. See design 038 §3–§4.
 [[bench]]
-name = "b1_latency"
-harness = false
-
-[[bench]]
-name = "b2_throughput"
+name = "b1_b2_tokio"
 harness = false
 
 [[bench]]
@@ -34,11 +33,7 @@ name = "b0_alloc_embassy"
 harness = false
 
 [[bench]]
-name = "b1_latency_embassy"
-harness = false
-
-[[bench]]
-name = "b2_throughput_embassy"
+name = "b1_b2_embassy"
 harness = false
 
 [dependencies]
diff --git a/aimdb-bench/README.md b/aimdb-bench/README.md
index 6fde577..c87c940 100644
--- a/aimdb-bench/README.md
+++ b/aimdb-bench/README.md
@@ -10,12 +10,17 @@ Measures three classes of performance across three canonical workload profiles:
 | **B1** — push-to-recv latency | Criterion p50/p99 | trend tracking | no |
 | **B2** — steady-state throughput | Criterion msgs/sec | trend tracking | no |
 
+**B1 and B2 share one bench per adapter** (`b1_b2_tokio`, `b1_b2_embassy`). A throughput-annotated Criterion
+bench reports both the per-iteration time (**B1 latency**, the `time` column) and
+messages/second (**B2 throughput**, the `thrpt` column) from the same runs, so
+there is no separate `b1_latency` target.
+
 Plus two informational benches that exercise the full runner-driven pipeline.
 
 **Adapters covered:**
 
-- **Tokio** — `b0_alloc_tokio`, `b1_latency`, `b2_throughput` (host).
-- **Embassy** — `b0_alloc_embassy`, `b1_latency_embassy`, `b2_throughput_embassy`
+- **Tokio** — `b0_alloc_tokio`, `b1_b2_tokio` (host).
+- **Embassy** — `b0_alloc_embassy`, `b1_b2_embassy`
   (host). These drive the real [`EmbassyBuffer`] backend via
   `futures::executor::block_on` over embassy-sync's poll methods — no
   `embassy-runtime`, no cortex-m executor, no hardware. The buffer constructors
@@ -48,16 +53,12 @@ Always run from the workspace root (`/aimdb_ws/aimdb`).
 # B0 — allocation gate (buffer layer)
 cargo bench -p aimdb-bench --bench b0_alloc_tokio
 
-# B1 — latency (Criterion)
-cargo bench -p aimdb-bench --bench b1_latency
-
-# B2 — throughput (Criterion)
-cargo bench -p aimdb-bench --bench b2_throughput
+# B1 + B2 — latency (time/iter) and throughput (msgs/sec), one Criterion suite
+cargo bench -p aimdb-bench --bench b1_b2_tokio
 
-# Embassy buffer backend (host) — same three classes
+# Embassy buffer backend (host) — same classes
 cargo bench -p aimdb-bench --bench b0_alloc_embassy
-cargo bench -p aimdb-bench --bench b1_latency_embassy
-cargo bench -p aimdb-bench --bench b2_throughput_embassy
+cargo bench -p aimdb-bench --bench b1_b2_embassy
 
 # Informational: allocation count through the runner pipeline
 cargo bench -p aimdb-bench --bench b_alloc_pipeline
@@ -75,10 +76,10 @@ B1 and B2 use Criterion's built-in baseline system:
 
 ```sh
 # Save a named baseline before a change
-cargo bench -p aimdb-bench --bench b1_latency -- --save-baseline pre-w8
+cargo bench -p aimdb-bench --bench b1_b2_tokio -- --save-baseline pre-w8
 
 # Compare against it after
-cargo bench -p aimdb-bench --bench b1_latency -- --baseline pre-w8
+cargo bench -p aimdb-bench --bench b1_b2_tokio -- --baseline pre-w8
 ```
 
 Criterion writes HTML reports to `target/criterion/`.
diff --git a/aimdb-bench/benches/b2_throughput_embassy.rs b/aimdb-bench/benches/b1_b2_embassy.rs
similarity index 75%
rename from aimdb-bench/benches/b2_throughput_embassy.rs
rename to aimdb-bench/benches/b1_b2_embassy.rs
index e38474e..b43c1dd 100644
--- a/aimdb-bench/benches/b2_throughput_embassy.rs
+++ b/aimdb-bench/benches/b1_b2_embassy.rs
@@ -1,14 +1,21 @@
-//! B2 — Steady-state throughput on the Embassy adapter (host-driven, Criterion).
+//! B1/B2 — Latency & throughput on the Embassy adapter (host-driven, Criterion).
 //!
-//! The Embassy companion to [`b2_throughput`]. Measures messages per second for
-//! SPSC (1 producer, 1 consumer) and 1→4 fan-out configurations against the
-//! **Embassy** buffer backend, driven on the host via
-//! `futures::executor::block_on` — no `embassy-runtime`, no cortex-m executor,
-//! no hardware.
+//! The Embassy companion to [`b1_b2_tokio`], capturing **both** the B1 and B2
+//! measurement classes from one set of runs against the **Embassy** buffer
+//! backend, driven on the host via `futures::executor::block_on` — no
+//! `embassy-runtime`, no cortex-m executor, no hardware:
 //!
-//! These are host throughput numbers for trend tracking and Tokio-vs-Embassy
-//! comparison; on-target throughput in CPU cycles is covered by the B3 STM32H5
-//! bench (`examples/embassy-bench-stm32h5`).
+//! - **B1 latency** — the per-iteration time Criterion reports for one
+//!   `buf.push(msg)` → `reader.recv()` cycle (the `time` column).
+//! - **B2 throughput** — messages/second, derived from that same timing via the
+//!   `Throughput::Elements(1)` annotation (the `thrpt` column).
+//!
+//! Covers SPSC (1 producer, 1 consumer) for all three profiles plus a 1→4
+//! telemetry fan-out.
+//!
+//! These are host wall-clock numbers for trend tracking and Tokio-vs-Embassy
+//! comparison; on-target latency/throughput in CPU cycles is covered by the B3
+//! STM32H5 bench (`examples/embassy-bench-stm32h5`).
 //!
 //! **Fan-out safety rules (SpmcRing / PubSubChannel):**
 //! - All readers are **primed** before any messages are pushed, so each holds
@@ -24,9 +31,9 @@
 //!
 //! Run:
 //! ```text
-//! cargo bench -p aimdb-bench --bench b2_throughput_embassy
-//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --save-baseline main
-//! cargo bench -p aimdb-bench --bench b2_throughput_embassy -- --baseline main
+//! cargo bench -p aimdb-bench --bench b1_b2_embassy
+//! cargo bench -p aimdb-bench --bench b1_b2_embassy -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b1_b2_embassy -- --baseline main
 //! ```
 
 aimdb_embassy_adapter::host_test_stubs!();
@@ -39,8 +46,8 @@ use futures::executor::block_on;
 
 // ── Telemetry SPSC ────────────────────────────────────────────────────────────
 
-fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+fn bench_b1_b2_telemetry_spsc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
     group.throughput(Throughput::Elements(1));
 
     group.bench_function("telemetry_spsc", |b| {
@@ -58,7 +65,7 @@ fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(telemetry_msg(i));
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
                     let _ = reader.recv().await;
                 }
                 start.elapsed()
@@ -76,8 +83,8 @@ fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
 // 4 readers. Lockstep keeps at most one message in flight, so the fixed CAP
 // never lags.
 
-fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
     // Each iteration produces 1 message observed by 4 consumers.
     group.throughput(Throughput::Elements(1));
 
@@ -106,7 +113,7 @@ fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(telemetry_msg(i));
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
                     let _ = r0.recv().await;
                     let _ = r1.recv().await;
                     let _ = r2.recv().await;
@@ -122,8 +129,8 @@ fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
 
 // ── State SPSC ────────────────────────────────────────────────────────────────
 
-fn bench_throughput_state_spsc(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+fn bench_b1_b2_state_spsc(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
     group.throughput(Throughput::Elements(1));
 
     group.bench_function("state_spsc", |b| {
@@ -141,7 +148,7 @@ fn bench_throughput_state_spsc(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(state_msg(i));
+                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
                     let _ = reader.recv().await;
                 }
                 start.elapsed()
@@ -154,8 +161,8 @@ fn bench_throughput_state_spsc(c: &mut Criterion) {
 
 // ── Command / Mailbox SPSC ────────────────────────────────────────────────────
 
-fn bench_throughput_command_mailbox(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B2-Throughput-Embassy");
+fn bench_b1_b2_command_mailbox(c: &mut Criterion) {
+    let mut group = c.benchmark_group("B1-B2-Embassy");
     group.throughput(Throughput::Elements(1));
 
     group.bench_function("command_mailbox", |b| {
@@ -173,7 +180,7 @@ fn bench_throughput_command_mailbox(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(command_msg(i));
+                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
                     let _ = reader.recv().await;
                 }
                 start.elapsed()
@@ -186,9 +193,9 @@ fn bench_throughput_command_mailbox(c: &mut Criterion) {
 
 criterion_group!(
     benches,
-    bench_throughput_telemetry_spsc,
-    bench_throughput_telemetry_fanout,
-    bench_throughput_state_spsc,
-    bench_throughput_command_mailbox,
+    bench_b1_b2_telemetry_spsc,
+    bench_b1_b2_telemetry_fanout,
+    bench_b1_b2_state_spsc,
+    bench_b1_b2_command_mailbox,
 );
 criterion_main!(benches);
diff --git a/aimdb-bench/benches/b2_throughput.rs b/aimdb-bench/benches/b1_b2_tokio.rs
similarity index 79%
rename from aimdb-bench/benches/b2_throughput.rs
rename to aimdb-bench/benches/b1_b2_tokio.rs
index 4f8ada6..293696a 100644
--- a/aimdb-bench/benches/b2_throughput.rs
+++ b/aimdb-bench/benches/b1_b2_tokio.rs
@@ -1,7 +1,16 @@
-//! B2 — Steady-state throughput benchmarks (Criterion).
+//! B1/B2 — Latency & throughput benchmarks (Criterion).
 //!
-//! Measures messages per second for SPSC (1 producer, 1 consumer) and 1→4
-//! fan-out configurations, using `TokioBuffer<T>` directly.
+//! A single Criterion suite that captures **both** the B1 and B2 measurement
+//! classes from one set of runs, using `TokioBuffer<T>` directly to isolate the
+//! buffer layer from `AimDb` initialization overhead:
+//!
+//! - **B1 latency** — the per-iteration time Criterion reports for one
+//!   `buf.push(msg)` → `reader.recv()` cycle (the `time` column).
+//! - **B2 throughput** — messages/second, derived from that same timing via the
+//!   `Throughput::Elements(1)` annotation (the `thrpt` column).
+//!
+//! Covers SPSC (1 producer, 1 consumer) for all three profiles plus a 1→4
+//! telemetry fan-out.
 //!
 //! **Fan-out safety rules (SpmcRing / broadcast):**
 //! - All readers are subscribed *before* any messages are pushed so each
@@ -15,13 +24,13 @@
 //! only the last write survives, which conflates Mailbox overwrite semantics
 //! with throughput measurement.  See design 038 §4 for details.
 //!
-//! **Executor:** single current-thread Tokio runtime, same as B0/B1.
+//! **Executor:** single current-thread Tokio runtime, same as B0.
 //!
 //! Run:
 //! ```text
-//! cargo bench -p aimdb-bench --bench b2_throughput
-//! cargo bench -p aimdb-bench --bench b2_throughput -- --save-baseline main
-//! cargo bench -p aimdb-bench --bench b2_throughput -- --baseline main
+//! cargo bench -p aimdb-bench --bench b1_b2_tokio
+//! cargo bench -p aimdb-bench --bench b1_b2_tokio -- --save-baseline main
+//! cargo bench -p aimdb-bench --bench b1_b2_tokio -- --baseline main
 //! ```
 
 use aimdb_bench::profiles::{
@@ -33,12 +42,12 @@ use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 
 // ── Telemetry SPSC ────────────────────────────────────────────────────────────
 
-fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
+fn bench_b1_b2_telemetry_spsc(c: &mut Criterion) {
     let rt = tokio::runtime::Builder::new_current_thread()
         .build()
         .expect("tokio runtime");
 
-    let mut group = c.benchmark_group("B2-Throughput");
+    let mut group = c.benchmark_group("B1-B2");
     group.throughput(Throughput::Elements(1));
 
     group.bench_function("telemetry_spsc", |b| {
@@ -56,7 +65,7 @@ fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(telemetry_msg(i));
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
                     let _ = reader.recv().await;
                 }
                 start.elapsed()
@@ -74,12 +83,12 @@ fn bench_throughput_telemetry_spsc(c: &mut Criterion) {
 // they would all eventually converge on a current-thread executor).
 // TELEMETRY_CAPACITY >= BATCH_SIZE ensures no reader lags.
 
-fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
+fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
     let rt = tokio::runtime::Builder::new_current_thread()
         .build()
         .expect("tokio runtime");
 
-    let mut group = c.benchmark_group("B2-Throughput");
+    let mut group = c.benchmark_group("B1-B2");
     // Each iteration produces 1 message observed by 4 consumers.
     group.throughput(Throughput::Elements(1));
 
@@ -106,7 +115,7 @@ fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(telemetry_msg(i));
+                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
                     let _ = r0.recv().await;
                     let _ = r1.recv().await;
                     let _ = r2.recv().await;
@@ -122,12 +131,12 @@ fn bench_throughput_telemetry_fanout(c: &mut Criterion) {
 
 // ── State SPSC ────────────────────────────────────────────────────────────────
 
-fn bench_throughput_state_spsc(c: &mut Criterion) {
+fn bench_b1_b2_state_spsc(c: &mut Criterion) {
     let rt = tokio::runtime::Builder::new_current_thread()
         .build()
         .expect("tokio runtime");
 
-    let mut group = c.benchmark_group("B2-Throughput");
+    let mut group = c.benchmark_group("B1-B2");
     group.throughput(Throughput::Elements(1));
 
     group.bench_function("state_spsc", |b| {
@@ -144,7 +153,7 @@ fn bench_throughput_state_spsc(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(state_msg(i));
+                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
                     let _ = reader.recv().await;
                 }
                 start.elapsed()
@@ -157,12 +166,12 @@ fn bench_throughput_state_spsc(c: &mut Criterion) {
 
 // ── Command / Mailbox SPSC ────────────────────────────────────────────────────
 
-fn bench_throughput_command_mailbox(c: &mut Criterion) {
+fn bench_b1_b2_command_mailbox(c: &mut Criterion) {
     let rt = tokio::runtime::Builder::new_current_thread()
         .build()
         .expect("tokio runtime");
 
-    let mut group = c.benchmark_group("B2-Throughput");
+    let mut group = c.benchmark_group("B1-B2");
     group.throughput(Throughput::Elements(1));
 
     group.bench_function("command_mailbox", |b| {
@@ -179,7 +188,7 @@ fn bench_throughput_command_mailbox(c: &mut Criterion) {
 
                 let start = std::time::Instant::now();
                 for i in 0..iters {
-                    buf.push(command_msg(i));
+                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
                     let _ = reader.recv().await;
                 }
                 start.elapsed()
@@ -192,9 +201,9 @@ fn bench_throughput_command_mailbox(c: &mut Criterion) {
 
 criterion_group!(
     benches,
-    bench_throughput_telemetry_spsc,
-    bench_throughput_telemetry_fanout,
-    bench_throughput_state_spsc,
-    bench_throughput_command_mailbox,
+    bench_b1_b2_telemetry_spsc,
+    bench_b1_b2_telemetry_fanout,
+    bench_b1_b2_state_spsc,
+    bench_b1_b2_command_mailbox,
 );
 criterion_main!(benches);
diff --git a/aimdb-bench/benches/b1_latency.rs b/aimdb-bench/benches/b1_latency.rs
deleted file mode 100644
index 82360b7..0000000
--- a/aimdb-bench/benches/b1_latency.rs
+++ /dev/null
@@ -1,138 +0,0 @@
-//! B1 — Push-to-recv latency benchmarks (Criterion).
-//!
-//! Measures the wall-clock latency from `buf.push(msg)` to `reader.recv()`
-//! returning, for each workload profile.  Uses `TokioBuffer<T>` directly to
-//! isolate the buffer layer from `AimDb` initialization overhead.
-//!
-//! **Measurement model:** `iter_custom` gives Criterion the total elapsed time
-//! for *iters* push → recv cycles (post-warmup).  Criterion computes the
-//! per-iteration p50/p99 distribution over many samples.
-//!
-//! **Executor:** a single current-thread Tokio runtime is shared across all
-//! bench iterations.  This eliminates work-stealing scheduler noise and keeps
-//! the signal comparable to the B0 allocator runs.
-//!
-//! Run:
-//! ```text
-//! cargo bench -p aimdb-bench --bench b1_latency
-//! # Save a named baseline:
-//! cargo bench -p aimdb-bench --bench b1_latency -- --save-baseline main
-//! # Compare against that baseline:
-//! cargo bench -p aimdb-bench --bench b1_latency -- --baseline main
-//! ```
-
-use aimdb_bench::profiles::{
-    command_buffer, command_msg, state_buffer, state_msg, telemetry_buffer, telemetry_msg,
-    WARMUP_ITERS,
-};
-use aimdb_core::buffer::{Buffer, Reader};
-use criterion::{criterion_group, criterion_main, Criterion};
-
-// ── Telemetry: SpmcRing / broadcast ──────────────────────────────────────────
-
-fn bench_latency_telemetry(c: &mut Criterion) {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .build()
-        .expect("tokio runtime");
-
-    let mut group = c.benchmark_group("B1-Latency");
-
-    group.bench_function("telemetry_spsc", |b| {
-        b.iter_custom(|iters| {
-            rt.block_on(async {
-                let buf = telemetry_buffer();
-                let mut reader = Reader::new(Box::new(buf.subscribe()));
-
-                // Warmup — not timed.
-                for i in 0..WARMUP_ITERS {
-                    buf.push(telemetry_msg(i as u64));
-                    let _ = reader.recv().await;
-                }
-
-                let start = std::time::Instant::now();
-                for i in 0..iters {
-                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
-                    let _ = reader.recv().await;
-                }
-                start.elapsed()
-            })
-        });
-    });
-
-    group.finish();
-}
-
-// ── State: SingleLatest / watch ───────────────────────────────────────────────
-
-fn bench_latency_state(c: &mut Criterion) {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .build()
-        .expect("tokio runtime");
-
-    let mut group = c.benchmark_group("B1-Latency");
-
-    group.bench_function("state_spsc", |b| {
-        b.iter_custom(|iters| {
-            rt.block_on(async {
-                let buf = state_buffer();
-                let mut reader = Reader::new(Box::new(buf.subscribe()));
-
-                for i in 0..WARMUP_ITERS {
-                    buf.push(state_msg(i as u64));
-                    let _ = reader.recv().await;
-                }
-
-                let start = std::time::Instant::now();
-                for i in 0..iters {
-                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
-                    let _ = reader.recv().await;
-                }
-                start.elapsed()
-            })
-        });
-    });
-
-    group.finish();
-}
-
-// ── Command: Mailbox / Notify ─────────────────────────────────────────────────
-
-fn bench_latency_command(c: &mut Criterion) {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .build()
-        .expect("tokio runtime");
-
-    let mut group = c.benchmark_group("B1-Latency");
-
-    // Tight 1:1 push → recv loop — matches Mailbox semantics.
-    group.bench_function("command_mailbox", |b| {
-        b.iter_custom(|iters| {
-            rt.block_on(async {
-                let buf = command_buffer();
-                let mut reader = Reader::new(Box::new(buf.subscribe()));
-
-                for i in 0..WARMUP_ITERS {
-                    buf.push(command_msg(i as u64));
-                    let _ = reader.recv().await;
-                }
-
-                let start = std::time::Instant::now();
-                for i in 0..iters {
-                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
-                    let _ = reader.recv().await;
-                }
-                start.elapsed()
-            })
-        });
-    });
-
-    group.finish();
-}
-
-criterion_group!(
-    benches,
-    bench_latency_telemetry,
-    bench_latency_state,
-    bench_latency_command,
-);
-criterion_main!(benches);
diff --git a/aimdb-bench/benches/b1_latency_embassy.rs b/aimdb-bench/benches/b1_latency_embassy.rs
deleted file mode 100644
index 99f7ac5..0000000
--- a/aimdb-bench/benches/b1_latency_embassy.rs
+++ /dev/null
@@ -1,132 +0,0 @@
-//! B1 — Push-to-recv latency on the Embassy adapter (host-driven, Criterion).
-//!
-//! The Embassy companion to [`b1_latency`]. Measures the wall-clock latency
-//! from `buf.push(msg)` to `reader.recv()` returning, for each workload
-//! profile, against the **Embassy** buffer backend. Driven on the host via
-//! `futures::executor::block_on` over embassy-sync's poll methods — no
-//! `embassy-runtime`, no cortex-m executor, no hardware.
-//!
-//! These are host wall-clock numbers for trend tracking and Tokio-vs-Embassy
-//! comparison; they are **not** a substitute for on-target cycle counts. Real
-//! embedded latency is measured in CPU cycles by the B3 STM32H5 bench
-//! (`examples/embassy-bench-stm32h5`).
-//!
-//! **Measurement model:** `iter_custom` gives Criterion the total elapsed time
-//! for *iters* push → recv cycles (post-warmup). Each reader is **primed**
-//! before the first push so the lazy SpmcRing subscriber is registered (see
-//! [`profiles_embassy`]).
-//!
-//! Run:
-//! ```text
-//! cargo bench -p aimdb-bench --bench b1_latency_embassy
-//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --save-baseline main
-//! cargo bench -p aimdb-bench --bench b1_latency_embassy -- --baseline main
-//! ```
-
-aimdb_embassy_adapter::host_test_stubs!();
-
-use aimdb_bench::profiles::{command_msg, state_msg, telemetry_msg, WARMUP_ITERS};
-use aimdb_bench::profiles_embassy::{command_buffer, prime, state_buffer, telemetry_buffer};
-use aimdb_core::buffer::{Buffer, Reader};
-use criterion::{criterion_group, criterion_main, Criterion};
-use futures::executor::block_on;
-
-// ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────────
-
-fn bench_latency_telemetry(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B1-Latency-Embassy");
-
-    group.bench_function("telemetry_spsc", |b| {
-        b.iter_custom(|iters| {
-            block_on(async {
-                let buf = telemetry_buffer();
-                let mut reader = Reader::new(Box::new(buf.subscribe()));
-                prime(&mut reader);
-
-                // Warmup — not timed.
-                for i in 0..WARMUP_ITERS {
-                    buf.push(telemetry_msg(i as u64));
-                    let _ = reader.recv().await;
-                }
-
-                let start = std::time::Instant::now();
-                for i in 0..iters {
-                    buf.push(telemetry_msg((WARMUP_ITERS as u64) + i));
-                    let _ = reader.recv().await;
-                }
-                start.elapsed()
-            })
-        });
-    });
-
-    group.finish();
-}
-
-// ── State: SingleLatest / Watch ───────────────────────────────────────────────
-
-fn bench_latency_state(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B1-Latency-Embassy");
-
-    group.bench_function("state_spsc", |b| {
-        b.iter_custom(|iters| {
-            block_on(async {
-                let buf = state_buffer();
-                let mut reader = Reader::new(Box::new(buf.subscribe()));
-                prime(&mut reader);
-
-                for i in 0..WARMUP_ITERS {
-                    buf.push(state_msg(i as u64));
-                    let _ = reader.recv().await;
-                }
-
-                let start = std::time::Instant::now();
-                for i in 0..iters {
-                    buf.push(state_msg((WARMUP_ITERS as u64) + i));
-                    let _ = reader.recv().await;
-                }
-                start.elapsed()
-            })
-        });
-    });
-
-    group.finish();
-}
-
-// ── Command: Mailbox / Channel(capacity=1) ────────────────────────────────────
-
-fn bench_latency_command(c: &mut Criterion) {
-    let mut group = c.benchmark_group("B1-Latency-Embassy");
-
-    // Tight 1:1 push → recv loop — matches Mailbox semantics.
-    group.bench_function("command_mailbox", |b| {
-        b.iter_custom(|iters| {
-            block_on(async {
-                let buf = command_buffer();
-                let mut reader = Reader::new(Box::new(buf.subscribe()));
-                prime(&mut reader);
-
-                for i in 0..WARMUP_ITERS {
-                    buf.push(command_msg(i as u64));
-                    let _ = reader.recv().await;
-                }
-
-                let start = std::time::Instant::now();
-                for i in 0..iters {
-                    buf.push(command_msg((WARMUP_ITERS as u64) + i));
-                    let _ = reader.recv().await;
-                }
-                start.elapsed()
-            })
-        });
-    });
-
-    group.finish();
-}
-
-criterion_group!(
-    benches,
-    bench_latency_telemetry,
-    bench_latency_state,
-    bench_latency_command,
-);
-criterion_main!(benches);
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
index 8e776ee..ba55abc 100644
--- a/aimdb-bench/src/lib.rs
+++ b/aimdb-bench/src/lib.rs
@@ -12,11 +12,9 @@
 //! | File                              | Class | Purpose                                  |
 //! |-----------------------------------|-------|------------------------------------------|
 //! | `benches/b0_alloc_tokio.rs`       | B0    | Per-message allocation (Tokio buffer)    |
-//! | `benches/b1_latency.rs`           | B1    | Push-to-recv latency (Tokio buffer)      |
-//! | `benches/b2_throughput.rs`        | B2    | Steady-state throughput (Tokio buffer)   |
+//! | `benches/b1_b2_tokio.rs`          | B1+B2 | Latency (time/iter) + throughput (Tokio) |
 //! | `benches/b0_alloc_embassy.rs`     | B0    | Per-message allocation (Embassy buffer)  |
-//! | `benches/b1_latency_embassy.rs`   | B1    | Push-to-recv latency (Embassy buffer)    |
-//! | `benches/b2_throughput_embassy.rs`| B2    | Steady-state throughput (Embassy buffer) |
+//! | `benches/b1_b2_embassy.rs`        | B1+B2 | Latency (time/iter) + throughput (Embassy)|
 //! | `benches/b_alloc_pipeline.rs`     | info  | Per-message allocation (runner pipeline) |
 //! | `benches/b_runner_pipeline.rs`    | info  | Runner pipeline throughput (Criterion)   |
 //!

From 51de07da92aa742da6c7c5e5b52274557191ec78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 20:28:01 +0000
Subject: [PATCH 13/16] Refactor and enhance documentation for aimdb-bench

- Updated comments and documentation in `b_alloc_pipeline.rs` to clarify the measurement scope and execution details.
- Improved clarity in `b_runner_pipeline.rs` regarding the benchmarking process and its scope.
- Simplified and clarified the `alloc.rs` documentation, emphasizing the isolation of the counting allocator from production code.
- Revised `lib.rs` documentation to highlight the non-production nature of the benchmarking infrastructure.
- Enhanced comments in `profiles_embassy.rs` to better explain the behavior of Embassy buffers and the importance of lazy subscriber registration.
- Added a new design document `038-aimdb-bench-crate-design.md` outlining the structured benchmarking infrastructure for AimDB.
- Introduced `039-proof-artifact-and-story-roadmap.md` to detail the sequencing of proof artifacts and story publication.
---
 _external/embassy                             |   2 +-
 aimdb-bench/benches/b0_alloc_embassy.rs       |  46 +--
 aimdb-bench/benches/b0_alloc_tokio.rs         |  48 +--
 aimdb-bench/benches/b1_b2_embassy.rs          |  46 ++-
 aimdb-bench/benches/b1_b2_tokio.rs            |  43 +--
 aimdb-bench/benches/b_alloc_pipeline.rs       |  41 +--
 aimdb-bench/benches/b_runner_pipeline.rs      |  25 +-
 aimdb-bench/src/alloc.rs                      |  18 +-
 aimdb-bench/src/lib.rs                        |  19 +-
 aimdb-bench/src/profiles_embassy.rs           |  47 ++-
 docs/design/037-zero-alloc-consume-path.md    |   4 +-
 docs/design/038-aimdb-bench-crate-design.md   | 309 ++++++++++++++++++
 .../039-proof-artifact-and-story-roadmap.md   |  54 +++
 13 files changed, 498 insertions(+), 204 deletions(-)
 create mode 100644 docs/design/038-aimdb-bench-crate-design.md
 create mode 100644 docs/design/039-proof-artifact-and-story-roadmap.md

diff --git a/_external/embassy b/_external/embassy
index 9e31679..20f6d85 160000
--- a/_external/embassy
+++ b/_external/embassy
@@ -1 +1 @@
-Subproject commit 9e31679810ce57f10d0466fbe62afd3502d98357
+Subproject commit 20f6d85e827d3ecf50419001f2746fe5ec2186cc
diff --git a/aimdb-bench/benches/b0_alloc_embassy.rs b/aimdb-bench/benches/b0_alloc_embassy.rs
index ba57aef..9975358 100644
--- a/aimdb-bench/benches/b0_alloc_embassy.rs
+++ b/aimdb-bench/benches/b0_alloc_embassy.rs
@@ -1,32 +1,22 @@
 //! B0 — Allocation counting on the Embassy adapter (host-driven).
 //!
-//! The Embassy companion to [`b0_alloc_tokio`]. Measures per-message
-//! allocation cost for each workload profile against the **Embassy** buffer
-//! backend ([`EmbassyBuffer`]), driven on the host via
-//! `futures::executor::block_on` over embassy-sync's poll methods — no
-//! `embassy-runtime`, no cortex-m executor, no hardware.
+//! The Embassy companion to [`b0_alloc_tokio`]. Measures per-message allocation
+//! cost for each workload profile against the **Embassy** buffer backend
+//! ([`EmbassyBuffer`]), driven on the host via `futures::executor::block_on`
+//! over embassy-sync's `poll_*` methods — no `embassy-runtime`, no cortex-m
+//! executor, no hardware. `poll_recv` drives those methods with no per-message
+//! future box, so the expected result is **0 allocs/msg**, same as the Tokio
+//! suite; the one-time `Box::new(reader)` and lazy subscriber registration
+//! happen during setup/warmup, before the counters are reset.
 //!
-//! The Embassy `poll_recv` drives embassy-sync's public `poll_*` methods
-//! directly with no per-message future box, so the steady-state hot path is
-//! allocation-free — the expected result is **0 allocs/msg**, the same as the
-//! Tokio suite. The one-time `Box::new(reader)` and the lazy subscriber
-//! registration happen during setup/warmup, before the counters are reset.
+//! **Measurement model** (identical to `b0_alloc_tokio`): create buffer +
+//! reader and **prime** it (forces lazy SpmcRing subscriber registration — see
+//! [`profiles_embassy`]), warm up `WARMUP_ITERS` cycles, `reset()`, run
+//! `BATCH_SIZE` cycles, then `snapshot()` and divide by `BATCH_SIZE`.
 //!
-//! **Measurement model** (identical to `b0_alloc_tokio`):
-//! 1. Create buffer + reader; **prime** the reader (forces lazy SpmcRing
-//!    subscriber registration — see [`profiles_embassy`]).
-//! 2. Warmup ≥ `WARMUP_ITERS` push → recv cycles (excluded from counters).
-//! 3. `reset()` allocation counters.
-//! 4. Run `BATCH_SIZE` push → recv cycles.
-//! 5. `snapshot()` counters; divide by `BATCH_SIZE` for per-message figures.
-//!
-//! Run:
-//! ```text
-//! cargo bench -p aimdb-bench --bench b0_alloc_embassy
-//! ```
-//!
-//! Results are written to `aimdb-bench/target/bench-results/b0_alloc_embassy.json`
-//! (anchored to the crate dir, so the path is the same regardless of CWD).
+//! Run `cargo bench -p aimdb-bench --bench b0_alloc_embassy`; results are
+//! written to `aimdb-bench/target/bench-results/b0_alloc_embassy.json` (anchored
+//! to the crate dir).
 
 // The Embassy adapter calls `defmt::*` unconditionally and links embassy-time;
 // on the host neither a logger nor a time driver exists. This expands no-op
@@ -50,9 +40,9 @@ fn main() {
 
     // ── Telemetry: SpmcRing / PubSubChannel ──────────────────────────────────
     //
-    // `prime()` is REQUIRED here: the SpmcRing subscriber is created on the
-    // reader's first poll, so without priming the first pushed message would be
-    // missed and the first `recv()` would block forever.
+    // `prime()` is REQUIRED: the SpmcRing subscriber is created on the reader's
+    // first poll, so without it the first pushed message is missed and `recv()`
+    // blocks forever.
     let telemetry_report = block_on(async {
         let buf = telemetry_buffer();
         let mut reader = Reader::new(Box::new(buf.subscribe()));
diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
index b52ab4d..ae7aefb 100644
--- a/aimdb-bench/benches/b0_alloc_tokio.rs
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -1,33 +1,20 @@
 //! B0 — Allocation counting on the Tokio adapter.
 //!
-//! Measures per-message allocation cost for each workload profile using
-//! `TokioBuffer<T>` directly (not the full `AimDb` stack).
+//! Measures per-message allocation cost for each workload profile against
+//! `TokioBuffer<T>` directly (not the full `AimDb` stack). The consume path
+//! polls the reader without boxing a future per `recv()`, so the expected
+//! result is **0 allocs/msg**; a non-zero figure flags an allocation regression
+//! in the consume path.
 //!
-//! **Expected result: 0 allocs/msg.** The zero-allocation consume path drives
-//! the reader without boxing a future per `recv()`, so the steady-state hot
-//! path is allocation-free. This bench is the hard gate that keeps it that way:
-//! any per-message allocation that creeps back into the consume path shows up
-//! here as a non-zero `allocs/msg`.
+//! **Measurement model:** create buffer + reader, warm up `WARMUP_ITERS`
+//! push → recv cycles, `reset()` the counters, run `BATCH_SIZE` cycles, then
+//! `snapshot()` and divide by `BATCH_SIZE`. A current-thread Tokio runtime
+//! keeps scheduler allocation out of the hot path so the counter isolates
+//! AimDB's per-message contribution.
 //!
-//! **Measurement model:**
-//! 1. Create buffer + reader.
-//! 2. Warmup ≥ `WARMUP_ITERS` push → recv cycles (excluded from counters).
-//! 3. `reset()` allocation counters.
-//! 4. Run `BATCH_SIZE` push → recv cycles.
-//! 5. `snapshot()` counters; divide by `BATCH_SIZE` for per-message figures.
-//!
-//! **Noise reduction:** a current-thread Tokio runtime is used so there are
-//! no work-stealing threads and Tokio's scheduler does not allocate per-poll
-//! in the hot path.  The counter then cleanly isolates AimDB's per-message
-//! contribution.
-//!
-//! Run:
-//! ```text
-//! cargo bench -p aimdb-bench --bench b0_alloc_tokio
-//! ```
-//!
-//! Results are written to `aimdb-bench/target/bench-results/b0_alloc_tokio.json`
-//! (anchored to the crate dir, so the path is the same regardless of CWD).
+//! Run `cargo bench -p aimdb-bench --bench b0_alloc_tokio`; results are written
+//! to `aimdb-bench/target/bench-results/b0_alloc_tokio.json` (anchored to the
+//! crate dir).
 
 use aimdb_bench::{
     alloc::{reset, snapshot},
@@ -40,8 +27,7 @@ use aimdb_bench::{
 use aimdb_core::buffer::{Buffer, Reader};
 
 fn main() {
-    // Current-thread executor — no work-stealing threads, minimal scheduler
-    // overhead, clean allocation signal.
+    // Current-thread executor — no work-stealing threads, clean allocation signal.
     let rt = tokio::runtime::Builder::new_current_thread()
         .build()
         .expect("failed to build current-thread Tokio runtime");
@@ -96,10 +82,8 @@ fn main() {
 
     // ── Command: Mailbox / Mutex + Notify ────────────────────────────────────
     //
-    // Tight 1:1 push → recv loop matches Mailbox semantics.  Do NOT batch
-    // pushes ahead of the consumer: the single slot overwrites earlier values,
-    // and only the last write survives — which would conflate Mailbox overwrite
-    // semantics with throughput measurement.
+    // Tight 1:1 push → recv loop. Do NOT batch pushes ahead of the consumer:
+    // the single slot overwrites earlier values, leaving only the last write.
     let command_report = rt.block_on(async {
         let buf = command_buffer();
         let mut reader = Reader::new(Box::new(buf.subscribe()));
diff --git a/aimdb-bench/benches/b1_b2_embassy.rs b/aimdb-bench/benches/b1_b2_embassy.rs
index b43c1dd..fdb719f 100644
--- a/aimdb-bench/benches/b1_b2_embassy.rs
+++ b/aimdb-bench/benches/b1_b2_embassy.rs
@@ -1,33 +1,30 @@
 //! B1/B2 — Latency & throughput on the Embassy adapter (host-driven, Criterion).
 //!
-//! The Embassy companion to [`b1_b2_tokio`], capturing **both** the B1 and B2
-//! measurement classes from one set of runs against the **Embassy** buffer
-//! backend, driven on the host via `futures::executor::block_on` — no
-//! `embassy-runtime`, no cortex-m executor, no hardware:
+//! The Embassy companion to [`b1_b2_tokio`], capturing **both** measurement
+//! classes from one set of runs against the **Embassy** buffer backend, driven
+//! on the host via `futures::executor::block_on` — no `embassy-runtime`, no
+//! cortex-m executor, no hardware:
 //!
-//! - **B1 latency** — the per-iteration time Criterion reports for one
-//!   `buf.push(msg)` → `reader.recv()` cycle (the `time` column).
-//! - **B2 throughput** — messages/second, derived from that same timing via the
-//!   `Throughput::Elements(1)` annotation (the `thrpt` column).
+//! - **B1 latency** — per-iteration time for one `buf.push(msg)` →
+//!   `reader.recv()` cycle (the `time` column).
+//! - **B2 throughput** — messages/second from that same timing via
+//!   `Throughput::Elements(1)` (the `thrpt` column).
 //!
 //! Covers SPSC (1 producer, 1 consumer) for all three profiles plus a 1→4
-//! telemetry fan-out.
-//!
-//! These are host wall-clock numbers for trend tracking and Tokio-vs-Embassy
-//! comparison; on-target latency/throughput in CPU cycles is covered by the B3
+//! telemetry fan-out. These are host wall-clock numbers for trend tracking and
+//! Tokio-vs-Embassy comparison; on-target cycle counts are covered by the B3
 //! STM32H5 bench (`examples/embassy-bench-stm32h5`).
 //!
-//! **Fan-out safety rules (SpmcRing / PubSubChannel):**
-//! - All readers are **primed** before any messages are pushed, so each holds
-//!   its read position from the start (the embassy `Subscriber` is otherwise
-//!   created lazily on first poll and would miss earlier messages).
-//! - `SUBS = 4` on [`TelemetryBuffer`](aimdb_bench::profiles_embassy::TelemetryBuffer)
-//!   provides exactly four subscriber slots for the fan-out.
-//! - The loop is strict lockstep (1 push, then `recv` on every reader), so at
-//!   most one message is ever in flight and the fixed `CAP` never lags.
+//! **Fan-out safety (SpmcRing / PubSubChannel):** all readers are **primed**
+//! before any push so each holds its read position from the start (the embassy
+//! `Subscriber` is otherwise created lazily on first poll and would miss earlier
+//! messages); `SUBS = 4` on
+//! [`TelemetryBuffer`](aimdb_bench::profiles_embassy::TelemetryBuffer) provides
+//! the four subscriber slots, and strict lockstep keeps the fixed `CAP` from
+//! lagging.
 //!
-//! **Mailbox throughput:** tight 1:1 push → recv loop. Do NOT batch pushes
-//! ahead of the consumer — the single slot overwrites earlier values.
+//! **Mailbox:** tight 1:1 push → recv loop. Do NOT batch pushes ahead of the
+//! consumer — the single slot overwrites earlier values.
 //!
 //! Run:
 //! ```text
@@ -78,10 +75,7 @@ fn bench_b1_b2_telemetry_spsc(c: &mut Criterion) {
 
 // ── Telemetry 1→4 fan-out ────────────────────────────────────────────────────
 //
-// All 4 readers are primed before any messages are pushed, so each registers
-// its subscriber at the current position. Each iteration: 1 push + recv on all
-// 4 readers. Lockstep keeps at most one message in flight, so the fixed CAP
-// never lags.
+// Each iteration: 1 push + recv on all 4 readers (see module fan-out rules).
 
 fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
     let mut group = c.benchmark_group("B1-B2-Embassy");
diff --git a/aimdb-bench/benches/b1_b2_tokio.rs b/aimdb-bench/benches/b1_b2_tokio.rs
index 293696a..87212e3 100644
--- a/aimdb-bench/benches/b1_b2_tokio.rs
+++ b/aimdb-bench/benches/b1_b2_tokio.rs
@@ -1,30 +1,24 @@
 //! B1/B2 — Latency & throughput benchmarks (Criterion).
 //!
-//! A single Criterion suite that captures **both** the B1 and B2 measurement
-//! classes from one set of runs, using `TokioBuffer<T>` directly to isolate the
-//! buffer layer from `AimDb` initialization overhead:
+//! One Criterion suite capturing **both** measurement classes from a single set
+//! of runs, using `TokioBuffer<T>` directly to isolate the buffer layer from
+//! `AimDb` initialization overhead:
 //!
-//! - **B1 latency** — the per-iteration time Criterion reports for one
-//!   `buf.push(msg)` → `reader.recv()` cycle (the `time` column).
-//! - **B2 throughput** — messages/second, derived from that same timing via the
-//!   `Throughput::Elements(1)` annotation (the `thrpt` column).
+//! - **B1 latency** — per-iteration time for one `buf.push(msg)` →
+//!   `reader.recv()` cycle (the `time` column).
+//! - **B2 throughput** — messages/second from that same timing via
+//!   `Throughput::Elements(1)` (the `thrpt` column).
 //!
 //! Covers SPSC (1 producer, 1 consumer) for all three profiles plus a 1→4
-//! telemetry fan-out.
+//! telemetry fan-out, on a current-thread Tokio runtime (same as B0).
 //!
-//! **Fan-out safety rules (SpmcRing / broadcast):**
-//! - All readers are subscribed *before* any messages are pushed so each
-//!   reader holds its read position from the start.
-//! - The loop is strict lockstep (1 push, then `recv` on every reader), so at
-//!   most one message is ever in flight; the ring capacity (`TELEMETRY_CAPACITY`)
-//!   is far more than enough to keep any reader from lagging within an iteration.
+//! **Fan-out safety (SpmcRing / broadcast):** all readers subscribe *before*
+//! any push so each holds its read position from the start, and the loop is
+//! strict lockstep (1 push, then `recv` on every reader) so at most one message
+//! is in flight — `TELEMETRY_CAPACITY` never lags.
 //!
-//! **Mailbox throughput:** tight 1:1 push → recv loop.  Do NOT batch pushes
-//! ahead of the consumer — the single slot overwrites earlier values and
-//! only the last write survives, which conflates Mailbox overwrite semantics
-//! with throughput measurement.  See design 038 §4 for details.
-//!
-//! **Executor:** single current-thread Tokio runtime, same as B0.
+//! **Mailbox:** tight 1:1 push → recv loop. Do NOT batch pushes ahead of the
+//! consumer — the single slot overwrites earlier values, leaving only the last.
 //!
 //! Run:
 //! ```text
@@ -78,10 +72,7 @@ fn bench_b1_b2_telemetry_spsc(c: &mut Criterion) {
 
 // ── Telemetry 1→4 fan-out ────────────────────────────────────────────────────
 //
-// All 4 readers are subscribed before any messages are pushed.
-// Each iteration: 1 push + recv on all 4 readers (sequential in bench, as
-// they would all eventually converge on a current-thread executor).
-// TELEMETRY_CAPACITY >= BATCH_SIZE ensures no reader lags.
+// Each iteration: 1 push + recv on all 4 readers (see module fan-out rules).
 
 fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
     let rt = tokio::runtime::Builder::new_current_thread()
@@ -95,9 +86,7 @@ fn bench_b1_b2_telemetry_fanout(c: &mut Criterion) {
     group.bench_function("telemetry_fanout_1x4", |b| {
         b.iter_custom(|iters| {
             rt.block_on(async {
-                // All readers subscribed before first push so each holds its
-                // read position from the start. Lockstep below keeps at most one
-                // message in flight, so the fixed ring capacity never lags.
+                // Subscribe all readers before the first push (see module docs).
                 let buf = telemetry_buffer();
                 let mut r0 = Reader::new(Box::new(buf.subscribe()));
                 let mut r1 = Reader::new(Box::new(buf.subscribe()));
diff --git a/aimdb-bench/benches/b_alloc_pipeline.rs b/aimdb-bench/benches/b_alloc_pipeline.rs
index 978c5c9..748fe2c 100644
--- a/aimdb-bench/benches/b_alloc_pipeline.rs
+++ b/aimdb-bench/benches/b_alloc_pipeline.rs
@@ -1,34 +1,25 @@
 //! B0-Pipeline — Allocation counting for a live runner-driven pipeline.
 //!
 //! Measures per-message allocation cost for a real `.source()` -> buffer ->
-//! `.tap()` pipeline driven by `AimDbRunner`. Unlike `b0_alloc_tokio`, this is
-//! an integration-layer measurement: it includes runner/stage machinery in
-//! addition to the buffer consume path.
+//! `.tap()` pipeline driven by `AimDbRunner` — an integration-layer measurement
+//! that includes runner/stage machinery on top of the buffer consume path. The
+//! source generates each batch internally after a single start notification and
+//! the tap signals completion once the batch is consumed, so the measured window
+//! carries no per-message ingress or ack traffic.
 //!
-//! **Scope:** this bench intentionally minimizes harness-side noise. The source
-//! generates each batch internally after a single start notification, and the
-//! tap emits a single completion notification when the whole batch has been
-//! consumed. There is no per-message ingress or ack channel traffic in the
-//! measured window.
+//! Treat this as an informational companion to the raw-buffer B0 gate in
+//! `b0_alloc_tokio`: if it regresses, that gate still isolates whether the
+//! consume path itself is at fault.
 //!
-//! **Interpretation:** use this as an informational companion to the raw-buffer
-//! B0 gate in `b0_alloc_tokio`. If this regresses, the raw-buffer B0 still
-//! tells you whether the issue is in the consume path itself.
+//! Run `cargo bench -p aimdb-bench --bench b_alloc_pipeline`; results are written
+//! to `aimdb-bench/target/bench-results/b_alloc_pipeline.json` (anchored to the
+//! crate dir).
 //!
-//! Run:
-//! ```text
-//! cargo bench -p aimdb-bench --bench b_alloc_pipeline
-//! ```
-//!
-//! Results are written to `aimdb-bench/target/bench-results/b_alloc_pipeline.json`
-//! (anchored to the crate dir, so the path is the same regardless of CWD).
-//!
-//! **Executor dependency.** The source/tap pacing below uses a check-then-await
-//! pattern (load an atomic, and only `.notified().await` if there is no work).
-//! `Notify::notify_waiters()` does not store a permit, so this is only free of
-//! lost wakeups because the bench runs on a **current-thread** Tokio runtime:
-//! nothing can preempt between the atomic load and the `.await`. Do not port
-//! this harness to a multi-threaded executor without revisiting the pacing.
+//! **Executor dependency.** The source/tap pacing uses check-then-await (load an
+//! atomic, `.notified().await` only when there is no work). `notify_waiters()`
+//! stores no permit, so this avoids lost wakeups only because the bench runs on
+//! a **current-thread** runtime: nothing preempts between the load and the
+//! `.await`. Do not port to a multi-threaded executor without revisiting it.
 
 use std::fmt::Debug;
 use std::sync::{
diff --git a/aimdb-bench/benches/b_runner_pipeline.rs b/aimdb-bench/benches/b_runner_pipeline.rs
index 7c62181..9ff1e79 100644
--- a/aimdb-bench/benches/b_runner_pipeline.rs
+++ b/aimdb-bench/benches/b_runner_pipeline.rs
@@ -1,23 +1,18 @@
 //! B-Runner-Pipeline — Runner-driven in-process pipeline throughput.
 //!
 //! Exercises the same three profiles as B0/B1/B2 through a real `AimDbRunner`
-//! path: `.source()` -> buffer -> `.tap()`.  This makes the benchmark measure
-//! stage wakeups and the runner-driven producer/consumer pipeline rather than
-//! direct `Producer<T>` / `Consumer<T>` calls from the bench body.
+//! path (`.source()` -> buffer -> `.tap()`), so it measures stage wakeups and
+//! the runner-driven producer/consumer pipeline rather than direct
+//! `Producer<T>` / `Consumer<T>` calls. It is in-process only — no outbound
+//! connectors, serialization, transport, or kernel I/O — and the timing window
+//! includes the handshakes that feed the source stage and observe completion at
+//! the tap stage.
 //!
-//! **Scope:** this is a real runner benchmark, but still in-process only.
-//! It does not include outbound connectors, serialization, transport, or
-//! kernel I/O. The timing window includes the coordination handshakes used to
-//! feed messages into the source stage and observe completion at the tap stage.
+//! One `AimDb` is built per bench group with its runner spawned once; Criterion
+//! samples push work into the source via an ingress channel and wait for the
+//! tap's completion signals.
 //!
-//! **Setup:** one `AimDb` instance is built per bench group, and its runner is
-//! spawned once. Criterion samples then push work into the source stage via an
-//! ingress channel and wait for completion signals emitted by the tap stage.
-//!
-//! Run:
-//! ```text
-//! cargo bench -p aimdb-bench --bench b_runner_pipeline
-//! ```
+//! Run `cargo bench -p aimdb-bench --bench b_runner_pipeline`.
 
 use std::fmt::Debug;
 use std::sync::Arc;
diff --git a/aimdb-bench/src/alloc.rs b/aimdb-bench/src/alloc.rs
index 41f56e8..6de1753 100644
--- a/aimdb-bench/src/alloc.rs
+++ b/aimdb-bench/src/alloc.rs
@@ -1,18 +1,10 @@
 //! Allocation counting for B0 benchmarks.
 //!
-//! Wraps the system allocator with atomic counters so B0 benchmarks can
-//! measure per-message allocation overhead precisely.
-//!
-//! **Production isolation.** `#[global_allocator]` is a per-binary
-//! link-time declaration.  `CountingAllocator` exists only in the bench
-//! binaries produced by `aimdb-bench`.  Nothing in the production
-//! dependency graph — `aimdb-core`, `aimdb-tokio-adapter`, application
-//! binaries — depends on `aimdb-bench`, so this has zero impact on
-//! production code.
-//!
-//! **Generic inner allocator.** `CountingAllocator<A>` is generic over the
-//! inner `GlobalAlloc` so a future embedded B3 target can swap `System` for
-//! `embedded-alloc` or similar without changing this module.
+//! Wraps an inner `GlobalAlloc` with atomic counters to measure per-message
+//! allocation overhead. `#[global_allocator]` is a per-binary, link-time
+//! declaration, so `CountingAllocator` affects only the bench binaries and has
+//! zero impact on production crates. It is generic over the inner allocator so
+//! an embedded target can swap `System` for `embedded-alloc`.
 
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::sync::atomic::{AtomicU64, Ordering};
diff --git a/aimdb-bench/src/lib.rs b/aimdb-bench/src/lib.rs
index ba55abc..ae9500d 100644
--- a/aimdb-bench/src/lib.rs
+++ b/aimdb-bench/src/lib.rs
@@ -1,11 +1,10 @@
-//! AimDB benchmarking infrastructure.
+//! AimDB benchmarking infrastructure. **Not for production use.**
 //!
-//! Provides reusable primitives for B0 (allocation counting), B1 (latency),
-//! and B2 (throughput) benchmarks.  **Not for production use.**
-//!
-//! The `alloc` module registers [`alloc::CountingAllocator`] as the
-//! `#[global_allocator]` for every bench binary that links this crate.
-//! Nothing in the production dependency graph depends on `aimdb-bench`.
+//! Reusable primitives for B0 (allocation counting), B1 (latency), and B2
+//! (throughput) benchmarks. The `alloc` module registers
+//! [`alloc::CountingAllocator`] as the `#[global_allocator]` for every bench
+//! binary that links this crate; nothing in the production dependency graph
+//! depends on `aimdb-bench`.
 //!
 //! # Bench entrypoints
 //!
@@ -18,9 +17,9 @@
 //! | `benches/b_alloc_pipeline.rs`     | info  | Per-message allocation (runner pipeline) |
 //! | `benches/b_runner_pipeline.rs`    | info  | Runner pipeline throughput (Criterion)   |
 //!
-//! On-target cycle profiling (B3) is a separate hardware-only crate,
-//! `examples/embassy-bench-stm32h5`, because DWT cycle counting cannot run on a
-//! host. See design doc 038 for the on-target B3 harness.
+//! On-target cycle profiling (B3) lives in the hardware-only
+//! `examples/embassy-bench-stm32h5` crate, since DWT cycle counting cannot run
+//! on a host.
 
 pub mod alloc;
 pub mod profiles;
diff --git a/aimdb-bench/src/profiles_embassy.rs b/aimdb-bench/src/profiles_embassy.rs
index 3546091..705edc1 100644
--- a/aimdb-bench/src/profiles_embassy.rs
+++ b/aimdb-bench/src/profiles_embassy.rs
@@ -1,16 +1,16 @@
 //! Embassy buffer constructors for the host-driven B0/B1/B2 suites.
 //!
-//! These reuse the same payload types and message factories as the Tokio
-//! profiles ([`crate::profiles`]) so the two adapters are measured against
-//! identical workloads. Only the buffer backend differs: here the buffers are
-//! [`EmbassyBuffer`]s built on embassy-sync primitives, driven on the host via
+//! Reuse the same payload types and message factories as the Tokio profiles
+//! ([`crate::profiles`]) so both adapters are measured against identical
+//! workloads; only the backend differs. These are [`EmbassyBuffer`]s on
+//! embassy-sync primitives, driven on the host via
 //! `futures::executor::block_on`.
 //!
 //! # Const-generic sizing
 //!
-//! Unlike Tokio's runtime-sized buffers, Embassy buffers are sized at
-//! compile time (`EmbassyBuffer<T, CAP, SUBS, PUBS, WATCH_N>`). The aliases
-//! below fix those parameters per profile:
+//! Embassy buffers are sized at compile time
+//! (`EmbassyBuffer<T, CAP, SUBS, PUBS, WATCH_N>`). The aliases below fix those
+//! parameters per profile:
 //!
 //! | Profile   | Backend        | CAP | SUBS | PUBS | WATCH_N | Notes                       |
 //! |-----------|----------------|-----|------|------|---------|-----------------------------|
@@ -18,18 +18,16 @@
 //! | State     | `SingleLatest` | 1   | 1    | 1    | 4       | only WATCH_N is used        |
 //! | Command   | `Mailbox`      | 1   | 1    | 1    | 1       | Channel capacity is fixed=1 |
 //!
-//! The lockstep push→recv loops in the benches keep at most one message in
-//! flight, so `CAP=16` for Telemetry is far more than enough to avoid lagging.
+//! Lockstep push→recv keeps at most one message in flight, so `CAP=16` for
+//! Telemetry never lags.
 //!
-//! # Lazy SpmcRing subscriber (important)
+//! # Lazy SpmcRing subscriber
 //!
-//! An [`EmbassyBuffer`] `SpmcRing` reader registers its underlying embassy
-//! `Subscriber` **lazily, on its first poll** — not at `subscribe()` time. A
-//! message published before that first poll is therefore missed, and a
-//! subsequent `recv()` would block forever. Benches must call
-//! [`prime`] on each reader *before* the first `push`, which forces subscriber
-//! registration via `try_recv`. This is a no-op for Watch/Mailbox readers, so
-//! it is safe (and clearer) to prime every reader uniformly.
+//! An `SpmcRing` reader registers its embassy `Subscriber` lazily, on its first
+//! poll — not at `subscribe()` time. A message published before that poll is
+//! missed and a later `recv()` blocks forever. Benches must [`prime`] each
+//! reader *before* the first `push` to force registration; priming is a no-op
+//! for Watch/Mailbox readers.
 
 use aimdb_core::buffer::Reader;
 use aimdb_embassy_adapter::EmbassyBuffer;
@@ -64,16 +62,15 @@ pub fn command_buffer() -> CommandBuffer {
 }
 
 /// Force lazy subscriber registration on an Embassy reader before the first
-/// `push`.
+/// `push` (see module docs).
 ///
-/// For an `SpmcRing` reader this registers the embassy `Subscriber` at the
-/// current queue position so it does not miss the first published message (see
-/// the module docs). For Watch/Mailbox readers it is a harmless empty read.
-/// Must be called *outside* the measured window — registration may allocate.
+/// For `SpmcRing` this registers the `Subscriber` at the current queue position
+/// so it does not miss the first message; for Watch/Mailbox it is a harmless
+/// empty read. Must be called *outside* the measured window — registration may
+/// allocate.
 #[inline]
 pub fn prime<T: Clone + Send>(reader: &mut Reader<T>) {
-    // `Reader<T>` exposes `try_recv`; the only expected error here is
-    // `BufferEmpty`, which we deliberately ignore — the point is the side
-    // effect of creating the subscriber, not the (absent) value.
+    // The `BufferEmpty` error is ignored: we want the side effect of creating
+    // the subscriber, not the (absent) value.
     let _ = reader.try_recv();
 }
diff --git a/docs/design/037-zero-alloc-consume-path.md b/docs/design/037-zero-alloc-consume-path.md
index fe392cc..cebb636 100644
--- a/docs/design/037-zero-alloc-consume-path.md
+++ b/docs/design/037-zero-alloc-consume-path.md
@@ -61,7 +61,7 @@ pub async fn recv(&mut self) -> Result<T, DbError> {
 
 1. **WASM — unbox what already exists.** `WasmRecvFuture`'s poll body *becomes* `poll_recv`; delete the `Box::pin`. Smallest diff; do it first to validate the trait shape.
 2. **Tokio Mailbox.** Currently `Mutex<slot>` + `Notify` ([`buffer.rs:8`](../../aimdb-tokio-adapter/src/buffer.rs#L8)). Replace `Notify` with waker storage beside the slot — single-slot take semantics is the textbook poll pattern. Drops the `Notify` permit subtleties entirely. Waker contract: see §6.
-3. **Embassy.** Verified against the locked embassy-sync **0.8.0**: `Channel` exposes [`poll_receive(&self, cx)`](https://docs.rs/embassy-sync/0.8.0/embassy_sync/channel/struct.Channel.html) natively (channel.rs:332 in the crate source) and pubsub `Subscriber` implements `Stream` (`poll_next`). Direct mapping, zero stored state. If the Watch-backed path lacks a public poll fn, mirror its `changed()` poll body — embassy futures are hand-rolled poll structs; mechanical.
+3. **Embassy. (As built.)** `Channel` exposes `poll_receive(&self, cx)` natively → Mailbox maps directly, zero stored state, no `unsafe`. But the **pubsub `Subscriber` and watch `Receiver` had no lag-preserving public poll**: `Subscriber: Stream` exists but its `poll_next` silently drops `WaitResult::Lagged`, and the watch `Receiver`'s `poll_changed` sits on a *sealed* trait (not callable downstream). The poll-native methods exist internally (the async `next_message()`/`changed()` are built on them) but are deliberately private. Rather than hand-roll a `no_std` `ReusableBoxFuture` (which added ~80 lines of raw-pointer `unsafe` to the embedded adapter — the initial W8 cut did this), we added two **small, additive public wrappers** to the vendored `embassy-sync` and drive them directly: `Subscriber::poll_next_message(&mut self, cx) -> Poll<WaitResult<T>>` and `watch::Receiver::poll_changed(&mut self, cx) -> Poll<T>`. The reader stores the subscriber/receiver and polls them each call — zero per-message alloc, no future box, **no new `unsafe`** (only the pre-existing `'static` borrow extension, unchanged from pre-W8). The change is on a branch in the embassy submodule (`aimdb/public-poll-methods`) with an **upstream PR pending** — the parity argument is that `Channel::poll_receive` is already public, so this just gives pubsub/watch the matching method. See [tmp decision doc](tmp-embassy-poll-recv-unsafe-options.md).
 4. **Tokio Broadcast — the one residue.** `broadcast::Receiver` exposes no public poll API. Use the `BroadcastStream` technique: a `tokio_util::sync::ReusableBoxFuture` owned by the reader — **one allocation per subscriber lifetime, reused for every message**. This is a Tokio API limitation, not an AimDB design cost; documented as such.
 
 ### 3.3 Fused and remote paths inherit it
@@ -99,7 +99,7 @@ W8 exists to make a claim provable; the proof ships in the same series. These be
 - **Lagged mapping** unchanged: `Lagged(n)` → `DbError::BufferLagged(n)`; pinned by existing adapter tests.
 - **Mailbox waker contract.** Single-slot take semantics with potentially multiple readers: store wakers, wake-**all** on push (spurious wakeups are benign; losers re-poll to `Pending`). Wake-one is an optimization with a starvation analysis attached — not now.
 - **Auto-traits.** The `poll_fn` future is `Send` iff the reader is; verify at the session-pump and connector spawn sites (compiler enforces; listed so the error is expected, not surprising).
-- **Embassy Watch poll surface** unverified on 0.8.0 — confirm during step 3; fallback is the mechanical mirror noted in §3.2.
+- **Embassy poll surface** — RESOLVED. The watch `Receiver` and pubsub `Subscriber` had no lag-preserving public poll (sealed plumbing + a lossy `Stream`). Resolved by adding public `poll_changed` / `poll_next_message` wrappers to the vendored `embassy-sync` (§3.2 step 3), upstream PR pending — *not* by the hand-rolled `no_std` `ReusableBoxFuture` the initial cut used, which is deleted along with its `unsafe`.
 
 ## 7. Dormant items (trigger-only)
 
diff --git a/docs/design/038-aimdb-bench-crate-design.md b/docs/design/038-aimdb-bench-crate-design.md
new file mode 100644
index 0000000..5994984
--- /dev/null
+++ b/docs/design/038-aimdb-bench-crate-design.md
@@ -0,0 +1,309 @@
+# 038 — The `aimdb-bench` Crate: Structured Benchmarking Infrastructure (W8 Companion)
+
+**Status:** Design Doc 2026-06-13. Infrastructure for **continuous performance measurement** throughout AimDB development. Companion to [037-zero-alloc-consume-path.md](037-zero-alloc-consume-path.md), which uses these tools to validate the W8 changes. The crate is **not** W8-specific; it's a foundational framework for ongoing performance tracking and regression detection. Intentionally lands **before** 037 to capture the pre-W8 baseline; see §4 for sequencing detail.
+
+---
+
+## 1. Overview
+
+`aimdb-bench` is a **sustainable, long-term benchmarking infrastructure** for AimDB development. It captures three classes of measurements — B0 (allocations, hard gate), B1 (leaf latency, trend), B2 (throughput, trend) — plus B3 (on-target embedded profiling). Unlike PR-specific benchmarks, this framework runs continuously and flags regressions through baseline drift.
+
+**Design philosophy:**
+- Measure the same workloads repeatedly; detect regressions organically
+- Baseline capture + optional regression gates per metric
+- Sustainable data collection, not W8-specific before/after
+- Land before 037 (W8); first B0 baseline documents the pre-W8 allocation cost; 037 acceptance criterion is B0 reaching 0
+- Extensible to new profiles, adapters, and measurement classes as AimDB evolves
+
+**Key constraints:**
+- Host-only; dev-dependency fenced from `no_std` graph
+- Uses Criterion for B1/B2; hand-rolled `CountingAllocator<A>` for B0 (zero external deps, generic inner allocator for future embedded adaptation)
+- Profiles: **Telemetry** (SpmcRing/`broadcast`), **State** (SingleLatest/`watch`), **Command** (Mailbox)
+- Adapters: **Tokio** (std) and **Embassy** (on-host via test stubs)
+- Isolated from production code; no leaked measurements or side effects
+
+**Non-goals:**
+- PR-specific before/after comparisons (use git history and result snapshots instead)
+- System-level throughput or end-to-end workload simulation
+- WASM benchmarking (unit tests cover WASM allocation changes)
+- Real-time OS benchmarking or external scheduler effects
+- Third-party trend storage (results captured in repo; CI integration optional)
+
+---
+
+## 2. Repo-aligned scope
+
+`aimdb-bench` should be a host-only, opt-in workspace crate for measuring the current AimDB API surface. The first cut should stay on the codepaths that already exist in the repo:
+
+- `TokioBuffer<T>::push` / `TokioBuffer<T>::subscribe` — used directly (not via the full `AimDb` stack) to isolate the buffer layer from database initialization noise
+- `BufferReader::recv`
+- `BufferReader::try_recv`
+- existing buffer metrics and profiling gates in `aimdb-core`
+
+It should not introduce a new runtime harness, a per-adapter spawn abstraction, or production-facing API changes. Tokio is the first target. Embassy is a follow-up only if it can be exercised through the existing host-test stubs without enabling `embassy-runtime`; otherwise it belongs in a separate embedded or hardware-only follow-up.
+
+A separate bench file (`benches/b_aimdb_e2e.rs`) exercises the same three profiles through the full `AimDb` + `Producer<T>` + `Consumer<T>` stack. It is informational only (not a CI gate) and serves as a comparison point for the overhead added by the database initialization and record-lookup layers. It does **not** replace B0/B1/B2: those measure the buffer layer in isolation precisely because that is what 037 changes.
+
+### 2.1 Suggested crate shape
+
+```
+aimdb-bench/
+├── Cargo.toml
+├── src/
+│   ├── lib.rs
+│   ├── profiles.rs
+│   ├── alloc.rs
+│   └── reports.rs
+├── benches/
+│   ├── b0_alloc_tokio.rs
+│   ├── b1_b2_tokio.rs          # B1 latency (time/iter) + B2 throughput (msgs/sec)
+│   └── b_aimdb_e2e.rs          # full AimDb spinup comparison (informational, not a CI gate)
+└── data/
+    └── baselines/
+        └── (JSON baseline files checked into repo)
+```
+
+### 2.2 Dependency rules and workspace placement
+
+- Add `aimdb-bench` as a workspace member in `Cargo.toml` (under `members = [...]`).
+- Exclude it from `default-members` so `cargo build` does not build it by default.
+- Depend on `aimdb-core` and the Tokio adapter only.
+- Avoid pulling Embassy runtime features into the host benchmark crate.
+- If Embassy coverage is added later, prefer the existing host-test stubs or a separate hardware example, not a host-side `embassy-runtime` executor.
+- Update the Makefile `fmt` and `fmt-check` package loops to include `aimdb-bench` so CI formatting checks cover it.
+
+## 3. Phased delivery plan
+
+### Phase 1 — scaffold and measurement primitives
+
+- Add `aimdb-bench` as a workspace member (not in `default-members`).
+- Add `aimdb-bench` to the Makefile `fmt` and `fmt-check` package loops.
+- Implement workload profiles and deterministic message factories.
+- Implement allocation tracking and result structs.
+- B0/B1/B2 suites use `TokioBuffer<T>` directly; `b_aimdb_e2e.rs` uses the full `AimDb` + `Producer<T>` + `Consumer<T>` stack.
+
+### Phase 2 — B0 allocation gate on Tokio
+
+- Add the first benchmark suite for allocation counting on the Tokio adapter.
+- Measure Telemetry, State, and Command profiles using `TokioBuffer<T>` directly.
+- Exclude warmup from the measured window.
+- Compare against baseline and apply profile-specific regression budgets (optionally strict zero for W8-specific goals).
+
+### Phase 3 — B1 latency and B2 throughput
+
+- Add a combined latency + throughput bench (`b1_b2_tokio`) for SPSC and 1→4
+  fan-out. A throughput-annotated Criterion bench reports both the per-iteration
+  time (B1 latency) and msgs/sec (B2 throughput) from the same runs, so the two
+  classes share one bench rather than duplicating the same push→recv loop.
+- Keep raw Tokio broadcast/channel measurements only as context, not as a gate.
+- Store results in repo-local baselines.
+
+### Phase 4 — Embassy follow-up — **DONE**
+
+Resolved: Embassy coverage **can** run through the existing host-test stubs without `embassy-runtime`, so it lands in `aimdb-bench` alongside Tokio (no host-side Embassy executor).
+
+- **Host B0/B1/B2** (`b0_alloc_embassy`, `b1_b2_embassy` — the latter covers both B1 latency and B2 throughput): drive the real `EmbassyBuffer` backend (enabled via the adapter's `embassy-sync` + `embassy-time` features — **not** `embassy-runtime`, which would pull the cortex-m executor) through `futures::executor::block_on` over embassy-sync's poll methods. Buffer constructors live in `src/profiles_embassy.rs`. Each bench expands `aimdb_embassy_adapter::host_test_stubs!()` for the defmt logger / panic-handler / time-driver stubs, and links `critical-section/std` for `CriticalSectionRawMutex`.
+  - **Lazy-subscriber gotcha:** an Embassy `SpmcRing` reader registers its `Subscriber` lazily on first poll, so a message pushed before that poll is missed and `recv()` blocks forever. The benches call `profiles_embassy::prime()` (a `try_recv`) on each reader before the first `push`. Harmless for Watch/Mailbox.
+  - B0 result: **0 allocs/msg** across all three profiles (`data/baselines/b0_alloc_embassy.json`), matching the Tokio suite.
+- **On-target B3** (`examples/embassy-bench-stm32h5`): the measurements that *cannot* run on a host — DWT `CYCCNT` cycle-per-message counts on an STM32H563ZI (Cortex-M33 @ 250 MHz) for the same three profiles plus a 1→4 fan-out, plus a counting allocator around `embedded-alloc` to re-validate 0 allocs/msg on real hardware. Flashed/run via `probe-rs`; results stream over RTT as defmt logs.
+
+### Phase 5 — CI and baselines
+
+- Make B0 the required CI gate for buffer and connector changes.
+- Keep B1 and B2 informational.
+- Add baseline capture and comparison scripts.
+- Treat B3 as a manual, hardware-facing follow-up rather than a blocker for landing the crate.
+
+## 4. Measurement model
+
+### Workload profiles
+
+- Telemetry: small, high-frequency values over SPMC Ring.
+- State: larger latest-state snapshots over SingleLatest.
+- Command: overwrite-style control payloads over Mailbox.
+
+### Measurement semantics
+
+- B0 counts allocations per message.
+- B1 measures latency from publish to `recv` return.
+- B2 measures steady-state messages per second for SPSC and fan-out.
+- B3 is deferred to a separate embedded profiling workflow.
+
+B1 and B2 are produced by a single Criterion bench (`b1_b2_tokio`, and its
+Embassy twin `b1_b2_embassy`): a throughput-annotated bench reports the
+per-iteration time (B1 latency) and msgs/sec (B2 throughput) from the same runs,
+so there is no separate `b1_latency` target — the SPSC loops would be identical.
+
+For B0, the primary purpose is comparison and regression detection over time, not a universal absolute-zero invariant across all contexts. The benchmark should measure a stable post-warmup window, report allocation metrics (for example total allocations and allocations/message), and compare those results against a committed baseline. CI pass/fail is then policy-driven from delta-to-baseline budgets per profile (strict or relaxed), while still hard-failing invalid runs (for example missing baseline, incomplete run, or invalid sample window).
+
+**Pre-037 sequencing:** The initial B0 baseline (captured before 037 lands) will show **1 alloc/msg** — the `Box::pin(async move { ... })` constructed on every `BufferReader::recv()` call in the Tokio adapter. This is the expected pre-W8 state and a valid, useful baseline. After 037 lands and replaces the async trait method with `poll_recv` + `core::future::poll_fn`, B0 should drop to 0. The delta — from 1 to 0 — is the acceptance criterion for the 037 SPI break.
+
+**B2 fan-out and lagging:** For `SpmcRing` 1→4 fan-out benchmarks, the bench must be designed to avoid `BufferLagged` errors; lagging is a distinct overload scenario, not part of peak throughput. Two rules guarantee a clean-path measurement: (1) subscribe all consumers *before* producing any messages — a `broadcast::Receiver` created after sends are in flight misses them, but one registered before holds its read position; (2) set `capacity >= messages_per_iteration` so no consumer can fall behind within a single bench iteration. If the lagging path is of interest, add a separate explicitly-named bench file (e.g., `b1_b2_saturated.rs`); mixing it into the main B2 suite conflates peak throughput with overload handling. Note: the error path in `TokioBufferReader` allocates unconditionally (`buffer_name: "broadcast".to_string()`), so its B0 will always be > 0 regardless of 037.
+
+**B2 Mailbox and `notify_waiters` semantics:** The Mailbox buffer uses `tokio::sync::Notify::notify_waiters()`, which wakes currently registered waiters but does **not** store a permit — a notification fired with no current waiters is silently dropped. This is not a bug: `TokioBufferReader::Notify::recv()` checks the value slot *before* calling `notified().await`, so a value written while no consumer is parked remains in the slot and is retrieved on the next `recv()` call without blocking. For B2, the correct measurement pattern is a tight 1:1 push→recv loop (producer sends one message, consumer calls `recv()`, repeat). Do **not** batch pushes ahead of the consumer in the Mailbox B2 suite: the slot holds exactly one value, intermediate writes overwrite earlier ones, and only the last value survives — which conflates Mailbox overwrite semantics with throughput measurement. A full `AimDb` spinup (see `b_aimdb_e2e.rs`) goes through exactly the same `Notify` + slot mechanism and does **not** change these semantics; it cannot work around them.
+
+### Allocation counter
+
+B0 uses a hand-rolled `CountingAllocator<A: GlobalAlloc>` wrapping the platform allocator — no external crates. The generic inner allocator `A` makes it adaptable for a future embedded B3 target without rework (swap `System` for `embedded-alloc` or similar). Byte tracking is included alongside call counting: it costs nothing extra and tells you the size of the future being eliminated, which is useful context when evaluating the 037 SPI break.
+
+```rust
+// aimdb-bench/src/alloc.rs
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub static ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
+pub static ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
+
+pub struct CountingAllocator<A>(pub A);
+
+unsafe impl<A: GlobalAlloc> GlobalAlloc for CountingAllocator<A> {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        ALLOC_BYTES.fetch_add(layout.size() as u64, Ordering::Relaxed);
+        self.0.alloc(layout)
+    }
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        self.0.dealloc(ptr, layout)
+    }
+}
+
+#[global_allocator]
+static GLOBAL: CountingAllocator<System> = CountingAllocator(System);
+```
+
+**Production isolation:** `#[global_allocator]` is a per-binary link-time declaration. `CountingAllocator` exists only in the bench binaries produced by `aimdb-bench`. Nothing in the production dependency graph — `aimdb-core`, `aimdb-tokio-adapter`, application binaries — depends on `aimdb-bench` or is affected in any way. The dependency graph runs one way only: `aimdb-bench → aimdb-core, aimdb-tokio-adapter`.
+
+**Noise reduction:** B0 bench binaries use `tokio::runtime::Builder::new_current_thread()`. On a current-thread executor there are no work-stealing threads; Tokio's scheduler does not allocate per-poll in the hot path. The counter then cleanly isolates AimDB's per-message contribution.
+
+**Measurement window:** Reset counters after warmup (≥100 recv iterations), then measure a batch of N messages and divide by N. Single-call measurement is less reliable even post-warmup; the batch average is the stable signal.
+
+## 5. Running order
+
+- `cargo bench -p aimdb-bench --bench b0_alloc_tokio`
+- `cargo bench -p aimdb-bench --bench b1_b2_tokio`
+- `cargo bench -p aimdb-bench --bench b0_alloc_embassy`
+- `cargo bench -p aimdb-bench --bench b1_b2_embassy`
+
+On-target B3 (cannot run on a host) — from `examples/embassy-bench-stm32h5/`, with a Nucleo-H563ZI attached:
+
+- `cargo run --release` (flashes via `probe-rs`; defmt/RTT reports cycles/msg + allocs/msg)
+
+## 6. CI and baselines
+
+- Store baselines in `data/baselines/`.
+- Compare new runs against the last known-good commit locally and in CI.
+- Gate only B0 on PRs that touch buffer or connector code.
+- Keep B1 and B2 as trend data for regressions, not hard failures.
+
+**Phase-1 governance (run in CI + manual PR review):** B0/B1/B2 always run and produce comparison artifacts; CI never hard-fails on regression deltas. PR reviewers apply manual judgment: B0 regressions warrant investigation, B1/B2 within expected variance proceed, sustained regressions trigger follow-up or mitigation notes. Hard gates are deferred until baseline history stabilizes (typically after 2–4 weeks of data).
+
+## 7. What stays out of scope for v1
+
+- Custom `RuntimeHarness` and `spawn_producer`
+- Host-side `embassy-runtime`
+- A new production-facing runtime layer
+- Embedded DWT profiling in the initial crate
+- PR-specific before/after automation as the primary workflow
+
+---
+
+## 14. Non-goals, future work, and extensibility
+
+**Not included (post-phase 5 or out of scope):**
+- Historical trending database or cloud storage (repo-local JSON + git is sufficient; external systems optional)
+- Automated per-commit trend analysis or ML-based regression detection
+- Real-time system benchmarking or workload simulation (L2+ pyramid; future work)
+- WASM benchmarking (unit tests cover WASM allocation changes)
+- Multi-node or distributed AimDB benchmarks (not yet supported)
+- GPU or FPGA acceleration profiling
+
+**Future work (extensible by design):**
+- **L2 pyramid**: Workload simulation (e.g., sensor fusion, multi-record chains, fan-out stress tests)
+- **Hot-path profiling**: Flame graph generation (using `perf` or `cargo-flamegraph`) for CPU-bound investigation
+- **Memory profiling**: Heap fragmentation analysis (post-B3)
+- **Regression dashboards**: Web UI for browsing baseline trends (integrates with repo JSON)
+- **New adapters**: Add benchmarks as new runtimes ship (e.g., `smol`, `async-std`)
+- **New profiles**: Domain-specific workloads (e.g., geospatial, financial timeseries)
+- **Adaptive sampling**: For noisy CI runners (reduce sample size dynamically)
+- **Platform-specific gates**: Per-architecture B0 budgets (e.g., stricter on MCU targets)
+
+**Extensibility**:
+- The profile model and report structs can grow with new measurement classes without changing the published API.
+- `WorkloadProfile` can be expanded with additional canonical workloads as AimDB grows.
+- `AllocationTracker` can be swapped for alternative instrumentation (e.g., `prophesize`, `valgrind`).
+- Result format (JSON) is human-readable and tool-friendly for downstream analysis.
+
+---
+
+## 15. Measurement assumptions and caveats
+
+1. **Microbenchmark scope**: B0–B2 isolate the consume path; no claim to system-level throughput or realistic workload behavior.
+2. **Single-threaded schedulers**: Criterion and B1/B2 use current-thread executors to isolate noise; results do not predict multi-threaded behavior or work-stealing schedulers.
+3. **Criterion noise**: p50 medians are robust; p99 may vary ±5–10% on noisy CI runners (acceptable for trend tracking, not hard gates).
+4. **Warmup excluded**: Criterion and manual setup both exclude warmup samples from the measured set; first-run cache effects are not included.
+5. **Allocation tracker precision**: Counter-based (not true memory profiling); reports allocation count and byte total per-message aggregate, not per-call precision or heap fragmentation. Production binaries are fully isolated — `#[global_allocator]` is a per-binary link-time declaration scoped only to bench binaries.
+6. **B3 hardware variation**: DWT cycle counts vary by CPU frequency scaling and pipeline state; B3 baseline must be recorded with CPU governor fixed to max frequency.
+7. **Host platform dependence**: Results are specific to the host CPU; cross-platform comparison requires normalization (not automated).
+8. **Development vs. release builds**: `--release` vs. debug optimizations can differ by 5–50×; always specify build flags when storing baselines.
+
+---
+
+## 16. Implementation and adoption
+
+### Pre-implementation checklist
+
+- [ ] Crate directory structure approved
+- [ ] B0 allocation gate budget (target: 0 allocs/msg) finalized
+- [ ] Criterion configuration tuned on target hardware (current-thread executor, pinning strategy)
+- [ ] Developer workflow documented
+
+### Post-implementation checklist
+
+- [x] B0–B2 scaffold + Criterion wiring compiles and runs
+- [x] B0 gate captures pre-W8 baseline (1 alloc/msg from `Box::pin` in `recv()`); **confirmed dropped to 0 once 037 / W8 landed** — `data/baselines/b0_alloc_tokio.json` now records 0 allocs/msg across all three tokio profiles
+- [x] **Embassy host B0/B1/B2 land** (`b0_alloc_embassy`, `b1_b2_embassy` — the latter covers both B1 latency and B2 throughput); `data/baselines/b0_alloc_embassy.json` records 0 allocs/msg across all three Embassy profiles
+- [ ] B1/B2 produce stable p50/p99 across multiple runs (\< 5% variance)
+- [x] **B3 harness runs on hardware** (`examples/embassy-bench-stm32h5`, Nucleo-H563ZI @ 250 MHz, release): first cycle-count baseline captured in `data/baselines/b3_cycles_stm32h5.json` — Telemetry 2013, State 2009, Command 1661, Telemetry 1→4 6239 cycles/msg, and **0 allocs/msg confirmed on the real `embedded-alloc` heap** across all profiles
+- [x] `data/baselines/` directory initialized with HEAD measurements
+- [ ] README, troubleshooting, and comparison workflow complete
+- [x] One-command workflow verified: `cargo bench -p aimdb-bench`
+
+### Initial rollout
+
+1. Land infrastructure (phases 1–3) in foundational PR
+2. Capture baseline measurements on HEAD; commit baseline JSON files to `data/baselines/`
+3. Integrate B0/B1/B2 CI job (advisory; no hard gate in phase 1)
+4. Document developer workflow: run `cargo bench -p aimdb-bench` locally, inspect JSON output in `target/criterion/`, or review action logs
+5. Use on next performance-critical PR (e.g., W8); refine based on developer feedback
+6. Schedule B3 baseline capture on hardware rig during next release preparation
+
+### Phase-1 CI Governance
+
+**Rationale:** Manual review balances regression visibility with implementation maturity. Criterion benches can be noisy on shared CI runners, so hard gates prematurely flag false positives. After 2–4 weeks of baseline history, the team can transition to policy-driven automated gates.
+
+**Phase-1 rules:**
+1. CI always runs B0/B1/B2 for PRs touching `aimdb-core/src/buffer/`, adapter `buffer.rs` files, or `connector.rs`.
+2. CI emits comparison JSON (new vs baseline) but never hard-fails.
+3. PR reviewer manually examines deltas:
+   - **B0 regression** (allocations/msg increased): investigate root cause before merge.
+   - **B1/B2 variance** (within ±5–10%): acceptable; proceed unless sustained across multiple runs.
+   - **Clear sustained regression** (>10% or repeated across commits): request mitigation or document in commit message.
+4. After stable baseline history, migrate to policy-driven thresholds (for example fail if B0 delta > +0.05 allocs/msg).
+
+### Ongoing usage
+
+- **Development**: Run `cargo bench -p aimdb-bench` locally to generate JSON logs in `target/criterion/`. Review output to inspect allocations, latency, and throughput.
+- **PRs with baseline changes**: Benchmark harness emits JSON files to `target/criterion/` and CI artifacts. Developer can checkout the PR and run `cargo bench` locally to inspect JSON output, or review action logs in the CI run.
+- **Baseline updates**: When measurements improve or degrade intentionally (e.g., after W8), commit updated baseline JSON files to `data/baselines/` with clear rationale in the commit message.
+- **Release prep**: Capture B3 on-target results; review against previous baseline; document in release notes if significant.
+- **Maintenance**: Update baselines as reference architecture improves; document rationale in git commit message.
+
+---
+
+## References
+
+- **037**: [Zero-Allocation Consume Path](037-zero-alloc-consume-path.md)
+- **Criterion.rs**: https://docs.rs/criterion
+- **DWT CYCCNT**: ARM Cortex-M debug spec, section D3.3
+- **STM32H5 HAL**: https://github.com/stm32-rs/stm32h5xx-hal
+- **`defmt`**: https://github.com/knurling-rs/defmt
diff --git a/docs/design/039-proof-artifact-and-story-roadmap.md b/docs/design/039-proof-artifact-and-story-roadmap.md
new file mode 100644
index 0000000..5e752a6
--- /dev/null
+++ b/docs/design/039-proof-artifact-and-story-roadmap.md
@@ -0,0 +1,54 @@
+# 038 — Proof-Artifact & Story Roadmap
+
+**Status:** Draft 2026-06-12. Companion to 037; sequences the claims-hardening artifacts from the June review against the content calendar. Publication anchor: **all stories release after the Bring-Your-Own-Connector (BYOC) and Bring-Your-Own-Adapter (BYOA) series.**
+
+---
+
+## 1. Operating principles
+
+1. **Two clocks, decoupled.** Artifacts land on *engineering time* (breaking window, release gates, hardware availability). Stories publish on *content time* (after BYOC/BYOA, in queue order). Nothing about the publication order is allowed to delay an artifact whose engineering deadline is earlier.
+2. **Artifact-gated, not calendar-gated.** A story ships when its §9-style numbers are merged — never before (037 §10). Calendar slots below are targets, not promises.
+3. **The series must not rot on publish.** BYOC and BYOA are SPI documentation. They get written against the SPI that will exist *after* the open breaking window closes — see §2.
+
+## 2. Hard constraints on the series itself
+
+- **BYOA teaches the post-W8 SPI.** W8 (037) changes `BufferReader` from boxed-future `recv` to `poll_recv`, and it must land inside the currently-open breaking window regardless of content plans. Writing the adapter series against the old SPI means it documents a deleted trait within weeks. Upside: `poll_recv` is the *better tutorial* — "implement one poll function, register the waker before `Pending`" is a cleaner adapter contract to teach than "return a `Pin<Box<dyn Future>>`", and the series quietly sets up story S1.
+- **BYOC is already stable.** The connector SPI post-W1 (fused `SerializedSource` + sync ingest closures, #141) is the shape to document. One recommendation to keep it stable: when W8 propagates (037 §3.3), keep `SerializedSource`'s **async wrapper** rather than exposing a poll method on the connector SPI — connector authors' world then doesn't change, and the BYOC series survives W8 untouched.
+- **Conformance suite runs early even though its story ships late.** First-run divergences between the Tokio/Embassy/WASM buffer implementations may require *behavioral* fixes — and behavioral fixes may be breaking. Discovering that after the window closes is the expensive version. Build and run the suite now (it reuses the W6 `host_test_stubs!` infra); publish S3 whenever the queue reaches it.
+
+## 3. Phase 0 — lands now, silently (before/during the series)
+
+| Item | Why now | Story attached |
+|---|---|---|
+| W8 + B0/B1/B2 baselines (037 §8) | Breaking window; "before" columns captured pre-merge | S1, later |
+| Conformance suite first run | Divergences may need the window (§2) | S3, later |
+| `#![forbid(unsafe_code)]` in `aimdb-core` | Trivial; enables the S5 claim | S5, later |
+| Size job in CI (cargo-size/bloat per feature set, thumbv7em) | Defends ~50 KB before anyone attacks it | S4, later |
+| Wording fixes: executor "zero dependencies" → "no required dependencies"; drop "real-time" until S7; serial + UDS rows in the connector table; retire the low-count star badge | Cheap correctness; none of these are stories | — |
+
+**One hardware day, two stories.** The next W3 rig session should cover *both* the remaining KNX matrix scenarios (soak, queue-flush, stale-channel NACK, backoff pacing, inbound flood, clean detection number) *and* the B3 before/after capture (DWT cycles + heap high-water, main vs W8 branch). That single session completes the artifact gates for S1 and S2.
+
+## 4. Phase 1–2 — the series (Alex's existing plan)
+
+BYOC, then BYOA, on their own schedule. Each installment may close with a one-line forward tease ("the adapter contract is one poll function — in a few weeks we'll show what that bought us, with cycle counts"), but the series carries no claims that the Phase-0 artifacts haven't already locked in.
+
+## 5. Phase 3 — the story queue (post-series, in order)
+
+| # | Story | Hook | Artifact gate | Channel |
+|---|---|---|---|---|
+| S1 | **One heap allocation per message — found, measured, removed** | `WasmRecvFuture` boxed solely to satisfy a trait signature; object safety vs `async fn`; declined the monomorphized lane *with data* | W8 merged; 037 §9 populated incl. B3 | r/rust (TWIR follow) |
+| S2 | **Our hardware matrix caught silent data loss before release** | Ten KNX writes vanishing behind warn-only AckTimeouts; spec-3.8.4 retransmit; same-day hardware re-validation | W3 matrix complete (release gate anyway) | r/embedded + blog; repurposed for DACH pilot decks & LinkedIn |
+| S3 | **Three runtimes, one semantics suite** | Proving "same buffer semantics on Tokio/Embassy/WASM" instead of asserting it; what diverged on first run | Suite in CI; divergences fixed or documented | r/rust |
+| S4 | **What a typed dataplane costs on a Cortex-M** | Binary-size archaeology: what actually dominates flash (fmt? panic infra? per-record monomorphization?); the honest ~50 KB breakdown | Size job + bloat analysis per feature set | embedded Rust; feeds the `Reader<T,B>` dormant trigger and grant TRL credibility |
+| S5 | **Proving the unsafe away** | `forbid(unsafe_code)` core; Miri on the WASM/Embassy `Send+Sync` impls; loom on the W8 Mailbox waker we just hand-rolled | Miri + loom jobs green in CI | r/rust |
+| S6 | **Schema evolution without a registry — proven** | proptest round-trips (up∘down = identity), old-reader/new-writer matrix | Property suite merged | r/rust + distributed-systems |
+| S7 | **Earning the word "real-time" back** | p99/p99.9 publish→wake jitter on Embassy, measured on the B3 rig; soft-RT claims with numbers or not at all | B3 infra + jitter harness; long soak | embedded; gated last deliberately |
+
+Parallel, different channel, existing plan: the MCP live-demo **Show HN** rides its own track and is unaffected by this queue.
+
+## 6. Cadence and sequencing notes
+
+- Suggested rhythm: one story per 2–3 weeks post-series, S1 first as the BYOA segue. S2 is independent and may interleave anywhere once the matrix closes.
+- If an artifact slips, the queue *reorders* rather than waits — every story is self-contained by design.
+- Each published story closes the loop in-repo: README/website wording updated in the same PR as the story's numbers (037 §8 rule, generalized), and the claim moves from "asserted" to "CI-enforced" in whatever doc owns it.
+- After S7, the original audit is fully discharged: every public claim is either CI-enforced, hardware-measured, or deleted.

From f82e81c46127683e3e7f1ba8921a1f22cc1d2e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 20:41:05 +0000
Subject: [PATCH 14/16] feat(buffer): add lazy subscriber creation explanation
 for SpmcRing

---
 aimdb-embassy-adapter/src/buffer.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/aimdb-embassy-adapter/src/buffer.rs b/aimdb-embassy-adapter/src/buffer.rs
index d674c38..dc38d00 100644
--- a/aimdb-embassy-adapter/src/buffer.rs
+++ b/aimdb-embassy-adapter/src/buffer.rs
@@ -210,6 +210,14 @@ impl<
         }
     }
 
+    /// The embassy subscriber/receiver is created **lazily on first poll**, not
+    /// here (unlike the Tokio adapter, which registers eagerly).
+    /// This matters only for `SpmcRing`: any message produced in the gap between
+    /// `subscribe()` and that first poll is missed, because the reader only
+    /// receives messages sent after it starts listening. In normal use the
+    /// consumer spawns and loops on `recv()`, so the gap is harmless. If you must
+    /// produce before the consumer has polled, call `try_recv()` once first to
+    /// start listening early (this is what the B0/B2 benches' `prime()` does).
     fn subscribe(&self) -> Self::Reader {
         // Clone the Arc for the reader
         EmbassyBufferReader {

From 871e0e346f035feb13e12f171c511a1b2b54040b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 21:13:29 +0000
Subject: [PATCH 15/16] chore: remove outdated design documents for aimdb-bench
 and proof artifacts

---
 docs/design/038-aimdb-bench-crate-design.md   | 309 ------------------
 .../039-proof-artifact-and-story-roadmap.md   |  54 ---
 2 files changed, 363 deletions(-)
 delete mode 100644 docs/design/038-aimdb-bench-crate-design.md
 delete mode 100644 docs/design/039-proof-artifact-and-story-roadmap.md

diff --git a/docs/design/038-aimdb-bench-crate-design.md b/docs/design/038-aimdb-bench-crate-design.md
deleted file mode 100644
index 5994984..0000000
--- a/docs/design/038-aimdb-bench-crate-design.md
+++ /dev/null
@@ -1,309 +0,0 @@
-# 038 — The `aimdb-bench` Crate: Structured Benchmarking Infrastructure (W8 Companion)
-
-**Status:** Design Doc 2026-06-13. Infrastructure for **continuous performance measurement** throughout AimDB development. Companion to [037-zero-alloc-consume-path.md](037-zero-alloc-consume-path.md), which uses these tools to validate the W8 changes. The crate is **not** W8-specific; it's a foundational framework for ongoing performance tracking and regression detection. Intentionally lands **before** 037 to capture the pre-W8 baseline; see §4 for sequencing detail.
-
----
-
-## 1. Overview
-
-`aimdb-bench` is a **sustainable, long-term benchmarking infrastructure** for AimDB development. It captures three classes of measurements — B0 (allocations, hard gate), B1 (leaf latency, trend), B2 (throughput, trend) — plus B3 (on-target embedded profiling). Unlike PR-specific benchmarks, this framework runs continuously and flags regressions through baseline drift.
-
-**Design philosophy:**
-- Measure the same workloads repeatedly; detect regressions organically
-- Baseline capture + optional regression gates per metric
-- Sustainable data collection, not W8-specific before/after
-- Land before 037 (W8); first B0 baseline documents the pre-W8 allocation cost; 037 acceptance criterion is B0 reaching 0
-- Extensible to new profiles, adapters, and measurement classes as AimDB evolves
-
-**Key constraints:**
-- Host-only; dev-dependency fenced from `no_std` graph
-- Uses Criterion for B1/B2; hand-rolled `CountingAllocator<A>` for B0 (zero external deps, generic inner allocator for future embedded adaptation)
-- Profiles: **Telemetry** (SpmcRing/`broadcast`), **State** (SingleLatest/`watch`), **Command** (Mailbox)
-- Adapters: **Tokio** (std) and **Embassy** (on-host via test stubs)
-- Isolated from production code; no leaked measurements or side effects
-
-**Non-goals:**
-- PR-specific before/after comparisons (use git history and result snapshots instead)
-- System-level throughput or end-to-end workload simulation
-- WASM benchmarking (unit tests cover WASM allocation changes)
-- Real-time OS benchmarking or external scheduler effects
-- Third-party trend storage (results captured in repo; CI integration optional)
-
----
-
-## 2. Repo-aligned scope
-
-`aimdb-bench` should be a host-only, opt-in workspace crate for measuring the current AimDB API surface. The first cut should stay on the codepaths that already exist in the repo:
-
-- `TokioBuffer<T>::push` / `TokioBuffer<T>::subscribe` — used directly (not via the full `AimDb` stack) to isolate the buffer layer from database initialization noise
-- `BufferReader::recv`
-- `BufferReader::try_recv`
-- existing buffer metrics and profiling gates in `aimdb-core`
-
-It should not introduce a new runtime harness, a per-adapter spawn abstraction, or production-facing API changes. Tokio is the first target. Embassy is a follow-up only if it can be exercised through the existing host-test stubs without enabling `embassy-runtime`; otherwise it belongs in a separate embedded or hardware-only follow-up.
-
-A separate bench file (`benches/b_aimdb_e2e.rs`) exercises the same three profiles through the full `AimDb` + `Producer<T>` + `Consumer<T>` stack. It is informational only (not a CI gate) and serves as a comparison point for the overhead added by the database initialization and record-lookup layers. It does **not** replace B0/B1/B2: those measure the buffer layer in isolation precisely because that is what 037 changes.
-
-### 2.1 Suggested crate shape
-
-```
-aimdb-bench/
-├── Cargo.toml
-├── src/
-│   ├── lib.rs
-│   ├── profiles.rs
-│   ├── alloc.rs
-│   └── reports.rs
-├── benches/
-│   ├── b0_alloc_tokio.rs
-│   ├── b1_b2_tokio.rs          # B1 latency (time/iter) + B2 throughput (msgs/sec)
-│   └── b_aimdb_e2e.rs          # full AimDb spinup comparison (informational, not a CI gate)
-└── data/
-    └── baselines/
-        └── (JSON baseline files checked into repo)
-```
-
-### 2.2 Dependency rules and workspace placement
-
-- Add `aimdb-bench` as a workspace member in `Cargo.toml` (under `members = [...]`).
-- Exclude it from `default-members` so `cargo build` does not build it by default.
-- Depend on `aimdb-core` and the Tokio adapter only.
-- Avoid pulling Embassy runtime features into the host benchmark crate.
-- If Embassy coverage is added later, prefer the existing host-test stubs or a separate hardware example, not a host-side `embassy-runtime` executor.
-- Update the Makefile `fmt` and `fmt-check` package loops to include `aimdb-bench` so CI formatting checks cover it.
-
-## 3. Phased delivery plan
-
-### Phase 1 — scaffold and measurement primitives
-
-- Add `aimdb-bench` as a workspace member (not in `default-members`).
-- Add `aimdb-bench` to the Makefile `fmt` and `fmt-check` package loops.
-- Implement workload profiles and deterministic message factories.
-- Implement allocation tracking and result structs.
-- B0/B1/B2 suites use `TokioBuffer<T>` directly; `b_aimdb_e2e.rs` uses the full `AimDb` + `Producer<T>` + `Consumer<T>` stack.
-
-### Phase 2 — B0 allocation gate on Tokio
-
-- Add the first benchmark suite for allocation counting on the Tokio adapter.
-- Measure Telemetry, State, and Command profiles using `TokioBuffer<T>` directly.
-- Exclude warmup from the measured window.
-- Compare against baseline and apply profile-specific regression budgets (optionally strict zero for W8-specific goals).
-
-### Phase 3 — B1 latency and B2 throughput
-
-- Add a combined latency + throughput bench (`b1_b2_tokio`) for SPSC and 1→4
-  fan-out. A throughput-annotated Criterion bench reports both the per-iteration
-  time (B1 latency) and msgs/sec (B2 throughput) from the same runs, so the two
-  classes share one bench rather than duplicating the same push→recv loop.
-- Keep raw Tokio broadcast/channel measurements only as context, not as a gate.
-- Store results in repo-local baselines.
-
-### Phase 4 — Embassy follow-up — **DONE**
-
-Resolved: Embassy coverage **can** run through the existing host-test stubs without `embassy-runtime`, so it lands in `aimdb-bench` alongside Tokio (no host-side Embassy executor).
-
-- **Host B0/B1/B2** (`b0_alloc_embassy`, `b1_b2_embassy` — the latter covers both B1 latency and B2 throughput): drive the real `EmbassyBuffer` backend (enabled via the adapter's `embassy-sync` + `embassy-time` features — **not** `embassy-runtime`, which would pull the cortex-m executor) through `futures::executor::block_on` over embassy-sync's poll methods. Buffer constructors live in `src/profiles_embassy.rs`. Each bench expands `aimdb_embassy_adapter::host_test_stubs!()` for the defmt logger / panic-handler / time-driver stubs, and links `critical-section/std` for `CriticalSectionRawMutex`.
-  - **Lazy-subscriber gotcha:** an Embassy `SpmcRing` reader registers its `Subscriber` lazily on first poll, so a message pushed before that poll is missed and `recv()` blocks forever. The benches call `profiles_embassy::prime()` (a `try_recv`) on each reader before the first `push`. Harmless for Watch/Mailbox.
-  - B0 result: **0 allocs/msg** across all three profiles (`data/baselines/b0_alloc_embassy.json`), matching the Tokio suite.
-- **On-target B3** (`examples/embassy-bench-stm32h5`): the measurements that *cannot* run on a host — DWT `CYCCNT` cycle-per-message counts on an STM32H563ZI (Cortex-M33 @ 250 MHz) for the same three profiles plus a 1→4 fan-out, plus a counting allocator around `embedded-alloc` to re-validate 0 allocs/msg on real hardware. Flashed/run via `probe-rs`; results stream over RTT as defmt logs.
-
-### Phase 5 — CI and baselines
-
-- Make B0 the required CI gate for buffer and connector changes.
-- Keep B1 and B2 informational.
-- Add baseline capture and comparison scripts.
-- Treat B3 as a manual, hardware-facing follow-up rather than a blocker for landing the crate.
-
-## 4. Measurement model
-
-### Workload profiles
-
-- Telemetry: small, high-frequency values over SPMC Ring.
-- State: larger latest-state snapshots over SingleLatest.
-- Command: overwrite-style control payloads over Mailbox.
-
-### Measurement semantics
-
-- B0 counts allocations per message.
-- B1 measures latency from publish to `recv` return.
-- B2 measures steady-state messages per second for SPSC and fan-out.
-- B3 is deferred to a separate embedded profiling workflow.
-
-B1 and B2 are produced by a single Criterion bench (`b1_b2_tokio`, and its
-Embassy twin `b1_b2_embassy`): a throughput-annotated bench reports the
-per-iteration time (B1 latency) and msgs/sec (B2 throughput) from the same runs,
-so there is no separate `b1_latency` target — the SPSC loops would be identical.
-
-For B0, the primary purpose is comparison and regression detection over time, not a universal absolute-zero invariant across all contexts. The benchmark should measure a stable post-warmup window, report allocation metrics (for example total allocations and allocations/message), and compare those results against a committed baseline. CI pass/fail is then policy-driven from delta-to-baseline budgets per profile (strict or relaxed), while still hard-failing invalid runs (for example missing baseline, incomplete run, or invalid sample window).
-
-**Pre-037 sequencing:** The initial B0 baseline (captured before 037 lands) will show **1 alloc/msg** — the `Box::pin(async move { ... })` constructed on every `BufferReader::recv()` call in the Tokio adapter. This is the expected pre-W8 state and a valid, useful baseline. After 037 lands and replaces the async trait method with `poll_recv` + `core::future::poll_fn`, B0 should drop to 0. The delta — from 1 to 0 — is the acceptance criterion for the 037 SPI break.
-
-**B2 fan-out and lagging:** For `SpmcRing` 1→4 fan-out benchmarks, the bench must be designed to avoid `BufferLagged` errors; lagging is a distinct overload scenario, not part of peak throughput. Two rules guarantee a clean-path measurement: (1) subscribe all consumers *before* producing any messages — a `broadcast::Receiver` created after sends are in flight misses them, but one registered before holds its read position; (2) set `capacity >= messages_per_iteration` so no consumer can fall behind within a single bench iteration. If the lagging path is of interest, add a separate explicitly-named bench file (e.g., `b1_b2_saturated.rs`); mixing it into the main B2 suite conflates peak throughput with overload handling. Note: the error path in `TokioBufferReader` allocates unconditionally (`buffer_name: "broadcast".to_string()`), so its B0 will always be > 0 regardless of 037.
-
-**B2 Mailbox and `notify_waiters` semantics:** The Mailbox buffer uses `tokio::sync::Notify::notify_waiters()`, which wakes currently registered waiters but does **not** store a permit — a notification fired with no current waiters is silently dropped. This is not a bug: `TokioBufferReader::Notify::recv()` checks the value slot *before* calling `notified().await`, so a value written while no consumer is parked remains in the slot and is retrieved on the next `recv()` call without blocking. For B2, the correct measurement pattern is a tight 1:1 push→recv loop (producer sends one message, consumer calls `recv()`, repeat). Do **not** batch pushes ahead of the consumer in the Mailbox B2 suite: the slot holds exactly one value, intermediate writes overwrite earlier ones, and only the last value survives — which conflates Mailbox overwrite semantics with throughput measurement. A full `AimDb` spinup (see `b_aimdb_e2e.rs`) goes through exactly the same `Notify` + slot mechanism and does **not** change these semantics; it cannot work around them.
-
-### Allocation counter
-
-B0 uses a hand-rolled `CountingAllocator<A: GlobalAlloc>` wrapping the platform allocator — no external crates. The generic inner allocator `A` makes it adaptable for a future embedded B3 target without rework (swap `System` for `embedded-alloc` or similar). Byte tracking is included alongside call counting: it costs nothing extra and tells you the size of the future being eliminated, which is useful context when evaluating the 037 SPI break.
-
-```rust
-// aimdb-bench/src/alloc.rs
-use std::alloc::{GlobalAlloc, Layout, System};
-use std::sync::atomic::{AtomicU64, Ordering};
-
-pub static ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
-pub static ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
-
-pub struct CountingAllocator<A>(pub A);
-
-unsafe impl<A: GlobalAlloc> GlobalAlloc for CountingAllocator<A> {
-    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
-        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
-        ALLOC_BYTES.fetch_add(layout.size() as u64, Ordering::Relaxed);
-        self.0.alloc(layout)
-    }
-    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
-        self.0.dealloc(ptr, layout)
-    }
-}
-
-#[global_allocator]
-static GLOBAL: CountingAllocator<System> = CountingAllocator(System);
-```
-
-**Production isolation:** `#[global_allocator]` is a per-binary link-time declaration. `CountingAllocator` exists only in the bench binaries produced by `aimdb-bench`. Nothing in the production dependency graph — `aimdb-core`, `aimdb-tokio-adapter`, application binaries — depends on `aimdb-bench` or is affected in any way. The dependency graph runs one way only: `aimdb-bench → aimdb-core, aimdb-tokio-adapter`.
-
-**Noise reduction:** B0 bench binaries use `tokio::runtime::Builder::new_current_thread()`. On a current-thread executor there are no work-stealing threads; Tokio's scheduler does not allocate per-poll in the hot path. The counter then cleanly isolates AimDB's per-message contribution.
-
-**Measurement window:** Reset counters after warmup (≥100 recv iterations), then measure a batch of N messages and divide by N. Single-call measurement is less reliable even post-warmup; the batch average is the stable signal.
-
-## 5. Running order
-
-- `cargo bench -p aimdb-bench --bench b0_alloc_tokio`
-- `cargo bench -p aimdb-bench --bench b1_b2_tokio`
-- `cargo bench -p aimdb-bench --bench b0_alloc_embassy`
-- `cargo bench -p aimdb-bench --bench b1_b2_embassy`
-
-On-target B3 (cannot run on a host) — from `examples/embassy-bench-stm32h5/`, with a Nucleo-H563ZI attached:
-
-- `cargo run --release` (flashes via `probe-rs`; defmt/RTT reports cycles/msg + allocs/msg)
-
-## 6. CI and baselines
-
-- Store baselines in `data/baselines/`.
-- Compare new runs against the last known-good commit locally and in CI.
-- Gate only B0 on PRs that touch buffer or connector code.
-- Keep B1 and B2 as trend data for regressions, not hard failures.
-
-**Phase-1 governance (run in CI + manual PR review):** B0/B1/B2 always run and produce comparison artifacts; CI never hard-fails on regression deltas. PR reviewers apply manual judgment: B0 regressions warrant investigation, B1/B2 within expected variance proceed, sustained regressions trigger follow-up or mitigation notes. Hard gates are deferred until baseline history stabilizes (typically after 2–4 weeks of data).
-
-## 7. What stays out of scope for v1
-
-- Custom `RuntimeHarness` and `spawn_producer`
-- Host-side `embassy-runtime`
-- A new production-facing runtime layer
-- Embedded DWT profiling in the initial crate
-- PR-specific before/after automation as the primary workflow
-
----
-
-## 14. Non-goals, future work, and extensibility
-
-**Not included (post-phase 5 or out of scope):**
-- Historical trending database or cloud storage (repo-local JSON + git is sufficient; external systems optional)
-- Automated per-commit trend analysis or ML-based regression detection
-- Real-time system benchmarking or workload simulation (L2+ pyramid; future work)
-- WASM benchmarking (unit tests cover WASM allocation changes)
-- Multi-node or distributed AimDB benchmarks (not yet supported)
-- GPU or FPGA acceleration profiling
-
-**Future work (extensible by design):**
-- **L2 pyramid**: Workload simulation (e.g., sensor fusion, multi-record chains, fan-out stress tests)
-- **Hot-path profiling**: Flame graph generation (using `perf` or `cargo-flamegraph`) for CPU-bound investigation
-- **Memory profiling**: Heap fragmentation analysis (post-B3)
-- **Regression dashboards**: Web UI for browsing baseline trends (integrates with repo JSON)
-- **New adapters**: Add benchmarks as new runtimes ship (e.g., `smol`, `async-std`)
-- **New profiles**: Domain-specific workloads (e.g., geospatial, financial timeseries)
-- **Adaptive sampling**: For noisy CI runners (reduce sample size dynamically)
-- **Platform-specific gates**: Per-architecture B0 budgets (e.g., stricter on MCU targets)
-
-**Extensibility**:
-- The profile model and report structs can grow with new measurement classes without changing the published API.
-- `WorkloadProfile` can be expanded with additional canonical workloads as AimDB grows.
-- `AllocationTracker` can be swapped for alternative instrumentation (e.g., `prophesize`, `valgrind`).
-- Result format (JSON) is human-readable and tool-friendly for downstream analysis.
-
----
-
-## 15. Measurement assumptions and caveats
-
-1. **Microbenchmark scope**: B0–B2 isolate the consume path; no claim to system-level throughput or realistic workload behavior.
-2. **Single-threaded schedulers**: Criterion and B1/B2 use current-thread executors to isolate noise; results do not predict multi-threaded behavior or work-stealing schedulers.
-3. **Criterion noise**: p50 medians are robust; p99 may vary ±5–10% on noisy CI runners (acceptable for trend tracking, not hard gates).
-4. **Warmup excluded**: Criterion and manual setup both exclude warmup samples from the measured set; first-run cache effects are not included.
-5. **Allocation tracker precision**: Counter-based (not true memory profiling); reports allocation count and byte total per-message aggregate, not per-call precision or heap fragmentation. Production binaries are fully isolated — `#[global_allocator]` is a per-binary link-time declaration scoped only to bench binaries.
-6. **B3 hardware variation**: DWT cycle counts vary by CPU frequency scaling and pipeline state; B3 baseline must be recorded with CPU governor fixed to max frequency.
-7. **Host platform dependence**: Results are specific to the host CPU; cross-platform comparison requires normalization (not automated).
-8. **Development vs. release builds**: `--release` vs. debug optimizations can differ by 5–50×; always specify build flags when storing baselines.
-
----
-
-## 16. Implementation and adoption
-
-### Pre-implementation checklist
-
-- [ ] Crate directory structure approved
-- [ ] B0 allocation gate budget (target: 0 allocs/msg) finalized
-- [ ] Criterion configuration tuned on target hardware (current-thread executor, pinning strategy)
-- [ ] Developer workflow documented
-
-### Post-implementation checklist
-
-- [x] B0–B2 scaffold + Criterion wiring compiles and runs
-- [x] B0 gate captures pre-W8 baseline (1 alloc/msg from `Box::pin` in `recv()`); **confirmed dropped to 0 once 037 / W8 landed** — `data/baselines/b0_alloc_tokio.json` now records 0 allocs/msg across all three tokio profiles
-- [x] **Embassy host B0/B1/B2 land** (`b0_alloc_embassy`, `b1_b2_embassy` — the latter covers both B1 latency and B2 throughput); `data/baselines/b0_alloc_embassy.json` records 0 allocs/msg across all three Embassy profiles
-- [ ] B1/B2 produce stable p50/p99 across multiple runs (\< 5% variance)
-- [x] **B3 harness runs on hardware** (`examples/embassy-bench-stm32h5`, Nucleo-H563ZI @ 250 MHz, release): first cycle-count baseline captured in `data/baselines/b3_cycles_stm32h5.json` — Telemetry 2013, State 2009, Command 1661, Telemetry 1→4 6239 cycles/msg, and **0 allocs/msg confirmed on the real `embedded-alloc` heap** across all profiles
-- [x] `data/baselines/` directory initialized with HEAD measurements
-- [ ] README, troubleshooting, and comparison workflow complete
-- [x] One-command workflow verified: `cargo bench -p aimdb-bench`
-
-### Initial rollout
-
-1. Land infrastructure (phases 1–3) in foundational PR
-2. Capture baseline measurements on HEAD; commit baseline JSON files to `data/baselines/`
-3. Integrate B0/B1/B2 CI job (advisory; no hard gate in phase 1)
-4. Document developer workflow: run `cargo bench -p aimdb-bench` locally, inspect JSON output in `target/criterion/`, or review action logs
-5. Use on next performance-critical PR (e.g., W8); refine based on developer feedback
-6. Schedule B3 baseline capture on hardware rig during next release preparation
-
-### Phase-1 CI Governance
-
-**Rationale:** Manual review balances regression visibility with implementation maturity. Criterion benches can be noisy on shared CI runners, so hard gates prematurely flag false positives. After 2–4 weeks of baseline history, the team can transition to policy-driven automated gates.
-
-**Phase-1 rules:**
-1. CI always runs B0/B1/B2 for PRs touching `aimdb-core/src/buffer/`, adapter `buffer.rs` files, or `connector.rs`.
-2. CI emits comparison JSON (new vs baseline) but never hard-fails.
-3. PR reviewer manually examines deltas:
-   - **B0 regression** (allocations/msg increased): investigate root cause before merge.
-   - **B1/B2 variance** (within ±5–10%): acceptable; proceed unless sustained across multiple runs.
-   - **Clear sustained regression** (>10% or repeated across commits): request mitigation or document in commit message.
-4. After stable baseline history, migrate to policy-driven thresholds (for example fail if B0 delta > +0.05 allocs/msg).
-
-### Ongoing usage
-
-- **Development**: Run `cargo bench -p aimdb-bench` locally to generate JSON logs in `target/criterion/`. Review output to inspect allocations, latency, and throughput.
-- **PRs with baseline changes**: Benchmark harness emits JSON files to `target/criterion/` and CI artifacts. Developer can checkout the PR and run `cargo bench` locally to inspect JSON output, or review action logs in the CI run.
-- **Baseline updates**: When measurements improve or degrade intentionally (e.g., after W8), commit updated baseline JSON files to `data/baselines/` with clear rationale in the commit message.
-- **Release prep**: Capture B3 on-target results; review against previous baseline; document in release notes if significant.
-- **Maintenance**: Update baselines as reference architecture improves; document rationale in git commit message.
-
----
-
-## References
-
-- **037**: [Zero-Allocation Consume Path](037-zero-alloc-consume-path.md)
-- **Criterion.rs**: https://docs.rs/criterion
-- **DWT CYCCNT**: ARM Cortex-M debug spec, section D3.3
-- **STM32H5 HAL**: https://github.com/stm32-rs/stm32h5xx-hal
-- **`defmt`**: https://github.com/knurling-rs/defmt
diff --git a/docs/design/039-proof-artifact-and-story-roadmap.md b/docs/design/039-proof-artifact-and-story-roadmap.md
deleted file mode 100644
index 5e752a6..0000000
--- a/docs/design/039-proof-artifact-and-story-roadmap.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# 038 — Proof-Artifact & Story Roadmap
-
-**Status:** Draft 2026-06-12. Companion to 037; sequences the claims-hardening artifacts from the June review against the content calendar. Publication anchor: **all stories release after the Bring-Your-Own-Connector (BYOC) and Bring-Your-Own-Adapter (BYOA) series.**
-
----
-
-## 1. Operating principles
-
-1. **Two clocks, decoupled.** Artifacts land on *engineering time* (breaking window, release gates, hardware availability). Stories publish on *content time* (after BYOC/BYOA, in queue order). Nothing about the publication order is allowed to delay an artifact whose engineering deadline is earlier.
-2. **Artifact-gated, not calendar-gated.** A story ships when its §9-style numbers are merged — never before (037 §10). Calendar slots below are targets, not promises.
-3. **The series must not rot on publish.** BYOC and BYOA are SPI documentation. They get written against the SPI that will exist *after* the open breaking window closes — see §2.
-
-## 2. Hard constraints on the series itself
-
-- **BYOA teaches the post-W8 SPI.** W8 (037) changes `BufferReader` from boxed-future `recv` to `poll_recv`, and it must land inside the currently-open breaking window regardless of content plans. Writing the adapter series against the old SPI means it documents a deleted trait within weeks. Upside: `poll_recv` is the *better tutorial* — "implement one poll function, register the waker before `Pending`" is a cleaner adapter contract to teach than "return a `Pin<Box<dyn Future>>`", and the series quietly sets up story S1.
-- **BYOC is already stable.** The connector SPI post-W1 (fused `SerializedSource` + sync ingest closures, #141) is the shape to document. One recommendation to keep it stable: when W8 propagates (037 §3.3), keep `SerializedSource`'s **async wrapper** rather than exposing a poll method on the connector SPI — connector authors' world then doesn't change, and the BYOC series survives W8 untouched.
-- **Conformance suite runs early even though its story ships late.** First-run divergences between the Tokio/Embassy/WASM buffer implementations may require *behavioral* fixes — and behavioral fixes may be breaking. Discovering that after the window closes is the expensive version. Build and run the suite now (it reuses the W6 `host_test_stubs!` infra); publish S3 whenever the queue reaches it.
-
-## 3. Phase 0 — lands now, silently (before/during the series)
-
-| Item | Why now | Story attached |
-|---|---|---|
-| W8 + B0/B1/B2 baselines (037 §8) | Breaking window; "before" columns captured pre-merge | S1, later |
-| Conformance suite first run | Divergences may need the window (§2) | S3, later |
-| `#![forbid(unsafe_code)]` in `aimdb-core` | Trivial; enables the S5 claim | S5, later |
-| Size job in CI (cargo-size/bloat per feature set, thumbv7em) | Defends ~50 KB before anyone attacks it | S4, later |
-| Wording fixes: executor "zero dependencies" → "no required dependencies"; drop "real-time" until S7; serial + UDS rows in the connector table; retire the low-count star badge | Cheap correctness; none of these are stories | — |
-
-**One hardware day, two stories.** The next W3 rig session should cover *both* the remaining KNX matrix scenarios (soak, queue-flush, stale-channel NACK, backoff pacing, inbound flood, clean detection number) *and* the B3 before/after capture (DWT cycles + heap high-water, main vs W8 branch). That single session completes the artifact gates for S1 and S2.
-
-## 4. Phase 1–2 — the series (Alex's existing plan)
-
-BYOC, then BYOA, on their own schedule. Each installment may close with a one-line forward tease ("the adapter contract is one poll function — in a few weeks we'll show what that bought us, with cycle counts"), but the series carries no claims that the Phase-0 artifacts haven't already locked in.
-
-## 5. Phase 3 — the story queue (post-series, in order)
-
-| # | Story | Hook | Artifact gate | Channel |
-|---|---|---|---|---|
-| S1 | **One heap allocation per message — found, measured, removed** | `WasmRecvFuture` boxed solely to satisfy a trait signature; object safety vs `async fn`; declined the monomorphized lane *with data* | W8 merged; 037 §9 populated incl. B3 | r/rust (TWIR follow) |
-| S2 | **Our hardware matrix caught silent data loss before release** | Ten KNX writes vanishing behind warn-only AckTimeouts; spec-3.8.4 retransmit; same-day hardware re-validation | W3 matrix complete (release gate anyway) | r/embedded + blog; repurposed for DACH pilot decks & LinkedIn |
-| S3 | **Three runtimes, one semantics suite** | Proving "same buffer semantics on Tokio/Embassy/WASM" instead of asserting it; what diverged on first run | Suite in CI; divergences fixed or documented | r/rust |
-| S4 | **What a typed dataplane costs on a Cortex-M** | Binary-size archaeology: what actually dominates flash (fmt? panic infra? per-record monomorphization?); the honest ~50 KB breakdown | Size job + bloat analysis per feature set | embedded Rust; feeds the `Reader<T,B>` dormant trigger and grant TRL credibility |
-| S5 | **Proving the unsafe away** | `forbid(unsafe_code)` core; Miri on the WASM/Embassy `Send+Sync` impls; loom on the W8 Mailbox waker we just hand-rolled | Miri + loom jobs green in CI | r/rust |
-| S6 | **Schema evolution without a registry — proven** | proptest round-trips (up∘down = identity), old-reader/new-writer matrix | Property suite merged | r/rust + distributed-systems |
-| S7 | **Earning the word "real-time" back** | p99/p99.9 publish→wake jitter on Embassy, measured on the B3 rig; soft-RT claims with numbers or not at all | B3 infra + jitter harness; long soak | embedded; gated last deliberately |
-
-Parallel, different channel, existing plan: the MCP live-demo **Show HN** rides its own track and is unaffected by this queue.
-
-## 6. Cadence and sequencing notes
-
-- Suggested rhythm: one story per 2–3 weeks post-series, S1 first as the BYOA segue. S2 is independent and may interleave anywhere once the matrix closes.
-- If an artifact slips, the queue *reorders* rather than waits — every story is self-contained by design.
-- Each published story closes the loop in-repo: README/website wording updated in the same PR as the story's numbers (037 §8 rule, generalized), and the claim moves from "asserted" to "CI-enforced" in whatever doc owns it.
-- After S7, the original audit is fully discharged: every public claim is either CI-enforced, hardware-measured, or deleted.

From 2b549742574a23300fce9ed96551e1eb7acf3a27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20Schn=C3=B6rch?=
 <alexander.schnoerch@outlook.com>
Date: Sun, 21 Jun 2026 21:26:10 +0000
Subject: [PATCH 16/16] docs: update comments for clarity on buffer types and
 Tokio primitives

---
 aimdb-bench/benches/b0_alloc_tokio.rs |  2 +-
 aimdb-bench/src/profiles.rs           | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/aimdb-bench/benches/b0_alloc_tokio.rs b/aimdb-bench/benches/b0_alloc_tokio.rs
index ae7aefb..db15212 100644
--- a/aimdb-bench/benches/b0_alloc_tokio.rs
+++ b/aimdb-bench/benches/b0_alloc_tokio.rs
@@ -80,7 +80,7 @@ fn main() {
     });
     state_report.print();
 
-    // ── Command: Mailbox / Mutex + Notify ────────────────────────────────────
+    // ── Command: Mailbox / Mutex slot + waker list ───────────────────────────
     //
     // Tight 1:1 push → recv loop. Do NOT batch pushes ahead of the consumer:
     // the single slot overwrites earlier values, leaving only the last write.
diff --git a/aimdb-bench/src/profiles.rs b/aimdb-bench/src/profiles.rs
index b57cc4b..c7129e1 100644
--- a/aimdb-bench/src/profiles.rs
+++ b/aimdb-bench/src/profiles.rs
@@ -2,11 +2,11 @@
 //!
 //! Three profiles match the three buffer types:
 //!
-//! | Profile      | Buffer type   | Tokio primitive     | Payload   |
-//! |--------------|---------------|---------------------|-----------|
-//! | **Telemetry**| `SpmcRing`    | `broadcast`         | small     |
-//! | **State**    | `SingleLatest`| `watch`             | medium    |
-//! | **Command**  | `Mailbox`     | `Mutex + Notify`    | small     |
+//! | Profile       | Buffer type    | Tokio primitive           | Payload |
+//! |---------------|----------------|---------------------------|---------|
+//! | **Telemetry** | `SpmcRing`     | `broadcast`               | small   |
+//! | **State**     | `SingleLatest` | `watch`                   | medium  |
+//! | **Command**   | `Mailbox`      | `Mutex` slot + waker list | small   |
 //!
 //! Buffers are constructed from a `BufferCfg` via the `Buffer<T>` trait so
 //! the bench code tests exactly the same code path that production uses.
@@ -106,7 +106,7 @@ pub fn state_buffer() -> TokioBuffer<StateMsg> {
     TokioBuffer::new(&BufferCfg::SingleLatest)
 }
 
-/// Build a `TokioBuffer<CommandMsg>` backed by `Mailbox` (`Mutex + Notify`).
+/// Build a `TokioBuffer<CommandMsg>` backed by `Mailbox` (`Mutex` slot + waker list).
 pub fn command_buffer() -> TokioBuffer<CommandMsg> {
     TokioBuffer::new(&BufferCfg::Mailbox)
 }